From 6cf152140f1989741ac58e18b39a0b0b6b19ab83 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 28 Sep 2021 09:49:20 +0200 Subject: [PATCH 01/18] MAINT Move DistanceMetric under metrics --- doc/glossary.rst | 7 +-- doc/modules/classes.rst | 11 +++- doc/modules/density.rst | 2 +- sklearn/cluster/_agglomerative.py | 4 +- sklearn/cluster/_hierarchical_fast.pyx | 15 +++-- sklearn/cluster/tests/test_hierarchical.py | 5 +- sklearn/metrics/__init__.py | 3 + .../{neighbors => metrics}/_dist_metrics.pxd | 16 +++--- .../{neighbors => metrics}/_dist_metrics.pyx | 57 +++++++++---------- sklearn/metrics/pairwise.py | 2 +- sklearn/metrics/setup.py | 8 +++ .../tests/test_dist_metrics.py | 13 +---- sklearn/neighbors/__init__.py | 2 +- sklearn/neighbors/_binary_tree.pxi | 15 +++-- sklearn/neighbors/_classification.py | 8 +-- sklearn/neighbors/_distance_metric.py | 20 +++++++ sklearn/neighbors/_graph.py | 18 +++--- sklearn/neighbors/_partition_nodes.pxd | 2 +- sklearn/neighbors/_unsupervised.py | 4 +- sklearn/neighbors/setup.py | 13 ----- sklearn/neighbors/tests/test_ball_tree.py | 13 ++++- .../neighbors/tests/test_neighbors_tree.py | 2 +- sklearn/{neighbors => utils}/_typedefs.pxd | 0 sklearn/{neighbors => utils}/_typedefs.pyx | 0 sklearn/utils/setup.py | 7 +++ 25 files changed, 142 insertions(+), 105 deletions(-) rename sklearn/{neighbors => metrics}/_dist_metrics.pxd (87%) rename sklearn/{neighbors => metrics}/_dist_metrics.pyx (95%) rename sklearn/{neighbors => metrics}/tests/test_dist_metrics.py (95%) create mode 100644 sklearn/neighbors/_distance_metric.py rename sklearn/{neighbors => utils}/_typedefs.pxd (100%) rename sklearn/{neighbors => utils}/_typedefs.pyx (100%) diff --git a/doc/glossary.rst b/doc/glossary.rst index 010f16a361531..2b4c6af0d1866 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -644,9 +644,8 @@ General Concepts Note that for most distance metrics, we rely on implementations from :mod:`scipy.spatial.distance`, but may reimplement for efficiency in - our context. The :mod:`neighbors` module also duplicates some metric - implementations for integration with efficient binary tree search data - structures. + our context. The :class:`metrics.DistanceMetric` interface is used to implement + distance metrics for integration with efficient neighbors search. pd A shorthand for `Pandas `_ due to the @@ -1023,7 +1022,7 @@ such as: Further examples: -* :class:`neighbors.DistanceMetric` +* :class:`metrics.DistanceMetric` * :class:`gaussian_process.kernels.Kernel` * ``tree.Criterion`` diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 72b67b23e8dc3..b7000bcf7cbb2 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1058,6 +1058,16 @@ further details. metrics.consensus_score +Distance metrics +---------------- + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + metrics.DistanceMetric Pairwise metrics ---------------- @@ -1317,7 +1327,6 @@ Model validation :template: class.rst neighbors.BallTree - neighbors.DistanceMetric neighbors.KDTree neighbors.KernelDensity neighbors.KNeighborsClassifier diff --git a/doc/modules/density.rst b/doc/modules/density.rst index 115d318183577..6440bf79ab729 100644 --- a/doc/modules/density.rst +++ b/doc/modules/density.rst @@ -136,7 +136,7 @@ The form of these kernels is as follows: :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h` The kernel density estimator can be used with any of the valid distance -metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though +metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though the results are properly normalized only for the Euclidean metric. One particularly useful metric is the `Haversine distance `_ diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 6606f370b81eb..70b3a5028169b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -16,8 +16,8 @@ from ..base import BaseEstimator, ClusterMixin from ..metrics.pairwise import paired_distances -from ..neighbors import DistanceMetric -from ..neighbors._dist_metrics import METRIC_MAPPING +from ..metrics import DistanceMetric +from ..metrics._dist_metrics import METRIC_MAPPING from ..utils import check_array from ..utils._fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx index 2a58757ce327d..11ea3294c086a 100644 --- a/sklearn/cluster/_hierarchical_fast.pyx +++ b/sklearn/cluster/_hierarchical_fast.pyx @@ -13,7 +13,7 @@ ctypedef np.int8_t INT8 np.import_array() -from ..neighbors._dist_metrics cimport DistanceMetric +from ..metrics._dist_metrics cimport DistanceMetric from ..utils._fast_dict cimport IntFloatDict # C++ @@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b, def average_merge(IntFloatDict a, IntFloatDict b, np.ndarray[ITYPE_t, ndim=1] mask, ITYPE_t n_a, ITYPE_t n_b): - """Merge two IntFloatDicts with the average strategy: when the - same key is present in the two dicts, the weighted average of the two + """Merge two IntFloatDicts with the average strategy: when the + same key is present in the two dicts, the weighted average of the two values is used. Parameters @@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b, ############################################################################### -# An edge object for fast comparisons +# An edge object for fast comparisons cdef class WeightedEdge: cdef public ITYPE_t a cdef public ITYPE_t b cdef public DTYPE_t weight - + def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b): self.weight = weight self.a = a @@ -326,7 +326,7 @@ cdef class WeightedEdge: return self.weight > other.weight elif op == 5: return self.weight >= other.weight - + def __repr__(self): return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__, self.weight, @@ -475,7 +475,7 @@ def mst_linkage_core( dist_metric: DistanceMetric A DistanceMetric object conforming to the API from - ``sklearn.neighbors._dist_metrics.pxd`` that will be + ``sklearn.metrics._dist_metrics.pxd`` that will be used to compute distances. Returns @@ -534,4 +534,3 @@ def mst_linkage_core( current_node = new_node return np.array(result) - diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 92f92dc3736e3..3525643383c26 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -17,7 +17,7 @@ from scipy.sparse.csgraph import connected_components from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS +from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import ignore_warnings @@ -31,6 +31,7 @@ _fix_connectivity, ) from sklearn.feature_extraction.image import grid_to_graph +from sklearn.metrics import DistanceMetric from sklearn.metrics.pairwise import ( PAIRED_DISTANCES, cosine_distances, @@ -38,7 +39,7 @@ pairwise_distances, ) from sklearn.metrics.cluster import normalized_mutual_info_score -from sklearn.neighbors import kneighbors_graph, DistanceMetric +from sklearn.neighbors import kneighbors_graph from sklearn.cluster._hierarchical_fast import ( average_merge, max_merge, diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 46958ea4ef7f8..e4339229c5b64 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -36,6 +36,8 @@ from ._classification import brier_score_loss from ._classification import multilabel_confusion_matrix +from ._dist_metrics import DistanceMetric + from . import cluster from .cluster import adjusted_mutual_info_score from .cluster import adjusted_rand_score @@ -115,6 +117,7 @@ "davies_bouldin_score", "DetCurveDisplay", "det_curve", + "DistanceMetric", "euclidean_distances", "explained_variance_score", "f1_score", diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd similarity index 87% rename from sklearn/neighbors/_dist_metrics.pxd rename to sklearn/metrics/_dist_metrics.pxd index 5b223f8c6d8a8..61bb4fb2fe011 100644 --- a/sklearn/neighbors/_dist_metrics.pxd +++ b/sklearn/metrics/_dist_metrics.pxd @@ -1,14 +1,12 @@ -#!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: cdivision=True +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False -cimport cython cimport numpy as np -from libc.math cimport fabs, sqrt, exp, cos, pow +from libc.math cimport sqrt, exp -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from ._typedefs import DTYPE, ITYPE +from ..utils._typedefs cimport DTYPE_t, ITYPE_t ###################################################################### # Inline distance functions @@ -60,7 +58,7 @@ cdef class DistanceMetric: cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1 - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1 cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx similarity index 95% rename from sklearn/neighbors/_dist_metrics.pyx rename to sklearn/metrics/_dist_metrics.pyx index 240a7a3f7d14d..eb1512fe25aef 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -1,8 +1,7 @@ -#!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: initializedcheck=False -#cython: cdivision=True +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False # By Jake Vanderplas (2013) # written for the scikit-learn project @@ -19,7 +18,7 @@ cdef extern from "arrayobject.h": int typenum, void* data) -cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): +cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n): # Wrap a memory buffer with an ndarray. Warning: this is not robust. # In particular, if x is deallocated before the returned array goes # out of scope, this could cause memory errors. Since there is not @@ -33,8 +32,8 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin cdef DTYPE_t INF = np.inf -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE -from ._typedefs import DTYPE, ITYPE +from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE +from ..utils._typedefs import DTYPE, ITYPE ###################################################################### @@ -98,7 +97,7 @@ cdef class DistanceMetric: Examples -------- - >>> from sklearn.neighbors import DistanceMetric + >>> from sklearn.metrics import DistanceMetric >>> dist = DistanceMetric.get_metric('euclidean') >>> X = [[0, 1, 2], [3, 4, 5]] @@ -291,14 +290,14 @@ cdef class DistanceMetric: cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: - """Compute the reduced distance between vectors x1 and x2. + """Compute the ranking-preserving distance between vectors x1 and x2. This can optionally be overridden in a base class. - The reduced distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the reduced distance is the squared-euclidean - distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For exampke, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. """ return self.dist(x1, x2, size) @@ -323,25 +322,25 @@ cdef class DistanceMetric: return 0 cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1: - """Convert the reduced distance to the distance""" + """Convert the ranking-preserving distance to the distance""" return rdist cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - """Convert the distance to the reduced distance""" + """Convert the distance to the ranking-preserving distance""" return dist def rdist_to_dist(self, rdist): - """Convert the Reduced distance to the true distance. + """Convert the ranking-preserving distance to the true distance. - The reduced distance, defined for some metrics, is a computationally - more efficient measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For exampke, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. Parameters ---------- rdist : double - Reduced distance. + Ranking-preserving distance. Returns ------- @@ -351,12 +350,12 @@ cdef class DistanceMetric: return rdist def dist_to_rdist(self, dist): - """Convert the true distance to the reduced distance. + """Convert the true distance to the rank-preserving surrogate distance. - The reduced distance, defined for some metrics, is a computationally - more efficient measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For exampke, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. Parameters ---------- @@ -366,7 +365,7 @@ cdef class DistanceMetric: Returns ------- double - Reduced distance. + Ranking-preserving distance. """ return dist @@ -519,7 +518,7 @@ cdef class ChebyshevDistance(DistanceMetric): Examples -------- - >>> from sklearn.neighbors.dist_metrics import DistanceMetric + >>> from sklearn.metrics.dist_metrics import DistanceMetric >>> dist = DistanceMetric.get_metric('chebyshev') >>> X = [[0, 1, 2], ... [3, 4, 5]] diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d493ad68603ea..51cf80614cb3c 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -780,7 +780,7 @@ def haversine_distances(X, Y=None): array([[ 0. , 11099.54035582], [11099.54035582, 0. ]]) """ - from ..neighbors import DistanceMetric + from ..metrics import DistanceMetric return DistanceMetric.get_metric("haversine").pairwise(X, Y) diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index df1a1caad17e0..69925a3590be6 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -1,4 +1,5 @@ import os +import numpy as np from numpy.distutils.misc_util import Configuration @@ -18,6 +19,13 @@ def configuration(parent_package="", top_path=None): "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries ) + config.add_extension( + "_dist_metrics", + sources=["_dist_metrics.pyx"], + include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")], + libraries=libraries, + ) + config.add_subpackage("tests") return config diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py similarity index 95% rename from sklearn/neighbors/tests/test_dist_metrics.py rename to sklearn/metrics/tests/test_dist_metrics.py index 08298f087c216..9440abba6f848 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -7,8 +7,7 @@ import pytest from scipy.spatial.distance import cdist -from sklearn.neighbors import DistanceMetric -from sklearn.neighbors import BallTree +from sklearn.metrics import DistanceMetric from sklearn.utils import check_random_state from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -230,16 +229,6 @@ def test_pyfunc_metric(): assert_array_almost_equal(D1_pkl, D2_pkl) -def test_bad_pyfunc_metric(): - def wrong_distance(x, y): - return "1" - - X = np.ones((5, 2)) - msg = "Custom distance function must accept two vectors" - with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_distance) - - def test_input_data_size(): # Regression test for #6288 # Previously, a metric requiring a particular input dimension would fail diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 8a0934eecf142..340910008f75c 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -5,7 +5,7 @@ from ._ball_tree import BallTree from ._kd_tree import KDTree -from ._dist_metrics import DistanceMetric +from ._distance_metric import DistanceMetric from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 9f90414994550..f25da86e2148c 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -153,11 +153,16 @@ import numpy as np import warnings from ..utils import check_array -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from ._typedefs import DTYPE, ITYPE +from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t +from sklearn.utils._typedefs import DTYPE, ITYPE -from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, - euclidean_dist_to_rdist, euclidean_rdist_to_dist) +from ..metrics._dist_metrics cimport ( + DistanceMetric, + euclidean_dist, + euclidean_rdist, + euclidean_dist_to_rdist, + euclidean_rdist_to_dist, +) from ._partition_nodes cimport partition_node_indices @@ -878,7 +883,7 @@ def newObj(obj): ###################################################################### # define the reverse mapping of VALID_METRICS -from ._dist_metrics import get_valid_metric_ids +from sklearn.metrics._dist_metrics import get_valid_metric_ids VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index ced21c7885962..08790cd1976bb 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -67,8 +67,8 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): metric : str or callable, default='minkowski' The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. @@ -348,8 +348,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors metric : str or callable, default='minkowski' Distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py new file mode 100644 index 0000000000000..10d6e24139068 --- /dev/null +++ b/sklearn/neighbors/_distance_metric.py @@ -0,0 +1,20 @@ +# TODO: Remove this file in 1.2 +import warnings + +from ..metrics import DistanceMetric as _DistanceMetric + + +class DistanceMetric(_DistanceMetric): + @classmethod + def _warn(cls): + warnings.warn( + "sklearn.neighbors.DistanceMetric has been moved " + "to sklearn.metrics.DistanceMetric in 1.0. " + "This import path will be removed in 1.2", + category=FutureWarning, + ) + + @classmethod + def get_metric(cls, metric, **kwargs): + DistanceMetric._warn() + return _DistanceMetric.get_metric(metric, **kwargs) diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index e6fdeffe3b291..9afa37b71a808 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -65,10 +65,11 @@ def kneighbors_graph( between neighbors according to the given metric. metric : str, default='minkowski' - The distance metric used to calculate the k-Neighbors for each sample - point. The DistanceMetric class gives a list of available metrics. - The default distance is 'euclidean' ('minkowski' metric with the p - param equal to 2.) + The distance metric to use for the tree. The default metric is + minkowski, and with p=2 is equivalent to the standard Euclidean + metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is @@ -157,10 +158,11 @@ def radius_neighbors_graph( between neighbors according to the given metric. metric : str, default='minkowski' - The distance metric used to calculate the neighbors within a - given radius for each sample point. The DistanceMetric class - gives a list of available metrics. The default distance is - 'euclidean' ('minkowski' metric with the param equal to 2.) + The distance metric to use for the tree. The default metric is + minkowski, and with p=2 is equivalent to the standard Euclidean + metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 522e826632824..94b02002d7a1e 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,4 +1,4 @@ -from ._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs cimport DTYPE_t, ITYPE_t cdef int partition_node_indices( DTYPE_t *data, diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 6b6eec1a3112b..440ac41eb71d5 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): metric : str or callable, default='minkowski' The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 85305efc29c78..aa19ba501b18d 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -32,19 +32,6 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) - config.add_extension( - "_dist_metrics", - sources=["_dist_metrics.pyx"], - include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")], - libraries=libraries, - ) - - config.add_extension( - "_typedefs", - sources=["_typedefs.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) config.add_extension( "_quad_tree", sources=["_quad_tree.pyx"], diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index c751539f2a1ae..a823a03251a1b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -4,7 +4,6 @@ import pytest from numpy.testing import assert_array_almost_equal from sklearn.neighbors._ball_tree import BallTree -from sklearn.neighbors import DistanceMetric from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -40,6 +39,8 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): + from sklearn.metrics import DistanceMetric + X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] @@ -84,3 +85,13 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) + + +def test_bad_pyfunc_metric(): + def wrong_distance(x, y): + return "1" + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=wrong_distance) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index de34b4d230171..e043ffb730708 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from sklearn.neighbors import DistanceMetric +from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( BallTree, kernel_norm, diff --git a/sklearn/neighbors/_typedefs.pxd b/sklearn/utils/_typedefs.pxd similarity index 100% rename from sklearn/neighbors/_typedefs.pxd rename to sklearn/utils/_typedefs.pxd diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/utils/_typedefs.pyx similarity index 100% rename from sklearn/neighbors/_typedefs.pyx rename to sklearn/utils/_typedefs.pyx diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index c75cbe2d86495..ed78ecc5db76f 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -88,6 +88,13 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) + config.add_extension( + "_typedefs", + sources=["_typedefs.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + config.add_subpackage("tests") return config From 2e0fff9f1116624872d92948a126405c46694025 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 28 Sep 2021 11:21:23 +0200 Subject: [PATCH 02/18] Add whats_new entry --- doc/whats_new/v1.1.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 3aabed6214771..23b03d4bcb027 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -45,6 +45,15 @@ Changelog message when the solver does not support sparse matrices with int64 indices. :pr:`21093` by `Tom Dupre la Tour`_. +:mod:`sklearn.metrics` +...................... + +- |API| :class:`metrics.DistanceMetric` has been moved from + :mod:`sklearn.neighbors` to :mod:`sklearn.metric`. + Using :class:`neighbors.DistanceMetric` for imports is still valid for + backward compatibility, but this interface will be removed in 1.2. + :pr:`21177` by :user:`Julien Jerphanion `. + :mod:`sklearn.utils` .................... From ec7ca1b8fd5dcc89d628db4afc9310bb0eeff000 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 28 Sep 2021 11:39:38 +0200 Subject: [PATCH 03/18] Add a test for the deprecation cycle --- sklearn/neighbors/tests/test_neighbors.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 1e1f3a082786e..7c37cb55d7768 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1818,3 +1818,12 @@ def test_pairwise_deprecated(NearestNeighbors): msg = r"Attribute `_pairwise` was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): nn._pairwise + + +# TODO: Remove in 1.2 +def test_neighbors_distance_metric_deprecation(): + from sklearn.neighbors import DistanceMetric + + msg = r"This import path will be removed in 1\.2" + with pytest.warns(FutureWarning, match=msg): + DistanceMetric.get_metric("euclidean") From 4dbe6518395374757247e2c3a9fdae748622fdf2 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 28 Sep 2021 13:35:45 +0200 Subject: [PATCH 04/18] Add a space to make Sphinx happy --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 23b03d4bcb027..8d9eba46a9069 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -52,7 +52,7 @@ Changelog :mod:`sklearn.neighbors` to :mod:`sklearn.metric`. Using :class:`neighbors.DistanceMetric` for imports is still valid for backward compatibility, but this interface will be removed in 1.2. - :pr:`21177` by :user:`Julien Jerphanion `. + :pr:`21177` by :user:`Julien Jerphanion `. :mod:`sklearn.utils` .................... From a3e03cf4802c480edf757b10e66517778801d152 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 28 Sep 2021 18:41:05 +0200 Subject: [PATCH 05/18] Apply suggestions from review Co-authored-by: Olivier Grisel --- doc/modules/density.rst | 6 +++--- doc/whats_new/v1.1.rst | 2 +- sklearn/neighbors/_distance_metric.py | 4 ++-- sklearn/neighbors/tests/test_ball_tree.py | 10 ---------- sklearn/neighbors/tests/test_neighbors.py | 9 ++++++--- 5 files changed, 12 insertions(+), 19 deletions(-) diff --git a/doc/modules/density.rst b/doc/modules/density.rst index 6440bf79ab729..9e542b803ef68 100644 --- a/doc/modules/density.rst +++ b/doc/modules/density.rst @@ -136,9 +136,9 @@ The form of these kernels is as follows: :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h` The kernel density estimator can be used with any of the valid distance -metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though -the results are properly normalized only for the Euclidean metric. One -particularly useful metric is the +metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of +available metrics), though the results are properly normalized only +for the Euclidean metric. One particularly useful metric is the `Haversine distance `_ which measures the angular distance between points on a sphere. Here is an example of using a kernel density estimate for a visualization diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8d9eba46a9069..99e1a87ca3017 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -51,7 +51,7 @@ Changelog - |API| :class:`metrics.DistanceMetric` has been moved from :mod:`sklearn.neighbors` to :mod:`sklearn.metric`. Using :class:`neighbors.DistanceMetric` for imports is still valid for - backward compatibility, but this interface will be removed in 1.2. + backward compatibility, but this alias will be removed in 1.3. :pr:`21177` by :user:`Julien Jerphanion `. :mod:`sklearn.utils` diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py index 10d6e24139068..c973425d2e7b6 100644 --- a/sklearn/neighbors/_distance_metric.py +++ b/sklearn/neighbors/_distance_metric.py @@ -1,4 +1,4 @@ -# TODO: Remove this file in 1.2 +# TODO: Remove this file in 1.3 import warnings from ..metrics import DistanceMetric as _DistanceMetric @@ -10,7 +10,7 @@ def _warn(cls): warnings.warn( "sklearn.neighbors.DistanceMetric has been moved " "to sklearn.metrics.DistanceMetric in 1.0. " - "This import path will be removed in 1.2", + "This import path will be removed in 1.3", category=FutureWarning, ) diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index a823a03251a1b..41ccff25a260e 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -85,13 +85,3 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) - - -def test_bad_pyfunc_metric(): - def wrong_distance(x, y): - return "1" - - X = np.ones((5, 2)) - msg = "Custom distance function must accept two vectors" - with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_distance) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 7c37cb55d7768..82144115ffbf3 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1820,10 +1820,13 @@ def test_pairwise_deprecated(NearestNeighbors): nn._pairwise -# TODO: Remove in 1.2 +# TODO: Remove in 1.3 def test_neighbors_distance_metric_deprecation(): from sklearn.neighbors import DistanceMetric + from sklearn.metrics import DistanceMetric as ActualDistanceMetric - msg = r"This import path will be removed in 1\.2" + msg = r"This import path will be removed in 1\.3" with pytest.warns(FutureWarning, match=msg): - DistanceMetric.get_metric("euclidean") + dist_metric = DistanceMetric.get_metric("euclidean") + + assert isinstance(dist_metric, ActualDistanceMetric) From b6e54bad9f0d831a0cb285705694c6dae5729cc1 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 6 Oct 2021 14:30:54 +0200 Subject: [PATCH 06/18] DOC Fix formatting in doc/whats_new/v1.1.rst --- doc/whats_new/v1.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 4374129315395..355830aa274b8 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -61,6 +61,7 @@ Changelog Using :class:`neighbors.DistanceMetric` for imports is still valid for backward compatibility, but this alias will be removed in 1.3. :pr:`21177` by :user:`Julien Jerphanion `. + :mod:`sklearn.model_selection` .............................. From cd3cd5d75a37ce06d7e7f2ce09bb6068bb63f3fc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 8 Oct 2021 08:19:10 -0400 Subject: [PATCH 07/18] Fix formatting Co-authored-by: Roman Yurchak --- doc/whats_new/v1.1.rst | 2 +- sklearn/metrics/_dist_metrics.pyx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 355830aa274b8..5bece85ad9f54 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -58,7 +58,7 @@ Changelog - |API| :class:`metrics.DistanceMetric` has been moved from :mod:`sklearn.neighbors` to :mod:`sklearn.metric`. - Using :class:`neighbors.DistanceMetric` for imports is still valid for + Using `neighbors.DistanceMetric` for imports is still valid for backward compatibility, but this alias will be removed in 1.3. :pr:`21177` by :user:`Julien Jerphanion `. diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index eb1512fe25aef..c592c1d8c5d4a 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -295,7 +295,7 @@ cdef class DistanceMetric: This can optionally be overridden in a base class. The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For exampke, for the + rank as the distance, but is more efficient to compute. For example, for the Euclidean metric, the rank-preserving surrogate distance is the squared-euclidean distance. """ @@ -333,7 +333,7 @@ cdef class DistanceMetric: """Convert the ranking-preserving distance to the true distance. The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For exampke, for the + rank as the distance, but is more efficient to compute. For example, for the Euclidean metric, the rank-preserving surrogate distance is the squared-euclidean distance. @@ -353,7 +353,7 @@ cdef class DistanceMetric: """Convert the true distance to the rank-preserving surrogate distance. The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For exampke, for the + rank as the distance, but is more efficient to compute. For example, for the Euclidean metric, the rank-preserving surrogate distance is the squared-euclidean distance. From cb8223b0f2209d71b6c099739f5cf160641c83bb Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 6 Oct 2021 15:05:57 +0200 Subject: [PATCH 08/18] FIX out of bound error in split_indices (#21130) Co-authored-by: Thomas J. Fan Co-authored-by: Olivier Grisel --- doc/whats_new/v1.1.rst | 9 ++++++- .../_hist_gradient_boosting/splitting.pyx | 26 ++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 5bece85ad9f54..1a9f773ce08df 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -38,7 +38,6 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. - :mod:`sklearn.calibration` .......................... @@ -46,6 +45,14 @@ Changelog `pos_label` to specify the positive class label. :pr:`21032` by :user:`Guillaume Lemaitre `. +:mod:`sklearn.ensemble` +........................... + +- |Fix| Fixed a bug that could produce a segfault in rare cases for + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. + :pr:`21130` :user:`Christian Lorentzen `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 08ae7aaf0862c..232cf094876cb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -388,11 +388,25 @@ cdef class Splitter: &left_indices_buffer[offset_in_buffers[thread_idx]], sizeof(unsigned int) * left_counts[thread_idx] ) - memcpy( - &sample_indices[right_offset[thread_idx]], - &right_indices_buffer[offset_in_buffers[thread_idx]], - sizeof(unsigned int) * right_counts[thread_idx] - ) + if right_counts[thread_idx] > 0: + # If we're splitting the rightmost node of the tree, i.e. the + # rightmost node in the partition array, and if n_threads >= 2, one + # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices) + # leading to evaluating + # + # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] + # = &partition[n_samples_in_tree] + # + # which is an out-of-bounds read access that can cause a segmentation fault. + # When boundscheck=True, removing this check produces this exception: + # + # IndexError: Out of bounds on buffer access + # + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) return (sample_indices[:right_child_position], sample_indices[right_child_position:], @@ -839,7 +853,7 @@ cdef class Splitter: # other category. The low-support categories will always be mapped to # the right child. We scan the sorted categories array from left to # right and from right to left, and we stop at the middle. - + # Considering ordered categories A B C D, with E being a low-support # category: A B C D # ^ From 7cc80df0671851fbda57ed85ad33950d325974d2 Mon Sep 17 00:00:00 2001 From: Helder Geovane Gomes de Lima Date: Wed, 6 Oct 2021 11:21:15 -0300 Subject: [PATCH 09/18] DOC Remove unused import from example (#21253) --- sklearn/feature_extraction/text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index a0b74a60dab4d..8dd743813fa27 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1557,7 +1557,6 @@ class TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): >>> from sklearn.feature_extraction.text import TfidfTransformer >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.pipeline import Pipeline - >>> import numpy as np >>> corpus = ['this is the first document', ... 'this document is the second document', ... 'and this is the third one', From b1223e7e4598b77b093730ab04d43ef13b008096 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 7 Oct 2021 04:13:00 -0400 Subject: [PATCH 10/18] MAINT Enable and run black on examples (#20502) --- .../applications/plot_digits_denoising.py | 29 ++- .../applications/plot_face_recognition.py | 38 ++-- .../plot_model_complexity_influence.py | 161 +++++++------ .../plot_out_of_core_classification.py | 169 +++++++------- .../plot_outlier_detection_wine.py | 71 +++--- .../applications/plot_prediction_latency.py | 211 ++++++++++-------- .../plot_species_distribution_modeling.py | 91 +++++--- examples/applications/plot_stock_market.py | 209 +++++++++-------- .../plot_tomography_l1_reconstruction.py | 32 ++- .../plot_topics_extraction_with_nmf_lda.py | 97 ++++---- examples/applications/svm_gui.py | 140 ++++++++---- .../wikipedia_principal_eigenvector.py | 18 +- .../bicluster/plot_bicluster_newsgroups.py | 86 ++++--- .../bicluster/plot_spectral_biclustering.py | 17 +- .../bicluster/plot_spectral_coclustering.py | 7 +- examples/calibration/plot_calibration.py | 68 +++--- .../calibration/plot_calibration_curve.py | 90 +++++--- .../plot_calibration_multiclass.py | 155 ++++++++----- .../calibration/plot_compare_calibration.py | 38 ++-- .../plot_classification_probability.py | 39 ++-- .../plot_classifier_comparison.py | 84 ++++--- .../plot_digits_classification.py | 17 +- examples/classification/plot_lda.py | 58 +++-- examples/classification/plot_lda_qda.py | 116 ++++++---- .../plot_adjusted_for_chance_measures.py | 51 +++-- examples/cluster/plot_affinity_propagation.py | 40 ++-- .../cluster/plot_agglomerative_clustering.py | 36 +-- .../plot_agglomerative_clustering_metrics.py | 66 +++--- .../cluster/plot_agglomerative_dendrogram.py | 9 +- .../cluster/plot_birch_vs_minibatchkmeans.py | 43 ++-- examples/cluster/plot_cluster_comparison.py | 195 ++++++++++------ examples/cluster/plot_cluster_iris.py | 54 ++--- examples/cluster/plot_coin_segmentation.py | 19 +- .../cluster/plot_coin_ward_segmentation.py | 20 +- examples/cluster/plot_color_quantization.py | 16 +- examples/cluster/plot_dbscan.py | 50 +++-- examples/cluster/plot_dict_face_patches.py | 19 +- examples/cluster/plot_digits_agglomeration.py | 23 +- examples/cluster/plot_digits_linkage.py | 20 +- examples/cluster/plot_face_compress.py | 11 +- ...e_agglomeration_vs_univariate_selection.py | 21 +- examples/cluster/plot_kmeans_assumptions.py | 9 +- examples/cluster/plot_kmeans_digits.py | 58 +++-- examples/cluster/plot_kmeans_plusplus.py | 17 +- .../plot_kmeans_silhouette_analysis.py | 66 ++++-- .../plot_kmeans_stability_low_dim_dense.py | 54 +++-- examples/cluster/plot_linkage_comparison.py | 96 +++++--- examples/cluster/plot_mean_shift.py | 16 +- examples/cluster/plot_mini_batch_kmeans.py | 65 +++--- examples/cluster/plot_optics.py | 60 ++--- examples/cluster/plot_segmentation_toy.py | 8 +- .../plot_ward_structured_vs_unstructured.py | 36 ++- examples/compose/plot_column_transformer.py | 126 ++++++----- .../plot_column_transformer_mixed_types.py | 69 +++--- examples/compose/plot_compare_reduction.py | 48 ++-- examples/compose/plot_digits_pipe.py | 38 ++-- examples/compose/plot_feature_union.py | 8 +- examples/compose/plot_transformed_target.py | 137 +++++++----- .../covariance/plot_covariance_estimation.py | 53 +++-- examples/covariance/plot_lw_vs_oas.py | 45 +++- .../covariance/plot_mahalanobis_distances.py | 86 ++++--- .../plot_robust_vs_empirical_covariance.py | 90 +++++--- examples/covariance/plot_sparse_cov.py | 62 ++--- .../plot_compare_cross_decomposition.py | 56 ++--- .../cross_decomposition/plot_pcr_vs_pls.py | 55 +++-- examples/datasets/plot_digits_last_image.py | 2 +- examples/datasets/plot_iris_dataset.py | 22 +- examples/datasets/plot_random_dataset.py | 49 ++-- .../plot_random_multilabel_dataset.py | 77 ++++--- .../decomposition/plot_beta_divergence.py | 4 +- .../decomposition/plot_faces_decomposition.py | 206 ++++++++++------- .../plot_ica_blind_source_separation.py | 12 +- examples/decomposition/plot_ica_vs_pca.py | 36 +-- .../decomposition/plot_image_denoising.py | 75 ++++--- .../decomposition/plot_incremental_pca.py | 14 +- examples/decomposition/plot_kernel_pca.py | 36 ++- examples/decomposition/plot_pca_3d.py | 13 +- examples/decomposition/plot_pca_iris.py | 20 +- .../plot_pca_vs_fa_model_selection.py | 68 +++--- examples/decomposition/plot_pca_vs_lda.py | 26 ++- examples/decomposition/plot_sparse_coding.py | 79 ++++--- examples/decomposition/plot_varimax_fa.py | 8 +- .../ensemble/plot_adaboost_hastie_10_2.py | 56 +++-- examples/ensemble/plot_adaboost_multiclass.py | 68 +++--- examples/ensemble/plot_adaboost_regression.py | 5 +- examples/ensemble/plot_adaboost_twoclass.py | 70 +++--- examples/ensemble/plot_bias_variance.py | 39 ++-- examples/ensemble/plot_ensemble_oob.py | 49 ++-- examples/ensemble/plot_forest_importances.py | 29 ++- .../ensemble/plot_forest_importances_faces.py | 6 +- examples/ensemble/plot_forest_iris.py | 63 +++--- .../plot_gradient_boosting_categorical.py | 88 +++++--- .../plot_gradient_boosting_early_stopping.py | 63 +++--- .../ensemble/plot_gradient_boosting_oob.py | 37 +-- .../plot_gradient_boosting_quantile.py | 98 ++++---- .../plot_gradient_boosting_regression.py | 56 +++-- .../plot_gradient_boosting_regularization.py | 53 +++-- examples/ensemble/plot_isolation_forest.py | 20 +- .../ensemble/plot_monotonic_constraints.py | 4 +- .../ensemble/plot_random_forest_embedding.py | 19 +- ...ot_random_forest_regression_multioutput.py | 51 +++-- examples/ensemble/plot_stack_predictors.py | 119 ++++++---- .../ensemble/plot_voting_decision_regions.py | 27 +-- examples/ensemble/plot_voting_probas.py | 50 +++-- examples/ensemble/plot_voting_regressor.py | 21 +- examples/exercises/plot_cv_diabetes.py | 26 ++- examples/exercises/plot_cv_digits.py | 11 +- .../plot_digits_classification_exercise.py | 16 +- examples/exercises/plot_iris_exercise.py | 32 ++- .../feature_selection/plot_f_test_vs_mi.py | 5 +- .../plot_feature_selection.py | 43 ++-- .../plot_feature_selection_pipeline.py | 12 +- .../plot_rfe_with_cross_validation.py | 30 ++- .../plot_select_from_model_diabetes.py | 25 ++- .../gaussian_process/plot_compare_gpr_krr.py | 41 ++-- examples/gaussian_process/plot_gpc.py | 82 ++++--- examples/gaussian_process/plot_gpc_iris.py | 17 +- .../plot_gpc_isoprobability.py | 57 +++-- examples/gaussian_process/plot_gpc_xor.py | 30 +-- examples/gaussian_process/plot_gpr_co2.py | 62 ++--- examples/gaussian_process/plot_gpr_noisy.py | 74 +++--- .../plot_gpr_noisy_targets.py | 53 +++-- .../plot_gpr_on_structured_data.py | 112 ++++++---- ...t_iterative_imputer_variants_comparison.py | 35 ++- examples/impute/plot_missing_values.py | 129 ++++++----- .../inspection/plot_permutation_importance.py | 62 ++--- ...t_permutation_importance_multicollinear.py | 32 +-- .../plot_scalable_poly_kernels.py | 99 +++++--- examples/linear_model/plot_ard.py | 37 +-- examples/linear_model/plot_bayesian_ridge.py | 41 ++-- .../plot_bayesian_ridge_curvefit.py | 22 +- ...puted_gram_matrix_with_weighted_samples.py | 2 +- examples/linear_model/plot_huber_vs_ridge.py | 19 +- examples/linear_model/plot_iris_logistic.py | 12 +- .../linear_model/plot_lasso_and_elasticnet.py | 44 ++-- .../plot_lasso_dense_vs_sparse_data.py | 12 +- examples/linear_model/plot_lasso_lars.py | 12 +- .../plot_lasso_model_selection.py | 84 ++++--- examples/linear_model/plot_logistic.py | 21 +- .../plot_logistic_l1_l2_sparsity.py | 34 +-- .../linear_model/plot_logistic_multinomial.py | 23 +- examples/linear_model/plot_logistic_path.py | 24 +- .../plot_multi_task_lasso_support.py | 38 ++-- examples/linear_model/plot_nnls.py | 4 +- examples/linear_model/plot_ols.py | 12 +- examples/linear_model/plot_ols_3d.py | 26 ++- .../linear_model/plot_ols_ridge_variance.py | 23 +- examples/linear_model/plot_omp.py | 26 +-- ...plot_poisson_regression_non_normal_loss.py | 203 +++++++++-------- .../plot_polynomial_interpolation.py | 37 +-- .../linear_model/plot_quantile_regression.py | 20 +- examples/linear_model/plot_ransac.py | 34 ++- examples/linear_model/plot_ridge_coeffs.py | 25 ++- examples/linear_model/plot_ridge_path.py | 12 +- examples/linear_model/plot_robust_fit.py | 58 +++-- examples/linear_model/plot_sgd_comparison.py | 21 +- .../linear_model/plot_sgd_early_stopping.py | 59 ++--- examples/linear_model/plot_sgd_iris.py | 23 +- .../linear_model/plot_sgd_loss_functions.py | 33 +-- examples/linear_model/plot_sgd_penalties.py | 30 +-- .../plot_sgd_separating_hyperplane.py | 9 +- .../linear_model/plot_sgd_weighted_samples.py | 22 +- .../linear_model/plot_sgdocsvm_vs_ocsvm.py | 105 +++++---- ...sparse_logistic_regression_20newsgroups.py | 85 +++---- .../plot_sparse_logistic_regression_mnist.py | 24 +- examples/linear_model/plot_theilsen.py | 48 ++-- ...lot_tweedie_regression_insurance_claims.py | 149 +++++++------ examples/manifold/plot_compare_methods.py | 39 ++-- examples/manifold/plot_manifold_sphere.py | 57 +++-- examples/manifold/plot_mds.py | 45 ++-- examples/manifold/plot_swissroll.py | 11 +- examples/manifold/plot_t_sne_perplexity.py | 35 +-- .../miscellaneous/plot_anomaly_comparison.py | 81 ++++--- .../plot_changed_only_pprint_parameter.py | 6 +- .../plot_display_object_visualization.py | 5 +- .../miscellaneous/plot_isotonic_regression.py | 16 +- .../plot_johnson_lindenstrauss_bound.py | 47 ++-- .../plot_kernel_approximation.py | 91 ++++---- .../plot_kernel_ridge_regression.py | 122 ++++++---- examples/miscellaneous/plot_multilabel.py | 60 +++-- .../plot_multioutput_face_completion.py | 37 +-- ...ot_partial_dependence_visualization_api.py | 25 ++- examples/mixture/plot_concentration_prior.py | 125 +++++++---- examples/mixture/plot_gmm.py | 44 ++-- examples/mixture/plot_gmm_covariances.py | 53 +++-- examples/mixture/plot_gmm_pdf.py | 21 +- examples/mixture/plot_gmm_selection.py | 67 +++--- examples/mixture/plot_gmm_sin.py | 119 ++++++---- .../grid_search_text_feature_extraction.py | 32 +-- .../model_selection/plot_confusion_matrix.py | 16 +- examples/model_selection/plot_cv_indices.py | 122 +++++++--- examples/model_selection/plot_cv_predict.py | 6 +- .../plot_grid_search_digits.py | 25 +-- .../plot_grid_search_refit_callable.py | 62 ++--- .../model_selection/plot_grid_search_stats.py | 114 +++++----- .../model_selection/plot_learning_curve.py | 86 ++++--- .../plot_multi_metric_evaluation.py | 71 +++--- .../plot_nested_cross_validation_iris.py | 38 ++-- .../model_selection/plot_randomized_search.py | 49 ++-- examples/model_selection/plot_roc.py | 105 +++++---- .../plot_successive_halving_heatmap.py | 58 ++--- .../plot_successive_halving_iterations.py | 40 ++-- .../plot_train_error_vs_test_error.py | 25 ++- .../plot_underfitting_overfitting.py | 25 ++- .../model_selection/plot_validation_curve.py | 42 +++- .../plot_classifier_chain_yeast.py | 64 +++--- .../approximate_nearest_neighbors.py | 155 +++++++------ .../plot_caching_nearest_neighbors.py | 31 +-- examples/neighbors/plot_classification.py | 26 ++- .../neighbors/plot_digits_kde_sampling.py | 14 +- examples/neighbors/plot_kde_1d.py | 75 ++++--- .../neighbors/plot_lof_novelty_detection.py | 33 +-- .../neighbors/plot_lof_outlier_detection.py | 16 +- examples/neighbors/plot_nca_classification.py | 61 +++-- examples/neighbors/plot_nca_dim_reduction.py | 32 ++- examples/neighbors/plot_nca_illustration.py | 34 +-- examples/neighbors/plot_nearest_centroid.py | 19 +- examples/neighbors/plot_regression.py | 11 +- examples/neighbors/plot_species_kde.py | 40 ++-- examples/neural_networks/plot_mlp_alpha.py | 78 ++++--- .../plot_mlp_training_curves.py | 114 +++++++--- .../neural_networks/plot_mnist_filters.py | 22 +- .../plot_rbm_logistic_classification.py | 58 +++-- examples/preprocessing/plot_all_scaling.py | 145 +++++++----- examples/preprocessing/plot_discretization.py | 37 +-- .../plot_discretization_classification.py | 142 +++++++----- .../plot_discretization_strategies.py | 41 ++-- .../preprocessing/plot_map_data_to_normal.py | 56 ++--- .../preprocessing/plot_scaling_importance.py | 73 +++--- .../plot_release_highlights_0_22_0.py | 53 ++--- .../plot_release_highlights_0_23_0.py | 47 ++-- .../plot_release_highlights_0_24_0.py | 60 +++-- .../plot_label_propagation_digits.py | 15 +- ...abel_propagation_digits_active_learning.py | 48 ++-- .../plot_label_propagation_structure.py | 61 +++-- .../plot_self_training_varying_threshold.py | 47 ++-- .../plot_semi_supervised_newsgroups.py | 63 +++--- .../plot_semi_supervised_versus_svm_iris.py | 37 +-- examples/svm/plot_custom_kernel.py | 9 +- examples/svm/plot_iris_svc.py | 34 +-- .../svm/plot_linearsvc_support_vectors.py | 29 ++- examples/svm/plot_oneclass.py | 36 +-- examples/svm/plot_rbf_parameters.py | 40 ++-- examples/svm/plot_separating_hyperplane.py | 17 +- .../plot_separating_hyperplane_unbalanced.py | 28 ++- examples/svm/plot_svm_anova.py | 19 +- examples/svm/plot_svm_kernels.py | 65 +++--- examples/svm/plot_svm_margin.py | 36 +-- examples/svm/plot_svm_nonlinear.py | 22 +- examples/svm/plot_svm_regression.py | 58 +++-- examples/svm/plot_svm_scale_c.py | 70 +++--- examples/svm/plot_svm_tie_breaking.py | 16 +- examples/svm/plot_weighted_samples.py | 21 +- ...ot_document_classification_20newsgroups.py | 207 +++++++++-------- examples/text/plot_document_clustering.py | 143 +++++++----- .../text/plot_hashing_vs_dict_vectorizer.py | 19 +- examples/tree/plot_cost_complexity_pruning.py | 19 +- examples/tree/plot_iris_dtc.py | 21 +- examples/tree/plot_tree_regression.py | 6 +- .../tree/plot_tree_regression_multioutput.py | 23 +- examples/tree/plot_unveil_tree_structure.py | 75 ++++--- pyproject.toml | 1 - 262 files changed, 7852 insertions(+), 5520 deletions(-) diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py index 426a8c61111c0..004292cdbb762 100644 --- a/examples/applications/plot_digits_denoising.py +++ b/examples/applications/plot_digits_denoising.py @@ -87,9 +87,9 @@ def plot_digits(X, title): # Let's first have a look to see the difference between noise-free and noisy # images. We will check the test set in this regard. plot_digits(X_test, "Uncorrupted test images") -plot_digits(X_test_noisy, - f"Noisy test images\n" - f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}") +plot_digits( + X_test_noisy, f"Noisy test images\nMSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}" +) # %% # Learn the `PCA` basis @@ -100,8 +100,9 @@ def plot_digits(X, title): from sklearn.decomposition import PCA, KernelPCA pca = PCA(n_components=32) -kernel_pca = KernelPCA(n_components=400, kernel="rbf", gamma=1e-3, - fit_inverse_transform=True, alpha=5e-3) +kernel_pca = KernelPCA( + n_components=400, kernel="rbf", gamma=1e-3, fit_inverse_transform=True, alpha=5e-3 +) pca.fit(X_train_noisy) _ = kernel_pca.fit(X_train_noisy) @@ -118,17 +119,21 @@ def plot_digits(X, title): # kernel to learn the PCA basis and a kernel ridge to learn the mapping # function. X_reconstructed_kernel_pca = kernel_pca.inverse_transform( - kernel_pca.transform(X_test_noisy)) + kernel_pca.transform(X_test_noisy) +) X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy)) # %% plot_digits(X_test, "Uncorrupted test images") -plot_digits(X_reconstructed_pca, - f"PCA reconstruction\n" - f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}") -plot_digits(X_reconstructed_kernel_pca, - f"Kernel PCA reconstruction\n" - f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}") +plot_digits( + X_reconstructed_pca, + f"PCA reconstruction\nMSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}", +) +plot_digits( + X_reconstructed_kernel_pca, + "Kernel PCA reconstruction\n" + f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}", +) # %% # PCA has a lower MSE than kernel PCA. However, the qualitative analysis might diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py index 41ef0ca0edde6..7e5d05102fa0c 100644 --- a/examples/applications/plot_face_recognition.py +++ b/examples/applications/plot_face_recognition.py @@ -43,7 +43,7 @@ print(__doc__) # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") # ############################################################################# @@ -75,7 +75,8 @@ # split into a training and testing set X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.25, random_state=42) + X, y, test_size=0.25, random_state=42 +) # ############################################################################# @@ -83,11 +84,11 @@ # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 -print("Extracting the top %d eigenfaces from %d faces" - % (n_components, X_train.shape[0])) +print( + "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) +) t0 = time() -pca = PCA(n_components=n_components, svd_solver='randomized', - whiten=True).fit(X_train) +pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) @@ -104,11 +105,11 @@ print("Fitting the classifier to the training set") t0 = time() -param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], - 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } -clf = GridSearchCV( - SVC(kernel='rbf', class_weight='balanced'), param_grid -) +param_grid = { + "C": [1e3, 5e3, 1e4, 5e4, 1e5], + "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], +} +clf = GridSearchCV(SVC(kernel="rbf", class_weight="balanced"), param_grid) clf = clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") @@ -130,10 +131,11 @@ # ############################################################################# # Qualitative evaluation of the predictions using matplotlib + def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) - plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) + plt.subplots_adjust(bottom=0, left=0.01, right=0.99, top=0.90, hspace=0.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray) @@ -144,14 +146,16 @@ def plot_gallery(images, titles, h, w, n_row=3, n_col=4): # plot the result of the prediction on a portion of the test set + def title(y_pred, y_test, target_names, i): - pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] - true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] - return 'predicted: %s\ntrue: %s' % (pred_name, true_name) + pred_name = target_names[y_pred[i]].rsplit(" ", 1)[-1] + true_name = target_names[y_test[i]].rsplit(" ", 1)[-1] + return "predicted: %s\ntrue: %s" % (pred_name, true_name) -prediction_titles = [title(y_pred, y_test, target_names, i) - for i in range(y_pred.shape[0])] +prediction_titles = [ + title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0]) +] plot_gallery(X_test, prediction_titles, h, w) diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py index 5748a546bdaad..241d9d4e33cca 100644 --- a/examples/applications/plot_model_complexity_influence.py +++ b/examples/applications/plot_model_complexity_influence.py @@ -72,23 +72,21 @@ def generate_data(case): """Generate regression/classification data.""" - if case == 'regression': + if case == "regression": X, y = datasets.load_diabetes(return_X_y=True) - elif case == 'classification': - X, y = datasets.fetch_20newsgroups_vectorized(subset='all', - return_X_y=True) + elif case == "classification": + X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True) X, y = shuffle(X, y) offset = int(X.shape[0] * 0.8) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] - data = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, - 'y_test': y_test} + data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test} return data -regression_data = generate_data('regression') -classification_data = generate_data('classification') +regression_data = generate_data("regression") +classification_data = generate_data("classification") ############################################################################## @@ -110,26 +108,33 @@ def benchmark_influence(conf): prediction_times = [] prediction_powers = [] complexities = [] - for param_value in conf['changing_param_values']: - conf['tuned_params'][conf['changing_param']] = param_value - estimator = conf['estimator'](**conf['tuned_params']) + for param_value in conf["changing_param_values"]: + conf["tuned_params"][conf["changing_param"]] = param_value + estimator = conf["estimator"](**conf["tuned_params"]) print("Benchmarking %s" % estimator) - estimator.fit(conf['data']['X_train'], conf['data']['y_train']) - conf['postfit_hook'](estimator) - complexity = conf['complexity_computer'](estimator) + estimator.fit(conf["data"]["X_train"], conf["data"]["y_train"]) + conf["postfit_hook"](estimator) + complexity = conf["complexity_computer"](estimator) complexities.append(complexity) start_time = time.time() - for _ in range(conf['n_samples']): - y_pred = estimator.predict(conf['data']['X_test']) - elapsed_time = (time.time() - start_time) / float(conf['n_samples']) + for _ in range(conf["n_samples"]): + y_pred = estimator.predict(conf["data"]["X_test"]) + elapsed_time = (time.time() - start_time) / float(conf["n_samples"]) prediction_times.append(elapsed_time) - pred_score = conf['prediction_performance_computer']( - conf['data']['y_test'], y_pred) + pred_score = conf["prediction_performance_computer"]( + conf["data"]["y_test"], y_pred + ) prediction_powers.append(pred_score) - print("Complexity: %d | %s: %.4f | Pred. Time: %fs\n" % ( - complexity, conf['prediction_performance_label'], pred_score, - elapsed_time)) + print( + "Complexity: %d | %s: %.4f | Pred. Time: %fs\n" + % ( + complexity, + conf["prediction_performance_label"], + pred_score, + elapsed_time, + ) + ) return prediction_powers, prediction_times, complexities @@ -147,46 +152,58 @@ def benchmark_influence(conf): # different data. # + def _count_nonzero_coefficients(estimator): a = estimator.coef_.toarray() return np.count_nonzero(a) configurations = [ - {'estimator': SGDClassifier, - 'tuned_params': {'penalty': 'elasticnet', 'alpha': 0.001, 'loss': - 'modified_huber', 'fit_intercept': True, 'tol': 1e-3}, - 'changing_param': 'l1_ratio', - 'changing_param_values': [0.25, 0.5, 0.75, 0.9], - 'complexity_label': 'non_zero coefficients', - 'complexity_computer': _count_nonzero_coefficients, - 'prediction_performance_computer': hamming_loss, - 'prediction_performance_label': 'Hamming Loss (Misclassification Ratio)', - 'postfit_hook': lambda x: x.sparsify(), - 'data': classification_data, - 'n_samples': 30}, - {'estimator': NuSVR, - 'tuned_params': {'C': 1e3, 'gamma': 2 ** -15}, - 'changing_param': 'nu', - 'changing_param_values': [0.1, 0.25, 0.5, 0.75, 0.9], - 'complexity_label': 'n_support_vectors', - 'complexity_computer': lambda x: len(x.support_vectors_), - 'data': regression_data, - 'postfit_hook': lambda x: x, - 'prediction_performance_computer': mean_squared_error, - 'prediction_performance_label': 'MSE', - 'n_samples': 30}, - {'estimator': GradientBoostingRegressor, - 'tuned_params': {'loss': 'squared_error'}, - 'changing_param': 'n_estimators', - 'changing_param_values': [10, 50, 100, 200, 500], - 'complexity_label': 'n_trees', - 'complexity_computer': lambda x: x.n_estimators, - 'data': regression_data, - 'postfit_hook': lambda x: x, - 'prediction_performance_computer': mean_squared_error, - 'prediction_performance_label': 'MSE', - 'n_samples': 30}, + { + "estimator": SGDClassifier, + "tuned_params": { + "penalty": "elasticnet", + "alpha": 0.001, + "loss": "modified_huber", + "fit_intercept": True, + "tol": 1e-3, + }, + "changing_param": "l1_ratio", + "changing_param_values": [0.25, 0.5, 0.75, 0.9], + "complexity_label": "non_zero coefficients", + "complexity_computer": _count_nonzero_coefficients, + "prediction_performance_computer": hamming_loss, + "prediction_performance_label": "Hamming Loss (Misclassification Ratio)", + "postfit_hook": lambda x: x.sparsify(), + "data": classification_data, + "n_samples": 30, + }, + { + "estimator": NuSVR, + "tuned_params": {"C": 1e3, "gamma": 2 ** -15}, + "changing_param": "nu", + "changing_param_values": [0.1, 0.25, 0.5, 0.75, 0.9], + "complexity_label": "n_support_vectors", + "complexity_computer": lambda x: len(x.support_vectors_), + "data": regression_data, + "postfit_hook": lambda x: x, + "prediction_performance_computer": mean_squared_error, + "prediction_performance_label": "MSE", + "n_samples": 30, + }, + { + "estimator": GradientBoostingRegressor, + "tuned_params": {"loss": "squared_error"}, + "changing_param": "n_estimators", + "changing_param_values": [10, 50, 100, 200, 500], + "complexity_label": "n_trees", + "complexity_computer": lambda x: x.n_estimators, + "data": regression_data, + "postfit_hook": lambda x: x, + "prediction_performance_computer": mean_squared_error, + "prediction_performance_label": "MSE", + "n_samples": 30, + }, ] @@ -209,6 +226,7 @@ def _count_nonzero_coefficients(estimator): # ensemble is not as detrimental. # + def plot_influence(conf, mse_values, prediction_times, complexities): """ Plot influence of model complexity on both accuracy and latency. @@ -219,38 +237,37 @@ def plot_influence(conf, mse_values, prediction_times, complexities): # first axes (prediction error) ax1 = fig.add_subplot(111) - line1 = ax1.plot(complexities, mse_values, c='tab:blue', ls='-')[0] - ax1.set_xlabel('Model Complexity (%s)' % conf['complexity_label']) - y1_label = conf['prediction_performance_label'] + line1 = ax1.plot(complexities, mse_values, c="tab:blue", ls="-")[0] + ax1.set_xlabel("Model Complexity (%s)" % conf["complexity_label"]) + y1_label = conf["prediction_performance_label"] ax1.set_ylabel(y1_label) - ax1.spines['left'].set_color(line1.get_color()) + ax1.spines["left"].set_color(line1.get_color()) ax1.yaxis.label.set_color(line1.get_color()) - ax1.tick_params(axis='y', colors=line1.get_color()) + ax1.tick_params(axis="y", colors=line1.get_color()) # second axes (latency) ax2 = fig.add_subplot(111, sharex=ax1, frameon=False) - line2 = ax2.plot(complexities, prediction_times, c='tab:orange', ls='-')[0] + line2 = ax2.plot(complexities, prediction_times, c="tab:orange", ls="-")[0] ax2.yaxis.tick_right() ax2.yaxis.set_label_position("right") y2_label = "Time (s)" ax2.set_ylabel(y2_label) - ax1.spines['right'].set_color(line2.get_color()) + ax1.spines["right"].set_color(line2.get_color()) ax2.yaxis.label.set_color(line2.get_color()) - ax2.tick_params(axis='y', colors=line2.get_color()) + ax2.tick_params(axis="y", colors=line2.get_color()) - plt.legend((line1, line2), ("prediction error", "latency"), - loc='upper right') + plt.legend((line1, line2), ("prediction error", "latency"), loc="upper right") - plt.title("Influence of varying '%s' on %s" % (conf['changing_param'], - conf['estimator'].__name__)) + plt.title( + "Influence of varying '%s' on %s" + % (conf["changing_param"], conf["estimator"].__name__) + ) for conf in configurations: - prediction_performances, prediction_times, complexities = \ - benchmark_influence(conf) - plot_influence(conf, prediction_performances, prediction_times, - complexities) + prediction_performances, prediction_times, complexities = benchmark_influence(conf) + plot_influence(conf, prediction_performances, prediction_times, complexities) plt.show() diff --git a/examples/applications/plot_out_of_core_classification.py b/examples/applications/plot_out_of_core_classification.py index 62cf00c4a0daa..287188c35b807 100644 --- a/examples/applications/plot_out_of_core_classification.py +++ b/examples/applications/plot_out_of_core_classification.py @@ -41,7 +41,8 @@ def _not_in_sphinx(): # Hack to detect whether we are running by the sphinx builder - return '__file__' in globals() + return "__file__" in globals() + # %% # Reuters Dataset related routines @@ -55,17 +56,17 @@ def _not_in_sphinx(): class ReutersParser(HTMLParser): """Utility class to parse a SGML file and yield documents one at a time.""" - def __init__(self, encoding='latin-1'): + def __init__(self, encoding="latin-1"): HTMLParser.__init__(self) self._reset() self.encoding = encoding def handle_starttag(self, tag, attrs): - method = 'start_' + tag + method = "start_" + tag getattr(self, method, lambda x: None)(attrs) def handle_endtag(self, tag): - method = 'end_' + tag + method = "end_" + tag getattr(self, method, lambda: None)() def _reset(self): @@ -99,10 +100,10 @@ def start_reuters(self, attributes): pass def end_reuters(self): - self.body = re.sub(r'\s+', r' ', self.body) - self.docs.append({'title': self.title, - 'body': self.body, - 'topics': self.topics}) + self.body = re.sub(r"\s+", r" ", self.body) + self.docs.append( + {"title": self.title, "body": self.body, "topics": self.topics} + ) self._reset() def start_title(self, attributes): @@ -143,37 +144,36 @@ def stream_reuters_documents(data_path=None): """ - DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' - 'reuters21578-mld/reuters21578.tar.gz') - ARCHIVE_FILENAME = 'reuters21578.tar.gz' + DOWNLOAD_URL = ( + "http://archive.ics.uci.edu/ml/machine-learning-databases/" + "reuters21578-mld/reuters21578.tar.gz" + ) + ARCHIVE_FILENAME = "reuters21578.tar.gz" if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" - print("downloading dataset (once and for all) into %s" % - data_path) + print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): - total_sz_mb = '%.2f MB' % (size / 1e6) - current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + total_sz_mb = "%.2f MB" % (size / 1e6) + current_sz_mb = "%.2f MB" % ((blocknum * bs) / 1e6) if _not_in_sphinx(): - sys.stdout.write( - '\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb)) + sys.stdout.write("\rdownloaded %s / %s" % (current_sz_mb, total_sz_mb)) archive_path = os.path.join(data_path, ARCHIVE_FILENAME) - urlretrieve(DOWNLOAD_URL, filename=archive_path, - reporthook=progress) + urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): - sys.stdout.write('\r') + sys.stdout.write("\r") print("untarring Reuters dataset...") - tarfile.open(archive_path, 'r:gz').extractall(data_path) + tarfile.open(archive_path, "r:gz").extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): - for doc in parser.parse(open(filename, 'rb')): + for doc in parser.parse(open(filename, "rb")): yield doc @@ -184,8 +184,9 @@ def progress(blocknum, bs, size): # Create the vectorizer and limit the number of features to a reasonable # maximum -vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18, - alternate_sign=False) +vectorizer = HashingVectorizer( + decode_error="ignore", n_features=2 ** 18, alternate_sign=False +) # Iterator over parsed Reuters SGML files. @@ -196,14 +197,14 @@ def progress(blocknum, bs, size): # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) -positive_class = 'acq' +positive_class = "acq" # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { - 'SGD': SGDClassifier(max_iter=5), - 'Perceptron': Perceptron(), - 'NB Multinomial': MultinomialNB(alpha=0.01), - 'Passive-Aggressive': PassiveAggressiveClassifier(), + "SGD": SGDClassifier(max_iter=5), + "Perceptron": Perceptron(), + "NB Multinomial": MultinomialNB(alpha=0.01), + "Passive-Aggressive": PassiveAggressiveClassifier(), } @@ -213,9 +214,11 @@ def get_minibatch(doc_iter, size, pos_class=positive_class): Note: size is before excluding invalid docs with no topics assigned. """ - data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics']) - for doc in itertools.islice(doc_iter, size) - if doc['topics']] + data = [ + ("{title}\n\n{body}".format(**doc), pos_class in doc["topics"]) + for doc in itertools.islice(doc_iter, size) + if doc["topics"] + ] if not len(data): return np.asarray([], dtype=int), np.asarray([], dtype=int) X_text, y = zip(*data) @@ -231,7 +234,7 @@ def iter_minibatches(doc_iter, minibatch_size): # test data statistics -test_stats = {'n_test': 0, 'n_test_pos': 0} +test_stats = {"n_test": 0, "n_test_pos": 0} # First we hold out a number of examples to estimate accuracy n_test_documents = 1000 @@ -241,28 +244,34 @@ def iter_minibatches(doc_iter, minibatch_size): tick = time.time() X_test = vectorizer.transform(X_test_text) vectorizing_time = time.time() - tick -test_stats['n_test'] += len(y_test) -test_stats['n_test_pos'] += sum(y_test) +test_stats["n_test"] += len(y_test) +test_stats["n_test_pos"] += sum(y_test) print("Test set is %d documents (%d positive)" % (len(y_test), sum(y_test))) def progress(cls_name, stats): """Report progress information, return a string.""" - duration = time.time() - stats['t0'] + duration = time.time() - stats["t0"] s = "%20s classifier : \t" % cls_name s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats s += "accuracy: %(accuracy).3f " % stats - s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration) + s += "in %.2fs (%5d docs/s)" % (duration, stats["n_train"] / duration) return s cls_stats = {} for cls_name in partial_fit_classifiers: - stats = {'n_train': 0, 'n_train_pos': 0, - 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), - 'runtime_history': [(0, 0)], 'total_fit_time': 0.0} + stats = { + "n_train": 0, + "n_train_pos": 0, + "accuracy": 0.0, + "accuracy_history": [(0, 0)], + "t0": time.time(), + "runtime_history": [(0, 0)], + "total_fit_time": 0.0, + } cls_stats[cls_name] = stats get_minibatch(data_stream, n_test_documents) @@ -291,23 +300,24 @@ def progress(cls_name, stats): cls.partial_fit(X_train, y_train, classes=all_classes) # accumulate test accuracy stats - cls_stats[cls_name]['total_fit_time'] += time.time() - tick - cls_stats[cls_name]['n_train'] += X_train.shape[0] - cls_stats[cls_name]['n_train_pos'] += sum(y_train) + cls_stats[cls_name]["total_fit_time"] += time.time() - tick + cls_stats[cls_name]["n_train"] += X_train.shape[0] + cls_stats[cls_name]["n_train_pos"] += sum(y_train) tick = time.time() - cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test) - cls_stats[cls_name]['prediction_time'] = time.time() - tick - acc_history = (cls_stats[cls_name]['accuracy'], - cls_stats[cls_name]['n_train']) - cls_stats[cls_name]['accuracy_history'].append(acc_history) - run_history = (cls_stats[cls_name]['accuracy'], - total_vect_time + cls_stats[cls_name]['total_fit_time']) - cls_stats[cls_name]['runtime_history'].append(run_history) + cls_stats[cls_name]["accuracy"] = cls.score(X_test, y_test) + cls_stats[cls_name]["prediction_time"] = time.time() - tick + acc_history = (cls_stats[cls_name]["accuracy"], cls_stats[cls_name]["n_train"]) + cls_stats[cls_name]["accuracy_history"].append(acc_history) + run_history = ( + cls_stats[cls_name]["accuracy"], + total_vect_time + cls_stats[cls_name]["total_fit_time"], + ) + cls_stats[cls_name]["runtime_history"].append(run_history) if i % 3 == 0: print(progress(cls_name, cls_stats[cls_name])) if i % 3 == 0: - print('\n') + print("\n") # %% @@ -326,64 +336,66 @@ def plot_accuracy(x, y, x_legend): """Plot accuracy as a function of x.""" x = np.array(x) y = np.array(y) - plt.title('Classification accuracy as a function of %s' % x_legend) - plt.xlabel('%s' % x_legend) - plt.ylabel('Accuracy') + plt.title("Classification accuracy as a function of %s" % x_legend) + plt.xlabel("%s" % x_legend) + plt.ylabel("Accuracy") plt.grid(True) plt.plot(x, y) -rcParams['legend.fontsize'] = 10 +rcParams["legend.fontsize"] = 10 cls_names = list(sorted(cls_stats.keys())) # Plot accuracy evolution plt.figure() for _, stats in sorted(cls_stats.items()): # Plot accuracy evolution with #examples - accuracy, n_examples = zip(*stats['accuracy_history']) + accuracy, n_examples = zip(*stats["accuracy_history"]) plot_accuracy(n_examples, accuracy, "training examples (#)") ax = plt.gca() ax.set_ylim((0.8, 1)) -plt.legend(cls_names, loc='best') +plt.legend(cls_names, loc="best") plt.figure() for _, stats in sorted(cls_stats.items()): # Plot accuracy evolution with runtime - accuracy, runtime = zip(*stats['runtime_history']) - plot_accuracy(runtime, accuracy, 'runtime (s)') + accuracy, runtime = zip(*stats["runtime_history"]) + plot_accuracy(runtime, accuracy, "runtime (s)") ax = plt.gca() ax.set_ylim((0.8, 1)) -plt.legend(cls_names, loc='best') +plt.legend(cls_names, loc="best") # Plot fitting times plt.figure() fig = plt.gcf() -cls_runtime = [stats['total_fit_time'] - for cls_name, stats in sorted(cls_stats.items())] +cls_runtime = [stats["total_fit_time"] for cls_name, stats in sorted(cls_stats.items())] cls_runtime.append(total_vect_time) -cls_names.append('Vectorization') -bar_colors = ['b', 'g', 'r', 'c', 'm', 'y'] +cls_names.append("Vectorization") +bar_colors = ["b", "g", "r", "c", "m", "y"] ax = plt.subplot(111) -rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, - color=bar_colors) +rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, color=bar_colors) ax.set_xticks(np.linspace(0, len(cls_names) - 1, len(cls_names))) ax.set_xticklabels(cls_names, fontsize=10) ymax = max(cls_runtime) * 1.2 ax.set_ylim((0, ymax)) -ax.set_ylabel('runtime (s)') -ax.set_title('Training Times') +ax.set_ylabel("runtime (s)") +ax.set_title("Training Times") def autolabel(rectangles): """attach some text vi autolabel on rectangles.""" for rect in rectangles: height = rect.get_height() - ax.text(rect.get_x() + rect.get_width() / 2., - 1.05 * height, '%.4f' % height, - ha='center', va='bottom') + ax.text( + rect.get_x() + rect.get_width() / 2.0, + 1.05 * height, + "%.4f" % height, + ha="center", + va="bottom", + ) plt.setp(plt.xticks()[1], rotation=30) @@ -396,23 +408,22 @@ def autolabel(rectangles): cls_runtime = [] cls_names = list(sorted(cls_stats.keys())) for cls_name, stats in sorted(cls_stats.items()): - cls_runtime.append(stats['prediction_time']) + cls_runtime.append(stats["prediction_time"]) cls_runtime.append(parsing_time) -cls_names.append('Read/Parse\n+Feat.Extr.') +cls_names.append("Read/Parse\n+Feat.Extr.") cls_runtime.append(vectorizing_time) -cls_names.append('Hashing\n+Vect.') +cls_names.append("Hashing\n+Vect.") ax = plt.subplot(111) -rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, - color=bar_colors) +rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5, color=bar_colors) ax.set_xticks(np.linspace(0, len(cls_names) - 1, len(cls_names))) ax.set_xticklabels(cls_names, fontsize=8) plt.setp(plt.xticks()[1], rotation=30) ymax = max(cls_runtime) * 1.2 ax.set_ylim((0, ymax)) -ax.set_ylabel('runtime (s)') -ax.set_title('Prediction Times (%d instances)' % n_test_documents) +ax.set_ylabel("runtime (s)") +ax.set_title("Prediction Times (%d instances)" % n_test_documents) autolabel(rectangles) plt.tight_layout() plt.show() diff --git a/examples/applications/plot_outlier_detection_wine.py b/examples/applications/plot_outlier_detection_wine.py index 49210b800d1b6..182f613d11eaa 100644 --- a/examples/applications/plot_outlier_detection_wine.py +++ b/examples/applications/plot_outlier_detection_wine.py @@ -47,17 +47,18 @@ # Define "classifiers" to be used classifiers = { - "Empirical Covariance": EllipticEnvelope(support_fraction=1., - contamination=0.25), - "Robust Covariance (Minimum Covariance Determinant)": - EllipticEnvelope(contamination=0.25), - "OCSVM": OneClassSVM(nu=0.25, gamma=0.35)} -colors = ['m', 'g', 'b'] + "Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25), + "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope( + contamination=0.25 + ), + "OCSVM": OneClassSVM(nu=0.25, gamma=0.35), +} +colors = ["m", "g", "b"] legend1 = {} legend2 = {} # Get data -X1 = load_wine()['data'][:, [1, 2]] # two clusters +X1 = load_wine()["data"][:, [1, 2]] # two clusters # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500)) @@ -67,7 +68,8 @@ Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape) legend1[clf_name] = plt.contour( - xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]) + xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i] + ) legend1_values_list = list(legend1.values()) legend1_keys_list = list(legend1.keys()) @@ -75,20 +77,30 @@ # Plot the results (= shape of the data points cloud) plt.figure(1) # two clusters plt.title("Outlier detection on a real data set (wine recognition)") -plt.scatter(X1[:, 0], X1[:, 1], color='black') +plt.scatter(X1[:, 0], X1[:, 1], color="black") bbox_args = dict(boxstyle="round", fc="0.8") arrow_args = dict(arrowstyle="->") -plt.annotate("outlying points", xy=(4, 2), - xycoords="data", textcoords="data", - xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args) +plt.annotate( + "outlying points", + xy=(4, 2), + xycoords="data", + textcoords="data", + xytext=(3, 1.25), + bbox=bbox_args, + arrowprops=arrow_args, +) plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) -plt.legend((legend1_values_list[0].collections[0], - legend1_values_list[1].collections[0], - legend1_values_list[2].collections[0]), - (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]), - loc="upper center", - prop=matplotlib.font_manager.FontProperties(size=11)) +plt.legend( + ( + legend1_values_list[0].collections[0], + legend1_values_list[1].collections[0], + legend1_values_list[2].collections[0], + ), + (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]), + loc="upper center", + prop=matplotlib.font_manager.FontProperties(size=11), +) plt.ylabel("ash") plt.xlabel("malic_acid") @@ -107,7 +119,7 @@ # the data scatter matrix and the risk of over-fitting the data. # Get data -X2 = load_wine()['data'][:, [6, 9]] # "banana"-shaped +X2 = load_wine()["data"][:, [6, 9]] # "banana"-shaped # Learn a frontier for outlier detection with several classifiers xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500)) @@ -117,7 +129,8 @@ Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()]) Z2 = Z2.reshape(xx2.shape) legend2[clf_name] = plt.contour( - xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]) + xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i] + ) legend2_values_list = list(legend2.values()) legend2_keys_list = list(legend2.keys()) @@ -125,15 +138,19 @@ # Plot the results (= shape of the data points cloud) plt.figure(2) # "banana" shape plt.title("Outlier detection on a real data set (wine recognition)") -plt.scatter(X2[:, 0], X2[:, 1], color='black') +plt.scatter(X2[:, 0], X2[:, 1], color="black") plt.xlim((xx2.min(), xx2.max())) plt.ylim((yy2.min(), yy2.max())) -plt.legend((legend2_values_list[0].collections[0], - legend2_values_list[1].collections[0], - legend2_values_list[2].collections[0]), - (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]), - loc="upper center", - prop=matplotlib.font_manager.FontProperties(size=11)) +plt.legend( + ( + legend2_values_list[0].collections[0], + legend2_values_list[1].collections[0], + legend2_values_list[2].collections[0], + ), + (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]), + loc="upper center", + prop=matplotlib.font_manager.FontProperties(size=11), +) plt.ylabel("color_intensity") plt.xlabel("flavanoids") diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py index e59402e47fe17..b9780e7974776 100644 --- a/examples/applications/plot_prediction_latency.py +++ b/examples/applications/plot_prediction_latency.py @@ -35,7 +35,7 @@ def _not_in_sphinx(): # Hack to detect whether we are running by the sphinx builder - return '__file__' in globals() + return "__file__" in globals() def atomic_benchmark_estimator(estimator, X_test, verbose=False): @@ -48,8 +48,12 @@ def atomic_benchmark_estimator(estimator, X_test, verbose=False): estimator.predict(instance) runtimes[i] = time.time() - start if verbose: - print("atomic_benchmark runtimes:", min(runtimes), np.percentile( - runtimes, 50), max(runtimes)) + print( + "atomic_benchmark runtimes:", + min(runtimes), + np.percentile(runtimes, 50), + max(runtimes), + ) return runtimes @@ -63,8 +67,12 @@ def bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose): runtimes[i] = time.time() - start runtimes = np.array(list(map(lambda x: x / float(n_instances), runtimes))) if verbose: - print("bulk_benchmark runtimes:", min(runtimes), np.percentile( - runtimes, 50), max(runtimes)) + print( + "bulk_benchmark runtimes:", + min(runtimes), + np.percentile(runtimes, 50), + max(runtimes), + ) return runtimes @@ -85,8 +93,7 @@ def benchmark_estimator(estimator, X_test, n_bulk_repeats=30, verbose=False): """ atomic_runtimes = atomic_benchmark_estimator(estimator, X_test, verbose) - bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, - verbose) + bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose) return atomic_runtimes, bulk_runtimes @@ -95,12 +102,14 @@ def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): if verbose: print("generating dataset...") - X, y, coef = make_regression(n_samples=n_train + n_test, - n_features=n_features, noise=noise, coef=True) + X, y, coef = make_regression( + n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True + ) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( - X, y, train_size=n_train, test_size=n_test, random_state=random_seed) + X, y, train_size=n_train, test_size=n_test, random_state=random_seed + ) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() @@ -130,26 +139,32 @@ def boxplot_runtimes(runtimes, pred_type, configuration): """ fig, ax1 = plt.subplots(figsize=(10, 6)) - bp = plt.boxplot(runtimes, ) - - cls_infos = ['%s\n(%d %s)' % (estimator_conf['name'], - estimator_conf['complexity_computer']( - estimator_conf['instance']), - estimator_conf['complexity_label']) for - estimator_conf in configuration['estimators']] + bp = plt.boxplot( + runtimes, + ) + + cls_infos = [ + "%s\n(%d %s)" + % ( + estimator_conf["name"], + estimator_conf["complexity_computer"](estimator_conf["instance"]), + estimator_conf["complexity_label"], + ) + for estimator_conf in configuration["estimators"] + ] plt.setp(ax1, xticklabels=cls_infos) - plt.setp(bp['boxes'], color='black') - plt.setp(bp['whiskers'], color='black') - plt.setp(bp['fliers'], color='red', marker='+') + plt.setp(bp["boxes"], color="black") + plt.setp(bp["whiskers"], color="black") + plt.setp(bp["fliers"], color="red", marker="+") - ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', - alpha=0.5) + ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5) ax1.set_axisbelow(True) - ax1.set_title('Prediction Time per Instance - %s, %d feats.' % ( - pred_type.capitalize(), - configuration['n_features'])) - ax1.set_ylabel('Prediction Time (us)') + ax1.set_title( + "Prediction Time per Instance - %s, %d feats." + % (pred_type.capitalize(), configuration["n_features"]) + ) + ax1.set_ylabel("Prediction Time (us)") plt.show() @@ -157,24 +172,24 @@ def boxplot_runtimes(runtimes, pred_type, configuration): def benchmark(configuration): """Run the whole benchmark.""" X_train, y_train, X_test, y_test = generate_dataset( - configuration['n_train'], configuration['n_test'], - configuration['n_features']) + configuration["n_train"], configuration["n_test"], configuration["n_features"] + ) stats = {} - for estimator_conf in configuration['estimators']: - print("Benchmarking", estimator_conf['instance']) - estimator_conf['instance'].fit(X_train, y_train) + for estimator_conf in configuration["estimators"]: + print("Benchmarking", estimator_conf["instance"]) + estimator_conf["instance"].fit(X_train, y_train) gc.collect() - a, b = benchmark_estimator(estimator_conf['instance'], X_test) - stats[estimator_conf['name']] = {'atomic': a, 'bulk': b} + a, b = benchmark_estimator(estimator_conf["instance"], X_test) + stats[estimator_conf["name"]] = {"atomic": a, "bulk": b} - cls_names = [estimator_conf['name'] for estimator_conf in configuration[ - 'estimators']] - runtimes = [1e6 * stats[clf_name]['atomic'] for clf_name in cls_names] - boxplot_runtimes(runtimes, 'atomic', configuration) - runtimes = [1e6 * stats[clf_name]['bulk'] for clf_name in cls_names] - boxplot_runtimes(runtimes, 'bulk (%d)' % configuration['n_test'], - configuration) + cls_names = [ + estimator_conf["name"] for estimator_conf in configuration["estimators"] + ] + runtimes = [1e6 * stats[clf_name]["atomic"] for clf_name in cls_names] + boxplot_runtimes(runtimes, "atomic", configuration) + runtimes = [1e6 * stats[clf_name]["bulk"] for clf_name in cls_names] + boxplot_runtimes(runtimes, "bulk (%d)" % configuration["n_test"], configuration) def n_feature_influence(estimators, n_train, n_test, n_features, percentile): @@ -205,62 +220,72 @@ def n_feature_influence(estimators, n_train, n_test, n_features, percentile): estimator.fit(X_train, y_train) gc.collect() runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False) - percentiles[cls_name][n] = 1e6 * np.percentile(runtimes, - percentile) + percentiles[cls_name][n] = 1e6 * np.percentile(runtimes, percentile) return percentiles def plot_n_features_influence(percentiles, percentile): fig, ax1 = plt.subplots(figsize=(10, 6)) - colors = ['r', 'g', 'b'] + colors = ["r", "g", "b"] for i, cls_name in enumerate(percentiles.keys()): x = np.array(sorted([n for n in percentiles[cls_name].keys()])) y = np.array([percentiles[cls_name][n] for n in x]) - plt.plot(x, y, color=colors[i], ) - ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', - alpha=0.5) + plt.plot( + x, + y, + color=colors[i], + ) + ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5) ax1.set_axisbelow(True) - ax1.set_title('Evolution of Prediction Time with #Features') - ax1.set_xlabel('#Features') - ax1.set_ylabel('Prediction Time at %d%%-ile (us)' % percentile) + ax1.set_title("Evolution of Prediction Time with #Features") + ax1.set_xlabel("#Features") + ax1.set_ylabel("Prediction Time at %d%%-ile (us)" % percentile) plt.show() def benchmark_throughputs(configuration, duration_secs=0.1): """benchmark throughput for different estimators.""" X_train, y_train, X_test, y_test = generate_dataset( - configuration['n_train'], configuration['n_test'], - configuration['n_features']) + configuration["n_train"], configuration["n_test"], configuration["n_features"] + ) throughputs = dict() - for estimator_config in configuration['estimators']: - estimator_config['instance'].fit(X_train, y_train) + for estimator_config in configuration["estimators"]: + estimator_config["instance"].fit(X_train, y_train) start_time = time.time() n_predictions = 0 while (time.time() - start_time) < duration_secs: - estimator_config['instance'].predict(X_test[[0]]) + estimator_config["instance"].predict(X_test[[0]]) n_predictions += 1 - throughputs[estimator_config['name']] = n_predictions / duration_secs + throughputs[estimator_config["name"]] = n_predictions / duration_secs return throughputs def plot_benchmark_throughput(throughputs, configuration): fig, ax = plt.subplots(figsize=(10, 6)) - colors = ['r', 'g', 'b'] - cls_infos = ['%s\n(%d %s)' % (estimator_conf['name'], - estimator_conf['complexity_computer']( - estimator_conf['instance']), - estimator_conf['complexity_label']) for - estimator_conf in configuration['estimators']] - cls_values = [throughputs[estimator_conf['name']] for estimator_conf in - configuration['estimators']] + colors = ["r", "g", "b"] + cls_infos = [ + "%s\n(%d %s)" + % ( + estimator_conf["name"], + estimator_conf["complexity_computer"](estimator_conf["instance"]), + estimator_conf["complexity_label"], + ) + for estimator_conf in configuration["estimators"] + ] + cls_values = [ + throughputs[estimator_conf["name"]] + for estimator_conf in configuration["estimators"] + ] plt.bar(range(len(throughputs)), cls_values, width=0.5, color=colors) ax.set_xticks(np.linspace(0.25, len(throughputs) - 0.75, len(throughputs))) ax.set_xticklabels(cls_infos, fontsize=10) ymax = max(cls_values) * 1.2 ax.set_ylim((0, ymax)) - ax.set_ylabel('Throughput (predictions/sec)') - ax.set_title('Prediction Throughput for different estimators (%d ' - 'features)' % configuration['n_features']) + ax.set_ylabel("Throughput (predictions/sec)") + ax.set_title( + "Prediction Throughput for different estimators (%d features)" + % configuration["n_features"] + ) plt.show() @@ -272,33 +297,43 @@ def plot_benchmark_throughput(throughputs, configuration): # ############################################################################# # Benchmark bulk/atomic prediction speed for various regressors configuration = { - 'n_train': int(1e3), - 'n_test': int(1e2), - 'n_features': int(1e2), - 'estimators': [ - {'name': 'Linear Model', - 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, - l1_ratio=0.25, tol=1e-4), - 'complexity_label': 'non-zero coefficients', - 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)}, - {'name': 'RandomForest', - 'instance': RandomForestRegressor(), - 'complexity_label': 'estimators', - 'complexity_computer': lambda clf: clf.n_estimators}, - {'name': 'SVR', - 'instance': SVR(kernel='rbf'), - 'complexity_label': 'support vectors', - 'complexity_computer': lambda clf: len(clf.support_vectors_)}, - ] + "n_train": int(1e3), + "n_test": int(1e2), + "n_features": int(1e2), + "estimators": [ + { + "name": "Linear Model", + "instance": SGDRegressor( + penalty="elasticnet", alpha=0.01, l1_ratio=0.25, tol=1e-4 + ), + "complexity_label": "non-zero coefficients", + "complexity_computer": lambda clf: np.count_nonzero(clf.coef_), + }, + { + "name": "RandomForest", + "instance": RandomForestRegressor(), + "complexity_label": "estimators", + "complexity_computer": lambda clf: clf.n_estimators, + }, + { + "name": "SVR", + "instance": SVR(kernel="rbf"), + "complexity_label": "support vectors", + "complexity_computer": lambda clf: len(clf.support_vectors_), + }, + ], } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90 -percentiles = n_feature_influence({'ridge': Ridge()}, - configuration['n_train'], - configuration['n_test'], - [100, 250, 500], percentile) +percentiles = n_feature_influence( + {"ridge": Ridge()}, + configuration["n_train"], + configuration["n_test"], + [100, 250, 500], + percentile, +) plot_n_features_influence(percentiles, percentile) # benchmark throughput diff --git a/examples/applications/plot_species_distribution_modeling.py b/examples/applications/plot_species_distribution_modeling.py index 4216f4fda0c2f..6dac08fe1942c 100644 --- a/examples/applications/plot_species_distribution_modeling.py +++ b/examples/applications/plot_species_distribution_modeling.py @@ -53,6 +53,7 @@ # otherwise, we'll improvise later... try: from mpl_toolkits.basemap import Basemap + basemap = True except ImportError: basemap = False @@ -93,31 +94,34 @@ def create_species_bunch(species_name, train, test, coverages, xgrid, ygrid): This will use the test/train record arrays to extract the data specific to the given species name. """ - bunch = Bunch(name=' '.join(species_name.split("_")[:2])) - species_name = species_name.encode('ascii') + bunch = Bunch(name=" ".join(species_name.split("_")[:2])) + species_name = species_name.encode("ascii") points = dict(test=test, train=train) for label, pts in points.items(): # choose points associated with the desired species - pts = pts[pts['species'] == species_name] - bunch['pts_%s' % label] = pts + pts = pts[pts["species"] == species_name] + bunch["pts_%s" % label] = pts # determine coverage values for each of the training & testing points - ix = np.searchsorted(xgrid, pts['dd long']) - iy = np.searchsorted(ygrid, pts['dd lat']) - bunch['cov_%s' % label] = coverages[:, -iy, ix].T + ix = np.searchsorted(xgrid, pts["dd long"]) + iy = np.searchsorted(ygrid, pts["dd lat"]) + bunch["cov_%s" % label] = coverages[:, -iy, ix].T return bunch -def plot_species_distribution(species=("bradypus_variegatus_0", - "microryzomys_minutus_0")): +def plot_species_distribution( + species=("bradypus_variegatus_0", "microryzomys_minutus_0") +): """ Plot the species distribution. """ if len(species) > 2: - print("Note: when more than two species are provided," - " only the first two will be used") + print( + "Note: when more than two species are provided," + " only the first two will be used" + ) t0 = time() @@ -131,19 +135,19 @@ def plot_species_distribution(species=("bradypus_variegatus_0", X, Y = np.meshgrid(xgrid, ygrid[::-1]) # create a bunch for each species - BV_bunch = create_species_bunch(species[0], - data.train, data.test, - data.coverages, xgrid, ygrid) - MM_bunch = create_species_bunch(species[1], - data.train, data.test, - data.coverages, xgrid, ygrid) + BV_bunch = create_species_bunch( + species[0], data.train, data.test, data.coverages, xgrid, ygrid + ) + MM_bunch = create_species_bunch( + species[1], data.train, data.test, data.coverages, xgrid, ygrid + ) # background points (grid coordinates) for evaluation np.random.seed(13) - background_points = np.c_[np.random.randint(low=0, high=data.Ny, - size=10000), - np.random.randint(low=0, high=data.Nx, - size=10000)].T + background_points = np.c_[ + np.random.randint(low=0, high=data.Ny, size=10000), + np.random.randint(low=0, high=data.Nx, size=10000), + ].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. @@ -160,7 +164,7 @@ def plot_species_distribution(species=("bradypus_variegatus_0", train_cover_std = (species.cov_train - mean) / std # Fit OneClassSVM - print(" - fit OneClassSVM ... ", end='') + print(" - fit OneClassSVM ... ", end="") clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) print("done.") @@ -169,16 +173,21 @@ def plot_species_distribution(species=("bradypus_variegatus_0", plt.subplot(1, 2, i + 1) if basemap: print(" - plot coastlines using basemap") - m = Basemap(projection='cyl', llcrnrlat=Y.min(), - urcrnrlat=Y.max(), llcrnrlon=X.min(), - urcrnrlon=X.max(), resolution='c') + m = Basemap( + projection="cyl", + llcrnrlat=Y.min(), + urcrnrlat=Y.max(), + llcrnrlon=X.min(), + urcrnrlon=X.max(), + resolution="c", + ) m.drawcoastlines() m.drawcountries() else: print(" - plot coastlines from coverage") - plt.contour(X, Y, land_reference, - levels=[-9998], colors="k", - linestyles="solid") + plt.contour( + X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid" + ) plt.xticks([]) plt.yticks([]) @@ -200,18 +209,28 @@ def plot_species_distribution(species=("bradypus_variegatus_0", # plot contours of the prediction plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) - plt.colorbar(format='%.2f') + plt.colorbar(format="%.2f") # scatter training/testing points - plt.scatter(species.pts_train['dd long'], species.pts_train['dd lat'], - s=2 ** 2, c='black', - marker='^', label='train') - plt.scatter(species.pts_test['dd long'], species.pts_test['dd lat'], - s=2 ** 2, c='black', - marker='x', label='test') + plt.scatter( + species.pts_train["dd long"], + species.pts_train["dd lat"], + s=2 ** 2, + c="black", + marker="^", + label="train", + ) + plt.scatter( + species.pts_test["dd long"], + species.pts_test["dd lat"], + s=2 ** 2, + c="black", + marker="x", + label="test", + ) plt.legend() plt.title(species.name) - plt.axis('equal') + plt.axis("equal") # Compute AUC with regards to background points pred_background = Z[background_points[0], background_points[1]] diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py index fb6f9b2ec27d8..5116d8939de5d 100644 --- a/examples/applications/plot_stock_market.py +++ b/examples/applications/plot_stock_market.py @@ -85,62 +85,63 @@ # alphavantage.co ones. symbol_dict = { - 'TOT': 'Total', - 'XOM': 'Exxon', - 'CVX': 'Chevron', - 'COP': 'ConocoPhillips', - 'VLO': 'Valero Energy', - 'MSFT': 'Microsoft', - 'IBM': 'IBM', - 'TWX': 'Time Warner', - 'CMCSA': 'Comcast', - 'CVC': 'Cablevision', - 'YHOO': 'Yahoo', - 'DELL': 'Dell', - 'HPQ': 'HP', - 'AMZN': 'Amazon', - 'TM': 'Toyota', - 'CAJ': 'Canon', - 'SNE': 'Sony', - 'F': 'Ford', - 'HMC': 'Honda', - 'NAV': 'Navistar', - 'NOC': 'Northrop Grumman', - 'BA': 'Boeing', - 'KO': 'Coca Cola', - 'MMM': '3M', - 'MCD': 'McDonald\'s', - 'PEP': 'Pepsi', - 'K': 'Kellogg', - 'UN': 'Unilever', - 'MAR': 'Marriott', - 'PG': 'Procter Gamble', - 'CL': 'Colgate-Palmolive', - 'GE': 'General Electrics', - 'WFC': 'Wells Fargo', - 'JPM': 'JPMorgan Chase', - 'AIG': 'AIG', - 'AXP': 'American express', - 'BAC': 'Bank of America', - 'GS': 'Goldman Sachs', - 'AAPL': 'Apple', - 'SAP': 'SAP', - 'CSCO': 'Cisco', - 'TXN': 'Texas Instruments', - 'XRX': 'Xerox', - 'WMT': 'Wal-Mart', - 'HD': 'Home Depot', - 'GSK': 'GlaxoSmithKline', - 'PFE': 'Pfizer', - 'SNY': 'Sanofi-Aventis', - 'NVS': 'Novartis', - 'KMB': 'Kimberly-Clark', - 'R': 'Ryder', - 'GD': 'General Dynamics', - 'RTN': 'Raytheon', - 'CVS': 'CVS', - 'CAT': 'Caterpillar', - 'DD': 'DuPont de Nemours'} + "TOT": "Total", + "XOM": "Exxon", + "CVX": "Chevron", + "COP": "ConocoPhillips", + "VLO": "Valero Energy", + "MSFT": "Microsoft", + "IBM": "IBM", + "TWX": "Time Warner", + "CMCSA": "Comcast", + "CVC": "Cablevision", + "YHOO": "Yahoo", + "DELL": "Dell", + "HPQ": "HP", + "AMZN": "Amazon", + "TM": "Toyota", + "CAJ": "Canon", + "SNE": "Sony", + "F": "Ford", + "HMC": "Honda", + "NAV": "Navistar", + "NOC": "Northrop Grumman", + "BA": "Boeing", + "KO": "Coca Cola", + "MMM": "3M", + "MCD": "McDonald's", + "PEP": "Pepsi", + "K": "Kellogg", + "UN": "Unilever", + "MAR": "Marriott", + "PG": "Procter Gamble", + "CL": "Colgate-Palmolive", + "GE": "General Electrics", + "WFC": "Wells Fargo", + "JPM": "JPMorgan Chase", + "AIG": "AIG", + "AXP": "American express", + "BAC": "Bank of America", + "GS": "Goldman Sachs", + "AAPL": "Apple", + "SAP": "SAP", + "CSCO": "Cisco", + "TXN": "Texas Instruments", + "XRX": "Xerox", + "WMT": "Wal-Mart", + "HD": "Home Depot", + "GSK": "GlaxoSmithKline", + "PFE": "Pfizer", + "SNY": "Sanofi-Aventis", + "NVS": "Novartis", + "KMB": "Kimberly-Clark", + "R": "Ryder", + "GD": "General Dynamics", + "RTN": "Raytheon", + "CVS": "CVS", + "CAT": "Caterpillar", + "DD": "DuPont de Nemours", +} symbols, names = np.array(sorted(symbol_dict.items())).T @@ -148,13 +149,15 @@ quotes = [] for symbol in symbols: - print('Fetching quote history for %r' % symbol, file=sys.stderr) - url = ('https://raw.githubusercontent.com/scikit-learn/examples-data/' - 'master/financial-data/{}.csv') + print("Fetching quote history for %r" % symbol, file=sys.stderr) + url = ( + "https://raw.githubusercontent.com/scikit-learn/examples-data/" + "master/financial-data/{}.csv" + ) quotes.append(pd.read_csv(url.format(symbol))) -close_prices = np.vstack([q['close'] for q in quotes]) -open_prices = np.vstack([q['open'] for q in quotes]) +close_prices = np.vstack([q["close"] for q in quotes]) +open_prices = np.vstack([q["open"] for q in quotes]) # The daily variations of the quotes are what carry most information variation = close_prices - open_prices @@ -173,12 +176,11 @@ # ############################################################################# # Cluster using affinity propagation -_, labels = cluster.affinity_propagation(edge_model.covariance_, - random_state=0) +_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0) n_labels = labels.max() for i in range(n_labels + 1): - print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i]))) + print("Cluster %i: %s" % ((i + 1), ", ".join(names[labels == i]))) # ############################################################################# # Find a low-dimension embedding for visualization: find the best position of @@ -188,46 +190,48 @@ # initiated with random vectors that we don't control). In addition, we # use a large number of neighbors to capture the large-scale structure. node_position_model = manifold.LocallyLinearEmbedding( - n_components=2, eigen_solver='dense', n_neighbors=6) + n_components=2, eigen_solver="dense", n_neighbors=6 +) embedding = node_position_model.fit_transform(X.T).T # ############################################################################# # Visualization -plt.figure(1, facecolor='w', figsize=(10, 8)) +plt.figure(1, facecolor="w", figsize=(10, 8)) plt.clf() -ax = plt.axes([0., 0., 1., 1.]) -plt.axis('off') +ax = plt.axes([0.0, 0.0, 1.0, 1.0]) +plt.axis("off") # Display a graph of the partial correlations partial_correlations = edge_model.precision_.copy() d = 1 / np.sqrt(np.diag(partial_correlations)) partial_correlations *= d partial_correlations *= d[:, np.newaxis] -non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02) +non_zero = np.abs(np.triu(partial_correlations, k=1)) > 0.02 # Plot the nodes using the coordinates of our embedding -plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels, - cmap=plt.cm.nipy_spectral) +plt.scatter( + embedding[0], embedding[1], s=100 * d ** 2, c=labels, cmap=plt.cm.nipy_spectral +) # Plot the edges start_idx, end_idx = np.where(non_zero) # a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) -segments = [[embedding[:, start], embedding[:, stop]] - for start, stop in zip(start_idx, end_idx)] +segments = [ + [embedding[:, start], embedding[:, stop]] for start, stop in zip(start_idx, end_idx) +] values = np.abs(partial_correlations[non_zero]) -lc = LineCollection(segments, - zorder=0, cmap=plt.cm.hot_r, - norm=plt.Normalize(0, .7 * values.max())) +lc = LineCollection( + segments, zorder=0, cmap=plt.cm.hot_r, norm=plt.Normalize(0, 0.7 * values.max()) +) lc.set_array(values) lc.set_linewidths(15 * values) ax.add_collection(lc) # Add a label to each node. The challenge here is that we want to # position the labels to avoid overlap with other labels -for index, (name, label, (x, y)) in enumerate( - zip(names, labels, embedding.T)): +for index, (name, label, (x, y)) in enumerate(zip(names, labels, embedding.T)): dx = x - embedding[0] dx[index] = 1 @@ -236,27 +240,38 @@ this_dx = dx[np.argmin(np.abs(dy))] this_dy = dy[np.argmin(np.abs(dx))] if this_dx > 0: - horizontalalignment = 'left' - x = x + .002 + horizontalalignment = "left" + x = x + 0.002 else: - horizontalalignment = 'right' - x = x - .002 + horizontalalignment = "right" + x = x - 0.002 if this_dy > 0: - verticalalignment = 'bottom' - y = y + .002 + verticalalignment = "bottom" + y = y + 0.002 else: - verticalalignment = 'top' - y = y - .002 - plt.text(x, y, name, size=10, - horizontalalignment=horizontalalignment, - verticalalignment=verticalalignment, - bbox=dict(facecolor='w', - edgecolor=plt.cm.nipy_spectral(label / float(n_labels)), - alpha=.6)) - -plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(), - embedding[0].max() + .10 * embedding[0].ptp(),) -plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(), - embedding[1].max() + .03 * embedding[1].ptp()) + verticalalignment = "top" + y = y - 0.002 + plt.text( + x, + y, + name, + size=10, + horizontalalignment=horizontalalignment, + verticalalignment=verticalalignment, + bbox=dict( + facecolor="w", + edgecolor=plt.cm.nipy_spectral(label / float(n_labels)), + alpha=0.6, + ), + ) + +plt.xlim( + embedding[0].min() - 0.15 * embedding[0].ptp(), + embedding[0].max() + 0.10 * embedding[0].ptp(), +) +plt.ylim( + embedding[1].min() - 0.03 * embedding[1].ptp(), + embedding[1].max() + 0.03 * embedding[1].ptp(), +) plt.show() diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py index 0c218c1b0fa4a..8e96dbff3dafb 100644 --- a/examples/applications/plot_tomography_l1_reconstruction.py +++ b/examples/applications/plot_tomography_l1_reconstruction.py @@ -57,14 +57,14 @@ def _weights(x, dx=1, orig=0): def _generate_center_coordinates(l_x): X, Y = np.mgrid[:l_x, :l_x].astype(np.float64) - center = l_x / 2. + center = l_x / 2.0 X += 0.5 - center Y += 0.5 - center return X, Y def build_projection_operator(l_x, n_dir): - """ Compute the tomography design matrix. + """Compute the tomography design matrix. Parameters ---------- @@ -83,8 +83,7 @@ def build_projection_operator(l_x, n_dir): angles = np.linspace(0, np.pi, n_dir, endpoint=False) data_inds, weights, camera_inds = [], [], [] data_unravel_indices = np.arange(l_x ** 2) - data_unravel_indices = np.hstack((data_unravel_indices, - data_unravel_indices)) + data_unravel_indices = np.hstack((data_unravel_indices, data_unravel_indices)) for i, angle in enumerate(angles): Xrot = np.cos(angle) * X - np.sin(angle) * Y inds, w = _weights(Xrot, dx=1, orig=X.min()) @@ -97,11 +96,11 @@ def build_projection_operator(l_x, n_dir): def generate_synthetic_data(): - """ Synthetic binary data """ + """Synthetic binary data""" rs = np.random.RandomState(0) n_pts = 36 x, y = np.ogrid[0:l, 0:l] - mask_outer = (x - l / 2.) ** 2 + (y - l / 2.) ** 2 < (l / 2.) ** 2 + mask_outer = (x - l / 2.0) ** 2 + (y - l / 2.0) ** 2 < (l / 2.0) ** 2 mask = np.zeros((l, l)) points = l * rs.rand(2, n_pts) mask[(points[0]).astype(int), (points[1]).astype(int)] = 1 @@ -131,19 +130,18 @@ def generate_synthetic_data(): plt.figure(figsize=(8, 3.3)) plt.subplot(131) -plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest') -plt.axis('off') -plt.title('original image') +plt.imshow(data, cmap=plt.cm.gray, interpolation="nearest") +plt.axis("off") +plt.title("original image") plt.subplot(132) -plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation='nearest') -plt.title('L2 penalization') -plt.axis('off') +plt.imshow(rec_l2, cmap=plt.cm.gray, interpolation="nearest") +plt.title("L2 penalization") +plt.axis("off") plt.subplot(133) -plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation='nearest') -plt.title('L1 penalization') -plt.axis('off') +plt.imshow(rec_l1, cmap=plt.cm.gray, interpolation="nearest") +plt.title("L1 penalization") +plt.axis("off") -plt.subplots_adjust(hspace=0.01, wspace=0.01, top=1, bottom=0, left=0, - right=1) +plt.subplots_adjust(hspace=0.01, wspace=0.01, top=1, bottom=0, left=0, right=1) plt.show() diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index c677fa3b6650a..48b69d710226b 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -43,17 +43,16 @@ def plot_top_words(model, feature_names, n_top_words, title): fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True) axes = axes.flatten() for topic_idx, topic in enumerate(model.components_): - top_features_ind = topic.argsort()[:-n_top_words - 1:-1] + top_features_ind = topic.argsort()[: -n_top_words - 1 : -1] top_features = [feature_names[i] for i in top_features_ind] weights = topic[top_features_ind] ax = axes[topic_idx] ax.barh(top_features, weights, height=0.7) - ax.set_title(f'Topic {topic_idx +1}', - fontdict={'fontsize': 30}) + ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30}) ax.invert_yaxis() - ax.tick_params(axis='both', which='major', labelsize=20) - for i in 'top right left'.split(): + ax.tick_params(axis="both", which="major", labelsize=20) + for i in "top right left".split(): ax.spines[i].set_visible(False) fig.suptitle(title, fontsize=40) @@ -68,69 +67,91 @@ def plot_top_words(model, feature_names, n_top_words, title): print("Loading dataset...") t0 = time() -data, _ = fetch_20newsgroups(shuffle=True, random_state=1, - remove=('headers', 'footers', 'quotes'), - return_X_y=True) +data, _ = fetch_20newsgroups( + shuffle=True, + random_state=1, + remove=("headers", "footers", "quotes"), + return_X_y=True, +) data_samples = data[:n_samples] print("done in %0.3fs." % (time() - t0)) # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") -tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, - max_features=n_features, - stop_words='english') +tfidf_vectorizer = TfidfVectorizer( + max_df=0.95, min_df=2, max_features=n_features, stop_words="english" +) t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") -tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, - max_features=n_features, - stop_words='english') +tf_vectorizer = CountVectorizer( + max_df=0.95, min_df=2, max_features=n_features, stop_words="english" +) t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model -print("Fitting the NMF model (Frobenius norm) with tf-idf features, " - "n_samples=%d and n_features=%d..." - % (n_samples, n_features)) +print( + "Fitting the NMF model (Frobenius norm) with tf-idf features, " + "n_samples=%d and n_features=%d..." % (n_samples, n_features) +) t0 = time() -nmf = NMF(n_components=n_components, random_state=1, - alpha=.1, l1_ratio=.5).fit(tfidf) +nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() -plot_top_words(nmf, tfidf_feature_names, n_top_words, - 'Topics in NMF model (Frobenius norm)') +plot_top_words( + nmf, tfidf_feature_names, n_top_words, "Topics in NMF model (Frobenius norm)" +) # Fit the NMF model -print('\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler " - "divergence) with tf-idf features, n_samples=%d and n_features=%d..." - % (n_samples, n_features)) +print( + "\n" * 2, + "Fitting the NMF model (generalized Kullback-Leibler " + "divergence) with tf-idf features, n_samples=%d and n_features=%d..." + % (n_samples, n_features), +) t0 = time() -nmf = NMF(n_components=n_components, random_state=1, - beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, - l1_ratio=.5).fit(tfidf) +nmf = NMF( + n_components=n_components, + random_state=1, + beta_loss="kullback-leibler", + solver="mu", + max_iter=1000, + alpha=0.1, + l1_ratio=0.5, +).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() -plot_top_words(nmf, tfidf_feature_names, n_top_words, - 'Topics in NMF model (generalized Kullback-Leibler divergence)') - -print('\n' * 2, "Fitting LDA models with tf features, " - "n_samples=%d and n_features=%d..." - % (n_samples, n_features)) -lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, - learning_method='online', - learning_offset=50., - random_state=0) +plot_top_words( + nmf, + tfidf_feature_names, + n_top_words, + "Topics in NMF model (generalized Kullback-Leibler divergence)", +) + +print( + "\n" * 2, + "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." + % (n_samples, n_features), +) +lda = LatentDirichletAllocation( + n_components=n_components, + max_iter=5, + learning_method="online", + learning_offset=50.0, + random_state=0, +) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) tf_feature_names = tf_vectorizer.get_feature_names_out() -plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model') +plot_top_words(lda, tf_feature_names, n_top_words, "Topics in LDA model") diff --git a/examples/applications/svm_gui.py b/examples/applications/svm_gui.py index d9065b0fe8db4..f480b36ad1d94 100644 --- a/examples/applications/svm_gui.py +++ b/examples/applications/svm_gui.py @@ -21,14 +21,16 @@ # License: BSD 3 clause import matplotlib -matplotlib.use('TkAgg') + +matplotlib.use("TkAgg") from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg + try: from matplotlib.backends.backend_tkagg import NavigationToolbar2Tk except ImportError: # NavigationToolbar2TkAgg was deprecated in matplotlib 2.2 from matplotlib.backends.backend_tkagg import ( - NavigationToolbar2TkAgg as NavigationToolbar2Tk + NavigationToolbar2TkAgg as NavigationToolbar2Tk, ) from matplotlib.figure import Figure from matplotlib.contour import ContourSet @@ -58,12 +60,12 @@ def __init__(self): self.surface_type = 0 def changed(self, event): - """Notify the observers. """ + """Notify the observers.""" for observer in self.observers: observer.update(event, self) def add_observer(self, observer): - """Register an observer. """ + """Register an observer.""" self.observers.append(observer) def set_surface(self, surface): @@ -96,14 +98,23 @@ def fit(self): degree = int(self.degree.get()) kernel_map = {0: "linear", 1: "rbf", 2: "poly"} if len(np.unique(y)) == 1: - clf = svm.OneClassSVM(kernel=kernel_map[self.kernel.get()], - gamma=gamma, coef0=coef0, degree=degree) + clf = svm.OneClassSVM( + kernel=kernel_map[self.kernel.get()], + gamma=gamma, + coef0=coef0, + degree=degree, + ) clf.fit(X) else: - clf = svm.SVC(kernel=kernel_map[self.kernel.get()], C=C, - gamma=gamma, coef0=coef0, degree=degree) + clf = svm.SVC( + kernel=kernel_map[self.kernel.get()], + C=C, + gamma=gamma, + coef0=coef0, + degree=degree, + ) clf.fit(X, y) - if hasattr(clf, 'score'): + if hasattr(clf, "score"): print("Accuracy:", clf.score(X, y) * 100) X1, X2, Z = self.decision_surface(clf) self.model.clf = clf @@ -134,13 +145,13 @@ def add_example(self, x, y, label): self.refit() def refit(self): - """Refit the model if already fitted. """ + """Refit the model if already fitted.""" if self.fitted: self.fit() class View: - """Test docstring. """ + """Test docstring.""" def __init__(self, root, controller): f = Figure() @@ -157,7 +168,7 @@ def __init__(self, root, controller): canvas.show() canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) - canvas.mpl_connect('button_press_event', self.onclick) + canvas.mpl_connect("button_press_event", self.onclick) toolbar = NavigationToolbar2Tk(canvas, root) toolbar.update() self.controllbar = ControllBar(root, controller) @@ -184,9 +195,9 @@ def onclick(self, event): def update_example(self, model, idx): x, y, l = model.data[idx] if l == 1: - color = 'w' + color = "w" elif l == -1: - color = 'k' + color = "k" self.ax.plot([x], [y], "%so" % color, scalex=0.0, scaley=0.0) def update(self, event, model): @@ -227,25 +238,33 @@ def plot_support_vectors(self, support_vectors): """Plot the support vectors by placing circles over the corresponding data points and adds the circle collection to the contours list.""" - cs = self.ax.scatter(support_vectors[:, 0], support_vectors[:, 1], - s=80, edgecolors="k", facecolors="none") + cs = self.ax.scatter( + support_vectors[:, 0], + support_vectors[:, 1], + s=80, + edgecolors="k", + facecolors="none", + ) self.contours.append(cs) def plot_decision_surface(self, surface, type): X1, X2, Z = surface if type == 0: levels = [-1.0, 0.0, 1.0] - linestyles = ['dashed', 'solid', 'dashed'] - colors = 'k' - self.contours.append(self.ax.contour(X1, X2, Z, levels, - colors=colors, - linestyles=linestyles)) + linestyles = ["dashed", "solid", "dashed"] + colors = "k" + self.contours.append( + self.ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) + ) elif type == 1: - self.contours.append(self.ax.contourf(X1, X2, Z, 10, - cmap=matplotlib.cm.bone, - origin='lower', alpha=0.85)) - self.contours.append(self.ax.contour(X1, X2, Z, [0.0], colors='k', - linestyles=['solid'])) + self.contours.append( + self.ax.contourf( + X1, X2, Z, 10, cmap=matplotlib.cm.bone, origin="lower", alpha=0.85 + ) + ) + self.contours.append( + self.ax.contour(X1, X2, Z, [0.0], colors="k", linestyles=["solid"]) + ) else: raise ValueError("surface type unknown") @@ -254,12 +273,27 @@ class ControllBar: def __init__(self, root, controller): fm = Tk.Frame(root) kernel_group = Tk.Frame(fm) - Tk.Radiobutton(kernel_group, text="Linear", variable=controller.kernel, - value=0, command=controller.refit).pack(anchor=Tk.W) - Tk.Radiobutton(kernel_group, text="RBF", variable=controller.kernel, - value=1, command=controller.refit).pack(anchor=Tk.W) - Tk.Radiobutton(kernel_group, text="Poly", variable=controller.kernel, - value=2, command=controller.refit).pack(anchor=Tk.W) + Tk.Radiobutton( + kernel_group, + text="Linear", + variable=controller.kernel, + value=0, + command=controller.refit, + ).pack(anchor=Tk.W) + Tk.Radiobutton( + kernel_group, + text="RBF", + variable=controller.kernel, + value=1, + command=controller.refit, + ).pack(anchor=Tk.W) + Tk.Radiobutton( + kernel_group, + text="Poly", + variable=controller.kernel, + value=2, + command=controller.refit, + ).pack(anchor=Tk.W) kernel_group.pack(side=Tk.LEFT) valbox = Tk.Frame(fm) @@ -267,8 +301,7 @@ def __init__(self, root, controller): controller.complexity.set("1.0") c = Tk.Frame(valbox) Tk.Label(c, text="C:", anchor="e", width=7).pack(side=Tk.LEFT) - Tk.Entry(c, width=6, textvariable=controller.complexity).pack( - side=Tk.LEFT) + Tk.Entry(c, width=6, textvariable=controller.complexity).pack(side=Tk.LEFT) c.pack() controller.gamma = Tk.StringVar() @@ -294,29 +327,42 @@ def __init__(self, root, controller): valbox.pack(side=Tk.LEFT) cmap_group = Tk.Frame(fm) - Tk.Radiobutton(cmap_group, text="Hyperplanes", - variable=controller.surface_type, value=0, - command=controller.refit).pack(anchor=Tk.W) - Tk.Radiobutton(cmap_group, text="Surface", - variable=controller.surface_type, value=1, - command=controller.refit).pack(anchor=Tk.W) + Tk.Radiobutton( + cmap_group, + text="Hyperplanes", + variable=controller.surface_type, + value=0, + command=controller.refit, + ).pack(anchor=Tk.W) + Tk.Radiobutton( + cmap_group, + text="Surface", + variable=controller.surface_type, + value=1, + command=controller.refit, + ).pack(anchor=Tk.W) cmap_group.pack(side=Tk.LEFT) - train_button = Tk.Button(fm, text='Fit', width=5, - command=controller.fit) + train_button = Tk.Button(fm, text="Fit", width=5, command=controller.fit) train_button.pack() fm.pack(side=Tk.LEFT) - Tk.Button(fm, text='Clear', width=5, - command=controller.clear_data).pack(side=Tk.LEFT) + Tk.Button(fm, text="Clear", width=5, command=controller.clear_data).pack( + side=Tk.LEFT + ) def get_parser(): from optparse import OptionParser + op = OptionParser() - op.add_option("--output", - action="store", type="str", dest="output", - help="Path where to dump data.") + op.add_option( + "--output", + action="store", + type="str", + dest="output", + help="Path where to dump data.", + ) return op diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py index 1a148497af1b4..1d86076431ed8 100644 --- a/examples/applications/wikipedia_principal_eigenvector.py +++ b/examples/applications/wikipedia_principal_eigenvector.py @@ -65,7 +65,7 @@ if not os.path.exists(filename): print("Downloading data from '%s', please wait..." % url) opener = urlopen(url) - open(filename, 'wb').write(opener.read()) + open(filename, "wb").write(opener.read()) print() @@ -163,7 +163,8 @@ def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None): # stop after 5M links to make it possible to work in RAM X, redirects, index_map = get_adjacency_matrix( - redirects_filename, page_links_filename, limit=5000000) + redirects_filename, page_links_filename, limit=5000000 +) names = {i: name for name, i in index_map.items()} print("Computing the principal singular vectors using randomized_svd") @@ -195,16 +196,17 @@ def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10): print("Normalizing the graph") for i in incoming_counts.nonzero()[0]: - X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i] - dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0), - 1.0 / n, 0)).ravel() + X.data[X.indptr[i] : X.indptr[i + 1]] *= 1.0 / incoming_counts[i] + dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0), 1.0 / n, 0)).ravel() - scores = np.full(n, 1. / n, dtype=np.float32) # initial guess + scores = np.full(n, 1.0 / n, dtype=np.float32) # initial guess for i in range(max_iter): print("power iteration #%d" % i) prev_scores = scores - scores = (alpha * (scores * X + np.dot(dangle, prev_scores)) - + (1 - alpha) * prev_scores.sum() / n) + scores = ( + alpha * (scores * X + np.dot(dangle, prev_scores)) + + (1 - alpha) * prev_scores.sum() / n + ) # check convergence: normalized l_inf norm scores_max = np.abs(scores).max() if scores_max == 0.0: diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py index c01807e345928..eb2c0c8dafb50 100644 --- a/examples/bicluster/plot_bicluster_newsgroups.py +++ b/examples/bicluster/plot_bicluster_newsgroups.py @@ -38,7 +38,7 @@ def number_normalizer(tokens): - """ Map all numeric tokens to a placeholder. + """Map all numeric tokens to a placeholder. For many applications, tokens that begin with a number are not directly useful, but the fact that such a token exists can be relevant. By applying @@ -54,22 +54,35 @@ def build_tokenizer(self): # exclude 'comp.os.ms-windows.misc' -categories = ['alt.atheism', 'comp.graphics', - 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', - 'comp.windows.x', 'misc.forsale', 'rec.autos', - 'rec.motorcycles', 'rec.sport.baseball', - 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', - 'sci.med', 'sci.space', 'soc.religion.christian', - 'talk.politics.guns', 'talk.politics.mideast', - 'talk.politics.misc', 'talk.religion.misc'] +categories = [ + "alt.atheism", + "comp.graphics", + "comp.sys.ibm.pc.hardware", + "comp.sys.mac.hardware", + "comp.windows.x", + "misc.forsale", + "rec.autos", + "rec.motorcycles", + "rec.sport.baseball", + "rec.sport.hockey", + "sci.crypt", + "sci.electronics", + "sci.med", + "sci.space", + "soc.religion.christian", + "talk.politics.guns", + "talk.politics.mideast", + "talk.politics.misc", + "talk.religion.misc", +] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target -vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) -cocluster = SpectralCoclustering(n_clusters=len(categories), - svd_method='arpack', random_state=0) -kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, - random_state=0) +vectorizer = NumberNormalizingVectorizer(stop_words="english", min_df=5) +cocluster = SpectralCoclustering( + n_clusters=len(categories), svd_method="arpack", random_state=0 +) +kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) @@ -78,16 +91,20 @@ def build_tokenizer(self): start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ -print("Done in {:.2f}s. V-measure: {:.4f}".format( - time() - start_time, - v_measure_score(y_cocluster, y_true))) +print( + "Done in {:.2f}s. V-measure: {:.4f}".format( + time() - start_time, v_measure_score(y_cocluster, y_true) + ) +) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X) -print("Done in {:.2f}s. V-measure: {:.4f}".format( - time() - start_time, - v_measure_score(y_kmeans, y_true))) +print( + "Done in {:.2f}s. V-measure: {:.4f}".format( + time() - start_time, v_measure_score(y_kmeans, y_true) + ) +) feature_names = vectorizer.get_feature_names_out() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) @@ -97,14 +114,14 @@ def bicluster_ncut(i): rows, cols = cocluster.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys + return sys.float_info.max row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0] col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0] # Note: the following is identical to X[rows[:, np.newaxis], # cols].sum() but much faster in scipy <= 0.16 weight = X[rows][:, cols].sum() - cut = (X[row_complement][:, cols].sum() + - X[rows][:, col_complement].sum()) + cut = X[row_complement][:, cols].sum() + X[rows][:, col_complement].sum() return cut / weight @@ -116,8 +133,7 @@ def most_common(d): return sorted(d.items(), key=operator.itemgetter(1), reverse=True) -bicluster_ncuts = list(bicluster_ncut(i) - for i in range(len(newsgroups.target_names))) +bicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names))) best_idx = np.argsort(bicluster_ncuts)[:5] print() @@ -133,20 +149,24 @@ def most_common(d): counter = defaultdict(int) for i in cluster_docs: counter[document_names[i]] += 1 - cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) - for name, c in most_common(counter)[:3]) + cat_string = ", ".join( + "{:.0f}% {}".format(float(c) / n_rows * 100, name) + for name, c in most_common(counter)[:3] + ) # words out_of_cluster_docs = cocluster.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = X[:, cluster_words] - word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) - - word_col[out_of_cluster_docs, :].sum(axis=0)) + word_scores = np.array( + word_col[cluster_docs, :].sum(axis=0) + - word_col[out_of_cluster_docs, :].sum(axis=0) + ) word_scores = word_scores.ravel() - important_words = list(feature_names[cluster_words[i]] - for i in word_scores.argsort()[:-11:-1]) + important_words = list( + feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1] + ) - print("bicluster {} : {} documents, {} words".format( - idx, n_rows, n_cols)) + print("bicluster {} : {} documents, {} words".format(idx, n_rows, n_cols)) print("categories : {}".format(cat_string)) - print("words : {}\n".format(', '.join(important_words))) + print("words : {}\n".format(", ".join(important_words))) diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py index abc63879a8420..754853749a784 100644 --- a/examples/bicluster/plot_spectral_biclustering.py +++ b/examples/bicluster/plot_spectral_biclustering.py @@ -30,8 +30,8 @@ n_clusters = (4, 3) data, rows, columns = make_checkerboard( - shape=(300, 300), n_clusters=n_clusters, noise=10, - shuffle=False, random_state=0) + shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0 +) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") @@ -45,11 +45,9 @@ plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") -model = SpectralBiclustering(n_clusters=n_clusters, method='log', - random_state=0) +model = SpectralBiclustering(n_clusters=n_clusters, method="log", random_state=0) model.fit(data) -score = consensus_score(model.biclusters_, - (rows[:, row_idx], columns[:, col_idx])) +score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.1f}".format(score)) @@ -59,9 +57,10 @@ plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") -plt.matshow(np.outer(np.sort(model.row_labels_) + 1, - np.sort(model.column_labels_) + 1), - cmap=plt.cm.Blues) +plt.matshow( + np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), + cmap=plt.cm.Blues, +) plt.title("Checkerboard structure of rearranged data") plt.show() diff --git a/examples/bicluster/plot_spectral_coclustering.py b/examples/bicluster/plot_spectral_coclustering.py index 0681d51e0bfd3..26494bf4f1c4f 100644 --- a/examples/bicluster/plot_spectral_coclustering.py +++ b/examples/bicluster/plot_spectral_coclustering.py @@ -27,8 +27,8 @@ from sklearn.metrics import consensus_score data, rows, columns = make_biclusters( - shape=(300, 300), n_clusters=5, noise=5, - shuffle=False, random_state=0) + shape=(300, 300), n_clusters=5, noise=5, shuffle=False, random_state=0 +) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") @@ -44,8 +44,7 @@ model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) -score = consensus_score(model.biclusters_, - (rows[:, row_idx], columns[:, col_idx])) +score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py index e8d8d5184a178..c1e1acea0c7c9 100644 --- a/examples/calibration/plot_calibration.py +++ b/examples/calibration/plot_calibration.py @@ -47,16 +47,16 @@ # half positive samples and half negative samples. Probability in this # blob is therefore 0.5. centers = [(-5, -5), (0, 0), (5, 5)] -X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False, - random_state=42) +X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False, random_state=42) -y[:n_samples // 2] = 0 -y[n_samples // 2:] = 1 +y[: n_samples // 2] = 0 +y[n_samples // 2 :] = 1 sample_weight = np.random.RandomState(42).rand(y.shape[0]) # split train, test for calibration -X_train, X_test, y_train, y_test, sw_train, sw_test = \ - train_test_split(X, y, sample_weight, test_size=0.9, random_state=42) +X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( + X, y, sample_weight, test_size=0.9, random_state=42 +) # Gaussian Naive-Bayes with no calibration clf = GaussianNB() @@ -64,12 +64,12 @@ prob_pos_clf = clf.predict_proba(X_test)[:, 1] # Gaussian Naive-Bayes with isotonic calibration -clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic') +clf_isotonic = CalibratedClassifierCV(clf, cv=2, method="isotonic") clf_isotonic.fit(X_train, y_train, sample_weight=sw_train) prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1] # Gaussian Naive-Bayes with sigmoid calibration -clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid') +clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method="sigmoid") clf_sigmoid.fit(X_train, y_train, sample_weight=sw_train) prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1] @@ -78,12 +78,10 @@ clf_score = brier_score_loss(y_test, prob_pos_clf, sample_weight=sw_test) print("No calibration: %1.3f" % clf_score) -clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, - sample_weight=sw_test) +clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sample_weight=sw_test) print("With isotonic calibration: %1.3f" % clf_isotonic_score) -clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, - sample_weight=sw_test) +clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sample_weight=sw_test) print("With sigmoid calibration: %1.3f" % clf_sigmoid_score) # ############################################################################# @@ -94,26 +92,42 @@ for this_y, color in zip(y_unique, colors): this_X = X_train[y_train == this_y] this_sw = sw_train[y_train == this_y] - plt.scatter(this_X[:, 0], this_X[:, 1], s=this_sw * 50, - c=color[np.newaxis, :], - alpha=0.5, edgecolor='k', - label="Class %s" % this_y) + plt.scatter( + this_X[:, 0], + this_X[:, 1], + s=this_sw * 50, + c=color[np.newaxis, :], + alpha=0.5, + edgecolor="k", + label="Class %s" % this_y, + ) plt.legend(loc="best") plt.title("Data") plt.figure() -order = np.lexsort((prob_pos_clf, )) -plt.plot(prob_pos_clf[order], 'r', label='No calibration (%1.3f)' % clf_score) -plt.plot(prob_pos_isotonic[order], 'g', linewidth=3, - label='Isotonic calibration (%1.3f)' % clf_isotonic_score) -plt.plot(prob_pos_sigmoid[order], 'b', linewidth=3, - label='Sigmoid calibration (%1.3f)' % clf_sigmoid_score) -plt.plot(np.linspace(0, y_test.size, 51)[1::2], - y_test[order].reshape(25, -1).mean(1), - 'k', linewidth=3, label=r'Empirical') +order = np.lexsort((prob_pos_clf,)) +plt.plot(prob_pos_clf[order], "r", label="No calibration (%1.3f)" % clf_score) +plt.plot( + prob_pos_isotonic[order], + "g", + linewidth=3, + label="Isotonic calibration (%1.3f)" % clf_isotonic_score, +) +plt.plot( + prob_pos_sigmoid[order], + "b", + linewidth=3, + label="Sigmoid calibration (%1.3f)" % clf_sigmoid_score, +) +plt.plot( + np.linspace(0, y_test.size, 51)[1::2], + y_test[order].reshape(25, -1).mean(1), + "k", + linewidth=3, + label=r"Empirical", +) plt.ylim([-0.05, 1.05]) -plt.xlabel("Instances sorted according to predicted probability " - "(uncalibrated GNB)") +plt.xlabel("Instances sorted according to predicted probability (uncalibrated GNB)") plt.ylabel("P(y=1)") plt.legend(loc="upper left") plt.title("Gaussian naive Bayes probabilities") diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py index d4bfda5a3a55d..c52cff1a858b0 100644 --- a/examples/calibration/plot_calibration_curve.py +++ b/examples/calibration/plot_calibration_curve.py @@ -29,11 +29,13 @@ from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split -X, y = make_classification(n_samples=100_000, n_features=20, n_informative=2, - n_redundant=10, random_state=42) +X, y = make_classification( + n_samples=100_000, n_features=20, n_informative=2, n_redundant=10, random_state=42 +) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.99, random_state=42 +) # %% # Calibration curves @@ -62,33 +64,40 @@ from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB -lr = LogisticRegression(C=1.) +lr = LogisticRegression(C=1.0) gnb = GaussianNB() -gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method='isotonic') -gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method='sigmoid') +gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method="isotonic") +gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method="sigmoid") -clf_list = [(lr, 'Logistic'), - (gnb, 'Naive Bayes'), - (gnb_isotonic, 'Naive Bayes + Isotonic'), - (gnb_sigmoid, 'Naive Bayes + Sigmoid')] +clf_list = [ + (lr, "Logistic"), + (gnb, "Naive Bayes"), + (gnb_isotonic, "Naive Bayes + Isotonic"), + (gnb_sigmoid, "Naive Bayes + Sigmoid"), +] # %% fig = plt.figure(figsize=(10, 10)) gs = GridSpec(4, 2) -colors = plt.cm.get_cmap('Dark2') +colors = plt.cm.get_cmap("Dark2") ax_calibration_curve = fig.add_subplot(gs[:2, :2]) calibration_displays = {} for i, (clf, name) in enumerate(clf_list): clf.fit(X_train, y_train) display = CalibrationDisplay.from_estimator( - clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve, - color=colors(i) + clf, + X_test, + y_test, + n_bins=10, + name=name, + ax=ax_calibration_curve, + color=colors(i), ) calibration_displays[name] = display ax_calibration_curve.grid() -ax_calibration_curve.set_title('Calibration plots (Naive Bayes)') +ax_calibration_curve.set_title("Calibration plots (Naive Bayes)") # Add histogram grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)] @@ -97,8 +106,11 @@ ax = fig.add_subplot(gs[row, col]) ax.hist( - calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name, - color=colors(i) + calibration_displays[name].y_prob, + range=(0, 1), + bins=10, + label=name, + color=colors(i), ) ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count") @@ -128,8 +140,14 @@ import pandas as pd -from sklearn.metrics import (precision_score, recall_score, f1_score, - brier_score_loss, log_loss, roc_auc_score) +from sklearn.metrics import ( + precision_score, + recall_score, + f1_score, + brier_score_loss, + log_loss, + roc_auc_score, +) scores = defaultdict(list) for i, (clf, name) in enumerate(clf_list): @@ -204,15 +222,17 @@ def predict_proba(self, X): # %% -lr = LogisticRegression(C=1.) +lr = LogisticRegression(C=1.0) svc = NaivelyCalibratedLinearSVC(max_iter=10_000) -svc_isotonic = CalibratedClassifierCV(svc, cv=2, method='isotonic') -svc_sigmoid = CalibratedClassifierCV(svc, cv=2, method='sigmoid') +svc_isotonic = CalibratedClassifierCV(svc, cv=2, method="isotonic") +svc_sigmoid = CalibratedClassifierCV(svc, cv=2, method="sigmoid") -clf_list = [(lr, 'Logistic'), - (svc, 'SVC'), - (svc_isotonic, 'SVC + Isotonic'), - (svc_sigmoid, 'SVC + Sigmoid')] +clf_list = [ + (lr, "Logistic"), + (svc, "SVC"), + (svc_isotonic, "SVC + Isotonic"), + (svc_sigmoid, "SVC + Sigmoid"), +] # %% fig = plt.figure(figsize=(10, 10)) @@ -223,13 +243,18 @@ def predict_proba(self, X): for i, (clf, name) in enumerate(clf_list): clf.fit(X_train, y_train) display = CalibrationDisplay.from_estimator( - clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve, - color=colors(i) + clf, + X_test, + y_test, + n_bins=10, + name=name, + ax=ax_calibration_curve, + color=colors(i), ) calibration_displays[name] = display ax_calibration_curve.grid() -ax_calibration_curve.set_title('Calibration plots (SVC)') +ax_calibration_curve.set_title("Calibration plots (SVC)") # Add histogram grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)] @@ -238,8 +263,11 @@ def predict_proba(self, X): ax = fig.add_subplot(gs[row, col]) ax.hist( - calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name, - color=colors(i) + calibration_displays[name].y_prob, + range=(0, 1), + bins=10, + label=name, + color=colors(i), ) ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count") diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py index ef1a53056009d..a8713dfc1f849 100644 --- a/examples/calibration/plot_calibration_multiclass.py +++ b/examples/calibration/plot_calibration_multiclass.py @@ -34,8 +34,9 @@ class of an instance (red: class 1, green: class 2, blue: class 3). np.random.seed(0) -X, y = make_blobs(n_samples=2000, n_features=2, centers=3, random_state=42, - cluster_std=5.0) +X, y = make_blobs( + n_samples=2000, n_features=2, centers=3, random_state=42, cluster_std=5.0 +) X_train, y_train = X[:600], y[:600] X_valid, y_valid = X[600:1000], y[600:1000] X_train_valid, y_train_valid = X[:1000], y[:1000] @@ -82,58 +83,96 @@ class of an instance (red: class 1, green: class 2, blue: class 3). cal_clf_probs = cal_clf.predict_proba(X_test) # Plot arrows for i in range(clf_probs.shape[0]): - plt.arrow(clf_probs[i, 0], clf_probs[i, 1], - cal_clf_probs[i, 0] - clf_probs[i, 0], - cal_clf_probs[i, 1] - clf_probs[i, 1], - color=colors[y_test[i]], head_width=1e-2) + plt.arrow( + clf_probs[i, 0], + clf_probs[i, 1], + cal_clf_probs[i, 0] - clf_probs[i, 0], + cal_clf_probs[i, 1] - clf_probs[i, 1], + color=colors[y_test[i]], + head_width=1e-2, + ) # Plot perfect predictions, at each vertex -plt.plot([1.0], [0.0], 'ro', ms=20, label="Class 1") -plt.plot([0.0], [1.0], 'go', ms=20, label="Class 2") -plt.plot([0.0], [0.0], 'bo', ms=20, label="Class 3") +plt.plot([1.0], [0.0], "ro", ms=20, label="Class 1") +plt.plot([0.0], [1.0], "go", ms=20, label="Class 2") +plt.plot([0.0], [0.0], "bo", ms=20, label="Class 3") # Plot boundaries of unit simplex -plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex") +plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex") # Annotate points 6 points around the simplex, and mid point inside simplex -plt.annotate(r'($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)', - xy=(1.0/3, 1.0/3), xytext=(1.0/3, .23), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.plot([1.0/3], [1.0/3], 'ko', ms=5) -plt.annotate(r'($\frac{1}{2}$, $0$, $\frac{1}{2}$)', - xy=(.5, .0), xytext=(.5, .1), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.annotate(r'($0$, $\frac{1}{2}$, $\frac{1}{2}$)', - xy=(.0, .5), xytext=(.1, .5), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.annotate(r'($\frac{1}{2}$, $\frac{1}{2}$, $0$)', - xy=(.5, .5), xytext=(.6, .6), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.annotate(r'($0$, $0$, $1$)', - xy=(0, 0), xytext=(.1, .1), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.annotate(r'($1$, $0$, $0$)', - xy=(1, 0), xytext=(1, .1), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') -plt.annotate(r'($0$, $1$, $0$)', - xy=(0, 1), xytext=(.1, 1), xycoords='data', - arrowprops=dict(facecolor='black', shrink=0.05), - horizontalalignment='center', verticalalignment='center') +plt.annotate( + r"($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)", + xy=(1.0 / 3, 1.0 / 3), + xytext=(1.0 / 3, 0.23), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.plot([1.0 / 3], [1.0 / 3], "ko", ms=5) +plt.annotate( + r"($\frac{1}{2}$, $0$, $\frac{1}{2}$)", + xy=(0.5, 0.0), + xytext=(0.5, 0.1), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.annotate( + r"($0$, $\frac{1}{2}$, $\frac{1}{2}$)", + xy=(0.0, 0.5), + xytext=(0.1, 0.5), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.annotate( + r"($\frac{1}{2}$, $\frac{1}{2}$, $0$)", + xy=(0.5, 0.5), + xytext=(0.6, 0.6), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.annotate( + r"($0$, $0$, $1$)", + xy=(0, 0), + xytext=(0.1, 0.1), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.annotate( + r"($1$, $0$, $0$)", + xy=(1, 0), + xytext=(1, 0.1), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) +plt.annotate( + r"($0$, $1$, $0$)", + xy=(0, 1), + xytext=(0.1, 1), + xycoords="data", + arrowprops=dict(facecolor="black", shrink=0.05), + horizontalalignment="center", + verticalalignment="center", +) # Add grid plt.grid(False) for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: - plt.plot([0, x], [x, 0], 'k', alpha=0.2) - plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2) - plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2) + plt.plot([0, x], [x, 0], "k", alpha=0.2) + plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2) + plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2) -plt.title("Change of predicted probabilities on test samples " - "after sigmoid calibration") +plt.title("Change of predicted probabilities on test samples after sigmoid calibration") plt.xlabel("Probability class 1") plt.ylabel("Probability class 2") plt.xlim(-0.05, 1.05) @@ -193,9 +232,12 @@ class of an instance (red: class 1, green: class 2, blue: class 3). # Use the three class-wise calibrators to compute calibrated probabilities calibrated_classifier = cal_clf.calibrated_classifiers_[0] -prediction = np.vstack([calibrator.predict(this_p) - for calibrator, this_p in - zip(calibrated_classifier.calibrators, p.T)]).T +prediction = np.vstack( + [ + calibrator.predict(this_p) + for calibrator, this_p in zip(calibrated_classifier.calibrators, p.T) + ] +).T # Re-normalize the calibrated predictions to make sure they stay inside the # simplex. This same renormalization step is performed internally by the @@ -204,18 +246,23 @@ class of an instance (red: class 1, green: class 2, blue: class 3). # Plot changes in predicted probabilities induced by the calibrators for i in range(prediction.shape[0]): - plt.arrow(p[i, 0], p[i, 1], - prediction[i, 0] - p[i, 0], prediction[i, 1] - p[i, 1], - head_width=1e-2, color=colors[np.argmax(p[i])]) + plt.arrow( + p[i, 0], + p[i, 1], + prediction[i, 0] - p[i, 0], + prediction[i, 1] - p[i, 1], + head_width=1e-2, + color=colors[np.argmax(p[i])], + ) # Plot the boundaries of the unit simplex -plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], 'k', label="Simplex") +plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex") plt.grid(False) for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: - plt.plot([0, x], [x, 0], 'k', alpha=0.2) - plt.plot([0, 0 + (1-x)/2], [x, x + (1-x)/2], 'k', alpha=0.2) - plt.plot([x, x + (1-x)/2], [0, 0 + (1-x)/2], 'k', alpha=0.2) + plt.plot([0, x], [x, 0], "k", alpha=0.2) + plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2) + plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2) plt.title("Learned sigmoid calibration map") plt.xlabel("Probability class 1") diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py index 7ee4eaf4da7df..f866e45e0ba2b 100644 --- a/examples/calibration/plot_compare_calibration.py +++ b/examples/calibration/plot_compare_calibration.py @@ -32,13 +32,15 @@ from sklearn.model_selection import train_test_split X, y = make_classification( - n_samples=100_000, n_features=20, n_informative=2, n_redundant=2, - random_state=42 + n_samples=100_000, n_features=20, n_informative=2, n_redundant=2, random_state=42 ) train_samples = 100 # Samples used for training the models X_train, X_test, y_train, y_test = train_test_split( - X, y, shuffle=False, test_size=100_000 - train_samples, + X, + y, + shuffle=False, + test_size=100_000 - train_samples, ) # %% @@ -92,10 +94,12 @@ def predict_proba(self, X): svc = NaivelyCalibratedLinearSVC(C=1.0) rfc = RandomForestClassifier() -clf_list = [(lr, 'Logistic'), - (gnb, 'Naive Bayes'), - (svc, 'SVC'), - (rfc, 'Random forest')] +clf_list = [ + (lr, "Logistic"), + (gnb, "Naive Bayes"), + (svc, "SVC"), + (rfc, "Random forest"), +] # %% @@ -104,20 +108,25 @@ def predict_proba(self, X): fig = plt.figure(figsize=(10, 10)) gs = GridSpec(4, 2) -colors = plt.cm.get_cmap('Dark2') +colors = plt.cm.get_cmap("Dark2") ax_calibration_curve = fig.add_subplot(gs[:2, :2]) calibration_displays = {} for i, (clf, name) in enumerate(clf_list): clf.fit(X_train, y_train) display = CalibrationDisplay.from_estimator( - clf, X_test, y_test, n_bins=10, name=name, ax=ax_calibration_curve, - color=colors(i) + clf, + X_test, + y_test, + n_bins=10, + name=name, + ax=ax_calibration_curve, + color=colors(i), ) calibration_displays[name] = display ax_calibration_curve.grid() -ax_calibration_curve.set_title('Calibration plots') +ax_calibration_curve.set_title("Calibration plots") # Add histogram grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)] @@ -126,8 +135,11 @@ def predict_proba(self, X): ax = fig.add_subplot(gs[row, col]) ax.hist( - calibration_displays[name].y_prob, range=(0, 1), bins=10, label=name, - color=colors(i) + calibration_displays[name].y_prob, + range=(0, 1), + bins=10, + label=name, + color=colors(i), ) ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count") diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py index ea4df9e6fb583..f88c749cb2b40 100644 --- a/examples/classification/plot_classification_probability.py +++ b/examples/classification/plot_classification_probability.py @@ -41,27 +41,23 @@ # Create different classifiers. classifiers = { - 'L1 logistic': LogisticRegression(C=C, penalty='l1', - solver='saga', - multi_class='multinomial', - max_iter=10000), - 'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2', - solver='saga', - multi_class='multinomial', - max_iter=10000), - 'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2', - solver='saga', - multi_class='ovr', - max_iter=10000), - 'Linear SVC': SVC(kernel='linear', C=C, probability=True, - random_state=0), - 'GPC': GaussianProcessClassifier(kernel) + "L1 logistic": LogisticRegression( + C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000 + ), + "L2 logistic (Multinomial)": LogisticRegression( + C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000 + ), + "L2 logistic (OvR)": LogisticRegression( + C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000 + ), + "Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0), + "GPC": GaussianProcessClassifier(kernel), } n_classifiers = len(classifiers) plt.figure(figsize=(3 * 2, n_classifiers * 2)) -plt.subplots_adjust(bottom=.2, top=.95) +plt.subplots_adjust(bottom=0.2, top=0.95) xx = np.linspace(3, 9, 100) yy = np.linspace(1, 5, 100).T @@ -83,16 +79,17 @@ plt.title("Class %d" % k) if k == 0: plt.ylabel(name) - imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)), - extent=(3, 9, 1, 5), origin='lower') + imshow_handle = plt.imshow( + probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower" + ) plt.xticks(()) plt.yticks(()) - idx = (y_pred == k) + idx = y_pred == k if idx.any(): - plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k') + plt.scatter(X[idx, 0], X[idx, 1], marker="o", c="w", edgecolor="k") ax = plt.axes([0.15, 0.04, 0.7, 0.05]) plt.title("Probability") -plt.colorbar(imshow_handle, cax=ax, orientation='horizontal') +plt.colorbar(imshow_handle, cax=ax, orientation="horizontal") plt.show() diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py index 83019e821dae5..1b38b7427b6c6 100644 --- a/examples/classification/plot_classifier_comparison.py +++ b/examples/classification/plot_classifier_comparison.py @@ -44,11 +44,20 @@ from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis -h = .02 # step size in the mesh - -names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", - "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", - "Naive Bayes", "QDA"] +h = 0.02 # step size in the mesh + +names = [ + "Nearest Neighbors", + "Linear SVM", + "RBF SVM", + "Gaussian Process", + "Decision Tree", + "Random Forest", + "Neural Net", + "AdaBoost", + "Naive Bayes", + "QDA", +] classifiers = [ KNeighborsClassifier(3), @@ -60,18 +69,21 @@ MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), - QuadraticDiscriminantAnalysis()] + QuadraticDiscriminantAnalysis(), +] -X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, - random_state=1, n_clusters_per_class=1) +X, y = make_classification( + n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1 +) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) -datasets = [make_moons(noise=0.3, random_state=0), - make_circles(noise=0.2, factor=0.5, random_state=1), - linearly_separable - ] +datasets = [ + make_moons(noise=0.3, random_state=0), + make_circles(noise=0.2, factor=0.5, random_state=1), + linearly_separable, +] figure = plt.figure(figsize=(27, 9)) i = 1 @@ -80,26 +92,26 @@ # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) - X_train, X_test, y_train, y_test = \ - train_test_split(X, y, test_size=.4, random_state=42) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.4, random_state=42 + ) - x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 - y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 + y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm = plt.cm.RdBu - cm_bright = ListedColormap(['#FF0000', '#0000FF']) + cm_bright = ListedColormap(["#FF0000", "#0000FF"]) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) if ds_cnt == 0: ax.set_title("Input data") # Plot the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - edgecolors='k') + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k") # Plot the testing points - ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, - edgecolors='k') + ax.scatter( + X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k" + ) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) @@ -121,14 +133,21 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) - ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) # Plot the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - edgecolors='k') + ax.scatter( + X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k" + ) # Plot the testing points - ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, - edgecolors='k', alpha=0.6) + ax.scatter( + X_test[:, 0], + X_test[:, 1], + c=y_test, + cmap=cm_bright, + edgecolors="k", + alpha=0.6, + ) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) @@ -136,8 +155,13 @@ ax.set_yticks(()) if ds_cnt == 0: ax.set_title(name) - ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), - size=15, horizontalalignment='right') + ax.text( + xx.max() - 0.3, + yy.min() + 0.3, + ("%.2f" % score).lstrip("0"), + size=15, + horizontalalignment="right", + ) i += 1 plt.tight_layout() diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py index 8cb61df2c9736..1c4f150c37374 100644 --- a/examples/classification/plot_digits_classification.py +++ b/examples/classification/plot_digits_classification.py @@ -38,8 +38,8 @@ _, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3)) for ax, image, label in zip(axes, digits.images, digits.target): ax.set_axis_off() - ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') - ax.set_title('Training: %i' % label) + ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest") + ax.set_title("Training: %i" % label) ############################################################################### # Classification @@ -65,7 +65,8 @@ # Split data into 50% train and 50% test subsets X_train, X_test, y_train, y_test = train_test_split( - data, digits.target, test_size=0.5, shuffle=False) + data, digits.target, test_size=0.5, shuffle=False +) # Learn the digits on the train subset clf.fit(X_train, y_train) @@ -81,15 +82,17 @@ for ax, image, prediction in zip(axes, X_test, predicted): ax.set_axis_off() image = image.reshape(8, 8) - ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') - ax.set_title(f'Prediction: {prediction}') + ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest") + ax.set_title(f"Prediction: {prediction}") ############################################################################### # :func:`~sklearn.metrics.classification_report` builds a text report showing # the main classification metrics. -print(f"Classification report for classifier {clf}:\n" - f"{metrics.classification_report(y_test, predicted)}\n") +print( + f"Classification report for classifier {clf}:\n" + f"{metrics.classification_report(y_test, predicted)}\n" +) ############################################################################### # We can also plot a :ref:`confusion matrix ` of the diff --git a/examples/classification/plot_lda.py b/examples/classification/plot_lda.py index ad16e7b0d2efa..856f2e206e9c9 100644 --- a/examples/classification/plot_lda.py +++ b/examples/classification/plot_lda.py @@ -45,13 +45,12 @@ def generate_data(n_samples, n_features): for _ in range(n_averages): X, y = generate_data(n_train, n_features) - clf1 = LinearDiscriminantAnalysis(solver='lsqr', - shrinkage='auto').fit(X, y) - clf2 = LinearDiscriminantAnalysis(solver='lsqr', - shrinkage=None).fit(X, y) + clf1 = LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto").fit(X, y) + clf2 = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=None).fit(X, y) oa = OAS(store_precision=False, assume_centered=False) - clf3 = LinearDiscriminantAnalysis(solver='lsqr', - covariance_estimator=oa).fit(X, y) + clf3 = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=oa).fit( + X, y + ) X, y = generate_data(n_test, n_features) score_clf1 += clf1.score(X, y) @@ -64,18 +63,37 @@ def generate_data(n_samples, n_features): features_samples_ratio = np.array(n_features_range) / n_train -plt.plot(features_samples_ratio, acc_clf1, linewidth=2, - label="Linear Discriminant Analysis with Ledoit Wolf", color='navy') -plt.plot(features_samples_ratio, acc_clf2, linewidth=2, - label="Linear Discriminant Analysis", color='gold') -plt.plot(features_samples_ratio, acc_clf3, linewidth=2, - label="Linear Discriminant Analysis with OAS", color='red') - -plt.xlabel('n_features / n_samples') -plt.ylabel('Classification accuracy') - -plt.legend(loc=3, prop={'size': 12}) -plt.suptitle('Linear Discriminant Analysis vs. ' + '\n' - + 'Shrinkage Linear Discriminant Analysis vs. ' + '\n' - + 'OAS Linear Discriminant Analysis (1 discriminative feature)') +plt.plot( + features_samples_ratio, + acc_clf1, + linewidth=2, + label="Linear Discriminant Analysis with Ledoit Wolf", + color="navy", +) +plt.plot( + features_samples_ratio, + acc_clf2, + linewidth=2, + label="Linear Discriminant Analysis", + color="gold", +) +plt.plot( + features_samples_ratio, + acc_clf3, + linewidth=2, + label="Linear Discriminant Analysis with OAS", + color="red", +) + +plt.xlabel("n_features / n_samples") +plt.ylabel("Classification accuracy") + +plt.legend(loc=3, prop={"size": 12}) +plt.suptitle( + "Linear Discriminant Analysis vs. " + + "\n" + + "Shrinkage Linear Discriminant Analysis vs. " + + "\n" + + "OAS Linear Discriminant Analysis (1 discriminative feature)" +) plt.show() diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py index d02adb03e6028..951f6b8c4cd29 100644 --- a/examples/classification/plot_lda_qda.py +++ b/examples/classification/plot_lda_qda.py @@ -23,33 +23,40 @@ class has its own standard deviation with QDA. # ############################################################################# # Colormap cmap = colors.LinearSegmentedColormap( - 'red_blue_classes', - {'red': [(0, 1, 1), (1, 0.7, 0.7)], - 'green': [(0, 0.7, 0.7), (1, 0.7, 0.7)], - 'blue': [(0, 0.7, 0.7), (1, 1, 1)]}) + "red_blue_classes", + { + "red": [(0, 1, 1), (1, 0.7, 0.7)], + "green": [(0, 0.7, 0.7), (1, 0.7, 0.7)], + "blue": [(0, 0.7, 0.7), (1, 1, 1)], + }, +) plt.cm.register_cmap(cmap=cmap) # ############################################################################# # Generate datasets def dataset_fixed_cov(): - '''Generate 2 Gaussians samples with the same covariance matrix''' + """Generate 2 Gaussians samples with the same covariance matrix""" n, dim = 300, 2 np.random.seed(0) - C = np.array([[0., -0.23], [0.83, .23]]) - X = np.r_[np.dot(np.random.randn(n, dim), C), - np.dot(np.random.randn(n, dim), C) + np.array([1, 1])] + C = np.array([[0.0, -0.23], [0.83, 0.23]]) + X = np.r_[ + np.dot(np.random.randn(n, dim), C), + np.dot(np.random.randn(n, dim), C) + np.array([1, 1]), + ] y = np.hstack((np.zeros(n), np.ones(n))) return X, y def dataset_cov(): - '''Generate 2 Gaussians samples with different covariance matrices''' + """Generate 2 Gaussians samples with different covariance matrices""" n, dim = 300, 2 np.random.seed(0) - C = np.array([[0., -1.], [2.5, .7]]) * 2. - X = np.r_[np.dot(np.random.randn(n, dim), C), - np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4])] + C = np.array([[0.0, -1.0], [2.5, 0.7]]) * 2.0 + X = np.r_[ + np.dot(np.random.randn(n, dim), C), + np.dot(np.random.randn(n, dim), C.T) + np.array([1, 4]), + ] y = np.hstack((np.zeros(n), np.ones(n))) return X, y @@ -59,46 +66,58 @@ def dataset_cov(): def plot_data(lda, X, y, y_pred, fig_index): splot = plt.subplot(2, 2, fig_index) if fig_index == 1: - plt.title('Linear Discriminant Analysis') - plt.ylabel('Data with\n fixed covariance') + plt.title("Linear Discriminant Analysis") + plt.ylabel("Data with\n fixed covariance") elif fig_index == 2: - plt.title('Quadratic Discriminant Analysis') + plt.title("Quadratic Discriminant Analysis") elif fig_index == 3: - plt.ylabel('Data with\n varying covariances') + plt.ylabel("Data with\n varying covariances") - tp = (y == y_pred) # True Positive + tp = y == y_pred # True Positive tp0, tp1 = tp[y == 0], tp[y == 1] X0, X1 = X[y == 0], X[y == 1] X0_tp, X0_fp = X0[tp0], X0[~tp0] X1_tp, X1_fp = X1[tp1], X1[~tp1] # class 0: dots - plt.scatter(X0_tp[:, 0], X0_tp[:, 1], marker='.', color='red') - plt.scatter(X0_fp[:, 0], X0_fp[:, 1], marker='x', - s=20, color='#990000') # dark red + plt.scatter(X0_tp[:, 0], X0_tp[:, 1], marker=".", color="red") + plt.scatter(X0_fp[:, 0], X0_fp[:, 1], marker="x", s=20, color="#990000") # dark red # class 1: dots - plt.scatter(X1_tp[:, 0], X1_tp[:, 1], marker='.', color='blue') - plt.scatter(X1_fp[:, 0], X1_fp[:, 1], marker='x', - s=20, color='#000099') # dark blue + plt.scatter(X1_tp[:, 0], X1_tp[:, 1], marker=".", color="blue") + plt.scatter( + X1_fp[:, 0], X1_fp[:, 1], marker="x", s=20, color="#000099" + ) # dark blue # class 0 and 1 : areas nx, ny = 200, 100 x_min, x_max = plt.xlim() y_min, y_max = plt.ylim() - xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), - np.linspace(y_min, y_max, ny)) + xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx), np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) - plt.pcolormesh(xx, yy, Z, cmap='red_blue_classes', - norm=colors.Normalize(0., 1.), zorder=0) - plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white') + plt.pcolormesh( + xx, yy, Z, cmap="red_blue_classes", norm=colors.Normalize(0.0, 1.0), zorder=0 + ) + plt.contour(xx, yy, Z, [0.5], linewidths=2.0, colors="white") # means - plt.plot(lda.means_[0][0], lda.means_[0][1], - '*', color='yellow', markersize=15, markeredgecolor='grey') - plt.plot(lda.means_[1][0], lda.means_[1][1], - '*', color='yellow', markersize=15, markeredgecolor='grey') + plt.plot( + lda.means_[0][0], + lda.means_[0][1], + "*", + color="yellow", + markersize=15, + markeredgecolor="grey", + ) + plt.plot( + lda.means_[1][0], + lda.means_[1][1], + "*", + color="yellow", + markersize=15, + markeredgecolor="grey", + ) return splot @@ -109,9 +128,15 @@ def plot_ellipse(splot, mean, cov, color): angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees # filled Gaussian at 2 standard deviation - ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5, - 180 + angle, facecolor=color, - edgecolor='black', linewidth=2) + ell = mpl.patches.Ellipse( + mean, + 2 * v[0] ** 0.5, + 2 * v[1] ** 0.5, + 180 + angle, + facecolor=color, + edgecolor="black", + linewidth=2, + ) ell.set_clip_box(splot.bbox) ell.set_alpha(0.2) splot.add_artist(ell) @@ -120,32 +145,35 @@ def plot_ellipse(splot, mean, cov, color): def plot_lda_cov(lda, splot): - plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') - plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') + plot_ellipse(splot, lda.means_[0], lda.covariance_, "red") + plot_ellipse(splot, lda.means_[1], lda.covariance_, "blue") def plot_qda_cov(qda, splot): - plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red') - plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue') + plot_ellipse(splot, qda.means_[0], qda.covariance_[0], "red") + plot_ellipse(splot, qda.means_[1], qda.covariance_[1], "blue") -plt.figure(figsize=(10, 8), facecolor='white') -plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis', - y=0.98, fontsize=15) +plt.figure(figsize=(10, 8), facecolor="white") +plt.suptitle( + "Linear Discriminant Analysis vs Quadratic Discriminant Analysis", + y=0.98, + fontsize=15, +) for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) - plt.axis('tight') + plt.axis("tight") # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) - plt.axis('tight') + plt.axis("tight") plt.tight_layout() plt.subplots_adjust(top=0.92) plt.show() diff --git a/examples/cluster/plot_adjusted_for_chance_measures.py b/examples/cluster/plot_adjusted_for_chance_measures.py index c84266378bb3f..0b77144ef5256 100644 --- a/examples/cluster/plot_adjusted_for_chance_measures.py +++ b/examples/cluster/plot_adjusted_for_chance_measures.py @@ -31,8 +31,9 @@ from sklearn import metrics -def uniform_labelings_scores(score_func, n_samples, n_clusters_range, - fixed_n_classes=None, n_runs=5, seed=42): +def uniform_labelings_scores( + score_func, n_samples, n_clusters_range, fixed_n_classes=None, n_runs=5, seed=42 +): """Compute score for 2 random uniform cluster labelings. Both random labelings have the same number of clusters for each value @@ -77,20 +78,24 @@ def ami_score(U, V): plots = [] names = [] for score_func in score_funcs: - print("Computing %s for %d values of n_clusters and n_samples=%d" - % (score_func.__name__, len(n_clusters_range), n_samples)) + print( + "Computing %s for %d values of n_clusters and n_samples=%d" + % (score_func.__name__, len(n_clusters_range), n_samples) + ) t0 = time() scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range) print("done in %0.3fs" % (time() - t0)) - plots.append(plt.errorbar( - n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0]) + plots.append( + plt.errorbar(n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0] + ) names.append(score_func.__name__) -plt.title("Clustering measures for 2 random uniform labelings\n" - "with equal number of clusters") -plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples) -plt.ylabel('Score value') +plt.title( + "Clustering measures for 2 random uniform labelings\nwith equal number of clusters" +) +plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples) +plt.ylabel("Score value") plt.legend(plots, names) plt.ylim(bottom=-0.05, top=1.05) @@ -107,21 +112,27 @@ def ami_score(U, V): plots = [] names = [] for score_func in score_funcs: - print("Computing %s for %d values of n_clusters and n_samples=%d" - % (score_func.__name__, len(n_clusters_range), n_samples)) + print( + "Computing %s for %d values of n_clusters and n_samples=%d" + % (score_func.__name__, len(n_clusters_range), n_samples) + ) t0 = time() - scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range, - fixed_n_classes=n_classes) + scores = uniform_labelings_scores( + score_func, n_samples, n_clusters_range, fixed_n_classes=n_classes + ) print("done in %0.3fs" % (time() - t0)) - plots.append(plt.errorbar( - n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0]) + plots.append( + plt.errorbar(n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0] + ) names.append(score_func.__name__) -plt.title("Clustering measures for random uniform labeling\n" - "against reference assignment with %d classes" % n_classes) -plt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples) -plt.ylabel('Score value') +plt.title( + "Clustering measures for random uniform labeling\n" + "against reference assignment with %d classes" % n_classes +) +plt.xlabel("Number of clusters (Number of samples is fixed to %d)" % n_samples) +plt.ylabel("Score value") plt.ylim(bottom=-0.05, top=1.05) plt.legend(plots, names) plt.show() diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index 101a60b74ec06..799d8d2d949b7 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -17,8 +17,9 @@ # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] -X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5, - random_state=0) +X, labels_true = make_blobs( + n_samples=300, centers=centers, cluster_std=0.5, random_state=0 +) # ############################################################################# # Compute Affinity Propagation @@ -28,35 +29,44 @@ n_clusters_ = len(cluster_centers_indices) -print('Estimated number of clusters: %d' % n_clusters_) +print("Estimated number of clusters: %d" % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) -print("Adjusted Rand Index: %0.3f" - % metrics.adjusted_rand_score(labels_true, labels)) -print("Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels)) -print("Silhouette Coefficient: %0.3f" - % metrics.silhouette_score(X, labels, metric='sqeuclidean')) +print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) +print( + "Adjusted Mutual Information: %0.3f" + % metrics.adjusted_mutual_info_score(labels_true, labels) +) +print( + "Silhouette Coefficient: %0.3f" + % metrics.silhouette_score(X, labels, metric="sqeuclidean") +) # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle -plt.close('all') +plt.close("all") plt.figure(1) plt.clf() -colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') +colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk") for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] - plt.plot(X[class_members, 0], X[class_members, 1], col + '.') - plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, - markeredgecolor='k', markersize=14) + plt.plot(X[class_members, 0], X[class_members, 1], col + ".") + plt.plot( + cluster_center[0], + cluster_center[1], + "o", + markerfacecolor=col, + markeredgecolor="k", + markersize=14, + ) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) -plt.title('Estimated number of clusters: %d' % n_clusters_) +plt.title("Estimated number of clusters: %d" % n_clusters_) plt.show() diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py index 5b846591bee70..bc3adbf376bb6 100644 --- a/examples/cluster/plot_agglomerative_clustering.py +++ b/examples/cluster/plot_agglomerative_clustering.py @@ -40,7 +40,7 @@ X = np.concatenate((x, y)) -X += .7 * np.random.randn(2, n_samples) +X += 0.7 * np.random.randn(2, n_samples) X = X.T # Create a graph capturing local connectivity. Larger number of neighbors @@ -53,28 +53,28 @@ for connectivity in (None, knn_graph): for n_clusters in (30, 3): plt.figure(figsize=(10, 4)) - for index, linkage in enumerate(('average', - 'complete', - 'ward', - 'single')): + for index, linkage in enumerate(("average", "complete", "ward", "single")): plt.subplot(1, 4, index + 1) - model = AgglomerativeClustering(linkage=linkage, - connectivity=connectivity, - n_clusters=n_clusters) + model = AgglomerativeClustering( + linkage=linkage, connectivity=connectivity, n_clusters=n_clusters + ) t0 = time.time() model.fit(X) elapsed_time = time.time() - t0 - plt.scatter(X[:, 0], X[:, 1], c=model.labels_, - cmap=plt.cm.nipy_spectral) - plt.title('linkage=%s\n(time %.2fs)' % (linkage, elapsed_time), - fontdict=dict(verticalalignment='top')) - plt.axis('equal') - plt.axis('off') + plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap=plt.cm.nipy_spectral) + plt.title( + "linkage=%s\n(time %.2fs)" % (linkage, elapsed_time), + fontdict=dict(verticalalignment="top"), + ) + plt.axis("equal") + plt.axis("off") - plt.subplots_adjust(bottom=0, top=.83, wspace=0, - left=0, right=1) - plt.suptitle('n_cluster=%i, connectivity=%r' % - (n_clusters, connectivity is not None), size=17) + plt.subplots_adjust(bottom=0, top=0.83, wspace=0, left=0, right=1) + plt.suptitle( + "n_cluster=%i, connectivity=%r" + % (n_clusters, connectivity is not None), + size=17, + ) plt.show() diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py index 704cf08106055..4901403a4d54b 100644 --- a/examples/cluster/plot_agglomerative_clustering_metrics.py +++ b/examples/cluster/plot_agglomerative_clustering_metrics.py @@ -54,17 +54,21 @@ def sqr(x): X = list() y = list() -for i, (phi, a) in enumerate([(.5, .15), (.5, .6), (.3, .2)]): +for i, (phi, a) in enumerate([(0.5, 0.15), (0.5, 0.6), (0.3, 0.2)]): for _ in range(30): - phase_noise = .01 * np.random.normal() - amplitude_noise = .04 * np.random.normal() + phase_noise = 0.01 * np.random.normal() + amplitude_noise = 0.04 * np.random.normal() additional_noise = 1 - 2 * np.random.rand(n_features) # Make the noise sparse - additional_noise[np.abs(additional_noise) < .997] = 0 - - X.append(12 * ((a + amplitude_noise) - * (sqr(6 * (t + phi + phase_noise))) - + additional_noise)) + additional_noise[np.abs(additional_noise) < 0.997] = 0 + + X.append( + 12 + * ( + (a + amplitude_noise) * (sqr(6 * (t + phi + phase_noise))) + + additional_noise + ) + ) y.append(i) X = np.array(X) @@ -72,20 +76,19 @@ def sqr(x): n_clusters = 3 -labels = ('Waveform 1', 'Waveform 2', 'Waveform 3') +labels = ("Waveform 1", "Waveform 2", "Waveform 3") # Plot the ground-truth labelling plt.figure() plt.axes([0, 0, 1, 1]) -for l, c, n in zip(range(n_clusters), 'rgb', - labels): - lines = plt.plot(X[y == l].T, c=c, alpha=.5) +for l, c, n in zip(range(n_clusters), "rgb", labels): + lines = plt.plot(X[y == l].T, c=c, alpha=0.5) lines[0].set_label(n) -plt.legend(loc='best') +plt.legend(loc="best") -plt.axis('tight') -plt.axis('off') +plt.axis("tight") +plt.axis("off") plt.suptitle("Ground truth", size=20) @@ -95,17 +98,21 @@ def sqr(x): plt.figure(figsize=(5, 4.5)) for i in range(n_clusters): for j in range(n_clusters): - avg_dist[i, j] = pairwise_distances(X[y == i], X[y == j], - metric=metric).mean() + avg_dist[i, j] = pairwise_distances( + X[y == i], X[y == j], metric=metric + ).mean() avg_dist /= avg_dist.max() for i in range(n_clusters): for j in range(n_clusters): - plt.text(i, j, '%5.3f' % avg_dist[i, j], - verticalalignment='center', - horizontalalignment='center') - - plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2, - vmin=0) + plt.text( + i, + j, + "%5.3f" % avg_dist[i, j], + verticalalignment="center", + horizontalalignment="center", + ) + + plt.imshow(avg_dist, interpolation="nearest", cmap=plt.cm.gnuplot2, vmin=0) plt.xticks(range(n_clusters), labels, rotation=45) plt.yticks(range(n_clusters), labels) plt.colorbar() @@ -115,15 +122,16 @@ def sqr(x): # Plot clustering results for index, metric in enumerate(["cosine", "euclidean", "cityblock"]): - model = AgglomerativeClustering(n_clusters=n_clusters, - linkage="average", affinity=metric) + model = AgglomerativeClustering( + n_clusters=n_clusters, linkage="average", affinity=metric + ) model.fit(X) plt.figure() plt.axes([0, 0, 1, 1]) - for l, c in zip(np.arange(model.n_clusters), 'rgbk'): - plt.plot(X[model.labels_ == l].T, c=c, alpha=.5) - plt.axis('tight') - plt.axis('off') + for l, c in zip(np.arange(model.n_clusters), "rgbk"): + plt.plot(X[model.labels_ == l].T, c=c, alpha=0.5) + plt.axis("tight") + plt.axis("off") plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20) diff --git a/examples/cluster/plot_agglomerative_dendrogram.py b/examples/cluster/plot_agglomerative_dendrogram.py index 401d28803b946..94ede3c2451c0 100644 --- a/examples/cluster/plot_agglomerative_dendrogram.py +++ b/examples/cluster/plot_agglomerative_dendrogram.py @@ -31,8 +31,9 @@ def plot_dendrogram(model, **kwargs): current_count += counts[child_idx - n_samples] counts[i] = current_count - linkage_matrix = np.column_stack([model.children_, model.distances_, - counts]).astype(float) + linkage_matrix = np.column_stack( + [model.children_, model.distances_, counts] + ).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) @@ -45,8 +46,8 @@ def plot_dendrogram(model, **kwargs): model = AgglomerativeClustering(distance_threshold=0, n_clusters=None) model = model.fit(X) -plt.title('Hierarchical Clustering Dendrogram') +plt.title("Hierarchical Clustering Dendrogram") # plot the top three levels of the dendrogram -plot_dendrogram(model, truncate_mode='level', p=3) +plot_dendrogram(model, truncate_mode="level", p=3) plt.xlabel("Number of points in node (or index of point if no parenthesis).") plt.show() diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index c4648ee5bd795..67c554c4469f2 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -33,8 +33,7 @@ xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) xx, yy = np.meshgrid(xx, yy) -n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], - np.ravel(yy)[:, np.newaxis])) +n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) # Generate blobs to do a comparison between MiniBatchKMeans and BIRCH. X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0) @@ -47,16 +46,17 @@ # Compute clustering with BIRCH with and without the final clustering step # and plot. -birch_models = [Birch(threshold=1.7, n_clusters=None), - Birch(threshold=1.7, n_clusters=100)] -final_step = ['without global clustering', 'with global clustering'] +birch_models = [ + Birch(threshold=1.7, n_clusters=None), + Birch(threshold=1.7, n_clusters=100), +] +final_step = ["without global clustering", "with global clustering"] for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)): t = time() birch_model.fit(X) time_ = time() - t - print("BIRCH %s as the final step took %0.2f seconds" % ( - info, (time() - t))) + print("BIRCH %s as the final step took %0.2f seconds" % (info, (time() - t))) # Plot result labels = birch_model.labels_ @@ -67,20 +67,24 @@ ax = fig.add_subplot(1, 3, ind + 1) for this_centroid, k, col in zip(centroids, range(n_clusters), colors_): mask = labels == k - ax.scatter(X[mask, 0], X[mask, 1], - c='w', edgecolor=col, marker='.', alpha=0.5) + ax.scatter(X[mask, 0], X[mask, 1], c="w", edgecolor=col, marker=".", alpha=0.5) if birch_model.n_clusters is None: - ax.scatter(this_centroid[0], this_centroid[1], marker='+', - c='k', s=25) + ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25) ax.set_ylim([-25, 25]) ax.set_xlim([-25, 25]) ax.set_autoscaley_on(False) - ax.set_title('BIRCH %s' % info) + ax.set_title("BIRCH %s" % info) # Compute clustering with MiniBatchKMeans. -mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100, - n_init=10, max_no_improvement=10, verbose=0, - random_state=0) +mbk = MiniBatchKMeans( + init="k-means++", + n_clusters=100, + batch_size=100, + n_init=10, + max_no_improvement=10, + verbose=0, + random_state=0, +) t0 = time() mbk.fit(X) t_mini_batch = time() - t0 @@ -88,13 +92,10 @@ mbk_means_labels_unique = np.unique(mbk.labels_) ax = fig.add_subplot(1, 3, 3) -for this_centroid, k, col in zip(mbk.cluster_centers_, - range(n_clusters), colors_): +for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters), colors_): mask = mbk.labels_ == k - ax.scatter(X[mask, 0], X[mask, 1], marker='.', - c='w', edgecolor=col, alpha=0.5) - ax.scatter(this_centroid[0], this_centroid[1], marker='+', - c='k', s=25) + ax.scatter(X[mask, 0], X[mask, 1], marker=".", c="w", edgecolor=col, alpha=0.5) + ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25) ax.set_xlim([-25, 25]) ax.set_ylim([-25, 25]) ax.set_title("MiniBatchKMeans") diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 4025d9e8b8591..43b9a7e333e45 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -42,9 +42,8 @@ # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 -noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, - noise=.05) -noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) +noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05) +noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None @@ -56,40 +55,68 @@ aniso = (X_aniso, y) # blobs with varied variances -varied = datasets.make_blobs(n_samples=n_samples, - cluster_std=[1.0, 2.5, 0.5], - random_state=random_state) +varied = datasets.make_blobs( + n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state +) # ============ # Set up cluster parameters # ============ plt.figure(figsize=(9 * 2 + 3, 13)) -plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05, - hspace=.01) +plt.subplots_adjust( + left=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01 +) plot_num = 1 -default_base = {'quantile': .3, - 'eps': .3, - 'damping': .9, - 'preference': -200, - 'n_neighbors': 10, - 'n_clusters': 3, - 'min_samples': 20, - 'xi': 0.05, - 'min_cluster_size': 0.1} +default_base = { + "quantile": 0.3, + "eps": 0.3, + "damping": 0.9, + "preference": -200, + "n_neighbors": 10, + "n_clusters": 3, + "min_samples": 20, + "xi": 0.05, + "min_cluster_size": 0.1, +} datasets = [ - (noisy_circles, {'damping': .77, 'preference': -240, - 'quantile': .2, 'n_clusters': 2, - 'min_samples': 20, 'xi': 0.25}), - (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), - (varied, {'eps': .18, 'n_neighbors': 2, - 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}), - (aniso, {'eps': .15, 'n_neighbors': 2, - 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}), + ( + noisy_circles, + { + "damping": 0.77, + "preference": -240, + "quantile": 0.2, + "n_clusters": 2, + "min_samples": 20, + "xi": 0.25, + }, + ), + (noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}), + ( + varied, + { + "eps": 0.18, + "n_neighbors": 2, + "min_samples": 5, + "xi": 0.035, + "min_cluster_size": 0.2, + }, + ), + ( + aniso, + { + "eps": 0.15, + "n_neighbors": 2, + "min_samples": 20, + "xi": 0.1, + "min_cluster_size": 0.2, + }, + ), (blobs, {}), - (no_structure, {})] + (no_structure, {}), +] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values @@ -102,11 +129,12 @@ X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift - bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) + bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"]) # connectivity matrix for structured Ward connectivity = kneighbors_graph( - X, n_neighbors=params['n_neighbors'], include_self=False) + X, n_neighbors=params["n_neighbors"], include_self=False + ) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) @@ -114,38 +142,46 @@ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) - two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) + two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"]) ward = cluster.AgglomerativeClustering( - n_clusters=params['n_clusters'], linkage='ward', - connectivity=connectivity) + n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity + ) spectral = cluster.SpectralClustering( - n_clusters=params['n_clusters'], eigen_solver='arpack', - affinity="nearest_neighbors") - dbscan = cluster.DBSCAN(eps=params['eps']) - optics = cluster.OPTICS(min_samples=params['min_samples'], - xi=params['xi'], - min_cluster_size=params['min_cluster_size']) + n_clusters=params["n_clusters"], + eigen_solver="arpack", + affinity="nearest_neighbors", + ) + dbscan = cluster.DBSCAN(eps=params["eps"]) + optics = cluster.OPTICS( + min_samples=params["min_samples"], + xi=params["xi"], + min_cluster_size=params["min_cluster_size"], + ) affinity_propagation = cluster.AffinityPropagation( - damping=params['damping'], preference=params['preference'], - random_state=0) + damping=params["damping"], preference=params["preference"], random_state=0 + ) average_linkage = cluster.AgglomerativeClustering( - linkage="average", affinity="cityblock", - n_clusters=params['n_clusters'], connectivity=connectivity) - birch = cluster.Birch(n_clusters=params['n_clusters']) + linkage="average", + affinity="cityblock", + n_clusters=params["n_clusters"], + connectivity=connectivity, + ) + birch = cluster.Birch(n_clusters=params["n_clusters"]) gmm = mixture.GaussianMixture( - n_components=params['n_clusters'], covariance_type='full') + n_components=params["n_clusters"], covariance_type="full" + ) clustering_algorithms = ( - ('MiniBatch\nKMeans', two_means), - ('Affinity\nPropagation', affinity_propagation), - ('MeanShift', ms), - ('Spectral\nClustering', spectral), - ('Ward', ward), - ('Agglomerative\nClustering', average_linkage), - ('DBSCAN', dbscan), - ('OPTICS', optics), - ('BIRCH', birch), - ('Gaussian\nMixture', gmm) + ("MiniBatch\nKMeans", two_means), + ("Affinity\nPropagation", affinity_propagation), + ("MeanShift", ms), + ("Spectral\nClustering", spectral), + ("Ward", ward), + ("Agglomerative\nClustering", average_linkage), + ("DBSCAN", dbscan), + ("OPTICS", optics), + ("BIRCH", birch), + ("Gaussian\nMixture", gmm), ) for name, algorithm in clustering_algorithms: @@ -155,19 +191,21 @@ with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - message="the number of connected components of the " + - "connectivity matrix is [0-9]{1,2}" + - " > 1. Completing it to avoid stopping the tree early.", - category=UserWarning) + message="the number of connected components of the " + + "connectivity matrix is [0-9]{1,2}" + + " > 1. Completing it to avoid stopping the tree early.", + category=UserWarning, + ) warnings.filterwarnings( "ignore", - message="Graph is not fully connected, spectral embedding" + - " may not work as expected.", - category=UserWarning) + message="Graph is not fully connected, spectral embedding" + + " may not work as expected.", + category=UserWarning, + ) algorithm.fit(X) t1 = time.time() - if hasattr(algorithm, 'labels_'): + if hasattr(algorithm, "labels_"): y_pred = algorithm.labels_.astype(int) else: y_pred = algorithm.predict(X) @@ -176,10 +214,26 @@ if i_dataset == 0: plt.title(name, size=18) - colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', - '#f781bf', '#a65628', '#984ea3', - '#999999', '#e41a1c', '#dede00']), - int(max(y_pred) + 1)))) + colors = np.array( + list( + islice( + cycle( + [ + "#377eb8", + "#ff7f00", + "#4daf4a", + "#f781bf", + "#a65628", + "#984ea3", + "#999999", + "#e41a1c", + "#dede00", + ] + ), + int(max(y_pred) + 1), + ) + ) + ) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) @@ -188,9 +242,14 @@ plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) - plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), - transform=plt.gca().transAxes, size=15, - horizontalalignment='right') + plt.text( + 0.99, + 0.01, + ("%.2fs" % (t1 - t0)).lstrip("0"), + transform=plt.gca().transAxes, + size=15, + horizontalalignment="right", + ) plot_num += 1 plt.show() diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py index f20d1a2374008..2e2844cf3fcd3 100755 --- a/examples/cluster/plot_cluster_iris.py +++ b/examples/cluster/plot_cluster_iris.py @@ -25,6 +25,7 @@ import numpy as np import matplotlib.pyplot as plt + # Though the following import is not directly being used, it is required # for 3D projection to work from mpl_toolkits.mplot3d import Axes3D @@ -38,55 +39,56 @@ X = iris.data y = iris.target -estimators = [('k_means_iris_8', KMeans(n_clusters=8)), - ('k_means_iris_3', KMeans(n_clusters=3)), - ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1, - init='random'))] +estimators = [ + ("k_means_iris_8", KMeans(n_clusters=8)), + ("k_means_iris_3", KMeans(n_clusters=3)), + ("k_means_iris_bad_init", KMeans(n_clusters=3, n_init=1, init="random")), +] fignum = 1 -titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization'] +titles = ["8 clusters", "3 clusters", "3 clusters, bad initialization"] for name, est in estimators: fig = plt.figure(fignum, figsize=(4, 3)) - ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) + ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) est.fit(X) labels = est.labels_ - ax.scatter(X[:, 3], X[:, 0], X[:, 2], - c=labels.astype(float), edgecolor='k') + ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor="k") ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) - ax.set_xlabel('Petal width') - ax.set_ylabel('Sepal length') - ax.set_zlabel('Petal length') + ax.set_xlabel("Petal width") + ax.set_ylabel("Sepal length") + ax.set_zlabel("Petal length") ax.set_title(titles[fignum - 1]) ax.dist = 12 fignum = fignum + 1 # Plot the ground truth fig = plt.figure(fignum, figsize=(4, 3)) -ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) - -for name, label in [('Setosa', 0), - ('Versicolour', 1), - ('Virginica', 2)]: - ax.text3D(X[y == label, 3].mean(), - X[y == label, 0].mean(), - X[y == label, 2].mean() + 2, name, - horizontalalignment='center', - bbox=dict(alpha=.2, edgecolor='w', facecolor='w')) +ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) + +for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]: + ax.text3D( + X[y == label, 3].mean(), + X[y == label, 0].mean(), + X[y == label, 2].mean() + 2, + name, + horizontalalignment="center", + bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"), + ) # Reorder the labels to have colors matching the cluster results y = np.choose(y, [1, 2, 0]).astype(float) -ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor='k') +ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k") ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) -ax.set_xlabel('Petal width') -ax.set_ylabel('Sepal length') -ax.set_zlabel('Petal length') -ax.set_title('Ground Truth') +ax.set_xlabel("Petal width") +ax.set_ylabel("Sepal length") +ax.set_zlabel("Petal length") +ax.set_title("Ground Truth") ax.dist = 12 fig.show() diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index 9fb9b11be2753..09cd8974e3eab 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -36,8 +36,8 @@ from sklearn.utils.fixes import parse_version # these were introduced in skimage-0.14 -if parse_version(skimage.__version__) >= parse_version('0.14'): - rescale_params = {'anti_aliasing': False, 'multichannel': False} +if parse_version(skimage.__version__) >= parse_version("0.14"): + rescale_params = {"anti_aliasing": False, "multichannel": False} else: rescale_params = {} @@ -48,8 +48,7 @@ # Applying a Gaussian filter for smoothing prior to down-scaling # reduces aliasing artifacts. smoothened_coins = gaussian_filter(orig_coins, sigma=2) -rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", - **rescale_params) +rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params) # Convert the image into a graph with the value of the gradient on the # edges. @@ -69,21 +68,21 @@ # %% # Visualize the resulting regions -for assign_labels in ('kmeans', 'discretize'): +for assign_labels in ("kmeans", "discretize"): t0 = time.time() - labels = spectral_clustering(graph, n_clusters=N_REGIONS, - assign_labels=assign_labels, random_state=42) + labels = spectral_clustering( + graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=42 + ) t1 = time.time() labels = labels.reshape(rescaled_coins.shape) plt.figure(figsize=(5, 5)) plt.imshow(rescaled_coins, cmap=plt.cm.gray) for l in range(N_REGIONS): - plt.contour(labels == l, - colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) + plt.contour(labels == l, colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))]) plt.xticks(()) plt.yticks(()) - title = 'Spectral clustering: %s, %.2fs' % (assign_labels, (t1 - t0)) + title = "Spectral clustering: %s, %.2fs" % (assign_labels, (t1 - t0)) print(title) plt.title(title) plt.show() diff --git a/examples/cluster/plot_coin_ward_segmentation.py b/examples/cluster/plot_coin_ward_segmentation.py index 218e6fb4c8d86..b674700fb3726 100644 --- a/examples/cluster/plot_coin_ward_segmentation.py +++ b/examples/cluster/plot_coin_ward_segmentation.py @@ -30,8 +30,8 @@ from sklearn.utils.fixes import parse_version # these were introduced in skimage-0.14 -if parse_version(skimage.__version__) >= parse_version('0.14'): - rescale_params = {'anti_aliasing': False, 'multichannel': False} +if parse_version(skimage.__version__) >= parse_version("0.14"): + rescale_params = {"anti_aliasing": False, "multichannel": False} else: rescale_params = {} @@ -43,8 +43,7 @@ # Applying a Gaussian filter for smoothing prior to down-scaling # reduces aliasing artifacts. smoothened_coins = gaussian_filter(orig_coins, sigma=2) -rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", - **rescale_params) +rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params) X = np.reshape(rescaled_coins, (-1, 1)) @@ -57,8 +56,9 @@ print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 27 # number of regions -ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', - connectivity=connectivity) +ward = AgglomerativeClustering( + n_clusters=n_clusters, linkage="ward", connectivity=connectivity +) ward.fit(X) label = np.reshape(ward.labels_, rescaled_coins.shape) print("Elapsed time: ", time.time() - st) @@ -70,8 +70,12 @@ plt.figure(figsize=(5, 5)) plt.imshow(rescaled_coins, cmap=plt.cm.gray) for l in range(n_clusters): - plt.contour(label == l, - colors=[plt.cm.nipy_spectral(l / float(n_clusters)), ]) + plt.contour( + label == l, + colors=[ + plt.cm.nipy_spectral(l / float(n_clusters)), + ], + ) plt.xticks(()) plt.yticks(()) plt.show() diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index 384e58f75e328..90b75a8cd1352 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -64,9 +64,7 @@ codebook_random = shuffle(image_array, random_state=0, n_samples=n_colors) print("Predicting color indices on the full image (random)") t0 = time() -labels_random = pairwise_distances_argmin(codebook_random, - image_array, - axis=0) +labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0) print(f"done in {time() - t0:0.3f}s.") @@ -78,19 +76,19 @@ def recreate_image(codebook, labels, w, h): # Display all results, alongside original image plt.figure(1) plt.clf() -plt.axis('off') -plt.title('Original image (96,615 colors)') +plt.axis("off") +plt.title("Original image (96,615 colors)") plt.imshow(china) plt.figure(2) plt.clf() -plt.axis('off') -plt.title(f'Quantized image ({n_colors} colors, K-Means)') +plt.axis("off") +plt.title(f"Quantized image ({n_colors} colors, K-Means)") plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h)) plt.figure(3) plt.clf() -plt.axis('off') -plt.title(f'Quantized image ({n_colors} colors, Random)') +plt.axis("off") +plt.title(f"Quantized image ({n_colors} colors, Random)") plt.imshow(recreate_image(codebook_random, labels_random, w, h)) plt.show() diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py index d7cfc3ec524b3..29d5f1b768210 100644 --- a/examples/cluster/plot_dbscan.py +++ b/examples/cluster/plot_dbscan.py @@ -20,8 +20,9 @@ # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] -X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, - random_state=0) +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 +) X = StandardScaler().fit_transform(X) @@ -36,17 +37,17 @@ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) -print('Estimated number of clusters: %d' % n_clusters_) -print('Estimated number of noise points: %d' % n_noise_) +print("Estimated number of clusters: %d" % n_clusters_) +print("Estimated number of noise points: %d" % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) -print("Adjusted Rand Index: %0.3f" - % metrics.adjusted_rand_score(labels_true, labels)) -print("Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels)) -print("Silhouette Coefficient: %0.3f" - % metrics.silhouette_score(X, labels)) +print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) +print( + "Adjusted Mutual Information: %0.3f" + % metrics.adjusted_mutual_info_score(labels_true, labels) +) +print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # ############################################################################# # Plot result @@ -54,22 +55,33 @@ # Black removed and is used for noise instead. unique_labels = set(labels) -colors = [plt.cm.Spectral(each) - for each in np.linspace(0, 1, len(unique_labels))] +colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] - class_member_mask = (labels == k) + class_member_mask = labels == k xy = X[class_member_mask & core_samples_mask] - plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), - markeredgecolor='k', markersize=14) + plt.plot( + xy[:, 0], + xy[:, 1], + "o", + markerfacecolor=tuple(col), + markeredgecolor="k", + markersize=14, + ) xy = X[class_member_mask & ~core_samples_mask] - plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), - markeredgecolor='k', markersize=6) - -plt.title('Estimated number of clusters: %d' % n_clusters_) + plt.plot( + xy[:, 0], + xy[:, 1], + "o", + markerfacecolor=tuple(col), + markeredgecolor="k", + markersize=6, + ) + +plt.title("Estimated number of clusters: %d" % n_clusters_) plt.show() diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py index ac144e7213cc5..242774550d063 100644 --- a/examples/cluster/plot_dict_face_patches.py +++ b/examples/cluster/plot_dict_face_patches.py @@ -36,7 +36,7 @@ # ############################################################################# # Learn the dictionary of images -print('Learning the dictionary... ') +print("Learning the dictionary... ") rng = np.random.RandomState(0) kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True) patch_size = (20, 20) @@ -48,8 +48,7 @@ index = 0 for _ in range(6): for img in faces.images: - data = extract_patches_2d(img, patch_size, max_patches=50, - random_state=rng) + data = extract_patches_2d(img, patch_size, max_patches=50, random_state=rng) data = np.reshape(data, (len(data), -1)) buffer.append(data) index += 1 @@ -60,25 +59,25 @@ kmeans.partial_fit(data) buffer = [] if index % 100 == 0: - print('Partial fit of %4i out of %i' - % (index, 6 * len(faces.images))) + print("Partial fit of %4i out of %i" % (index, 6 * len(faces.images))) dt = time.time() - t0 -print('done in %.2fs.' % dt) +print("done in %.2fs." % dt) # ############################################################################# # Plot the results plt.figure(figsize=(4.2, 4)) for i, patch in enumerate(kmeans.cluster_centers_): plt.subplot(9, 9, i + 1) - plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray, - interpolation='nearest') + plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray, interpolation="nearest") plt.xticks(()) plt.yticks(()) -plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' % - (dt, 8 * len(faces.images)), fontsize=16) +plt.suptitle( + "Patches of faces\nTrain time %.1fs on %d patches" % (dt, 8 * len(faces.images)), + fontsize=16, +) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) plt.show() diff --git a/examples/cluster/plot_digits_agglomeration.py b/examples/cluster/plot_digits_agglomeration.py index ce8bd8daf3bf0..c6590c0e24771 100644 --- a/examples/cluster/plot_digits_agglomeration.py +++ b/examples/cluster/plot_digits_agglomeration.py @@ -26,8 +26,7 @@ X = np.reshape(images, (len(images), -1)) connectivity = grid_to_graph(*images[0].shape) -agglo = cluster.FeatureAgglomeration(connectivity=connectivity, - n_clusters=32) +agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) @@ -36,26 +35,28 @@ images_restored = np.reshape(X_restored, images.shape) plt.figure(1, figsize=(4, 3.5)) plt.clf() -plt.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91) +plt.subplots_adjust(left=0.01, right=0.99, bottom=0.01, top=0.91) for i in range(4): plt.subplot(3, 4, i + 1) - plt.imshow(images[i], cmap=plt.cm.gray, vmax=16, interpolation='nearest') + plt.imshow(images[i], cmap=plt.cm.gray, vmax=16, interpolation="nearest") plt.xticks(()) plt.yticks(()) if i == 1: - plt.title('Original data') + plt.title("Original data") plt.subplot(3, 4, 4 + i + 1) - plt.imshow(images_restored[i], cmap=plt.cm.gray, vmax=16, - interpolation='nearest') + plt.imshow(images_restored[i], cmap=plt.cm.gray, vmax=16, interpolation="nearest") if i == 1: - plt.title('Agglomerated data') + plt.title("Agglomerated data") plt.xticks(()) plt.yticks(()) plt.subplot(3, 4, 10) -plt.imshow(np.reshape(agglo.labels_, images[0].shape), - interpolation='nearest', cmap=plt.cm.nipy_spectral) +plt.imshow( + np.reshape(agglo.labels_, images[0].shape), + interpolation="nearest", + cmap=plt.cm.nipy_spectral, +) plt.xticks(()) plt.yticks(()) -plt.title('Labels') +plt.title("Labels") plt.show() diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index 44cd821b58e19..c5d78e362fb38 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -41,9 +41,9 @@ def nudge_images(X, y): # methods, but we multiply the size of the dataset only by 2, as the # cost of the hierarchical clustering methods are strongly # super-linear in n_samples - shift = lambda x: ndimage.shift(x.reshape((8, 8)), - .3 * np.random.normal(size=2), - mode='constant').ravel() + shift = lambda x: ndimage.shift( + x.reshape((8, 8)), 0.3 * np.random.normal(size=2), mode="constant" + ).ravel() X = np.concatenate([X, np.apply_along_axis(shift, 1, X)]) Y = np.concatenate([y, y], axis=0) return X, Y @@ -60,15 +60,19 @@ def plot_clustering(X_red, labels, title=None): plt.figure(figsize=(6, 4)) for i in range(X_red.shape[0]): - plt.text(X_red[i, 0], X_red[i, 1], str(y[i]), - color=plt.cm.nipy_spectral(labels[i] / 10.), - fontdict={'weight': 'bold', 'size': 9}) + plt.text( + X_red[i, 0], + X_red[i, 1], + str(y[i]), + color=plt.cm.nipy_spectral(labels[i] / 10.0), + fontdict={"weight": "bold", "size": 9}, + ) plt.xticks([]) plt.yticks([]) if title is not None: plt.title(title, size=17) - plt.axis('off') + plt.axis("off") plt.tight_layout(rect=[0, 0.03, 1, 0.95]) @@ -80,7 +84,7 @@ def plot_clustering(X_red, labels, title=None): from sklearn.cluster import AgglomerativeClustering -for linkage in ('ward', 'average', 'complete', 'single'): +for linkage in ("ward", "average", "complete", "single"): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) t0 = time() clustering.fit(X_red) diff --git a/examples/cluster/plot_face_compress.py b/examples/cluster/plot_face_compress.py index 4eed00d623f9b..8b2e329ed257d 100644 --- a/examples/cluster/plot_face_compress.py +++ b/examples/cluster/plot_face_compress.py @@ -27,6 +27,7 @@ try: # SciPy >= 0.16 have face in misc from scipy.misc import face + face = face(gray=True) except ImportError: face = sp.face(gray=True) @@ -58,7 +59,7 @@ # equal bins face regular_values = np.linspace(0, 256, n_clusters + 1) regular_labels = np.searchsorted(regular_values, face) - 1 -regular_values = .5 * (regular_values[1:] + regular_values[:-1]) # mean +regular_values = 0.5 * (regular_values[1:] + regular_values[:-1]) # mean regular_face = np.choose(regular_labels.ravel(), regular_values, mode="clip") regular_face.shape = face.shape plt.figure(3, figsize=(3, 2.2)) @@ -67,15 +68,15 @@ # histogram plt.figure(4, figsize=(3, 2.2)) plt.clf() -plt.axes([.01, .01, .98, .98]) -plt.hist(X, bins=256, color='.5', edgecolor='.5') +plt.axes([0.01, 0.01, 0.98, 0.98]) +plt.hist(X, bins=256, color=".5", edgecolor=".5") plt.yticks(()) plt.xticks(regular_values) values = np.sort(values) for center_1, center_2 in zip(values[:-1], values[1:]): - plt.axvline(.5 * (center_1 + center_2), color='b') + plt.axvline(0.5 * (center_1 + center_2), color="b") for center_1, center_2 in zip(regular_values[:-1], regular_values[1:]): - plt.axvline(.5 * (center_1 + center_2), color='b', linestyle='--') + plt.axvline(0.5 * (center_1 + center_2), color="b", linestyle="--") plt.show() diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py index e148647446613..afb31751ce7a5 100644 --- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py +++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py @@ -39,13 +39,13 @@ n_samples = 200 size = 40 # image size roi_size = 15 -snr = 5. +snr = 5.0 np.random.seed(0) mask = np.ones([size, size], dtype=bool) coef = np.zeros((size, size)) -coef[0:roi_size, 0:roi_size] = -1. -coef[-roi_size:, -roi_size:] = 1. +coef[0:roi_size, 0:roi_size] = -1.0 +coef[-roi_size:, -roi_size:] = 1.0 X = np.random.randn(n_samples, size ** 2) for x in X: # smooth data @@ -55,7 +55,7 @@ y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) -noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) +noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.0)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# @@ -67,11 +67,10 @@ # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) -ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, - memory=mem) -clf = Pipeline([('ward', ward), ('ridge', ridge)]) +ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) +clf = Pipeline([("ward", ward), ("ridge", ridge)]) # Select the optimal number of parcels with grid search -clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) +clf = GridSearchCV(clf, {"ward__n_clusters": [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) @@ -80,9 +79,9 @@ # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) -clf = Pipeline([('anova', anova), ('ridge', ridge)]) +clf = Pipeline([("anova", anova), ("ridge", ridge)]) # Select the optimal percentage of features with grid search -clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) +clf = GridSearchCV(clf, {"anova__percentile": [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) @@ -90,7 +89,7 @@ # ############################################################################# # Inverse the transformation to plot the results on an image -plt.close('all') +plt.close("all") plt.figure(figsize=(7.3, 2.7)) plt.subplot(1, 3, 1) plt.imshow(coef, interpolation="nearest", cmap=plt.cm.RdBu_r) diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index 2ff04d523855a..4360f89faee3c 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -43,9 +43,9 @@ plt.title("Anisotropicly Distributed Blobs") # Different variance -X_varied, y_varied = make_blobs(n_samples=n_samples, - cluster_std=[1.0, 2.5, 0.5], - random_state=random_state) +X_varied, y_varied = make_blobs( + n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state +) y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied) plt.subplot(223) @@ -54,8 +54,7 @@ # Unevenly sized blobs X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])) -y_pred = KMeans(n_clusters=3, - random_state=random_state).fit_predict(X_filtered) +y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered) plt.subplot(224) plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py index 9fc0abebc4464..8190c6e2792e0 100644 --- a/examples/cluster/plot_kmeans_digits.py +++ b/examples/cluster/plot_kmeans_digits.py @@ -39,9 +39,7 @@ data, labels = load_digits(return_X_y=True) (n_samples, n_features), n_digits = data.shape, np.unique(labels).size -print( - f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}" -) +print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}") # %% # Define our evaluation benchmark @@ -95,13 +93,18 @@ def bench_k_means(kmeans, name, data, labels): # The silhouette score requires the full dataset results += [ - metrics.silhouette_score(data, estimator[-1].labels_, - metric="euclidean", sample_size=300,) + metrics.silhouette_score( + data, + estimator[-1].labels_, + metric="euclidean", + sample_size=300, + ) ] # Show the results - formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}" - "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}") + formatter_result = ( + "{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}" + ) print(formatter_result.format(*results)) @@ -122,11 +125,10 @@ def bench_k_means(kmeans, name, data, labels): from sklearn.cluster import KMeans from sklearn.decomposition import PCA -print(82 * '_') -print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') +print(82 * "_") +print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette") -kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4, - random_state=0) +kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4, random_state=0) bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels) kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0) @@ -136,7 +138,7 @@ def bench_k_means(kmeans, name, data, labels): kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1) bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels) -print(82 * '_') +print(82 * "_") # %% # Visualize the results on PCA-reduced data @@ -153,7 +155,7 @@ def bench_k_means(kmeans, name, data, labels): kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. -h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. +h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 @@ -167,17 +169,31 @@ def bench_k_means(kmeans, name, data, labels): Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() -plt.imshow(Z, interpolation="nearest", - extent=(xx.min(), xx.max(), yy.min(), yy.max()), - cmap=plt.cm.Paired, aspect="auto", origin="lower") +plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + cmap=plt.cm.Paired, + aspect="auto", + origin="lower", +) -plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) +plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ -plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3, - color="w", zorder=10) -plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n" - "Centroids are marked with white cross") +plt.scatter( + centroids[:, 0], + centroids[:, 1], + marker="x", + s=169, + linewidths=3, + color="w", + zorder=10, +) +plt.title( + "K-means clustering on the digits dataset (PCA-reduced data)\n" + "Centroids are marked with white cross" +) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) diff --git a/examples/cluster/plot_kmeans_plusplus.py b/examples/cluster/plot_kmeans_plusplus.py index d9821db2d452e..8afbb62dfdda4 100644 --- a/examples/cluster/plot_kmeans_plusplus.py +++ b/examples/cluster/plot_kmeans_plusplus.py @@ -19,26 +19,23 @@ n_samples = 4000 n_components = 4 -X, y_true = make_blobs(n_samples=n_samples, - centers=n_components, - cluster_std=0.60, - random_state=0) +X, y_true = make_blobs( + n_samples=n_samples, centers=n_components, cluster_std=0.60, random_state=0 +) X = X[:, ::-1] # Calculate seeds from kmeans++ -centers_init, indices = kmeans_plusplus(X, n_clusters=4, - random_state=0) +centers_init, indices = kmeans_plusplus(X, n_clusters=4, random_state=0) # Plot init seeds along side sample data plt.figure(1) -colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm'] +colors = ["#4EACC5", "#FF9C34", "#4E9A06", "m"] for k, col in enumerate(colors): cluster_data = y_true == k - plt.scatter(X[cluster_data, 0], X[cluster_data, 1], - c=col, marker='.', s=10) + plt.scatter(X[cluster_data, 0], X[cluster_data, 1], c=col, marker=".", s=10) -plt.scatter(centers_init[:, 0], centers_init[:, 1], c='b', s=50) +plt.scatter(centers_init[:, 0], centers_init[:, 1], c="b", s=50) plt.title("K-Means++ Initialization") plt.xticks([]) plt.yticks([]) diff --git a/examples/cluster/plot_kmeans_silhouette_analysis.py b/examples/cluster/plot_kmeans_silhouette_analysis.py index 8cd65cfb6a865..63efe0c406fd8 100644 --- a/examples/cluster/plot_kmeans_silhouette_analysis.py +++ b/examples/cluster/plot_kmeans_silhouette_analysis.py @@ -42,13 +42,15 @@ # Generating the sample data from make_blobs # This particular setting has one distinct cluster and 3 clusters placed close # together. -X, y = make_blobs(n_samples=500, - n_features=2, - centers=4, - cluster_std=1, - center_box=(-10.0, 10.0), - shuffle=True, - random_state=1) # For reproducibility +X, y = make_blobs( + n_samples=500, + n_features=2, + centers=4, + cluster_std=1, + center_box=(-10.0, 10.0), + shuffle=True, + random_state=1, +) # For reproducibility range_n_clusters = [2, 3, 4, 5, 6] @@ -74,8 +76,12 @@ # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) - print("For n_clusters =", n_clusters, - "The average silhouette_score is :", silhouette_avg) + print( + "For n_clusters =", + n_clusters, + "The average silhouette_score is :", + silhouette_avg, + ) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) @@ -84,8 +90,7 @@ for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them - ith_cluster_silhouette_values = \ - sample_silhouette_values[cluster_labels == i] + ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() @@ -93,9 +98,14 @@ y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) - ax1.fill_betweenx(np.arange(y_lower, y_upper), - 0, ith_cluster_silhouette_values, - facecolor=color, edgecolor=color, alpha=0.7) + ax1.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) @@ -115,25 +125,35 @@ # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) - ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, - c=colors, edgecolor='k') + ax2.scatter( + X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k" + ) # Labeling the clusters centers = clusterer.cluster_centers_ # Draw white circles at cluster centers - ax2.scatter(centers[:, 0], centers[:, 1], marker='o', - c="white", alpha=1, s=200, edgecolor='k') + ax2.scatter( + centers[:, 0], + centers[:, 1], + marker="o", + c="white", + alpha=1, + s=200, + edgecolor="k", + ) for i, c in enumerate(centers): - ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, - s=50, edgecolor='k') + ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") - plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " - "with n_clusters = %d" % n_clusters), - fontsize=14, fontweight='bold') + plt.suptitle( + "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d" + % n_clusters, + fontsize=14, + fontweight="bold", + ) plt.show() diff --git a/examples/cluster/plot_kmeans_stability_low_dim_dense.py b/examples/cluster/plot_kmeans_stability_low_dim_dense.py index a6771f6350135..dd9f32d01485d 100644 --- a/examples/cluster/plot_kmeans_stability_low_dim_dense.py +++ b/examples/cluster/plot_kmeans_stability_low_dim_dense.py @@ -54,19 +54,18 @@ def make_data(random_state, n_samples_per_center, grid_size, scale): random_state = check_random_state(random_state) - centers = np.array([[i, j] - for i in range(grid_size) - for j in range(grid_size)]) + centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)]) n_clusters_true, n_features = centers.shape noise = random_state.normal( - scale=scale, size=(n_samples_per_center, centers.shape[1])) + scale=scale, size=(n_samples_per_center, centers.shape[1]) + ) X = np.concatenate([c + noise for c in centers]) - y = np.concatenate([[i] * n_samples_per_center - for i in range(n_clusters_true)]) + y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)]) return shuffle(X, y, random_state=random_state) + # Part 1: Quantitative evaluation of various init methods @@ -75,10 +74,10 @@ def make_data(random_state, n_samples_per_center, grid_size, scale): legends = [] cases = [ - (KMeans, 'k-means++', {}), - (KMeans, 'random', {}), - (MiniBatchKMeans, 'k-means++', {'max_no_improvement': 3}), - (MiniBatchKMeans, 'random', {'max_no_improvement': 3, 'init_size': 500}), + (KMeans, "k-means++", {}), + (KMeans, "random", {}), + (MiniBatchKMeans, "k-means++", {"max_no_improvement": 3}), + (MiniBatchKMeans, "random", {"max_no_improvement": 3, "init_size": 500}), ] for factory, init, params in cases: @@ -88,33 +87,46 @@ def make_data(random_state, n_samples_per_center, grid_size, scale): for run_id in range(n_runs): X, y = make_data(run_id, n_samples_per_center, grid_size, scale) for i, n_init in enumerate(n_init_range): - km = factory(n_clusters=n_clusters, init=init, random_state=run_id, - n_init=n_init, **params).fit(X) + km = factory( + n_clusters=n_clusters, + init=init, + random_state=run_id, + n_init=n_init, + **params, + ).fit(X) inertia[i, run_id] = km.inertia_ p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1)) plots.append(p[0]) legends.append("%s with %s init" % (factory.__name__, init)) -plt.xlabel('n_init') -plt.ylabel('inertia') +plt.xlabel("n_init") +plt.ylabel("inertia") plt.legend(plots, legends) plt.title("Mean inertia for various k-means init across %d runs" % n_runs) # Part 2: Qualitative visual inspection of the convergence X, y = make_data(random_state, n_samples_per_center, grid_size, scale) -km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1, - random_state=random_state).fit(X) +km = MiniBatchKMeans( + n_clusters=n_clusters, init="random", n_init=1, random_state=random_state +).fit(X) plt.figure() for k in range(n_clusters): my_members = km.labels_ == k color = cm.nipy_spectral(float(k) / n_clusters, 1) - plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color) + plt.plot(X[my_members, 0], X[my_members, 1], "o", marker=".", c=color) cluster_center = km.cluster_centers_[k] - plt.plot(cluster_center[0], cluster_center[1], 'o', - markerfacecolor=color, markeredgecolor='k', markersize=6) - plt.title("Example cluster allocation with a single random init\n" - "with MiniBatchKMeans") + plt.plot( + cluster_center[0], + cluster_center[1], + "o", + markerfacecolor=color, + markeredgecolor="k", + markersize=6, + ) + plt.title( + "Example cluster allocation with a single random init\nwith MiniBatchKMeans" + ) plt.show() diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index 390aa1fe889d2..0a4667855f2ec 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -40,9 +40,8 @@ # of the algorithms, but not too big to avoid too long running times n_samples = 1500 -noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, - noise=.05) -noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) +noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05) +noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None @@ -54,30 +53,31 @@ aniso = (X_aniso, y) # blobs with varied variances -varied = datasets.make_blobs(n_samples=n_samples, - cluster_std=[1.0, 2.5, 0.5], - random_state=random_state) +varied = datasets.make_blobs( + n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state +) # %% # Run the clustering and plot # Set up cluster parameters plt.figure(figsize=(9 * 1.3 + 2, 14.5)) -plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, - hspace=.01) +plt.subplots_adjust( + left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01 +) plot_num = 1 -default_base = {'n_neighbors': 10, - 'n_clusters': 3} +default_base = {"n_neighbors": 10, "n_clusters": 3} datasets = [ - (noisy_circles, {'n_clusters': 2}), - (noisy_moons, {'n_clusters': 2}), - (varied, {'n_neighbors': 2}), - (aniso, {'n_neighbors': 2}), + (noisy_circles, {"n_clusters": 2}), + (noisy_moons, {"n_clusters": 2}), + (varied, {"n_neighbors": 2}), + (aniso, {"n_neighbors": 2}), (blobs, {}), - (no_structure, {})] + (no_structure, {}), +] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values @@ -93,19 +93,23 @@ # Create cluster objects # ============ ward = cluster.AgglomerativeClustering( - n_clusters=params['n_clusters'], linkage='ward') + n_clusters=params["n_clusters"], linkage="ward" + ) complete = cluster.AgglomerativeClustering( - n_clusters=params['n_clusters'], linkage='complete') + n_clusters=params["n_clusters"], linkage="complete" + ) average = cluster.AgglomerativeClustering( - n_clusters=params['n_clusters'], linkage='average') + n_clusters=params["n_clusters"], linkage="average" + ) single = cluster.AgglomerativeClustering( - n_clusters=params['n_clusters'], linkage='single') + n_clusters=params["n_clusters"], linkage="single" + ) clustering_algorithms = ( - ('Single Linkage', single), - ('Average Linkage', average), - ('Complete Linkage', complete), - ('Ward Linkage', ward), + ("Single Linkage", single), + ("Average Linkage", average), + ("Complete Linkage", complete), + ("Ward Linkage", ward), ) for name, algorithm in clustering_algorithms: @@ -115,14 +119,15 @@ with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - message="the number of connected components of the " + - "connectivity matrix is [0-9]{1,2}" + - " > 1. Completing it to avoid stopping the tree early.", - category=UserWarning) + message="the number of connected components of the " + + "connectivity matrix is [0-9]{1,2}" + + " > 1. Completing it to avoid stopping the tree early.", + category=UserWarning, + ) algorithm.fit(X) t1 = time.time() - if hasattr(algorithm, 'labels_'): + if hasattr(algorithm, "labels_"): y_pred = algorithm.labels_.astype(int) else: y_pred = algorithm.predict(X) @@ -131,19 +136,40 @@ if i_dataset == 0: plt.title(name, size=18) - colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', - '#f781bf', '#a65628', '#984ea3', - '#999999', '#e41a1c', '#dede00']), - int(max(y_pred) + 1)))) + colors = np.array( + list( + islice( + cycle( + [ + "#377eb8", + "#ff7f00", + "#4daf4a", + "#f781bf", + "#a65628", + "#984ea3", + "#999999", + "#e41a1c", + "#dede00", + ] + ), + int(max(y_pred) + 1), + ) + ) + ) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) - plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), - transform=plt.gca().transAxes, size=15, - horizontalalignment='right') + plt.text( + 0.99, + 0.01, + ("%.2fs" % (t1 - t0)).lstrip("0"), + transform=plt.gca().transAxes, + size=15, + horizontalalignment="right", + ) plot_num += 1 plt.show() diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py index dce7b35503232..53e052ed71d36 100644 --- a/examples/cluster/plot_mean_shift.py +++ b/examples/cluster/plot_mean_shift.py @@ -45,12 +45,18 @@ plt.figure(1) plt.clf() -colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') +colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk") for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] - plt.plot(X[my_members, 0], X[my_members, 1], col + '.') - plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, - markeredgecolor='k', markersize=14) -plt.title('Estimated number of clusters: %d' % n_clusters_) + plt.plot(X[my_members, 0], X[my_members, 1], col + ".") + plt.plot( + cluster_center[0], + cluster_center[1], + "o", + markerfacecolor=col, + markeredgecolor="k", + markersize=14, + ) +plt.title("Estimated number of clusters: %d" % n_clusters_) plt.show() diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py index e8f78556f80d2..99b5311c897d0 100644 --- a/examples/cluster/plot_mini_batch_kmeans.py +++ b/examples/cluster/plot_mini_batch_kmeans.py @@ -35,7 +35,7 @@ # ############################################################################# # Compute clustering with Means -k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) +k_means = KMeans(init="k-means++", n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 @@ -43,8 +43,14 @@ # ############################################################################# # Compute clustering with MiniBatchKMeans -mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size, - n_init=10, max_no_improvement=10, verbose=0) +mbk = MiniBatchKMeans( + init="k-means++", + n_clusters=3, + batch_size=batch_size, + n_init=10, + max_no_improvement=10, + verbose=0, +) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0 @@ -54,14 +60,13 @@ fig = plt.figure(figsize=(8, 3)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) -colors = ['#4EACC5', '#FF9C34', '#4E9A06'] +colors = ["#4EACC5", "#FF9C34", "#4E9A06"] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. k_means_cluster_centers = k_means.cluster_centers_ -order = pairwise_distances_argmin(k_means.cluster_centers_, - mbk.cluster_centers_) +order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_) mbk_means_cluster_centers = mbk.cluster_centers_[order] k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers) @@ -72,44 +77,50 @@ for k, col in zip(range(n_clusters), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] - ax.plot(X[my_members, 0], X[my_members, 1], 'w', - markerfacecolor=col, marker='.') - ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, - markeredgecolor='k', markersize=6) -ax.set_title('KMeans') + ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".") + ax.plot( + cluster_center[0], + cluster_center[1], + "o", + markerfacecolor=col, + markeredgecolor="k", + markersize=6, + ) +ax.set_title("KMeans") ax.set_xticks(()) ax.set_yticks(()) -plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % ( - t_batch, k_means.inertia_)) +plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_)) # MiniBatchKMeans ax = fig.add_subplot(1, 3, 2) for k, col in zip(range(n_clusters), colors): my_members = mbk_means_labels == k cluster_center = mbk_means_cluster_centers[k] - ax.plot(X[my_members, 0], X[my_members, 1], 'w', - markerfacecolor=col, marker='.') - ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, - markeredgecolor='k', markersize=6) -ax.set_title('MiniBatchKMeans') + ax.plot(X[my_members, 0], X[my_members, 1], "w", markerfacecolor=col, marker=".") + ax.plot( + cluster_center[0], + cluster_center[1], + "o", + markerfacecolor=col, + markeredgecolor="k", + markersize=6, + ) +ax.set_title("MiniBatchKMeans") ax.set_xticks(()) ax.set_yticks(()) -plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % - (t_mini_batch, mbk.inertia_)) +plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_)) # Initialise the different array to all False -different = (mbk_means_labels == 4) +different = mbk_means_labels == 4 ax = fig.add_subplot(1, 3, 3) for k in range(n_clusters): - different += ((k_means_labels == k) != (mbk_means_labels == k)) + different += (k_means_labels == k) != (mbk_means_labels == k) identic = np.logical_not(different) -ax.plot(X[identic, 0], X[identic, 1], 'w', - markerfacecolor='#bbbbbb', marker='.') -ax.plot(X[different, 0], X[different, 1], 'w', - markerfacecolor='m', marker='.') -ax.set_title('Difference') +ax.plot(X[identic, 0], X[identic, 1], "w", markerfacecolor="#bbbbbb", marker=".") +ax.plot(X[different, 0], X[different, 1], "w", markerfacecolor="m", marker=".") +ax.set_title("Difference") ax.set_xticks(()) ax.set_yticks(()) diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py index 211fb84aede30..4ca81be91de1b 100644 --- a/examples/cluster/plot_optics.py +++ b/examples/cluster/plot_optics.py @@ -30,25 +30,31 @@ np.random.seed(0) n_points_per_cluster = 250 -C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) -C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) -C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) -C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) +C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2) +C2 = [4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2) +C3 = [1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2) +C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) -clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) +clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05) # Run the fit clust.fit(X) -labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, eps=0.5) -labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, eps=2) +labels_050 = cluster_optics_dbscan( + reachability=clust.reachability_, + core_distances=clust.core_distances_, + ordering=clust.ordering_, + eps=0.5, +) +labels_200 = cluster_optics_dbscan( + reachability=clust.reachability_, + core_distances=clust.core_distances_, + ordering=clust.ordering_, + eps=2, +) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] @@ -62,40 +68,40 @@ ax4 = plt.subplot(G[1, 2]) # Reachability plot -colors = ['g.', 'r.', 'b.', 'y.', 'c.'] +colors = ["g.", "r.", "b.", "y.", "c."] for klass, color in zip(range(0, 5), colors): Xk = space[labels == klass] Rk = reachability[labels == klass] ax1.plot(Xk, Rk, color, alpha=0.3) -ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3) -ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5) -ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5) -ax1.set_ylabel('Reachability (epsilon distance)') -ax1.set_title('Reachability Plot') +ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3) +ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5) +ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5) +ax1.set_ylabel("Reachability (epsilon distance)") +ax1.set_title("Reachability Plot") # OPTICS -colors = ['g.', 'r.', 'b.', 'y.', 'c.'] +colors = ["g.", "r.", "b.", "y.", "c."] for klass, color in zip(range(0, 5), colors): Xk = X[clust.labels_ == klass] ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1) -ax2.set_title('Automatic Clustering\nOPTICS') +ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], "k+", alpha=0.1) +ax2.set_title("Automatic Clustering\nOPTICS") # DBSCAN at 0.5 -colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c'] +colors = ["g", "greenyellow", "olive", "r", "b", "c"] for klass, color in zip(range(0, 6), colors): Xk = X[labels_050 == klass] - ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.') -ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1) -ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN') + ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker=".") +ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], "k+", alpha=0.1) +ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN") # DBSCAN at 2. -colors = ['g.', 'm.', 'y.', 'c.'] +colors = ["g.", "m.", "y.", "c."] for klass, color in zip(range(0, 4), colors): Xk = X[labels_200 == klass] ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1) -ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN') +ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], "k+", alpha=0.1) +ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN") plt.tight_layout() plt.show() diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index a6980c5f271ef..f4709358b63a4 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -73,8 +73,8 @@ # Force the solver to be arpack, since amg is numerically # unstable on this example -labels = spectral_clustering(graph, n_clusters=4, eigen_solver='arpack') -label_im = np.full(mask.shape, -1.) +labels = spectral_clustering(graph, n_clusters=4, eigen_solver="arpack") +label_im = np.full(mask.shape, -1.0) label_im[mask] = labels plt.matshow(img) @@ -91,8 +91,8 @@ graph = image.img_to_graph(img, mask=mask) graph.data = np.exp(-graph.data / graph.data.std()) -labels = spectral_clustering(graph, n_clusters=2, eigen_solver='arpack') -label_im = np.full(mask.shape, -1.) +labels = spectral_clustering(graph, n_clusters=2, eigen_solver="arpack") +label_im = np.full(mask.shape, -1.0) label_im[mask] = labels plt.matshow(img) diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py index 671bdd2735280..75a2aecb6fd3b 100644 --- a/examples/cluster/plot_ward_structured_vs_unstructured.py +++ b/examples/cluster/plot_ward_structured_vs_unstructured.py @@ -39,13 +39,13 @@ noise = 0.05 X, _ = make_swiss_roll(n_samples, noise=noise) # Make it thinner -X[:, 1] *= .5 +X[:, 1] *= 0.5 # ############################################################################# # Compute clustering print("Compute unstructured hierarchical clustering...") st = time.time() -ward = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(X) +ward = AgglomerativeClustering(n_clusters=6, linkage="ward").fit(X) elapsed_time = time.time() - st label = ward.labels_ print("Elapsed time: %.2fs" % elapsed_time) @@ -57,23 +57,30 @@ ax = p3.Axes3D(fig) ax.view_init(7, -80) for l in np.unique(label): - ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2], - color=plt.cm.jet(float(l) / np.max(label + 1)), - s=20, edgecolor='k') -plt.title('Without connectivity constraints (time %.2fs)' % elapsed_time) + ax.scatter( + X[label == l, 0], + X[label == l, 1], + X[label == l, 2], + color=plt.cm.jet(float(l) / np.max(label + 1)), + s=20, + edgecolor="k", + ) +plt.title("Without connectivity constraints (time %.2fs)" % elapsed_time) # ############################################################################# # Define the structure A of the data. Here a 10 nearest neighbors from sklearn.neighbors import kneighbors_graph + connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False) # ############################################################################# # Compute clustering print("Compute structured hierarchical clustering...") st = time.time() -ward = AgglomerativeClustering(n_clusters=6, connectivity=connectivity, - linkage='ward').fit(X) +ward = AgglomerativeClustering( + n_clusters=6, connectivity=connectivity, linkage="ward" +).fit(X) elapsed_time = time.time() - st label = ward.labels_ print("Elapsed time: %.2fs" % elapsed_time) @@ -85,9 +92,14 @@ ax = p3.Axes3D(fig) ax.view_init(7, -80) for l in np.unique(label): - ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2], - color=plt.cm.jet(float(l) / np.max(label + 1)), - s=20, edgecolor='k') -plt.title('With connectivity constraints (time %.2fs)' % elapsed_time) + ax.scatter( + X[label == l, 0], + X[label == l, 1], + X[label == l, 2], + color=plt.cm.jet(float(l) / np.max(label + 1)), + s=20, + edgecolor="k", + ) +plt.title("With connectivity constraints (time %.2fs)" % elapsed_time) plt.show() diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py index f6144ed2c491b..81fd448b56d18 100644 --- a/examples/compose/plot_column_transformer.py +++ b/examples/compose/plot_column_transformer.py @@ -43,17 +43,21 @@ # a specific date. We will only use posts from 2 categories to speed up running # time. -categories = ['sci.med', 'sci.space'] -X_train, y_train = fetch_20newsgroups(random_state=1, - subset='train', - categories=categories, - remove=('footers', 'quotes'), - return_X_y=True) -X_test, y_test = fetch_20newsgroups(random_state=1, - subset='test', - categories=categories, - remove=('footers', 'quotes'), - return_X_y=True) +categories = ["sci.med", "sci.space"] +X_train, y_train = fetch_20newsgroups( + random_state=1, + subset="train", + categories=categories, + remove=("footers", "quotes"), + return_X_y=True, +) +X_test, y_test = fetch_20newsgroups( + random_state=1, + subset="test", + categories=categories, + remove=("footers", "quotes"), + return_X_y=True, +) ############################################################################## # Each feature comprises meta information about that post, such as the subject, @@ -79,16 +83,16 @@ def subject_body_extractor(posts): features = np.empty(shape=(len(posts), 2), dtype=object) for i, text in enumerate(posts): # temporary variable `_` stores '\n\n' - headers, _, body = text.partition('\n\n') + headers, _, body = text.partition("\n\n") # store body text in second column features[i, 1] = body - prefix = 'Subject:' - sub = '' + prefix = "Subject:" + sub = "" # save text after 'Subject:' in first column - for line in headers.split('\n'): + for line in headers.split("\n"): if line.startswith(prefix): - sub = line[len(prefix):] + sub = line[len(prefix) :] break features[i, 0] = sub @@ -103,9 +107,7 @@ def subject_body_extractor(posts): def text_stats(posts): - return [{'length': len(text), - 'num_sentences': text.count('.')} - for text in posts] + return [{"length": len(text), "num_sentences": text.count(".")} for text in posts] text_stats_transformer = FunctionTransformer(text_stats) @@ -121,35 +123,59 @@ def text_stats(posts): # ``ColumnTransformer``. We combine them, with weights, then train a # classifier on the combined set of features. -pipeline = Pipeline([ - # Extract subject & body - ('subjectbody', subject_body_transformer), - # Use ColumnTransformer to combine the subject and body features - ('union', ColumnTransformer( - [ - # bag-of-words for subject (col 0) - ('subject', TfidfVectorizer(min_df=50), 0), - # bag-of-words with decomposition for body (col 1) - ('body_bow', Pipeline([ - ('tfidf', TfidfVectorizer()), - ('best', TruncatedSVD(n_components=50)), - ]), 1), - # Pipeline for pulling text stats from post's body - ('body_stats', Pipeline([ - ('stats', text_stats_transformer), # returns a list of dicts - ('vect', DictVectorizer()), # list of dicts -> feature matrix - ]), 1), - ], - # weight above ColumnTransformer features - transformer_weights={ - 'subject': 0.8, - 'body_bow': 0.5, - 'body_stats': 1.0, - } - )), - # Use a SVC classifier on the combined features - ('svc', LinearSVC(dual=False)), -], verbose=True) +pipeline = Pipeline( + [ + # Extract subject & body + ("subjectbody", subject_body_transformer), + # Use ColumnTransformer to combine the subject and body features + ( + "union", + ColumnTransformer( + [ + # bag-of-words for subject (col 0) + ("subject", TfidfVectorizer(min_df=50), 0), + # bag-of-words with decomposition for body (col 1) + ( + "body_bow", + Pipeline( + [ + ("tfidf", TfidfVectorizer()), + ("best", TruncatedSVD(n_components=50)), + ] + ), + 1, + ), + # Pipeline for pulling text stats from post's body + ( + "body_stats", + Pipeline( + [ + ( + "stats", + text_stats_transformer, + ), # returns a list of dicts + ( + "vect", + DictVectorizer(), + ), # list of dicts -> feature matrix + ] + ), + 1, + ), + ], + # weight above ColumnTransformer features + transformer_weights={ + "subject": 0.8, + "body_bow": 0.5, + "body_stats": 1.0, + }, + ), + ), + # Use a SVC classifier on the combined features + ("svc", LinearSVC(dual=False)), + ], + verbose=True, +) ############################################################################## # Finally, we fit our pipeline on the training data and use it to predict @@ -157,6 +183,4 @@ def text_stats(posts): pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) -print('Classification report:\n\n{}'.format( - classification_report(y_test, y_pred)) -) +print("Classification report:\n\n{}".format(classification_report(y_test, y_pred))) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index f51a55742926d..f6ea68105dc97 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -66,26 +66,28 @@ # Note that ``pclass`` could either be treated as a categorical or numeric # feature. -numeric_features = ['age', 'fare'] -numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler())]) +numeric_features = ["age", "fare"] +numeric_transformer = Pipeline( + steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] +) -categorical_features = ['embarked', 'sex', 'pclass'] -categorical_transformer = OneHotEncoder(handle_unknown='ignore') +categorical_features = ["embarked", "sex", "pclass"] +categorical_transformer = OneHotEncoder(handle_unknown="ignore") preprocessor = ColumnTransformer( transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features)]) + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] +) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. -clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', LogisticRegression())]) +clf = Pipeline( + steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())] +) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, - random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) @@ -97,7 +99,7 @@ # representation of the estimator is displayed as follows: from sklearn import set_config -set_config(display='diagram') +set_config(display="diagram") clf # %% @@ -110,7 +112,7 @@ # First, let's only select a subset of columns to simplify our # example. -subset_feature = ['embarked', 'sex', 'pclass', 'age', 'fare'] +subset_feature = ["embarked", "sex", "pclass", "age", "fare"] X_train, X_test = X_train[subset_feature], X_test[subset_feature] # %% @@ -134,12 +136,15 @@ from sklearn.compose import make_column_selector as selector -preprocessor = ColumnTransformer(transformers=[ - ('num', numeric_transformer, selector(dtype_exclude="category")), - ('cat', categorical_transformer, selector(dtype_include="category")) -]) -clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', LogisticRegression())]) +preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, selector(dtype_exclude="category")), + ("cat", categorical_transformer, selector(dtype_include="category")), + ] +) +clf = Pipeline( + steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())] +) clf.fit(X_train, y_train) @@ -167,8 +172,8 @@ # :class:`~sklearn.model_selection.GridSearchCV`. param_grid = { - 'preprocessor__num__imputer__strategy': ['mean', 'median'], - 'classifier__C': [0.1, 1.0, 10, 100], + "preprocessor__num__imputer__strategy": ["mean", "median"], + "classifier__C": [0.1, 1.0, 10, 100], } grid_search = GridSearchCV(clf, param_grid, cv=10) @@ -193,15 +198,23 @@ cv_results = pd.DataFrame(grid_search.cv_results_) cv_results = cv_results.sort_values("mean_test_score", ascending=False) -cv_results[["mean_test_score", "std_test_score", - "param_preprocessor__num__imputer__strategy", - "param_classifier__C" - ]].head(5) +cv_results[ + [ + "mean_test_score", + "std_test_score", + "param_preprocessor__num__imputer__strategy", + "param_classifier__C", + ] +].head(5) # %% # The best hyper-parameters have be used to re-fit a final model on the full # training set. We can evaluate that final model on held out test data that was # not used for hyperparameter tuning. # -print(("best logistic regression from grid search: %.3f" - % grid_search.score(X_test, y_test))) +print( + ( + "best logistic regression from grid search: %.3f" + % grid_search.score(X_test, y_test) + ) +) diff --git a/examples/compose/plot_compare_reduction.py b/examples/compose/plot_compare_reduction.py index 4e7d2c5900420..b31ac5d150998 100755 --- a/examples/compose/plot_compare_reduction.py +++ b/examples/compose/plot_compare_reduction.py @@ -38,51 +38,52 @@ print(__doc__) -pipe = Pipeline([ - # the reduce_dim stage is populated by the param_grid - ('reduce_dim', 'passthrough'), - ('classify', LinearSVC(dual=False, max_iter=10000)) -]) +pipe = Pipeline( + [ + # the reduce_dim stage is populated by the param_grid + ("reduce_dim", "passthrough"), + ("classify", LinearSVC(dual=False, max_iter=10000)), + ] +) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { - 'reduce_dim': [PCA(iterated_power=7), NMF()], - 'reduce_dim__n_components': N_FEATURES_OPTIONS, - 'classify__C': C_OPTIONS + "reduce_dim": [PCA(iterated_power=7), NMF()], + "reduce_dim__n_components": N_FEATURES_OPTIONS, + "classify__C": C_OPTIONS, }, { - 'reduce_dim': [SelectKBest(chi2)], - 'reduce_dim__k': N_FEATURES_OPTIONS, - 'classify__C': C_OPTIONS + "reduce_dim": [SelectKBest(chi2)], + "reduce_dim__k": N_FEATURES_OPTIONS, + "classify__C": C_OPTIONS, }, ] -reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] +reducer_labels = ["PCA", "NMF", "KBest(chi2)"] grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid) X, y = load_digits(return_X_y=True) grid.fit(X, y) -mean_scores = np.array(grid.cv_results_['mean_test_score']) +mean_scores = np.array(grid.cv_results_["mean_test_score"]) # scores are in the order of param_grid iteration, which is alphabetical mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS)) # select score for best C mean_scores = mean_scores.max(axis=0) -bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) * - (len(reducer_labels) + 1) + .5) +bar_offsets = np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + 0.5 plt.figure() -COLORS = 'bgrcmyk' +COLORS = "bgrcmyk" for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)): plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i]) plt.title("Comparing feature reduction techniques") -plt.xlabel('Reduced number of features') +plt.xlabel("Reduced number of features") plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS) -plt.ylabel('Digit classification accuracy') +plt.ylabel("Digit classification accuracy") plt.ylim((0, 1)) -plt.legend(loc='upper left') +plt.legend(loc="upper left") plt.show() @@ -103,11 +104,12 @@ from shutil import rmtree # Create a temporary folder to store the transformers of the pipeline -location = 'cachedir' +location = "cachedir" memory = Memory(location=location, verbose=10) -cached_pipe = Pipeline([('reduce_dim', PCA()), - ('classify', LinearSVC(dual=False, max_iter=10000))], - memory=memory) +cached_pipe = Pipeline( + [("reduce_dim", PCA()), ("classify", LinearSVC(dual=False, max_iter=10000))], + memory=memory, +) # This time, a cached pipeline will be used within the grid search diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py index 7c7ddf938a14f..8974ccc0e651e 100644 --- a/examples/compose/plot_digits_pipe.py +++ b/examples/compose/plot_digits_pipe.py @@ -36,14 +36,14 @@ pca = PCA() # set the tolerance to a large value to make the example faster logistic = LogisticRegression(max_iter=10000, tol=0.1) -pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) +pipe = Pipeline(steps=[("pca", pca), ("logistic", logistic)]) X_digits, y_digits = datasets.load_digits(return_X_y=True) # Parameters of pipelines can be set using ‘__’ separated parameter names: param_grid = { - 'pca__n_components': [5, 15, 30, 45, 64], - 'logistic__C': np.logspace(-4, 4, 4), + "pca__n_components": [5, 15, 30, 45, 64], + "logistic__C": np.logspace(-4, 4, 4), } search = GridSearchCV(pipe, param_grid, n_jobs=-1) search.fit(X_digits, y_digits) @@ -54,24 +54,30 @@ pca.fit(X_digits) fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6)) -ax0.plot(np.arange(1, pca.n_components_ + 1), - pca.explained_variance_ratio_, '+', linewidth=2) -ax0.set_ylabel('PCA explained variance ratio') - -ax0.axvline(search.best_estimator_.named_steps['pca'].n_components, - linestyle=':', label='n_components chosen') +ax0.plot( + np.arange(1, pca.n_components_ + 1), pca.explained_variance_ratio_, "+", linewidth=2 +) +ax0.set_ylabel("PCA explained variance ratio") + +ax0.axvline( + search.best_estimator_.named_steps["pca"].n_components, + linestyle=":", + label="n_components chosen", +) ax0.legend(prop=dict(size=12)) # For each number of components, find the best classifier results results = pd.DataFrame(search.cv_results_) -components_col = 'param_pca__n_components' +components_col = "param_pca__n_components" best_clfs = results.groupby(components_col).apply( - lambda g: g.nlargest(1, 'mean_test_score')) - -best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score', - legend=False, ax=ax1) -ax1.set_ylabel('Classification accuracy (val)') -ax1.set_xlabel('n_components') + lambda g: g.nlargest(1, "mean_test_score") +) + +best_clfs.plot( + x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1 +) +ax1.set_ylabel("Classification accuracy (val)") +ax1.set_xlabel("n_components") plt.xlim(-1, 70) diff --git a/examples/compose/plot_feature_union.py b/examples/compose/plot_feature_union.py index c76548a2d376f..09e6e6919ab9a 100644 --- a/examples/compose/plot_feature_union.py +++ b/examples/compose/plot_feature_union.py @@ -50,9 +50,11 @@ pipeline = Pipeline([("features", combined_features), ("svm", svm)]) -param_grid = dict(features__pca__n_components=[1, 2, 3], - features__univ_select__k=[1, 2], - svm__C=[0.1, 1, 10]) +param_grid = dict( + features__pca__n_components=[1, 2, 3], + features__univ_select__k=[1, 2], + svm__C=[0.1, 1, 10], +) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) grid_search.fit(X, y) diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py index eac10a926ee46..c7f64e265b4b9 100755 --- a/examples/compose/plot_transformed_target.py +++ b/examples/compose/plot_transformed_target.py @@ -32,10 +32,10 @@ ############################################################################## # `normed` is being deprecated in favor of `density` in histograms -if parse_version(matplotlib.__version__) >= parse_version('2.1'): - density_param = {'density': True} +if parse_version(matplotlib.__version__) >= parse_version("2.1"): + density_param = {"density": True} else: - density_param = {'normed': True} + density_param = {"normed": True} # %% # A synthetic random regression dataset is generated. The targets ``y`` are @@ -62,14 +62,14 @@ ax0.hist(y, bins=100, **density_param) ax0.set_xlim([0, 2000]) -ax0.set_ylabel('Probability') -ax0.set_xlabel('Target') -ax0.set_title('Target distribution') +ax0.set_ylabel("Probability") +ax0.set_xlabel("Target") +ax0.set_title("Target distribution") ax1.hist(y_trans, bins=100, **density_param) -ax1.set_ylabel('Probability') -ax1.set_xlabel('Target') -ax1.set_title('Transformed target distribution') +ax1.set_ylabel("Probability") +ax1.set_xlabel("Target") +ax1.set_title("Transformed target distribution") f.suptitle("Synthetic data", y=0.06, x=0.53) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) @@ -90,28 +90,36 @@ y_pred = regr.predict(X_test) # Plot results ax0.scatter(y_test, y_pred) -ax0.plot([0, 2000], [0, 2000], '--k') -ax0.set_ylabel('Target predicted') -ax0.set_xlabel('True Target') -ax0.set_title('Ridge regression \n without target transformation') -ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( - r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) +ax0.plot([0, 2000], [0, 2000], "--k") +ax0.set_ylabel("Target predicted") +ax0.set_xlabel("True Target") +ax0.set_title("Ridge regression \n without target transformation") +ax0.text( + 100, + 1750, + r"$R^2$=%.2f, MAE=%.2f" + % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)), +) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) # Transform targets and use same linear model -regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), - func=np.log1p, - inverse_func=np.expm1) +regr_trans = TransformedTargetRegressor( + regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1 +) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) -ax1.plot([0, 2000], [0, 2000], '--k') -ax1.set_ylabel('Target predicted') -ax1.set_xlabel('True Target') -ax1.set_title('Ridge regression \n with target transformation') -ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( - r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) +ax1.plot([0, 2000], [0, 2000], "--k") +ax1.set_ylabel("Target predicted") +ax1.set_xlabel("True Target") +ax1.set_title("Ridge regression \n with target transformation") +ax1.text( + 100, + 1750, + r"$R^2$=%.2f, MAE=%.2f" + % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)), +) ax1.set_xlim([0, 2000]) ax1.set_ylim([0, 2000]) @@ -133,12 +141,11 @@ # Keep only numeric columns X = ames.data.select_dtypes(np.number) # Remove columns with NaN or Inf values -X = X.drop(columns=['LotFrontage', 'GarageYrBlt', 'MasVnrArea']) +X = X.drop(columns=["LotFrontage", "GarageYrBlt", "MasVnrArea"]) y = ames.target -y_trans = quantile_transform(y.to_frame(), - n_quantiles=900, - output_distribution='normal', - copy=True).squeeze() +y_trans = quantile_transform( + y.to_frame(), n_quantiles=900, output_distribution="normal", copy=True +).squeeze() # %% # A :class:`~sklearn.preprocessing.QuantileTransformer` is used to normalize # the target distribution before applying a @@ -147,15 +154,15 @@ f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, **density_param) -ax0.set_ylabel('Probability') -ax0.set_xlabel('Target') -ax0.text(s='Target distribution', x=1.2e5, y=9.8e-6, fontsize=12) +ax0.set_ylabel("Probability") +ax0.set_xlabel("Target") +ax0.text(s="Target distribution", x=1.2e5, y=9.8e-6, fontsize=12) ax0.ticklabel_format(axis="both", style="sci", scilimits=(0, 0)) ax1.hist(y_trans, bins=100, **density_param) -ax1.set_ylabel('Probability') -ax1.set_xlabel('Target') -ax1.text(s='Transformed target distribution', x=-6.8, y=0.479, fontsize=12) +ax1.set_ylabel("Probability") +ax1.set_xlabel("Target") +ax1.text(s="Transformed target distribution", x=-6.8, y=0.479, fontsize=12) f.suptitle("Ames housing data: selling price", y=0.04) f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) @@ -171,51 +178,69 @@ # target. With target transformation, the shape is more linear indicating # better model fit. -f, (ax0, ax1) = plt.subplots(2, 2, sharey='row', figsize=(6.5, 8)) +f, (ax0, ax1) = plt.subplots(2, 2, sharey="row", figsize=(6.5, 8)) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0[0].scatter(y_pred, y_test, s=8) -ax0[0].plot([0, 7e5], [0, 7e5], '--k') -ax0[0].set_ylabel('True target') -ax0[0].set_xlabel('Predicted target') -ax0[0].text(s='Ridge regression \n without target transformation', x=-5e4, - y=8e5, fontsize=12, multialignment='center') -ax0[0].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % ( - r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) +ax0[0].plot([0, 7e5], [0, 7e5], "--k") +ax0[0].set_ylabel("True target") +ax0[0].set_xlabel("Predicted target") +ax0[0].text( + s="Ridge regression \n without target transformation", + x=-5e4, + y=8e5, + fontsize=12, + multialignment="center", +) +ax0[0].text( + 3e4, + 64e4, + r"$R^2$=%.2f, MAE=%.2f" + % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)), +) ax0[0].set_xlim([0, 7e5]) ax0[0].set_ylim([0, 7e5]) ax0[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0)) ax1[0].scatter(y_pred, (y_pred - y_test), s=8) -ax1[0].set_ylabel('Residual') -ax1[0].set_xlabel('Predicted target') +ax1[0].set_ylabel("Residual") +ax1[0].set_xlabel("Predicted target") ax1[0].ticklabel_format(axis="both", style="sci", scilimits=(0, 0)) regr_trans = TransformedTargetRegressor( regressor=RidgeCV(), - transformer=QuantileTransformer(n_quantiles=900, - output_distribution='normal')) + transformer=QuantileTransformer(n_quantiles=900, output_distribution="normal"), +) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax0[1].scatter(y_pred, y_test, s=8) -ax0[1].plot([0, 7e5], [0, 7e5], '--k') -ax0[1].set_ylabel('True target') -ax0[1].set_xlabel('Predicted target') -ax0[1].text(s='Ridge regression \n with target transformation', x=-5e4, - y=8e5, fontsize=12, multialignment='center') -ax0[1].text(3e4, 64e4, r'$R^2$=%.2f, MAE=%.2f' % ( - r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) +ax0[1].plot([0, 7e5], [0, 7e5], "--k") +ax0[1].set_ylabel("True target") +ax0[1].set_xlabel("Predicted target") +ax0[1].text( + s="Ridge regression \n with target transformation", + x=-5e4, + y=8e5, + fontsize=12, + multialignment="center", +) +ax0[1].text( + 3e4, + 64e4, + r"$R^2$=%.2f, MAE=%.2f" + % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)), +) ax0[1].set_xlim([0, 7e5]) ax0[1].set_ylim([0, 7e5]) ax0[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0)) ax1[1].scatter(y_pred, (y_pred - y_test), s=8) -ax1[1].set_ylabel('Residual') -ax1[1].set_xlabel('Predicted target') +ax1[1].set_ylabel("Residual") +ax1[1].set_xlabel("Predicted target") ax1[1].ticklabel_format(axis="both", style="sci", scilimits=(0, 0)) f.suptitle("Ames housing data: selling price", y=0.035) diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py index c0540a75f9e67..eae43b3c7d4d3 100644 --- a/examples/covariance/plot_covariance_estimation.py +++ b/examples/covariance/plot_covariance_estimation.py @@ -47,8 +47,13 @@ import matplotlib.pyplot as plt from scipy import linalg -from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \ - log_likelihood, empirical_covariance +from sklearn.covariance import ( + LedoitWolf, + OAS, + ShrunkCovariance, + log_likelihood, + empirical_covariance, +) from sklearn.model_selection import GridSearchCV @@ -69,8 +74,9 @@ # spanning a range of possible shrinkage coefficient values shrinkages = np.logspace(-2, 0, 30) -negative_logliks = [-ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test) - for s in shrinkages] +negative_logliks = [ + -ShrunkCovariance(shrinkage=s).fit(X_train).score(X_test) for s in shrinkages +] # under the ground-truth model, which we would not have access to in real # settings @@ -82,7 +88,7 @@ # Compare different approaches to setting the parameter # GridSearch for an optimal shrinkage coefficient -tuned_parameters = [{'shrinkage': shrinkages}] +tuned_parameters = [{"shrinkage": shrinkages}] cv = GridSearchCV(ShrunkCovariance(), tuned_parameters) cv.fit(X_train) @@ -98,31 +104,42 @@ # Plot results fig = plt.figure() plt.title("Regularized covariance: likelihood and shrinkage coefficient") -plt.xlabel('Regularization parameter: shrinkage coefficient') -plt.ylabel('Error: negative log-likelihood on test data') +plt.xlabel("Regularization parameter: shrinkage coefficient") +plt.ylabel("Error: negative log-likelihood on test data") # range shrinkage curve plt.loglog(shrinkages, negative_logliks, label="Negative log-likelihood") -plt.plot(plt.xlim(), 2 * [loglik_real], '--r', - label="Real covariance likelihood") +plt.plot(plt.xlim(), 2 * [loglik_real], "--r", label="Real covariance likelihood") # adjust view lik_max = np.amax(negative_logliks) lik_min = np.amin(negative_logliks) -ymin = lik_min - 6. * np.log((plt.ylim()[1] - plt.ylim()[0])) -ymax = lik_max + 10. * np.log(lik_max - lik_min) +ymin = lik_min - 6.0 * np.log((plt.ylim()[1] - plt.ylim()[0])) +ymax = lik_max + 10.0 * np.log(lik_max - lik_min) xmin = shrinkages[0] xmax = shrinkages[-1] # LW likelihood -plt.vlines(lw.shrinkage_, ymin, -loglik_lw, color='magenta', - linewidth=3, label='Ledoit-Wolf estimate') +plt.vlines( + lw.shrinkage_, + ymin, + -loglik_lw, + color="magenta", + linewidth=3, + label="Ledoit-Wolf estimate", +) # OAS likelihood -plt.vlines(oa.shrinkage_, ymin, -loglik_oa, color='purple', - linewidth=3, label='OAS estimate') +plt.vlines( + oa.shrinkage_, ymin, -loglik_oa, color="purple", linewidth=3, label="OAS estimate" +) # best CV estimator likelihood -plt.vlines(cv.best_estimator_.shrinkage, ymin, - -cv.best_estimator_.score(X_test), color='cyan', - linewidth=3, label='Cross-validation best estimate') +plt.vlines( + cv.best_estimator_.shrinkage, + ymin, + -cv.best_estimator_.score(X_test), + color="cyan", + linewidth=3, + label="Cross-validation best estimate", +) plt.ylim(ymin, ymax) plt.xlim(xmin, xmax) diff --git a/examples/covariance/plot_lw_vs_oas.py b/examples/covariance/plot_lw_vs_oas.py index f9cd11bbb81b4..abd451da3875a 100644 --- a/examples/covariance/plot_lw_vs_oas.py +++ b/examples/covariance/plot_lw_vs_oas.py @@ -44,8 +44,7 @@ oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): - X = np.dot( - np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) + X = np.dot(np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X) @@ -59,10 +58,22 @@ # plot MSE plt.subplot(2, 1, 1) -plt.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), - label='Ledoit-Wolf', color='navy', lw=2) -plt.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1), - label='OAS', color='darkorange', lw=2) +plt.errorbar( + n_samples_range, + lw_mse.mean(1), + yerr=lw_mse.std(1), + label="Ledoit-Wolf", + color="navy", + lw=2, +) +plt.errorbar( + n_samples_range, + oa_mse.mean(1), + yerr=oa_mse.std(1), + label="OAS", + color="darkorange", + lw=2, +) plt.ylabel("Squared error") plt.legend(loc="upper right") plt.title("Comparison of covariance estimators") @@ -70,14 +81,26 @@ # plot shrinkage coefficient plt.subplot(2, 1, 2) -plt.errorbar(n_samples_range, lw_shrinkage.mean(1), yerr=lw_shrinkage.std(1), - label='Ledoit-Wolf', color='navy', lw=2) -plt.errorbar(n_samples_range, oa_shrinkage.mean(1), yerr=oa_shrinkage.std(1), - label='OAS', color='darkorange', lw=2) +plt.errorbar( + n_samples_range, + lw_shrinkage.mean(1), + yerr=lw_shrinkage.std(1), + label="Ledoit-Wolf", + color="navy", + lw=2, +) +plt.errorbar( + n_samples_range, + oa_shrinkage.mean(1), + yerr=oa_shrinkage.std(1), + label="OAS", + color="darkorange", + lw=2, +) plt.xlabel("n_samples") plt.ylabel("Shrinkage") plt.legend(loc="lower right") -plt.ylim(plt.ylim()[0], 1. + (plt.ylim()[1] - plt.ylim()[0]) / 10.) +plt.ylim(plt.ylim()[0], 1.0 + (plt.ylim()[1] - plt.ylim()[0]) / 10.0) plt.xlim(5, 31) plt.show() diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py index ab2e9fe8471d5..b93d68a269706 100644 --- a/examples/covariance/plot_mahalanobis_distances.py +++ b/examples/covariance/plot_mahalanobis_distances.py @@ -84,11 +84,11 @@ # generate Gaussian data of shape (125, 2) gen_cov = np.eye(n_features) -gen_cov[0, 0] = 2. +gen_cov[0, 0] = 2.0 X = np.dot(np.random.randn(n_samples, n_features), gen_cov) # add some outliers outliers_cov = np.eye(n_features) -outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7. +outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.0 X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov) # %% @@ -109,9 +109,11 @@ robust_cov = MinCovDet().fit(X) # fit a MLE estimator to data emp_cov = EmpiricalCovariance().fit(X) -print('Estimated covariance matrix:\n' - 'MCD (Robust):\n{}\n' - 'MLE:\n{}'.format(robust_cov.covariance_, emp_cov.covariance_)) +print( + "Estimated covariance matrix:\nMCD (Robust):\n{}\nMLE:\n{}".format( + robust_cov.covariance_, emp_cov.covariance_ + ) +) # %% # To better visualize the difference, we plot contours of the @@ -122,33 +124,44 @@ fig, ax = plt.subplots(figsize=(10, 5)) # Plot data set -inlier_plot = ax.scatter(X[:, 0], X[:, 1], - color='black', label='inliers') -outlier_plot = ax.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], - color='red', label='outliers') -ax.set_xlim(ax.get_xlim()[0], 10.) +inlier_plot = ax.scatter(X[:, 0], X[:, 1], color="black", label="inliers") +outlier_plot = ax.scatter( + X[:, 0][-n_outliers:], X[:, 1][-n_outliers:], color="red", label="outliers" +) +ax.set_xlim(ax.get_xlim()[0], 10.0) ax.set_title("Mahalanobis distances of a contaminated data set") # Create meshgrid of feature 1 and feature 2 values -xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100), - np.linspace(plt.ylim()[0], plt.ylim()[1], 100)) +xx, yy = np.meshgrid( + np.linspace(plt.xlim()[0], plt.xlim()[1], 100), + np.linspace(plt.ylim()[0], plt.ylim()[1], 100), +) zz = np.c_[xx.ravel(), yy.ravel()] # Calculate the MLE based Mahalanobis distances of the meshgrid mahal_emp_cov = emp_cov.mahalanobis(zz) mahal_emp_cov = mahal_emp_cov.reshape(xx.shape) -emp_cov_contour = plt.contour(xx, yy, np.sqrt(mahal_emp_cov), - cmap=plt.cm.PuBu_r, linestyles='dashed') +emp_cov_contour = plt.contour( + xx, yy, np.sqrt(mahal_emp_cov), cmap=plt.cm.PuBu_r, linestyles="dashed" +) # Calculate the MCD based Mahalanobis distances mahal_robust_cov = robust_cov.mahalanobis(zz) mahal_robust_cov = mahal_robust_cov.reshape(xx.shape) -robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov), - cmap=plt.cm.YlOrBr_r, linestyles='dotted') +robust_contour = ax.contour( + xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles="dotted" +) # Add legend -ax.legend([emp_cov_contour.collections[1], robust_contour.collections[1], - inlier_plot, outlier_plot], - ['MLE dist', 'MCD dist', 'inliers', 'outliers'], - loc="upper right", borderaxespad=0) +ax.legend( + [ + emp_cov_contour.collections[1], + robust_contour.collections[1], + inlier_plot, + outlier_plot, + ], + ["MLE dist", "MCD dist", "inliers", "outliers"], + loc="upper right", + borderaxespad=0, +) plt.show() @@ -161,32 +174,37 @@ # distribution of inlier samples for robust MCD based Mahalanobis distances. fig, (ax1, ax2) = plt.subplots(1, 2) -plt.subplots_adjust(wspace=.6) +plt.subplots_adjust(wspace=0.6) # Calculate cubic root of MLE Mahalanobis distances for samples emp_mahal = emp_cov.mahalanobis(X - np.mean(X, 0)) ** (0.33) # Plot boxplots -ax1.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=.25) +ax1.boxplot([emp_mahal[:-n_outliers], emp_mahal[-n_outliers:]], widths=0.25) # Plot individual samples -ax1.plot(np.full(n_samples - n_outliers, 1.26), emp_mahal[:-n_outliers], - '+k', markeredgewidth=1) -ax1.plot(np.full(n_outliers, 2.26), emp_mahal[-n_outliers:], - '+k', markeredgewidth=1) -ax1.axes.set_xticklabels(('inliers', 'outliers'), size=15) +ax1.plot( + np.full(n_samples - n_outliers, 1.26), + emp_mahal[:-n_outliers], + "+k", + markeredgewidth=1, +) +ax1.plot(np.full(n_outliers, 2.26), emp_mahal[-n_outliers:], "+k", markeredgewidth=1) +ax1.axes.set_xticklabels(("inliers", "outliers"), size=15) ax1.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$", size=16) ax1.set_title("Using non-robust estimates\n(Maximum Likelihood)") # Calculate cubic root of MCD Mahalanobis distances for samples robust_mahal = robust_cov.mahalanobis(X - robust_cov.location_) ** (0.33) # Plot boxplots -ax2.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]], - widths=.25) +ax2.boxplot([robust_mahal[:-n_outliers], robust_mahal[-n_outliers:]], widths=0.25) # Plot individual samples -ax2.plot(np.full(n_samples - n_outliers, 1.26), robust_mahal[:-n_outliers], - '+k', markeredgewidth=1) -ax2.plot(np.full(n_outliers, 2.26), robust_mahal[-n_outliers:], - '+k', markeredgewidth=1) -ax2.axes.set_xticklabels(('inliers', 'outliers'), size=15) +ax2.plot( + np.full(n_samples - n_outliers, 1.26), + robust_mahal[:-n_outliers], + "+k", + markeredgewidth=1, +) +ax2.plot(np.full(n_outliers, 2.26), robust_mahal[-n_outliers:], "+k", markeredgewidth=1) +ax2.axes.set_xticklabels(("inliers", "outliers"), size=15) ax2.set_ylabel(r"$\sqrt[3]{\rm{(Mahal. dist.)}}$", size=16) ax2.set_title("Using robust estimates\n(Minimum Covariance Determinant)") diff --git a/examples/covariance/plot_robust_vs_empirical_covariance.py b/examples/covariance/plot_robust_vs_empirical_covariance.py index 8ca0a49d66642..a4fb6ef8941ed 100644 --- a/examples/covariance/plot_robust_vs_empirical_covariance.py +++ b/examples/covariance/plot_robust_vs_empirical_covariance.py @@ -66,8 +66,11 @@ repeat = 10 range_n_outliers = np.concatenate( - (np.linspace(0, n_samples / 8, 5), - np.linspace(n_samples / 8, n_samples / 2, 5)[1:-1])).astype(int) + ( + np.linspace(0, n_samples / 8, 5), + np.linspace(n_samples / 8, n_samples / 2, 5)[1:-1], + ) +).astype(int) # definition of arrays to store results err_loc_mcd = np.zeros((range_n_outliers.size, repeat)) @@ -87,8 +90,9 @@ X = rng.randn(n_samples, n_features) # add some outliers outliers_index = rng.permutation(n_samples)[:n_outliers] - outliers_offset = 10. * \ - (np.random.randint(2, size=(n_outliers, n_features)) - 0.5) + outliers_offset = 10.0 * ( + np.random.randint(2, size=(n_outliers, n_features)) - 0.5 + ) X[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False @@ -102,8 +106,9 @@ # compare estimators learned from the full data set with true # parameters err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2) - err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm( - np.eye(n_features)) + err_cov_emp_full[i, j] = ( + EmpiricalCovariance().fit(X).error_norm(np.eye(n_features)) + ) # compare with an empirical covariance learned from a pure data set # (i.e. "perfect" mcd) @@ -117,34 +122,63 @@ font_prop = matplotlib.font_manager.FontProperties(size=11) plt.subplot(2, 1, 1) lw = 2 -plt.errorbar(range_n_outliers, err_loc_mcd.mean(1), - yerr=err_loc_mcd.std(1) / np.sqrt(repeat), - label="Robust location", lw=lw, color='m') -plt.errorbar(range_n_outliers, err_loc_emp_full.mean(1), - yerr=err_loc_emp_full.std(1) / np.sqrt(repeat), - label="Full data set mean", lw=lw, color='green') -plt.errorbar(range_n_outliers, err_loc_emp_pure.mean(1), - yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat), - label="Pure data set mean", lw=lw, color='black') +plt.errorbar( + range_n_outliers, + err_loc_mcd.mean(1), + yerr=err_loc_mcd.std(1) / np.sqrt(repeat), + label="Robust location", + lw=lw, + color="m", +) +plt.errorbar( + range_n_outliers, + err_loc_emp_full.mean(1), + yerr=err_loc_emp_full.std(1) / np.sqrt(repeat), + label="Full data set mean", + lw=lw, + color="green", +) +plt.errorbar( + range_n_outliers, + err_loc_emp_pure.mean(1), + yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat), + label="Pure data set mean", + lw=lw, + color="black", +) plt.title("Influence of outliers on the location estimation") plt.ylabel(r"Error ($||\mu - \hat{\mu}||_2^2$)") plt.legend(loc="upper left", prop=font_prop) plt.subplot(2, 1, 2) x_size = range_n_outliers.size -plt.errorbar(range_n_outliers, err_cov_mcd.mean(1), - yerr=err_cov_mcd.std(1), - label="Robust covariance (mcd)", color='m') -plt.errorbar(range_n_outliers[:(x_size // 5 + 1)], - err_cov_emp_full.mean(1)[:(x_size // 5 + 1)], - yerr=err_cov_emp_full.std(1)[:(x_size // 5 + 1)], - label="Full data set empirical covariance", color='green') -plt.plot(range_n_outliers[(x_size // 5):(x_size // 2 - 1)], - err_cov_emp_full.mean(1)[(x_size // 5):(x_size // 2 - 1)], - color='green', ls='--') -plt.errorbar(range_n_outliers, err_cov_emp_pure.mean(1), - yerr=err_cov_emp_pure.std(1), - label="Pure data set empirical covariance", color='black') +plt.errorbar( + range_n_outliers, + err_cov_mcd.mean(1), + yerr=err_cov_mcd.std(1), + label="Robust covariance (mcd)", + color="m", +) +plt.errorbar( + range_n_outliers[: (x_size // 5 + 1)], + err_cov_emp_full.mean(1)[: (x_size // 5 + 1)], + yerr=err_cov_emp_full.std(1)[: (x_size // 5 + 1)], + label="Full data set empirical covariance", + color="green", +) +plt.plot( + range_n_outliers[(x_size // 5) : (x_size // 2 - 1)], + err_cov_emp_full.mean(1)[(x_size // 5) : (x_size // 2 - 1)], + color="green", + ls="--", +) +plt.errorbar( + range_n_outliers, + err_cov_emp_pure.mean(1), + yerr=err_cov_emp_pure.std(1), + label="Pure data set empirical covariance", + color="black", +) plt.title("Influence of outliers on the covariance estimation") plt.xlabel("Amount of contamination (%)") plt.ylabel("RMSE") diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py index c595e7d3d9661..ddb8de244a3b9 100644 --- a/examples/covariance/plot_sparse_cov.py +++ b/examples/covariance/plot_sparse_cov.py @@ -65,10 +65,9 @@ n_features = 20 prng = np.random.RandomState(1) -prec = make_sparse_spd_matrix(n_features, alpha=.98, - smallest_coef=.4, - largest_coef=.7, - random_state=prng) +prec = make_sparse_spd_matrix( + n_features, alpha=0.98, smallest_coef=0.4, largest_coef=0.7, random_state=prng +) cov = linalg.inv(prec) d = np.sqrt(np.diag(cov)) cov /= d @@ -97,42 +96,55 @@ plt.subplots_adjust(left=0.02, right=0.98) # plot the covariances -covs = [('Empirical', emp_cov), ('Ledoit-Wolf', lw_cov_), - ('GraphicalLassoCV', cov_), ('True', cov)] +covs = [ + ("Empirical", emp_cov), + ("Ledoit-Wolf", lw_cov_), + ("GraphicalLassoCV", cov_), + ("True", cov), +] vmax = cov_.max() for i, (name, this_cov) in enumerate(covs): plt.subplot(2, 4, i + 1) - plt.imshow(this_cov, interpolation='nearest', vmin=-vmax, vmax=vmax, - cmap=plt.cm.RdBu_r) + plt.imshow( + this_cov, interpolation="nearest", vmin=-vmax, vmax=vmax, cmap=plt.cm.RdBu_r + ) plt.xticks(()) plt.yticks(()) - plt.title('%s covariance' % name) + plt.title("%s covariance" % name) # plot the precisions -precs = [('Empirical', linalg.inv(emp_cov)), ('Ledoit-Wolf', lw_prec_), - ('GraphicalLasso', prec_), ('True', prec)] -vmax = .9 * prec_.max() +precs = [ + ("Empirical", linalg.inv(emp_cov)), + ("Ledoit-Wolf", lw_prec_), + ("GraphicalLasso", prec_), + ("True", prec), +] +vmax = 0.9 * prec_.max() for i, (name, this_prec) in enumerate(precs): ax = plt.subplot(2, 4, i + 5) - plt.imshow(np.ma.masked_equal(this_prec, 0), - interpolation='nearest', vmin=-vmax, vmax=vmax, - cmap=plt.cm.RdBu_r) + plt.imshow( + np.ma.masked_equal(this_prec, 0), + interpolation="nearest", + vmin=-vmax, + vmax=vmax, + cmap=plt.cm.RdBu_r, + ) plt.xticks(()) plt.yticks(()) - plt.title('%s precision' % name) - if hasattr(ax, 'set_facecolor'): - ax.set_facecolor('.7') + plt.title("%s precision" % name) + if hasattr(ax, "set_facecolor"): + ax.set_facecolor(".7") else: - ax.set_axis_bgcolor('.7') + ax.set_axis_bgcolor(".7") # plot the model selection metric plt.figure(figsize=(4, 3)) -plt.axes([.2, .15, .75, .7]) -plt.plot(model.cv_results_["alphas"], model.cv_results_["mean_score"], 'o-') -plt.axvline(model.alpha_, color='.5') -plt.title('Model selection') -plt.ylabel('Cross-validation score') -plt.xlabel('alpha') +plt.axes([0.2, 0.15, 0.75, 0.7]) +plt.plot(model.cv_results_["alphas"], model.cv_results_["mean_score"], "o-") +plt.axvline(model.alpha_, color=".5") +plt.title("Model selection") +plt.ylabel("Cross-validation score") +plt.xlabel("alpha") plt.show() diff --git a/examples/cross_decomposition/plot_compare_cross_decomposition.py b/examples/cross_decomposition/plot_compare_cross_decomposition.py index 2f1f1d21f30d5..21b735e401711 100644 --- a/examples/cross_decomposition/plot_compare_cross_decomposition.py +++ b/examples/cross_decomposition/plot_compare_cross_decomposition.py @@ -36,10 +36,10 @@ X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) -X_train = X[:n // 2] -Y_train = Y[:n // 2] -X_test = X[n // 2:] -Y_test = Y[n // 2:] +X_train = X[: n // 2] +Y_train = Y[: n // 2] +X_test = X[n // 2 :] +Y_test = Y[n // 2 :] print("Corr(X)") print(np.round(np.corrcoef(X.T), 2)) @@ -61,54 +61,54 @@ # 1) On diagonal plot X vs Y scores on each components plt.figure(figsize=(12, 8)) plt.subplot(221) -plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", - marker="o", s=25) -plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", - marker="o", s=25) +plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", s=25) +plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") -plt.title('Comp. 1: X vs Y (test corr = %.2f)' % - np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1]) +plt.title( + "Comp. 1: X vs Y (test corr = %.2f)" + % np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1] +) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") plt.subplot(224) -plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", - marker="o", s=25) -plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", - marker="o", s=25) +plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", marker="o", s=25) +plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", marker="o", s=25) plt.xlabel("x scores") plt.ylabel("y scores") -plt.title('Comp. 2: X vs Y (test corr = %.2f)' % - np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1]) +plt.title( + "Comp. 2: X vs Y (test corr = %.2f)" + % np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1] +) plt.xticks(()) plt.yticks(()) plt.legend(loc="best") # 2) Off diagonal plot components 1 vs 2 for X and Y plt.subplot(222) -plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", - marker="*", s=50) -plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", - marker="*", s=50) +plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", marker="*", s=50) +plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("X comp. 1") plt.ylabel("X comp. 2") -plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)' - % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1]) +plt.title( + "X comp. 1 vs X comp. 2 (test corr = %.2f)" + % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1] +) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) plt.subplot(223) -plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", - marker="*", s=50) -plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", - marker="*", s=50) +plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", marker="*", s=50) +plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", marker="*", s=50) plt.xlabel("Y comp. 1") plt.ylabel("Y comp. 2") -plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)' - % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1]) +plt.title( + "Y comp. 1 vs Y comp. 2 , (test corr = %.2f)" + % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1] +) plt.legend(loc="best") plt.xticks(()) plt.yticks(()) diff --git a/examples/cross_decomposition/plot_pcr_vs_pls.py b/examples/cross_decomposition/plot_pcr_vs_pls.py index cc22f3bd0ebc6..09633e988c1f1 100644 --- a/examples/cross_decomposition/plot_pcr_vs_pls.py +++ b/examples/cross_decomposition/plot_pcr_vs_pls.py @@ -48,20 +48,27 @@ rng = np.random.RandomState(0) n_samples = 500 -cov = [[3, 3], - [3, 4]] +cov = [[3, 3], [3, 4]] X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples) pca = PCA(n_components=2).fit(X) -plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples') +plt.scatter(X[:, 0], X[:, 1], alpha=0.3, label="samples") for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)): comp = comp * var # scale component by its variance explanation power - plt.plot([0, comp[0]], [0, comp[1]], label=f"Component {i}", linewidth=5, - color=f"C{i + 2}") -plt.gca().set(aspect='equal', - title="2-dimensional dataset with principal components", - xlabel='first feature', ylabel='second feature') + plt.plot( + [0, comp[0]], + [0, comp[1]], + label=f"Component {i}", + linewidth=5, + color=f"C{i + 2}", + ) +plt.gca().set( + aspect="equal", + title="2-dimensional dataset with principal components", + xlabel="first feature", + ylabel="second feature", +) plt.legend() plt.show() @@ -74,10 +81,10 @@ fig, axes = plt.subplots(1, 2, figsize=(10, 3)) -axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3) -axes[0].set(xlabel='Projected data onto first PCA component', ylabel='y') -axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3) -axes[1].set(xlabel='Projected data onto second PCA component', ylabel='y') +axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3) +axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y") +axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3) +axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y") plt.tight_layout() plt.show() @@ -104,23 +111,25 @@ pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) pcr.fit(X_train, y_train) -pca = pcr.named_steps['pca'] # retrieve the PCA step of the pipeline +pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline pls = PLSRegression(n_components=1) pls.fit(X_train, y_train) fig, axes = plt.subplots(1, 2, figsize=(10, 3)) -axes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth') -axes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3, - label='predictions') -axes[0].set(xlabel='Projected data onto first PCA component', - ylabel='y', title='PCR / PCA') +axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth") +axes[0].scatter( + pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions" +) +axes[0].set( + xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA" +) axes[0].legend() -axes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth') -axes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3, - label='predictions') -axes[1].set(xlabel='Projected data onto first PLS component', - ylabel='y', title='PLS') +axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth") +axes[1].scatter( + pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions" +) +axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS") axes[1].legend() plt.tight_layout() plt.show() diff --git a/examples/datasets/plot_digits_last_image.py b/examples/datasets/plot_digits_last_image.py index 27314b6c9dcdb..98620d98702c7 100644 --- a/examples/datasets/plot_digits_last_image.py +++ b/examples/datasets/plot_digits_last_image.py @@ -31,5 +31,5 @@ # Display the first digit plt.figure(1, figsize=(3, 3)) -plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest') +plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation="nearest") plt.show() diff --git a/examples/datasets/plot_iris_dataset.py b/examples/datasets/plot_iris_dataset.py index d6e23253aa53e..c37057d230c1d 100644 --- a/examples/datasets/plot_iris_dataset.py +++ b/examples/datasets/plot_iris_dataset.py @@ -33,17 +33,16 @@ X = iris.data[:, :2] # we only take the first two features. y = iris.target -x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 -y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 +x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 +y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 plt.figure(2, figsize=(8, 6)) plt.clf() # Plot the training points -plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, - edgecolor='k') -plt.xlabel('Sepal length') -plt.ylabel('Sepal width') +plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor="k") +plt.xlabel("Sepal length") +plt.ylabel("Sepal width") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) @@ -55,8 +54,15 @@ fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) X_reduced = PCA(n_components=3).fit_transform(iris.data) -ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y, - cmap=plt.cm.Set1, edgecolor='k', s=40) +ax.scatter( + X_reduced[:, 0], + X_reduced[:, 1], + X_reduced[:, 2], + c=y, + cmap=plt.cm.Set1, + edgecolor="k", + s=40, +) ax.set_title("First three PCA directions") ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py index 2f8d4be8ac383..7a94eaa5550f7 100644 --- a/examples/datasets/plot_random_dataset.py +++ b/examples/datasets/plot_random_dataset.py @@ -22,47 +22,42 @@ from sklearn.datasets import make_gaussian_quantiles plt.figure(figsize=(8, 8)) -plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95) +plt.subplots_adjust(bottom=0.05, top=0.9, left=0.05, right=0.95) plt.subplot(321) -plt.title("One informative feature, one cluster per class", fontsize='small') -X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=1, - n_clusters_per_class=1) -plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, - s=25, edgecolor='k') +plt.title("One informative feature, one cluster per class", fontsize="small") +X1, Y1 = make_classification( + n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1 +) +plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k") plt.subplot(322) -plt.title("Two informative features, one cluster per class", fontsize='small') -X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2, - n_clusters_per_class=1) -plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, - s=25, edgecolor='k') +plt.title("Two informative features, one cluster per class", fontsize="small") +X1, Y1 = make_classification( + n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1 +) +plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k") plt.subplot(323) -plt.title("Two informative features, two clusters per class", - fontsize='small') +plt.title("Two informative features, two clusters per class", fontsize="small") X2, Y2 = make_classification(n_features=2, n_redundant=0, n_informative=2) -plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2, - s=25, edgecolor='k') +plt.scatter(X2[:, 0], X2[:, 1], marker="o", c=Y2, s=25, edgecolor="k") plt.subplot(324) -plt.title("Multi-class, two informative features, one cluster", - fontsize='small') -X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2, - n_clusters_per_class=1, n_classes=3) -plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, - s=25, edgecolor='k') +plt.title("Multi-class, two informative features, one cluster", fontsize="small") +X1, Y1 = make_classification( + n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=3 +) +plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k") plt.subplot(325) -plt.title("Three blobs", fontsize='small') +plt.title("Three blobs", fontsize="small") X1, Y1 = make_blobs(n_features=2, centers=3) -plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, - s=25, edgecolor='k') +plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k") plt.subplot(326) -plt.title("Gaussian divided into three quantiles", fontsize='small') +plt.title("Gaussian divided into three quantiles", fontsize="small") X1, Y1 = make_gaussian_quantiles(n_features=2, n_classes=3) -plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, - s=25, edgecolor='k') +plt.scatter(X1[:, 0], X1[:, 1], marker="o", c=Y1, s=25, edgecolor="k") plt.show() diff --git a/examples/datasets/plot_random_multilabel_dataset.py b/examples/datasets/plot_random_multilabel_dataset.py index 5cb54689d64be..a7ceba36e30db 100644 --- a/examples/datasets/plot_random_multilabel_dataset.py +++ b/examples/datasets/plot_random_multilabel_dataset.py @@ -41,15 +41,18 @@ print(__doc__) -COLORS = np.array(['!', - '#FF3333', # red - '#0198E1', # blue - '#BF5FFF', # purple - '#FCD116', # yellow - '#FF7216', # orange - '#4DBD33', # green - '#87421F' # brown - ]) +COLORS = np.array( + [ + "!", + "#FF3333", # red + "#0198E1", # blue + "#BF5FFF", # purple + "#FCD116", # yellow + "#FF7216", # orange + "#4DBD33", # green + "#87421F", # brown + ] +) # Use same random seed for multiple calls to make_multilabel_classification to # ensure same distributions @@ -57,38 +60,48 @@ def plot_2d(ax, n_labels=1, n_classes=3, length=50): - X, Y, p_c, p_w_c = make_ml_clf(n_samples=150, n_features=2, - n_classes=n_classes, n_labels=n_labels, - length=length, allow_unlabeled=False, - return_distributions=True, - random_state=RANDOM_SEED) - - ax.scatter(X[:, 0], X[:, 1], color=COLORS.take((Y * [1, 2, 4] - ).sum(axis=1)), - marker='.') - ax.scatter(p_w_c[0] * length, p_w_c[1] * length, - marker='*', linewidth=.5, edgecolor='black', - s=20 + 1500 * p_c ** 2, - color=COLORS.take([1, 2, 4])) - ax.set_xlabel('Feature 0 count') + X, Y, p_c, p_w_c = make_ml_clf( + n_samples=150, + n_features=2, + n_classes=n_classes, + n_labels=n_labels, + length=length, + allow_unlabeled=False, + return_distributions=True, + random_state=RANDOM_SEED, + ) + + ax.scatter( + X[:, 0], X[:, 1], color=COLORS.take((Y * [1, 2, 4]).sum(axis=1)), marker="." + ) + ax.scatter( + p_w_c[0] * length, + p_w_c[1] * length, + marker="*", + linewidth=0.5, + edgecolor="black", + s=20 + 1500 * p_c ** 2, + color=COLORS.take([1, 2, 4]), + ) + ax.set_xlabel("Feature 0 count") return p_c, p_w_c -_, (ax1, ax2) = plt.subplots(1, 2, sharex='row', sharey='row', figsize=(8, 4)) -plt.subplots_adjust(bottom=.15) +_, (ax1, ax2) = plt.subplots(1, 2, sharex="row", sharey="row", figsize=(8, 4)) +plt.subplots_adjust(bottom=0.15) p_c, p_w_c = plot_2d(ax1, n_labels=1) -ax1.set_title('n_labels=1, length=50') -ax1.set_ylabel('Feature 1 count') +ax1.set_title("n_labels=1, length=50") +ax1.set_ylabel("Feature 1 count") plot_2d(ax2, n_labels=3) -ax2.set_title('n_labels=3, length=50') +ax2.set_title("n_labels=3, length=50") ax2.set_xlim(left=0, auto=True) ax2.set_ylim(bottom=0, auto=True) plt.show() -print('The data was generated from (random_state=%d):' % RANDOM_SEED) -print('Class', 'P(C)', 'P(w0|C)', 'P(w1|C)', sep='\t') -for k, p, p_w in zip(['red', 'blue', 'yellow'], p_c, p_w_c.T): - print('%s\t%0.2f\t%0.2f\t%0.2f' % (k, p, p_w[0], p_w[1])) +print("The data was generated from (random_state=%d):" % RANDOM_SEED) +print("Class", "P(C)", "P(w0|C)", "P(w1|C)", sep="\t") +for k, p, p_w in zip(["red", "blue", "yellow"], p_c, p_w_c.T): + print("%s\t%0.2f\t%0.2f\t%0.2f" % (k, p, p_w[0], p_w[1])) diff --git a/examples/decomposition/plot_beta_divergence.py b/examples/decomposition/plot_beta_divergence.py index 8f39039446e88..41c908e273c72 100644 --- a/examples/decomposition/plot_beta_divergence.py +++ b/examples/decomposition/plot_beta_divergence.py @@ -15,8 +15,8 @@ x = np.linspace(0.001, 4, 1000) y = np.zeros(x.shape) -colors = 'mbgyr' -for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)): +colors = "mbgyr" +for j, beta in enumerate((0.0, 0.5, 1.0, 1.5, 2.0)): for i, xi in enumerate(x): y[i] = _beta_divergence(1, xi, 1, beta) name = "beta = %1.1f" % beta diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py index 84e6f923f0d3b..7c873e867aa8b 100644 --- a/examples/decomposition/plot_faces_decomposition.py +++ b/examples/decomposition/plot_faces_decomposition.py @@ -25,8 +25,7 @@ from sklearn import decomposition # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") n_row, n_col = 2, 3 n_components = n_row * n_col image_shape = (64, 64) @@ -34,8 +33,7 @@ # ############################################################################# # Load faces data -faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True, - random_state=rng) +faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True, random_state=rng) n_samples, n_features = faces.shape # global centering @@ -48,56 +46,78 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray): - plt.figure(figsize=(2. * n_col, 2.26 * n_row)) + plt.figure(figsize=(2.0 * n_col, 2.26 * n_row)) plt.suptitle(title, size=16) for i, comp in enumerate(images): plt.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) - plt.imshow(comp.reshape(image_shape), cmap=cmap, - interpolation='nearest', - vmin=-vmax, vmax=vmax) + plt.imshow( + comp.reshape(image_shape), + cmap=cmap, + interpolation="nearest", + vmin=-vmax, + vmax=vmax, + ) plt.xticks(()) plt.yticks(()) - plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) + plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.0) # ############################################################################# # List of the different estimators, whether to center and transpose the # problem, and whether the transformer uses the clustering API. estimators = [ - ('Eigenfaces - PCA using randomized SVD', - decomposition.PCA(n_components=n_components, svd_solver='randomized', - whiten=True), - True), - - ('Non-negative components - NMF', - decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3), - False), - - ('Independent components - FastICA', - decomposition.FastICA(n_components=n_components, whiten=True), - True), - - ('Sparse comp. - MiniBatchSparsePCA', - decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, - n_iter=100, batch_size=3, - random_state=rng), - True), - - ('MiniBatchDictionaryLearning', - decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, - n_iter=50, batch_size=3, - random_state=rng), - True), - - ('Cluster centers - MiniBatchKMeans', - MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, - max_iter=50, random_state=rng), - True), - - ('Factor Analysis components - FA', - decomposition.FactorAnalysis(n_components=n_components, max_iter=20), - True), + ( + "Eigenfaces - PCA using randomized SVD", + decomposition.PCA( + n_components=n_components, svd_solver="randomized", whiten=True + ), + True, + ), + ( + "Non-negative components - NMF", + decomposition.NMF(n_components=n_components, init="nndsvda", tol=5e-3), + False, + ), + ( + "Independent components - FastICA", + decomposition.FastICA(n_components=n_components, whiten=True), + True, + ), + ( + "Sparse comp. - MiniBatchSparsePCA", + decomposition.MiniBatchSparsePCA( + n_components=n_components, + alpha=0.8, + n_iter=100, + batch_size=3, + random_state=rng, + ), + True, + ), + ( + "MiniBatchDictionaryLearning", + decomposition.MiniBatchDictionaryLearning( + n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng + ), + True, + ), + ( + "Cluster centers - MiniBatchKMeans", + MiniBatchKMeans( + n_clusters=n_components, + tol=1e-3, + batch_size=20, + max_iter=50, + random_state=rng, + ), + True, + ), + ( + "Factor Analysis components - FA", + decomposition.FactorAnalysis(n_components=n_components, max_iter=20), + True, + ), ] @@ -116,9 +136,9 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray): if center: data = faces_centered estimator.fit(data) - train_time = (time() - t0) + train_time = time() - t0 print("done in %0.3fs" % train_time) - if hasattr(estimator, 'cluster_centers_'): + if hasattr(estimator, "cluster_centers_"): components_ = estimator.cluster_centers_ else: components_ = estimator.components_ @@ -128,53 +148,79 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray): # via the PCA decomposition, also provides a scalar noise_variance_ # (the mean of pixelwise variance) that cannot be displayed as an image # so we skip it. - if (hasattr(estimator, 'noise_variance_') and - estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case - plot_gallery("Pixelwise variance", - estimator.noise_variance_.reshape(1, -1), n_col=1, - n_row=1) - plot_gallery('%s - Train time %.1fs' % (name, train_time), - components_[:n_components]) + if ( + hasattr(estimator, "noise_variance_") and estimator.noise_variance_.ndim > 0 + ): # Skip the Eigenfaces case + plot_gallery( + "Pixelwise variance", + estimator.noise_variance_.reshape(1, -1), + n_col=1, + n_row=1, + ) + plot_gallery( + "%s - Train time %.1fs" % (name, train_time), components_[:n_components] + ) plt.show() # ############################################################################# # Various positivity constraints applied to dictionary learning. estimators = [ - ('Dictionary learning', - decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, - n_iter=50, batch_size=3, - random_state=rng), - True), - ('Dictionary learning - positive dictionary', - decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, - n_iter=50, batch_size=3, - random_state=rng, - positive_dict=True), - True), - ('Dictionary learning - positive code', - decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, - n_iter=50, batch_size=3, - fit_algorithm='cd', - random_state=rng, - positive_code=True), - True), - ('Dictionary learning - positive dictionary & code', - decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, - n_iter=50, batch_size=3, - fit_algorithm='cd', - random_state=rng, - positive_dict=True, - positive_code=True), - True), + ( + "Dictionary learning", + decomposition.MiniBatchDictionaryLearning( + n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng + ), + True, + ), + ( + "Dictionary learning - positive dictionary", + decomposition.MiniBatchDictionaryLearning( + n_components=15, + alpha=0.1, + n_iter=50, + batch_size=3, + random_state=rng, + positive_dict=True, + ), + True, + ), + ( + "Dictionary learning - positive code", + decomposition.MiniBatchDictionaryLearning( + n_components=15, + alpha=0.1, + n_iter=50, + batch_size=3, + fit_algorithm="cd", + random_state=rng, + positive_code=True, + ), + True, + ), + ( + "Dictionary learning - positive dictionary & code", + decomposition.MiniBatchDictionaryLearning( + n_components=15, + alpha=0.1, + n_iter=50, + batch_size=3, + fit_algorithm="cd", + random_state=rng, + positive_dict=True, + positive_code=True, + ), + True, + ), ] # ############################################################################# # Plot a sample of the input data -plot_gallery("First centered Olivetti faces", faces_centered[:n_components], - cmap=plt.cm.RdBu) +plot_gallery( + "First centered Olivetti faces", faces_centered[:n_components], cmap=plt.cm.RdBu +) # ############################################################################# # Do the estimation and plot it @@ -186,7 +232,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray): if center: data = faces_centered estimator.fit(data) - train_time = (time() - t0) + train_time = time() - t0 print("done in %0.3fs" % train_time) components_ = estimator.components_ plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu) diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py index 92fda7c20adf2..bbb9ac1c897e6 100644 --- a/examples/decomposition/plot_ica_blind_source_separation.py +++ b/examples/decomposition/plot_ica_blind_source_separation.py @@ -57,11 +57,13 @@ plt.figure() models = [X, S, S_, H] -names = ['Observations (mixed signal)', - 'True Sources', - 'ICA recovered signals', - 'PCA recovered signals'] -colors = ['red', 'steelblue', 'orange'] +names = [ + "Observations (mixed signal)", + "True Sources", + "ICA recovered signals", + "PCA recovered signals", +] +colors = ["red", "steelblue", "orange"] for ii, (model, name) in enumerate(zip(models, names), 1): plt.subplot(4, 1, ii) diff --git a/examples/decomposition/plot_ica_vs_pca.py b/examples/decomposition/plot_ica_vs_pca.py index b4fa513dbc36e..769fe47a028f4 100644 --- a/examples/decomposition/plot_ica_vs_pca.py +++ b/examples/decomposition/plot_ica_vs_pca.py @@ -41,7 +41,7 @@ # Generate sample data rng = np.random.RandomState(42) S = rng.standard_t(1.5, size=(20000, 2)) -S[:, 0] *= 2. +S[:, 0] *= 2.0 # Mix data A = np.array([[1, 1], [0, 2]]) # Mixing matrix @@ -60,47 +60,57 @@ # ############################################################################# # Plot results + def plot_samples(S, axis_list=None): - plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10, - color='steelblue', alpha=0.5) + plt.scatter( + S[:, 0], S[:, 1], s=2, marker="o", zorder=10, color="steelblue", alpha=0.5 + ) if axis_list is not None: - colors = ['orange', 'red'] + colors = ["orange", "red"] for color, axis in zip(colors, axis_list): axis /= axis.std() x_axis, y_axis = axis # Trick to get legend to work plt.plot(0.1 * x_axis, 0.1 * y_axis, linewidth=2, color=color) - plt.quiver((0, 0), (0, 0), x_axis, y_axis, zorder=11, width=0.01, - scale=6, color=color) + plt.quiver( + (0, 0), + (0, 0), + x_axis, + y_axis, + zorder=11, + width=0.01, + scale=6, + color=color, + ) plt.hlines(0, -3, 3) plt.vlines(0, -3, 3) plt.xlim(-3, 3) plt.ylim(-3, 3) - plt.xlabel('x') - plt.ylabel('y') + plt.xlabel("x") + plt.ylabel("y") plt.figure() plt.subplot(2, 2, 1) plot_samples(S / S.std()) -plt.title('True Independent Sources') +plt.title("True Independent Sources") axis_list = [pca.components_.T, ica.mixing_] plt.subplot(2, 2, 2) plot_samples(X / np.std(X), axis_list=axis_list) -legend = plt.legend(['PCA', 'ICA'], loc='upper right') +legend = plt.legend(["PCA", "ICA"], loc="upper right") legend.set_zorder(100) -plt.title('Observations') +plt.title("Observations") plt.subplot(2, 2, 3) plot_samples(S_pca_ / np.std(S_pca_, axis=0)) -plt.title('PCA recovered signals') +plt.title("PCA recovered signals") plt.subplot(2, 2, 4) plot_samples(S_ica_ / np.std(S_ica_)) -plt.title('ICA recovered signals') +plt.title("ICA recovered signals") plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.36) plt.show() diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py index 0cf505b7548c5..df08e052d3ec1 100644 --- a/examples/decomposition/plot_image_denoising.py +++ b/examples/decomposition/plot_image_denoising.py @@ -46,13 +46,14 @@ try: # SciPy >= 0.16 have face in misc from scipy.misc import face + face = face(gray=True) except ImportError: face = sp.face(gray=True) # Convert from uint8 representation with values between 0 and 255 to # a floating point representation with values between 0 and 1. -face = face / 255. +face = face / 255.0 # downsample for higher speed face = face[::4, ::4] + face[1::4, ::4] + face[::4, 1::4] + face[1::4, 1::4] @@ -60,92 +61,92 @@ height, width = face.shape # Distort the right half of the image -print('Distorting image...') +print("Distorting image...") distorted = face.copy() -distorted[:, width // 2:] += 0.075 * np.random.randn(height, width // 2) +distorted[:, width // 2 :] += 0.075 * np.random.randn(height, width // 2) # Extract all reference patches from the left half of the image -print('Extracting reference patches...') +print("Extracting reference patches...") t0 = time() patch_size = (7, 7) -data = extract_patches_2d(distorted[:, :width // 2], patch_size) +data = extract_patches_2d(distorted[:, : width // 2], patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) -print('done in %.2fs.' % (time() - t0)) +print("done in %.2fs." % (time() - t0)) # ############################################################################# # Learn the dictionary from reference patches -print('Learning the dictionary...') +print("Learning the dictionary...") t0 = time() dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500) V = dico.fit(data).components_ dt = time() - t0 -print('done in %.2fs.' % dt) +print("done in %.2fs." % dt) plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(V[:100]): plt.subplot(10, 10, i + 1) - plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, - interpolation='nearest') + plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, interpolation="nearest") plt.xticks(()) plt.yticks(()) -plt.suptitle('Dictionary learned from face patches\n' + - 'Train time %.1fs on %d patches' % (dt, len(data)), - fontsize=16) +plt.suptitle( + "Dictionary learned from face patches\n" + + "Train time %.1fs on %d patches" % (dt, len(data)), + fontsize=16, +) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) # ############################################################################# # Display the distorted image + def show_with_diff(image, reference, title): """Helper function to display denoising""" plt.figure(figsize=(5, 3.3)) plt.subplot(1, 2, 1) - plt.title('Image') - plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray, - interpolation='nearest') + plt.title("Image") + plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray, interpolation="nearest") plt.xticks(()) plt.yticks(()) plt.subplot(1, 2, 2) difference = image - reference - plt.title('Difference (norm: %.2f)' % np.sqrt(np.sum(difference ** 2))) - plt.imshow(difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr, - interpolation='nearest') + plt.title("Difference (norm: %.2f)" % np.sqrt(np.sum(difference ** 2))) + plt.imshow( + difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr, interpolation="nearest" + ) plt.xticks(()) plt.yticks(()) plt.suptitle(title, size=16) plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.2) -show_with_diff(distorted, face, 'Distorted image') +show_with_diff(distorted, face, "Distorted image") # ############################################################################# # Extract noisy patches and reconstruct them using the dictionary -print('Extracting noisy patches... ') +print("Extracting noisy patches... ") t0 = time() -data = extract_patches_2d(distorted[:, width // 2:], patch_size) +data = extract_patches_2d(distorted[:, width // 2 :], patch_size) data = data.reshape(data.shape[0], -1) intercept = np.mean(data, axis=0) data -= intercept -print('done in %.2fs.' % (time() - t0)) +print("done in %.2fs." % (time() - t0)) transform_algorithms = [ - ('Orthogonal Matching Pursuit\n1 atom', 'omp', - {'transform_n_nonzero_coefs': 1}), - ('Orthogonal Matching Pursuit\n2 atoms', 'omp', - {'transform_n_nonzero_coefs': 2}), - ('Least-angle regression\n5 atoms', 'lars', - {'transform_n_nonzero_coefs': 5}), - ('Thresholding\n alpha=0.1', 'threshold', {'transform_alpha': .1})] + ("Orthogonal Matching Pursuit\n1 atom", "omp", {"transform_n_nonzero_coefs": 1}), + ("Orthogonal Matching Pursuit\n2 atoms", "omp", {"transform_n_nonzero_coefs": 2}), + ("Least-angle regression\n5 atoms", "lars", {"transform_n_nonzero_coefs": 5}), + ("Thresholding\n alpha=0.1", "threshold", {"transform_alpha": 0.1}), +] reconstructions = {} for title, transform_algorithm, kwargs in transform_algorithms: - print(title + '...') + print(title + "...") reconstructions[title] = face.copy() t0 = time() dico.set_params(transform_algorithm=transform_algorithm, **kwargs) @@ -154,14 +155,14 @@ def show_with_diff(image, reference, title): patches += intercept patches = patches.reshape(len(data), *patch_size) - if transform_algorithm == 'threshold': + if transform_algorithm == "threshold": patches -= patches.min() patches /= patches.max() - reconstructions[title][:, width // 2:] = reconstruct_from_patches_2d( - patches, (height, width // 2)) + reconstructions[title][:, width // 2 :] = reconstruct_from_patches_2d( + patches, (height, width // 2) + ) dt = time() - t0 - print('done in %.2fs.' % dt) - show_with_diff(reconstructions[title], face, - title + ' (time: %.1fs)' % dt) + print("done in %.2fs." % dt) + show_with_diff(reconstructions[title], face, title + " (time: %.1fs)" % dt) plt.show() diff --git a/examples/decomposition/plot_incremental_pca.py b/examples/decomposition/plot_incremental_pca.py index 980f9d019ea1c..88cd6a679c479 100644 --- a/examples/decomposition/plot_incremental_pca.py +++ b/examples/decomposition/plot_incremental_pca.py @@ -40,18 +40,22 @@ pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) -colors = ['navy', 'turquoise', 'darkorange'] +colors = ["navy", "turquoise", "darkorange"] for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: plt.figure(figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): - plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], - color=color, lw=2, label=target_name) + plt.scatter( + X_transformed[y == i, 0], + X_transformed[y == i, 1], + color=color, + lw=2, + label=target_name, + ) if "Incremental" in title: err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean() - plt.title(title + " of iris dataset\nMean absolute unsigned error " - "%.6f" % err) + plt.title(title + " of iris dataset\nMean absolute unsigned error %.6f" % err) else: plt.title(title + " of iris dataset") plt.legend(loc="best", shadow=False, scatterpoints=1) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index cfec4f4ec8b1d..8a9ad066cb181 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -20,7 +20,7 @@ np.random.seed(0) -X, y = make_circles(n_samples=400, factor=.3, noise=.05) +X, y = make_circles(n_samples=400, factor=0.3, noise=0.05) kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) X_kpca = kpca.fit_transform(X) @@ -31,15 +31,13 @@ # Plot results plt.figure() -plt.subplot(2, 2, 1, aspect='equal') +plt.subplot(2, 2, 1, aspect="equal") plt.title("Original space") reds = y == 0 blues = y == 1 -plt.scatter(X[reds, 0], X[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X[blues, 0], X[blues, 1], c="blue", - s=20, edgecolor='k') +plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k") +plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k") plt.xlabel("$x_1$") plt.ylabel("$x_2$") @@ -47,31 +45,25 @@ X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T # projection on the first principal component (in the phi space) Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) -plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower') +plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower") -plt.subplot(2, 2, 2, aspect='equal') -plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", - s=20, edgecolor='k') +plt.subplot(2, 2, 2, aspect="equal") +plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k") +plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k") plt.title("Projection by PCA") plt.xlabel("1st principal component") plt.ylabel("2nd component") -plt.subplot(2, 2, 3, aspect='equal') -plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", - s=20, edgecolor='k') +plt.subplot(2, 2, 3, aspect="equal") +plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k") +plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k") plt.title("Projection by KPCA") plt.xlabel(r"1st principal component in space induced by $\phi$") plt.ylabel("2nd component") -plt.subplot(2, 2, 4, aspect='equal') -plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", - s=20, edgecolor='k') +plt.subplot(2, 2, 4, aspect="equal") +plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k") +plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k") plt.title("Original space after inverse transform") plt.xlabel("$x_1$") plt.ylabel("$x_2$") diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py index 4d7a851a938da..27d3a34e2dd75 100644 --- a/examples/decomposition/plot_pca_3d.py +++ b/examples/decomposition/plot_pca_3d.py @@ -34,8 +34,7 @@ def pdf(x): - return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) - + stats.norm(scale=4 / e).pdf(x)) + return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) + stats.norm(scale=4 / e).pdf(x)) y = np.random.normal(scale=0.5, size=(30000)) @@ -61,9 +60,9 @@ def pdf(x): def plot_figs(fig_num, elev, azim): fig = plt.figure(fig_num, figsize=(4, 3)) plt.clf() - ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=elev, azim=azim) + ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=elev, azim=azim) - ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker='+', alpha=.4) + ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker="+", alpha=0.4) Y = np.c_[a, b, c] # Using SciPy's SVD, this would be: @@ -74,9 +73,9 @@ def plot_figs(fig_num, elev, azim): V = pca.components_.T x_pca_axis, y_pca_axis, z_pca_axis = 3 * V - x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]] - y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]] - z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]] + x_pca_plane = np.r_[x_pca_axis[:2], -x_pca_axis[1::-1]] + y_pca_plane = np.r_[y_pca_axis[:2], -y_pca_axis[1::-1]] + z_pca_plane = np.r_[z_pca_axis[:2], -z_pca_axis[1::-1]] x_pca_plane.shape = (2, 2) y_pca_plane.shape = (2, 2) z_pca_plane.shape = (2, 2) diff --git a/examples/decomposition/plot_pca_iris.py b/examples/decomposition/plot_pca_iris.py index 67baf0deefdb3..8ed89104cddc8 100644 --- a/examples/decomposition/plot_pca_iris.py +++ b/examples/decomposition/plot_pca_iris.py @@ -34,23 +34,25 @@ fig = plt.figure(1, figsize=(4, 3)) plt.clf() -ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) +ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134) plt.cla() pca = decomposition.PCA(n_components=3) pca.fit(X) X = pca.transform(X) -for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]: - ax.text3D(X[y == label, 0].mean(), - X[y == label, 1].mean() + 1.5, - X[y == label, 2].mean(), name, - horizontalalignment='center', - bbox=dict(alpha=.5, edgecolor='w', facecolor='w')) +for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]: + ax.text3D( + X[y == label, 0].mean(), + X[y == label, 1].mean() + 1.5, + X[y == label, 2].mean(), + name, + horizontalalignment="center", + bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"), + ) # Reorder the labels to have colors matching the cluster results y = np.choose(y, [1, 2, 0]).astype(float) -ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, - edgecolor='k') +ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k") ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py index b858434d910e3..65c04838f8796 100644 --- a/examples/decomposition/plot_pca_vs_fa_model_selection.py +++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py @@ -43,7 +43,7 @@ # Create the data n_samples, n_features, rank = 1000, 50, 10 -sigma = 1. +sigma = 1.0 rng = np.random.RandomState(42) U, _, _ = linalg.svd(rng.randn(n_features, n_features)) X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T) @@ -52,7 +52,7 @@ X_homo = X + sigma * rng.randn(n_samples, n_features) # Adding heteroscedastic noise -sigmas = sigma * rng.rand(n_features) + sigma / 2. +sigmas = sigma * rng.rand(n_features) + sigma / 2.0 X_hetero = X + rng.randn(n_samples, n_features) * sigmas # ############################################################################# @@ -62,7 +62,7 @@ def compute_scores(X): - pca = PCA(svd_solver='full') + pca = PCA(svd_solver="full") fa = FactorAnalysis() pca_scores, fa_scores = [], [] @@ -77,7 +77,7 @@ def compute_scores(X): def shrunk_cov_score(X): shrinkages = np.logspace(-2, 0, 30) - cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages}) + cv = GridSearchCV(ShrunkCovariance(), {"shrinkage": shrinkages}) return np.mean(cross_val_score(cv.fit(X).best_estimator_, X)) @@ -85,13 +85,12 @@ def lw_score(X): return np.mean(cross_val_score(LedoitWolf(), X)) -for X, title in [(X_homo, 'Homoscedastic Noise'), - (X_hetero, 'Heteroscedastic Noise')]: +for X, title in [(X_homo, "Homoscedastic Noise"), (X_hetero, "Heteroscedastic Noise")]: pca_scores, fa_scores = compute_scores(X) n_components_pca = n_components[np.argmax(pca_scores)] n_components_fa = n_components[np.argmax(fa_scores)] - pca = PCA(svd_solver='full', n_components='mle') + pca = PCA(svd_solver="full", n_components="mle") pca.fit(X) n_components_pca_mle = pca.n_components_ @@ -100,26 +99,45 @@ def lw_score(X): print("best n_components by PCA MLE = %d" % n_components_pca_mle) plt.figure() - plt.plot(n_components, pca_scores, 'b', label='PCA scores') - plt.plot(n_components, fa_scores, 'r', label='FA scores') - plt.axvline(rank, color='g', label='TRUTH: %d' % rank, linestyle='-') - plt.axvline(n_components_pca, color='b', - label='PCA CV: %d' % n_components_pca, linestyle='--') - plt.axvline(n_components_fa, color='r', - label='FactorAnalysis CV: %d' % n_components_fa, - linestyle='--') - plt.axvline(n_components_pca_mle, color='k', - label='PCA MLE: %d' % n_components_pca_mle, linestyle='--') + plt.plot(n_components, pca_scores, "b", label="PCA scores") + plt.plot(n_components, fa_scores, "r", label="FA scores") + plt.axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-") + plt.axvline( + n_components_pca, + color="b", + label="PCA CV: %d" % n_components_pca, + linestyle="--", + ) + plt.axvline( + n_components_fa, + color="r", + label="FactorAnalysis CV: %d" % n_components_fa, + linestyle="--", + ) + plt.axvline( + n_components_pca_mle, + color="k", + label="PCA MLE: %d" % n_components_pca_mle, + linestyle="--", + ) # compare with other covariance estimators - plt.axhline(shrunk_cov_score(X), color='violet', - label='Shrunk Covariance MLE', linestyle='-.') - plt.axhline(lw_score(X), color='orange', - label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.') - - plt.xlabel('nb of components') - plt.ylabel('CV scores') - plt.legend(loc='lower right') + plt.axhline( + shrunk_cov_score(X), + color="violet", + label="Shrunk Covariance MLE", + linestyle="-.", + ) + plt.axhline( + lw_score(X), + color="orange", + label="LedoitWolf MLE" % n_components_pca_mle, + linestyle="-.", + ) + + plt.xlabel("nb of components") + plt.ylabel("CV scores") + plt.legend(loc="lower right") plt.title(title) plt.show() diff --git a/examples/decomposition/plot_pca_vs_lda.py b/examples/decomposition/plot_pca_vs_lda.py index 051b96ffedf2c..f9abf21b3ad0e 100644 --- a/examples/decomposition/plot_pca_vs_lda.py +++ b/examples/decomposition/plot_pca_vs_lda.py @@ -37,24 +37,28 @@ X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components -print('explained variance ratio (first two components): %s' - % str(pca.explained_variance_ratio_)) +print( + "explained variance ratio (first two components): %s" + % str(pca.explained_variance_ratio_) +) plt.figure() -colors = ['navy', 'turquoise', 'darkorange'] +colors = ["navy", "turquoise", "darkorange"] lw = 2 for color, i, target_name in zip(colors, [0, 1, 2], target_names): - plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, - label=target_name) -plt.legend(loc='best', shadow=False, scatterpoints=1) -plt.title('PCA of IRIS dataset') + plt.scatter( + X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name + ) +plt.legend(loc="best", shadow=False, scatterpoints=1) +plt.title("PCA of IRIS dataset") plt.figure() for color, i, target_name in zip(colors, [0, 1, 2], target_names): - plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color, - label=target_name) -plt.legend(loc='best', shadow=False, scatterpoints=1) -plt.title('LDA of IRIS dataset') + plt.scatter( + X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name + ) +plt.legend(loc="best", shadow=False, scatterpoints=1) +plt.title("LDA of IRIS dataset") plt.show() diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py index 144401073a7fe..681b1ca1942c0 100644 --- a/examples/decomposition/plot_sparse_coding.py +++ b/examples/decomposition/plot_sparse_coding.py @@ -26,9 +26,11 @@ def ricker_function(resolution, center, width): """Discrete sub-sampled Ricker (Mexican hat) wavelet""" x = np.linspace(0, resolution - 1, resolution) - x = ((2 / (np.sqrt(3 * width) * np.pi ** .25)) - * (1 - (x - center) ** 2 / width ** 2) - * np.exp(-(x - center) ** 2 / (2 * width ** 2))) + x = ( + (2 / (np.sqrt(3 * width) * np.pi ** 0.25)) + * (1 - (x - center) ** 2 / width ** 2) + * np.exp(-((x - center) ** 2) / (2 * width ** 2)) + ) return x @@ -48,57 +50,74 @@ def ricker_matrix(width, resolution, n_components): n_components = resolution // subsampling # Compute a wavelet dictionary -D_fixed = ricker_matrix(width=width, resolution=resolution, - n_components=n_components) -D_multi = np.r_[tuple(ricker_matrix(width=w, resolution=resolution, - n_components=n_components // 5) - for w in (10, 50, 100, 500, 1000))] +D_fixed = ricker_matrix(width=width, resolution=resolution, n_components=n_components) +D_multi = np.r_[ + tuple( + ricker_matrix(width=w, resolution=resolution, n_components=n_components // 5) + for w in (10, 50, 100, 500, 1000) + ) +] # Generate a signal y = np.linspace(0, resolution - 1, resolution) first_quarter = y < resolution / 4 -y[first_quarter] = 3. -y[np.logical_not(first_quarter)] = -1. +y[first_quarter] = 3.0 +y[np.logical_not(first_quarter)] = -1.0 # List the different sparse coding methods in the following format: # (title, transform_algorithm, transform_alpha, # transform_n_nozero_coefs, color) -estimators = [('OMP', 'omp', None, 15, 'navy'), - ('Lasso', 'lasso_lars', 2, None, 'turquoise'), ] +estimators = [ + ("OMP", "omp", None, 15, "navy"), + ("Lasso", "lasso_lars", 2, None, "turquoise"), +] lw = 2 # Avoid FutureWarning about default value change when numpy >= 1.14 -lstsq_rcond = None if np_version >= parse_version('1.14') else -1 +lstsq_rcond = None if np_version >= parse_version("1.14") else -1 plt.figure(figsize=(13, 6)) -for subplot, (D, title) in enumerate(zip((D_fixed, D_multi), - ('fixed width', 'multiple widths'))): +for subplot, (D, title) in enumerate( + zip((D_fixed, D_multi), ("fixed width", "multiple widths")) +): plt.subplot(1, 2, subplot + 1) - plt.title('Sparse coding against %s dictionary' % title) - plt.plot(y, lw=lw, linestyle='--', label='Original signal') + plt.title("Sparse coding against %s dictionary" % title) + plt.plot(y, lw=lw, linestyle="--", label="Original signal") # Do a wavelet approximation for title, algo, alpha, n_nonzero, color in estimators: - coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero, - transform_alpha=alpha, transform_algorithm=algo) + coder = SparseCoder( + dictionary=D, + transform_n_nonzero_coefs=n_nonzero, + transform_alpha=alpha, + transform_algorithm=algo, + ) x = coder.transform(y.reshape(1, -1)) density = len(np.flatnonzero(x)) x = np.ravel(np.dot(x, D)) squared_error = np.sum((y - x) ** 2) - plt.plot(x, color=color, lw=lw, - label='%s: %s nonzero coefs,\n%.2f error' - % (title, density, squared_error)) + plt.plot( + x, + color=color, + lw=lw, + label="%s: %s nonzero coefs,\n%.2f error" % (title, density, squared_error), + ) # Soft thresholding debiasing - coder = SparseCoder(dictionary=D, transform_algorithm='threshold', - transform_alpha=20) + coder = SparseCoder( + dictionary=D, transform_algorithm="threshold", transform_alpha=20 + ) x = coder.transform(y.reshape(1, -1)) _, idx = np.where(x != 0) x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T, y, rcond=lstsq_rcond) x = np.ravel(np.dot(x, D)) squared_error = np.sum((y - x) ** 2) - plt.plot(x, color='darkorange', lw=lw, - label='Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error' - % (len(idx), squared_error)) - plt.axis('tight') - plt.legend(shadow=False, loc='best') -plt.subplots_adjust(.04, .07, .97, .90, .09, .2) + plt.plot( + x, + color="darkorange", + lw=lw, + label="Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error" + % (len(idx), squared_error), + ) + plt.axis("tight") + plt.legend(shadow=False, loc="best") +plt.subplots_adjust(0.04, 0.07, 0.97, 0.90, 0.09, 0.2) plt.show() diff --git a/examples/decomposition/plot_varimax_fa.py b/examples/decomposition/plot_varimax_fa.py index 4e786406bdbce..82644595daf94 100644 --- a/examples/decomposition/plot_varimax_fa.py +++ b/examples/decomposition/plot_varimax_fa.py @@ -52,9 +52,11 @@ # Run factor analysis with Varimax rotation n_comps = 2 -methods = [('PCA', PCA()), - ('Unrotated FA', FactorAnalysis()), - ('Varimax FA', FactorAnalysis(rotation='varimax'))] +methods = [ + ("PCA", PCA()), + ("Unrotated FA", FactorAnalysis()), + ("Varimax FA", FactorAnalysis(rotation="varimax")), +] fig, axes = plt.subplots(ncols=len(methods), figsize=(10, 8)) for ax, (method, fa) in zip(axes, methods): diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py index 4d48d13dd24f2..a4bf4d3875ed2 100644 --- a/examples/ensemble/plot_adaboost_hastie_10_2.py +++ b/examples/ensemble/plot_adaboost_hastie_10_2.py @@ -36,7 +36,7 @@ n_estimators = 400 # A learning rate of 1. may not be optimal for both SAMME and SAMME.R -learning_rate = 1. +learning_rate = 1.0 X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) @@ -55,23 +55,23 @@ base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, - algorithm="SAMME") + algorithm="SAMME", +) ada_discrete.fit(X_train, y_train) ada_real = AdaBoostClassifier( base_estimator=dt_stump, learning_rate=learning_rate, n_estimators=n_estimators, - algorithm="SAMME.R") + algorithm="SAMME.R", +) ada_real.fit(X_train, y_train) fig = plt.figure() ax = fig.add_subplot(111) -ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', - label='Decision Stump Error') -ax.plot([1, n_estimators], [dt_err] * 2, 'k--', - label='Decision Tree Error') +ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error") +ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error") ada_discrete_err = np.zeros((n_estimators,)) for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)): @@ -89,24 +89,36 @@ for i, y_pred in enumerate(ada_real.staged_predict(X_train)): ada_real_err_train[i] = zero_one_loss(y_pred, y_train) -ax.plot(np.arange(n_estimators) + 1, ada_discrete_err, - label='Discrete AdaBoost Test Error', - color='red') -ax.plot(np.arange(n_estimators) + 1, ada_discrete_err_train, - label='Discrete AdaBoost Train Error', - color='blue') -ax.plot(np.arange(n_estimators) + 1, ada_real_err, - label='Real AdaBoost Test Error', - color='orange') -ax.plot(np.arange(n_estimators) + 1, ada_real_err_train, - label='Real AdaBoost Train Error', - color='green') +ax.plot( + np.arange(n_estimators) + 1, + ada_discrete_err, + label="Discrete AdaBoost Test Error", + color="red", +) +ax.plot( + np.arange(n_estimators) + 1, + ada_discrete_err_train, + label="Discrete AdaBoost Train Error", + color="blue", +) +ax.plot( + np.arange(n_estimators) + 1, + ada_real_err, + label="Real AdaBoost Test Error", + color="orange", +) +ax.plot( + np.arange(n_estimators) + 1, + ada_real_err_train, + label="Real AdaBoost Train Error", + color="green", +) ax.set_ylim((0.0, 0.5)) -ax.set_xlabel('n_estimators') -ax.set_ylabel('error rate') +ax.set_xlabel("n_estimators") +ax.set_ylabel("error rate") -leg = ax.legend(loc='upper right', fancybox=True) +leg = ax.legend(loc="upper right", fancybox=True) leg.get_frame().set_alpha(0.7) plt.show() diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py index 0ee08c5ed322e..af28b3fe940bf 100644 --- a/examples/ensemble/plot_adaboost_multiclass.py +++ b/examples/ensemble/plot_adaboost_multiclass.py @@ -37,8 +37,9 @@ from sklearn.tree import DecisionTreeClassifier -X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, - n_classes=3, random_state=1) +X, y = make_gaussian_quantiles( + n_samples=13000, n_features=10, n_classes=3, random_state=1 +) n_split = 3000 @@ -46,15 +47,15 @@ y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier( - DecisionTreeClassifier(max_depth=2), - n_estimators=600, - learning_rate=1) + DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1 +) bdt_discrete = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, - algorithm="SAMME") + algorithm="SAMME", +) bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) @@ -63,11 +64,10 @@ discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip( - bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)): - real_test_errors.append( - 1. - accuracy_score(real_test_predict, y_test)) - discrete_test_errors.append( - 1. - accuracy_score(discrete_train_predict, y_test)) + bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test) +): + real_test_errors.append(1.0 - accuracy_score(real_test_predict, y_test)) + discrete_test_errors.append(1.0 - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) n_trees_real = len(bdt_real) @@ -81,35 +81,41 @@ plt.figure(figsize=(15, 5)) plt.subplot(131) -plt.plot(range(1, n_trees_discrete + 1), - discrete_test_errors, c='black', label='SAMME') -plt.plot(range(1, n_trees_real + 1), - real_test_errors, c='black', - linestyle='dashed', label='SAMME.R') +plt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c="black", label="SAMME") +plt.plot( + range(1, n_trees_real + 1), + real_test_errors, + c="black", + linestyle="dashed", + label="SAMME.R", +) plt.legend() plt.ylim(0.18, 0.62) -plt.ylabel('Test Error') -plt.xlabel('Number of Trees') +plt.ylabel("Test Error") +plt.xlabel("Number of Trees") plt.subplot(132) -plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors, - "b", label='SAMME', alpha=.5) -plt.plot(range(1, n_trees_real + 1), real_estimator_errors, - "r", label='SAMME.R', alpha=.5) +plt.plot( + range(1, n_trees_discrete + 1), + discrete_estimator_errors, + "b", + label="SAMME", + alpha=0.5, +) +plt.plot( + range(1, n_trees_real + 1), real_estimator_errors, "r", label="SAMME.R", alpha=0.5 +) plt.legend() -plt.ylabel('Error') -plt.xlabel('Number of Trees') -plt.ylim((.2, - max(real_estimator_errors.max(), - discrete_estimator_errors.max()) * 1.2)) +plt.ylabel("Error") +plt.xlabel("Number of Trees") +plt.ylim((0.2, max(real_estimator_errors.max(), discrete_estimator_errors.max()) * 1.2)) plt.xlim((-20, len(bdt_discrete) + 20)) plt.subplot(133) -plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, - "b", label='SAMME') +plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, "b", label="SAMME") plt.legend() -plt.ylabel('Weight') -plt.xlabel('Number of Trees') +plt.ylabel("Weight") +plt.xlabel("Number of Trees") plt.ylim((0, discrete_estimator_weights.max() * 1.2)) plt.xlim((-20, n_trees_discrete + 20)) diff --git a/examples/ensemble/plot_adaboost_regression.py b/examples/ensemble/plot_adaboost_regression.py index 659a2a5944ea6..0c3f01299b06e 100644 --- a/examples/ensemble/plot_adaboost_regression.py +++ b/examples/ensemble/plot_adaboost_regression.py @@ -32,8 +32,9 @@ # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=4) -regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), - n_estimators=300, random_state=rng) +regr_2 = AdaBoostRegressor( + DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng +) regr_1.fit(X, y) regr_2.fit(X, y) diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py index edb4cbb1a97b3..d22f14cf7c8c9 100644 --- a/examples/ensemble/plot_adaboost_twoclass.py +++ b/examples/ensemble/plot_adaboost_twoclass.py @@ -31,19 +31,19 @@ # Construct dataset -X1, y1 = make_gaussian_quantiles(cov=2., - n_samples=200, n_features=2, - n_classes=2, random_state=1) -X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5, - n_samples=300, n_features=2, - n_classes=2, random_state=1) +X1, y1 = make_gaussian_quantiles( + cov=2.0, n_samples=200, n_features=2, n_classes=2, random_state=1 +) +X2, y2 = make_gaussian_quantiles( + mean=(3, 3), cov=1.5, n_samples=300, n_features=2, n_classes=2, random_state=1 +) X = np.concatenate((X1, X2)) -y = np.concatenate((y1, - y2 + 1)) +y = np.concatenate((y1, -y2 + 1)) # Create and fit an AdaBoosted decision tree -bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), - algorithm="SAMME", - n_estimators=200) +bdt = AdaBoostClassifier( + DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200 +) bdt.fit(X, y) @@ -57,8 +57,9 @@ plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), - np.arange(y_min, y_max, plot_step)) +xx, yy = np.meshgrid( + np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) +) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) @@ -68,35 +69,42 @@ # Plot the training points for i, n, c in zip(range(2), class_names, plot_colors): idx = np.where(y == i) - plt.scatter(X[idx, 0], X[idx, 1], - c=c, cmap=plt.cm.Paired, - s=20, edgecolor='k', - label="Class %s" % n) + plt.scatter( + X[idx, 0], + X[idx, 1], + c=c, + cmap=plt.cm.Paired, + s=20, + edgecolor="k", + label="Class %s" % n, + ) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) -plt.legend(loc='upper right') -plt.xlabel('x') -plt.ylabel('y') -plt.title('Decision Boundary') +plt.legend(loc="upper right") +plt.xlabel("x") +plt.ylabel("y") +plt.title("Decision Boundary") # Plot the two-class decision scores twoclass_output = bdt.decision_function(X) plot_range = (twoclass_output.min(), twoclass_output.max()) plt.subplot(122) for i, n, c in zip(range(2), class_names, plot_colors): - plt.hist(twoclass_output[y == i], - bins=10, - range=plot_range, - facecolor=c, - label='Class %s' % n, - alpha=.5, - edgecolor='k') + plt.hist( + twoclass_output[y == i], + bins=10, + range=plot_range, + facecolor=c, + label="Class %s" % n, + alpha=0.5, + edgecolor="k", + ) x1, x2, y1, y2 = plt.axis() plt.axis((x1, x2, y1, y2 * 1.2)) -plt.legend(loc='upper right') -plt.ylabel('Samples') -plt.xlabel('Score') -plt.title('Decision Scores') +plt.legend(loc="upper right") +plt.ylabel("Samples") +plt.xlabel("Score") +plt.title("Decision Scores") plt.tight_layout() plt.subplots_adjust(wspace=0.35) diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py index 0af239e197cf0..f78a200a41c83 100644 --- a/examples/ensemble/plot_bias_variance.py +++ b/examples/ensemble/plot_bias_variance.py @@ -73,18 +73,20 @@ from sklearn.tree import DecisionTreeRegressor # Settings -n_repeat = 50 # Number of iterations for computing expectations -n_train = 50 # Size of the training set -n_test = 1000 # Size of the test set -noise = 0.1 # Standard deviation of the noise +n_repeat = 50 # Number of iterations for computing expectations +n_train = 50 # Size of the training set +n_test = 1000 # Size of the test set +noise = 0.1 # Standard deviation of the noise np.random.seed(0) # Change this for exploring the bias-variance decomposition of other # estimators. This should work well for estimators with high variance (e.g., # decision trees or KNN), but poorly for estimators with low variance (e.g., # linear models). -estimators = [("Tree", DecisionTreeRegressor()), - ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor()))] +estimators = [ + ("Tree", DecisionTreeRegressor()), + ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor())), +] n_estimators = len(estimators) @@ -93,7 +95,7 @@ def f(x): x = x.ravel() - return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2) + return np.exp(-(x ** 2)) + 1.5 * np.exp(-((x - 2) ** 2)) def generate(n_samples, noise, n_repeat=1): @@ -141,18 +143,18 @@ def generate(n_samples, noise, n_repeat=1): for j in range(n_repeat): y_error += (y_test[:, j] - y_predict[:, i]) ** 2 - y_error /= (n_repeat * n_repeat) + y_error /= n_repeat * n_repeat y_noise = np.var(y_test, axis=1) y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2 y_var = np.var(y_predict, axis=1) - print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) " - " + {3:.4f} (var) + {4:.4f} (noise)".format(name, - np.mean(y_error), - np.mean(y_bias), - np.mean(y_var), - np.mean(y_noise))) + print( + "{0}: {1:.4f} (error) = {2:.4f} (bias^2) " + " + {3:.4f} (var) + {4:.4f} (noise)".format( + name, np.mean(y_error), np.mean(y_bias), np.mean(y_var), np.mean(y_noise) + ) + ) # Plot figures plt.subplot(2, n_estimators, n + 1) @@ -165,14 +167,13 @@ def generate(n_samples, noise, n_repeat=1): else: plt.plot(X_test, y_predict[:, i], "r", alpha=0.05) - plt.plot(X_test, np.mean(y_predict, axis=1), "c", - label=r"$\mathbb{E}_{LS} \^y(x)$") + plt.plot(X_test, np.mean(y_predict, axis=1), "c", label=r"$\mathbb{E}_{LS} \^y(x)$") plt.xlim([-5, 5]) plt.title(name) if n == n_estimators - 1: - plt.legend(loc=(1.1, .5)) + plt.legend(loc=(1.1, 0.5)) plt.subplot(2, n_estimators, n_estimators + n + 1) plt.plot(X_test, y_error, "r", label="$error(x)$") @@ -185,7 +186,7 @@ def generate(n_samples, noise, n_repeat=1): if n == n_estimators - 1: - plt.legend(loc=(1.1, .5)) + plt.legend(loc=(1.1, 0.5)) -plt.subplots_adjust(right=.75) +plt.subplots_adjust(right=0.75) plt.show() diff --git a/examples/ensemble/plot_ensemble_oob.py b/examples/ensemble/plot_ensemble_oob.py index e7b37f212177c..5b798eece8667 100644 --- a/examples/ensemble/plot_ensemble_oob.py +++ b/examples/ensemble/plot_ensemble_oob.py @@ -36,26 +36,45 @@ RANDOM_STATE = 123 # Generate a binary classification dataset. -X, y = make_classification(n_samples=500, n_features=25, - n_clusters_per_class=1, n_informative=15, - random_state=RANDOM_STATE) +X, y = make_classification( + n_samples=500, + n_features=25, + n_clusters_per_class=1, + n_informative=15, + random_state=RANDOM_STATE, +) # NOTE: Setting the `warm_start` construction parameter to `True` disables # support for parallelized ensembles but is necessary for tracking the OOB # error trajectory during training. ensemble_clfs = [ - ("RandomForestClassifier, max_features='sqrt'", - RandomForestClassifier(warm_start=True, oob_score=True, - max_features="sqrt", - random_state=RANDOM_STATE)), - ("RandomForestClassifier, max_features='log2'", - RandomForestClassifier(warm_start=True, max_features='log2', - oob_score=True, - random_state=RANDOM_STATE)), - ("RandomForestClassifier, max_features=None", - RandomForestClassifier(warm_start=True, max_features=None, - oob_score=True, - random_state=RANDOM_STATE)) + ( + "RandomForestClassifier, max_features='sqrt'", + RandomForestClassifier( + warm_start=True, + oob_score=True, + max_features="sqrt", + random_state=RANDOM_STATE, + ), + ), + ( + "RandomForestClassifier, max_features='log2'", + RandomForestClassifier( + warm_start=True, + max_features="log2", + oob_score=True, + random_state=RANDOM_STATE, + ), + ), + ( + "RandomForestClassifier, max_features=None", + RandomForestClassifier( + warm_start=True, + max_features=None, + oob_score=True, + random_state=RANDOM_STATE, + ), + ), ] # Map a classifier name to a list of (, ) pairs. diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 7b75b92a1f0e0..9d19bbf907904 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -25,16 +25,22 @@ from sklearn.model_selection import train_test_split X, y = make_classification( - n_samples=1000, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, n_classes=2, random_state=0, shuffle=False) -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=42) + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + n_classes=2, + random_state=0, + shuffle=False, +) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) # %% # A random forest classifier will be fitted to compute the feature importances. from sklearn.ensemble import RandomForestClassifier -feature_names = [f'feature {i}' for i in range(X.shape[1])] +feature_names = [f"feature {i}" for i in range(X.shape[1])] forest = RandomForestClassifier(random_state=0) forest.fit(X_train, y_train) @@ -54,16 +60,15 @@ start_time = time.time() importances = forest.feature_importances_ -std = np.std([ - tree.feature_importances_ for tree in forest.estimators_], axis=0) +std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) elapsed_time = time.time() - start_time -print(f"Elapsed time to compute the importances: " - f"{elapsed_time:.3f} seconds") +print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds") # %% # Let's plot the impurity-based importance. import pandas as pd + forest_importances = pd.Series(importances, index=feature_names) fig, ax = plt.subplots() @@ -84,10 +89,10 @@ start_time = time.time() result = permutation_importance( - forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) + forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 +) elapsed_time = time.time() - start_time -print(f"Elapsed time to compute the importances: " - f"{elapsed_time:.3f} seconds") +print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds") forest_importances = pd.Series(result.importances_mean, index=feature_names) diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py index ff2ec6f67ed99..8bf265f345be8 100644 --- a/examples/ensemble/plot_forest_importances_faces.py +++ b/examples/ensemble/plot_forest_importances_faces.py @@ -44,8 +44,7 @@ # A random forest classifier will be fitted to compute the feature importances. from sklearn.ensemble import RandomForestClassifier -forest = RandomForestClassifier( - n_estimators=750, n_jobs=n_jobs, random_state=42) +forest = RandomForestClassifier(n_estimators=750, n_jobs=n_jobs, random_state=42) forest.fit(X, y) @@ -68,8 +67,7 @@ importances = forest.feature_importances_ elapsed_time = time.time() - start_time -print(f"Elapsed time to compute the importances: " - f"{elapsed_time:.3f} seconds") +print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds") imp_reshaped = importances.reshape(img_shape) plt.matshow(imp_reshaped, cmap=plt.cm.hot) plt.title("Pixel importances using impurity values") diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py index 81cd54a9bb4d3..b2e95ef2ecc81 100644 --- a/examples/ensemble/plot_forest_iris.py +++ b/examples/ensemble/plot_forest_iris.py @@ -47,8 +47,11 @@ from matplotlib.colors import ListedColormap from sklearn.datasets import load_iris -from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, - AdaBoostClassifier) +from sklearn.ensemble import ( + RandomForestClassifier, + ExtraTreesClassifier, + AdaBoostClassifier, +) from sklearn.tree import DecisionTreeClassifier # Parameters @@ -64,11 +67,12 @@ plot_idx = 1 -models = [DecisionTreeClassifier(max_depth=None), - RandomForestClassifier(n_estimators=n_estimators), - ExtraTreesClassifier(n_estimators=n_estimators), - AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), - n_estimators=n_estimators)] +models = [ + DecisionTreeClassifier(max_depth=None), + RandomForestClassifier(n_estimators=n_estimators), + ExtraTreesClassifier(n_estimators=n_estimators), + AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators), +] for pair in ([0, 1], [0, 2], [2, 3]): for model in models: @@ -94,15 +98,12 @@ scores = model.score(X, y) # Create a title for each column and the console by using str() and # slicing away useless parts of the string - model_title = str(type(model)).split( - ".")[-1][:-2][:-len("Classifier")] + model_title = str(type(model)).split(".")[-1][:-2][: -len("Classifier")] model_details = model_title if hasattr(model, "estimators_"): - model_details += " with {} estimators".format( - len(model.estimators_)) - print(model_details + " with features", pair, - "has a score of", scores) + model_details += " with {} estimators".format(len(model.estimators_)) + print(model_details + " with features", pair, "has a score of", scores) plt.subplot(3, 4, plot_idx) if plot_idx <= len(models): @@ -113,8 +114,9 @@ # filled contour plot x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), - np.arange(y_min, y_max, plot_step)) + xx, yy = np.meshgrid( + np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) + ) # Plot either a single DecisionTreeClassifier or alpha blend the # decision surfaces of the ensemble of classifiers @@ -139,19 +141,30 @@ # black outline xx_coarser, yy_coarser = np.meshgrid( np.arange(x_min, x_max, plot_step_coarser), - np.arange(y_min, y_max, plot_step_coarser)) - Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), - yy_coarser.ravel()] - ).reshape(xx_coarser.shape) - cs_points = plt.scatter(xx_coarser, yy_coarser, s=15, - c=Z_points_coarser, cmap=cmap, - edgecolors="none") + np.arange(y_min, y_max, plot_step_coarser), + ) + Z_points_coarser = model.predict( + np.c_[xx_coarser.ravel(), yy_coarser.ravel()] + ).reshape(xx_coarser.shape) + cs_points = plt.scatter( + xx_coarser, + yy_coarser, + s=15, + c=Z_points_coarser, + cmap=cmap, + edgecolors="none", + ) # Plot the training points, these are clustered together and have a # black outline - plt.scatter(X[:, 0], X[:, 1], c=y, - cmap=ListedColormap(['r', 'y', 'b']), - edgecolor='k', s=20) + plt.scatter( + X[:, 0], + X[:, 1], + c=y, + cmap=ListedColormap(["r", "y", "b"]), + edgecolor="k", + s=20, + ) plot_idx += 1 # move on to the next plot in sequence plt.suptitle("Classifiers on feature subsets of the Iris dataset", fontsize=12) diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py index 876a1ca21ec4c..3bb406a0ffe86 100644 --- a/examples/ensemble/plot_gradient_boosting_categorical.py +++ b/examples/ensemble/plot_gradient_boosting_categorical.py @@ -32,8 +32,8 @@ X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True) -n_categorical_features = (X.dtypes == 'category').sum() -n_numerical_features = (X.dtypes == 'float').sum() +n_categorical_features = (X.dtypes == "category").sum() +n_numerical_features = (X.dtypes == "float").sum() print(f"Number of samples: {X.shape[0]}") print(f"Number of features: {X.shape[1]}") print(f"Number of categorical features: {n_categorical_features}") @@ -51,10 +51,9 @@ from sklearn.compose import make_column_selector dropper = make_column_transformer( - ('drop', make_column_selector(dtype_include='category')), - remainder='passthrough') -hist_dropped = make_pipeline(dropper, - HistGradientBoostingRegressor(random_state=42)) + ("drop", make_column_selector(dtype_include="category")), remainder="passthrough" +) +hist_dropped = make_pipeline(dropper, HistGradientBoostingRegressor(random_state=42)) # %% # Gradient boosting estimator with one-hot encoding @@ -65,12 +64,16 @@ from sklearn.preprocessing import OneHotEncoder one_hot_encoder = make_column_transformer( - (OneHotEncoder(sparse=False, handle_unknown='ignore'), - make_column_selector(dtype_include='category')), - remainder='passthrough') + ( + OneHotEncoder(sparse=False, handle_unknown="ignore"), + make_column_selector(dtype_include="category"), + ), + remainder="passthrough", +) -hist_one_hot = make_pipeline(one_hot_encoder, - HistGradientBoostingRegressor(random_state=42)) +hist_one_hot = make_pipeline( + one_hot_encoder, HistGradientBoostingRegressor(random_state=42) +) # %% # Gradient boosting estimator with ordinal encoding @@ -83,12 +86,16 @@ import numpy as np ordinal_encoder = make_column_transformer( - (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), - make_column_selector(dtype_include='category')), - remainder='passthrough') + ( + OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), + make_column_selector(dtype_include="category"), + ), + remainder="passthrough", +) -hist_ordinal = make_pipeline(ordinal_encoder, - HistGradientBoostingRegressor(random_state=42)) +hist_ordinal = make_pipeline( + ordinal_encoder, HistGradientBoostingRegressor(random_state=42) +) # %% # Gradient boosting estimator with native categorical support @@ -107,12 +114,12 @@ # The ordinal encoder will first output the categorical features, and then the # continuous (passed-through) features -categorical_mask = ([True] * n_categorical_features + - [False] * n_numerical_features) +categorical_mask = [True] * n_categorical_features + [False] * n_numerical_features hist_native = make_pipeline( ordinal_encoder, - HistGradientBoostingRegressor(random_state=42, - categorical_features=categorical_mask) + HistGradientBoostingRegressor( + random_state=42, categorical_features=categorical_mask + ), ) @@ -136,20 +143,33 @@ def plot_results(figure_title): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) - plot_info = [('fit_time', 'Fit times (s)', ax1, None), - ('test_score', 'Mean Absolute Percentage Error', ax2, - (0, 0.20))] + plot_info = [ + ("fit_time", "Fit times (s)", ax1, None), + ("test_score", "Mean Absolute Percentage Error", ax2, (0, 0.20)), + ] x, width = np.arange(4), 0.9 for key, title, ax, y_limit in plot_info: - items = [dropped_result[key], one_hot_result[key], ordinal_result[key], - native_result[key]] - ax.bar(x, [np.mean(np.abs(item)) for item in items], - width, yerr=[np.std(item) for item in items], - color=['C0', 'C1', 'C2', 'C3']) - ax.set(xlabel='Model', title=title, xticks=x, - xticklabels=["Dropped", "One Hot", "Ordinal", "Native"], - ylim=y_limit) + items = [ + dropped_result[key], + one_hot_result[key], + ordinal_result[key], + native_result[key], + ] + ax.bar( + x, + [np.mean(np.abs(item)) for item in items], + width, + yerr=[np.std(item) for item in items], + color=["C0", "C1", "C2", "C3"], + ) + ax.set( + xlabel="Model", + title=title, + xticks=x, + xticklabels=["Dropped", "One Hot", "Ordinal", "Native"], + ylim=y_limit, + ) fig.suptitle(figure_title) @@ -194,8 +214,10 @@ def plot_results(figure_title): # of trees and the depth of each tree. for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native): - pipe.set_params(histgradientboostingregressor__max_depth=3, - histgradientboostingregressor__max_iter=15) + pipe.set_params( + histgradientboostingregressor__max_depth=3, + histgradientboostingregressor__max_iter=15, + ) dropped_result = cross_validate(hist_dropped, X, y, cv=3, scoring=scoring) one_hot_result = cross_validate(hist_one_hot, X, y, cv=3, scoring=scoring) diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py index 6f38e57a15ca1..cc4408b6dc255 100644 --- a/examples/ensemble/plot_gradient_boosting_early_stopping.py +++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py @@ -49,7 +49,7 @@ data_list = [datasets.load_iris(), datasets.load_digits()] data_list = [(d.data, d.target) for d in data_list] data_list += [datasets.make_hastie_10_2()] -names = ['Iris Data', 'Digits Data', 'Hastie Data'] +names = ["Iris Data", "Digits Data", "Hastie Data"] n_gb = [] score_gb = [] @@ -61,17 +61,20 @@ n_estimators = 500 for X, y in data_list: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, - random_state=0) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=0 + ) # We specify that if the scores don't improve by at least 0.01 for the last # 10 stages, stop fitting additional stages - gbes = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, - validation_fraction=0.2, - n_iter_no_change=5, tol=0.01, - random_state=0) - gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, - random_state=0) + gbes = ensemble.GradientBoostingClassifier( + n_estimators=n_estimators, + validation_fraction=0.2, + n_iter_no_change=5, + tol=0.01, + random_state=0, + ) + gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, random_state=0) start = time.time() gb.fit(X_train, y_train) time_gb.append(time.time() - start) @@ -97,10 +100,12 @@ plt.figure(figsize=(9, 5)) -bar1 = plt.bar(index, score_gb, bar_width, label='Without early stopping', - color='crimson') -bar2 = plt.bar(index + bar_width, score_gbes, bar_width, - label='With early stopping', color='coral') +bar1 = plt.bar( + index, score_gb, bar_width, label="Without early stopping", color="crimson" +) +bar2 = plt.bar( + index + bar_width, score_gbes, bar_width, label="With early stopping", color="coral" +) plt.xticks(index + bar_width, names) plt.yticks(np.arange(0, 1.3, 0.1)) @@ -111,20 +116,24 @@ def autolabel(rects, n_estimators): Attach a text label above each bar displaying n_estimators of each model """ for i, rect in enumerate(rects): - plt.text(rect.get_x() + rect.get_width() / 2., - 1.05 * rect.get_height(), 'n_est=%d' % n_estimators[i], - ha='center', va='bottom') + plt.text( + rect.get_x() + rect.get_width() / 2.0, + 1.05 * rect.get_height(), + "n_est=%d" % n_estimators[i], + ha="center", + va="bottom", + ) autolabel(bar1, n_gb) autolabel(bar2, n_gbes) plt.ylim([0, 1.3]) -plt.legend(loc='best') +plt.legend(loc="best") plt.grid(True) -plt.xlabel('Datasets') -plt.ylabel('Test score') +plt.xlabel("Datasets") +plt.ylabel("Test score") plt.show() @@ -135,10 +144,12 @@ def autolabel(rects, n_estimators): plt.figure(figsize=(9, 5)) -bar1 = plt.bar(index, time_gb, bar_width, label='Without early stopping', - color='crimson') -bar2 = plt.bar(index + bar_width, time_gbes, bar_width, - label='With early stopping', color='coral') +bar1 = plt.bar( + index, time_gb, bar_width, label="Without early stopping", color="crimson" +) +bar2 = plt.bar( + index + bar_width, time_gbes, bar_width, label="With early stopping", color="coral" +) max_y = np.amax(np.maximum(time_gb, time_gbes)) @@ -149,10 +160,10 @@ def autolabel(rects, n_estimators): autolabel(bar2, n_gbes) plt.ylim([0, 1.3 * max_y]) -plt.legend(loc='best') +plt.legend(loc="best") plt.grid(True) -plt.xlabel('Datasets') -plt.ylabel('Fit Time') +plt.xlabel("Datasets") +plt.ylabel("Fit Time") plt.show() diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py index c3b9321f166be..ce13eb4398403 100644 --- a/examples/ensemble/plot_gradient_boosting_oob.py +++ b/examples/ensemble/plot_gradient_boosting_oob.py @@ -51,24 +51,29 @@ X = np.c_[x1, x2, x3] X = X.astype(np.float32) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, - random_state=9) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=9) # Fit classifier with out-of-bag estimates -params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5, - 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3} +params = { + "n_estimators": 1200, + "max_depth": 3, + "subsample": 0.5, + "learning_rate": 0.01, + "min_samples_leaf": 1, + "random_state": 3, +} clf = ensemble.GradientBoostingClassifier(**params) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) print("Accuracy: {:.4f}".format(acc)) -n_estimators = params['n_estimators'] +n_estimators = params["n_estimators"] x = np.arange(n_estimators) + 1 def heldout_score(clf, X_test, y_test): - """compute deviance scores on ``X_test`` and ``y_test``. """ + """compute deviance scores on ``X_test`` and ``y_test``.""" score = np.zeros((n_estimators,), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): score[i] = clf.loss_(y_test, y_pred) @@ -112,26 +117,26 @@ def cv_estimate(n_splits=None): cv_color = list(map(lambda x: x / 256.0, (253, 192, 134))) # plot curves and vertical lines for best iterations -plt.plot(x, cumsum, label='OOB loss', color=oob_color) -plt.plot(x, test_score, label='Test loss', color=test_color) -plt.plot(x, cv_score, label='CV loss', color=cv_color) +plt.plot(x, cumsum, label="OOB loss", color=oob_color) +plt.plot(x, test_score, label="Test loss", color=test_color) +plt.plot(x, cv_score, label="CV loss", color=cv_color) plt.axvline(x=oob_best_iter, color=oob_color) plt.axvline(x=test_best_iter, color=test_color) plt.axvline(x=cv_best_iter, color=cv_color) # add three vertical lines to xticks xticks = plt.xticks() -xticks_pos = np.array(xticks[0].tolist() + - [oob_best_iter, cv_best_iter, test_best_iter]) -xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + - ['OOB', 'CV', 'Test']) +xticks_pos = np.array( + xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter] +) +xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + ["OOB", "CV", "Test"]) ind = np.argsort(xticks_pos) xticks_pos = xticks_pos[ind] xticks_label = xticks_label[ind] plt.xticks(xticks_pos, xticks_label) -plt.legend(loc='upper right') -plt.ylabel('normalized loss') -plt.xlabel('number of iterations') +plt.legend(loc="upper right") +plt.ylabel("normalized loss") +plt.xlabel("number of iterations") plt.show() diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index 67e208ece0b06..93bc70038d3f6 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -66,14 +66,13 @@ def f(x): min_samples_split=9, ) for alpha in [0.05, 0.5, 0.95]: - gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha, - **common_params) + gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, **common_params) all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train) # %% # For the sake of comparison, we also fit a baseline model trained with the # usual (mean) squared error (MSE). -gbr_ls = GradientBoostingRegressor(loss='squared_error', **common_params) +gbr_ls = GradientBoostingRegressor(loss="squared_error", **common_params) all_models["mse"] = gbr_ls.fit(X_train, y_train) # %% @@ -88,24 +87,25 @@ def f(x): import matplotlib.pyplot as plt -y_pred = all_models['mse'].predict(xx) -y_lower = all_models['q 0.05'].predict(xx) -y_upper = all_models['q 0.95'].predict(xx) -y_med = all_models['q 0.50'].predict(xx) +y_pred = all_models["mse"].predict(xx) +y_lower = all_models["q 0.05"].predict(xx) +y_upper = all_models["q 0.95"].predict(xx) +y_med = all_models["q 0.50"].predict(xx) fig = plt.figure(figsize=(10, 10)) -plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') -plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') -plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange") -plt.plot(xx, y_pred, 'r-', label='Predicted mean') -plt.plot(xx, y_upper, 'k-') -plt.plot(xx, y_lower, 'k-') -plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, - label='Predicted 90% interval') -plt.xlabel('$x$') -plt.ylabel('$f(x)$') +plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$") +plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations") +plt.plot(xx, y_med, "r-", label="Predicted median", color="orange") +plt.plot(xx, y_pred, "r-", label="Predicted mean") +plt.plot(xx, y_upper, "k-") +plt.plot(xx, y_lower, "k-") +plt.fill_between( + xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval" +) +plt.xlabel("$x$") +plt.ylabel("$f(x)$") plt.ylim(-10, 25) -plt.legend(loc='upper left') +plt.legend(loc="upper left") plt.show() # %% @@ -129,21 +129,19 @@ def f(x): def highlight_min(x): x_min = x.min() - return ['font-weight: bold' if v == x_min else '' - for v in x] + return ["font-weight: bold" if v == x_min else "" for v in x] results = [] for name, gbr in sorted(all_models.items()): - metrics = {'model': name} + metrics = {"model": name} y_pred = gbr.predict(X_train) for alpha in [0.05, 0.5, 0.95]: - metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( - y_train, y_pred, alpha=alpha) - metrics['MSE'] = mean_squared_error(y_train, y_pred) + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_train, y_pred, alpha=alpha) + metrics["MSE"] = mean_squared_error(y_train, y_pred) results.append(metrics) -pd.DataFrame(results).set_index('model').style.apply(highlight_min) +pd.DataFrame(results).set_index("model").style.apply(highlight_min) # %% # One column shows all models evaluated by the same metric. The minimum number @@ -163,15 +161,14 @@ def highlight_min(x): # We then do the same on the test set. results = [] for name, gbr in sorted(all_models.items()): - metrics = {'model': name} + metrics = {"model": name} y_pred = gbr.predict(X_test) for alpha in [0.05, 0.5, 0.95]: - metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( - y_test, y_pred, alpha=alpha) - metrics['MSE'] = mean_squared_error(y_test, y_pred) + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(y_test, y_pred, alpha=alpha) + metrics["MSE"] = mean_squared_error(y_test, y_pred) results.append(metrics) -pd.DataFrame(results).set_index('model').style.apply(highlight_min) +pd.DataFrame(results).set_index("model").style.apply(highlight_min) # %% @@ -199,16 +196,18 @@ def coverage_fraction(y, y_low, y_high): return np.mean(np.logical_and(y >= y_low, y <= y_high)) -coverage_fraction(y_train, - all_models['q 0.05'].predict(X_train), - all_models['q 0.95'].predict(X_train)) +coverage_fraction( + y_train, + all_models["q 0.05"].predict(X_train), + all_models["q 0.95"].predict(X_train), +) # %% # On the training set the calibration is very close to the expected coverage # value for a 90% confidence interval. -coverage_fraction(y_test, - all_models['q 0.05'].predict(X_test), - all_models['q 0.95'].predict(X_test)) +coverage_fraction( + y_test, all_models["q 0.05"].predict(X_test), all_models["q 0.95"].predict(X_test) +) # %% @@ -298,16 +297,17 @@ def coverage_fraction(y, y_low, y_high): y_upper = search_95p.predict(xx) fig = plt.figure(figsize=(10, 10)) -plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') -plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') -plt.plot(xx, y_upper, 'k-') -plt.plot(xx, y_lower, 'k-') -plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, - label='Predicted 90% interval') -plt.xlabel('$x$') -plt.ylabel('$f(x)$') +plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$") +plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations") +plt.plot(xx, y_upper, "k-") +plt.plot(xx, y_lower, "k-") +plt.fill_between( + xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval" +) +plt.xlabel("$x$") +plt.ylabel("$f(x)$") plt.ylim(-10, 25) -plt.legend(loc='upper left') +plt.legend(loc="upper left") plt.title("Prediction with tuned hyper-parameters") plt.show() @@ -317,13 +317,9 @@ def coverage_fraction(y, y_low, y_high): # # We now quantitatively evaluate the joint-calibration of the pair of # estimators: -coverage_fraction(y_train, - search_05p.predict(X_train), - search_95p.predict(X_train)) +coverage_fraction(y_train, search_05p.predict(X_train), search_95p.predict(X_train)) # %% -coverage_fraction(y_test, - search_05p.predict(X_test), - search_95p.predict(X_test)) +coverage_fraction(y_test, search_05p.predict(X_test), search_95p.predict(X_test)) # %% # The calibration of the tuned pair is sadly not better on the test set: the # width of the estimated confidence interval is still too narrow. diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index 3722f4bf2066f..c258dc13babaf 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -61,13 +61,16 @@ # :class:`~sklearn.ensemble.GradientBoostingRegressor` ). X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=13) + X, y, test_size=0.1, random_state=13 +) -params = {'n_estimators': 500, - 'max_depth': 4, - 'min_samples_split': 5, - 'learning_rate': 0.01, - 'loss': 'squared_error'} +params = { + "n_estimators": 500, + "max_depth": 4, + "min_samples_split": 5, + "learning_rate": 0.01, + "loss": "squared_error", +} # %% # Fit regression model @@ -89,20 +92,25 @@ # Finally, we will visualize the results. To do that we will first compute the # test set deviance and then plot it against boosting iterations. -test_score = np.zeros((params['n_estimators'],), dtype=np.float64) +test_score = np.zeros((params["n_estimators"],), dtype=np.float64) for i, y_pred in enumerate(reg.staged_predict(X_test)): test_score[i] = reg.loss_(y_test, y_pred) fig = plt.figure(figsize=(6, 6)) plt.subplot(1, 1, 1) -plt.title('Deviance') -plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-', - label='Training Set Deviance') -plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', - label='Test Set Deviance') -plt.legend(loc='upper right') -plt.xlabel('Boosting Iterations') -plt.ylabel('Deviance') +plt.title("Deviance") +plt.plot( + np.arange(params["n_estimators"]) + 1, + reg.train_score_, + "b-", + label="Training Set Deviance", +) +plt.plot( + np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance" +) +plt.legend(loc="upper right") +plt.xlabel("Boosting Iterations") +plt.ylabel("Deviance") fig.tight_layout() plt.show() @@ -123,19 +131,23 @@ feature_importance = reg.feature_importances_ sorted_idx = np.argsort(feature_importance) -pos = np.arange(sorted_idx.shape[0]) + .5 +pos = np.arange(sorted_idx.shape[0]) + 0.5 fig = plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) -plt.barh(pos, feature_importance[sorted_idx], align='center') +plt.barh(pos, feature_importance[sorted_idx], align="center") plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx]) -plt.title('Feature Importance (MDI)') +plt.title("Feature Importance (MDI)") -result = permutation_importance(reg, X_test, y_test, n_repeats=10, - random_state=42, n_jobs=2) +result = permutation_importance( + reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 +) sorted_idx = result.importances_mean.argsort() plt.subplot(1, 2, 2) -plt.boxplot(result.importances[sorted_idx].T, - vert=False, labels=np.array(diabetes.feature_names)[sorted_idx]) +plt.boxplot( + result.importances[sorted_idx].T, + vert=False, + labels=np.array(diabetes.feature_names)[sorted_idx], +) plt.title("Permutation Importance (test set)") fig.tight_layout() plt.show() diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py index 6f2eb893ebe0f..5947233d60860 100644 --- a/examples/ensemble/plot_gradient_boosting_regularization.py +++ b/examples/ensemble/plot_gradient_boosting_regularization.py @@ -41,21 +41,31 @@ X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] -original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, - 'max_depth': None, 'random_state': 2, 'min_samples_split': 5} +original_params = { + "n_estimators": 1000, + "max_leaf_nodes": 4, + "max_depth": None, + "random_state": 2, + "min_samples_split": 5, +} plt.figure() -for label, color, setting in [('No shrinkage', 'orange', - {'learning_rate': 1.0, 'subsample': 1.0}), - ('learning_rate=0.1', 'turquoise', - {'learning_rate': 0.1, 'subsample': 1.0}), - ('subsample=0.5', 'blue', - {'learning_rate': 1.0, 'subsample': 0.5}), - ('learning_rate=0.1, subsample=0.5', 'gray', - {'learning_rate': 0.1, 'subsample': 0.5}), - ('learning_rate=0.1, max_features=2', 'magenta', - {'learning_rate': 0.1, 'max_features': 2})]: +for label, color, setting in [ + ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}), + ("learning_rate=0.1", "turquoise", {"learning_rate": 0.1, "subsample": 1.0}), + ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}), + ( + "learning_rate=0.1, subsample=0.5", + "gray", + {"learning_rate": 0.1, "subsample": 0.5}, + ), + ( + "learning_rate=0.1, max_features=2", + "magenta", + {"learning_rate": 0.1, "max_features": 2}, + ), +]: params = dict(original_params) params.update(setting) @@ -63,17 +73,22 @@ clf.fit(X_train, y_train) # compute test set deviance - test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64) + test_deviance = np.zeros((params["n_estimators"],), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = clf.loss_(y_test, y_pred) - plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], - '-', color=color, label=label) - -plt.legend(loc='upper left') -plt.xlabel('Boosting Iterations') -plt.ylabel('Test Set Deviance') + plt.plot( + (np.arange(test_deviance.shape[0]) + 1)[::5], + test_deviance[::5], + "-", + color=color, + label=label, + ) + +plt.legend(loc="upper left") +plt.xlabel("Boosting Iterations") +plt.ylabel("Test Set Deviance") plt.show() diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py index 5370f3af3ef97..12a2bfd846279 100644 --- a/examples/ensemble/plot_isolation_forest.py +++ b/examples/ensemble/plot_isolation_forest.py @@ -54,17 +54,15 @@ plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) -b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', - s=20, edgecolor='k') -b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', - s=20, edgecolor='k') -c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red', - s=20, edgecolor='k') -plt.axis('tight') +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k") +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k") +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k") +plt.axis("tight") plt.xlim((-5, 5)) plt.ylim((-5, 5)) -plt.legend([b1, b2, c], - ["training observations", - "new regular observations", "new abnormal observations"], - loc="upper left") +plt.legend( + [b1, b2, c], + ["training observations", "new regular observations", "new abnormal observations"], + loc="upper left", +) plt.show() diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py index 6146a3bb72db1..1039f69b6fbe9 100644 --- a/examples/ensemble/plot_monotonic_constraints.py +++ b/examples/ensemble/plot_monotonic_constraints.py @@ -33,9 +33,7 @@ f_1 = rng.rand(n_samples) # negative correlation with y X = np.c_[f_0, f_1] noise = rng.normal(loc=0.0, scale=0.01, size=n_samples) -y = (5 * f_0 + np.sin(10 * np.pi * f_0) - - 5 * f_1 - np.cos(10 * np.pi * f_1) + - noise) +y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise fig, ax = plt.subplots() diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py index 4d0ccd4502c31..339dab440502d 100644 --- a/examples/ensemble/plot_random_forest_embedding.py +++ b/examples/ensemble/plot_random_forest_embedding.py @@ -57,23 +57,24 @@ fig = plt.figure(figsize=(9, 8)) ax = plt.subplot(221) -ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k') +ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k") ax.set_title("Original Data (2d)") ax.set_xticks(()) ax.set_yticks(()) ax = plt.subplot(222) -ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k') -ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" % - X_transformed.shape[1]) +ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor="k") +ax.set_title( + "Truncated SVD reduction (2d) of transformed data (%dd)" % X_transformed.shape[1] +) ax.set_xticks(()) ax.set_yticks(()) # Plot the decision in original space. For that, we will assign a color # to each point in the mesh [x_min, x_max]x[y_min, y_max]. -h = .01 -x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 -y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 +h = 0.01 +x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 +y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # transform grid using RandomTreesEmbedding @@ -83,7 +84,7 @@ ax = plt.subplot(223) ax.set_title("Naive Bayes on Transformed data") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) -ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k') +ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k") ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) @@ -95,7 +96,7 @@ ax = plt.subplot(224) ax.set_title("ExtraTrees predictions") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) -ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k') +ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor="k") ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py index 8b7803361a60a..220f759ba40b1 100644 --- a/examples/ensemble/plot_random_forest_regression_multioutput.py +++ b/examples/ensemble/plot_random_forest_regression_multioutput.py @@ -37,19 +37,19 @@ rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(600, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T -y += (0.5 - rng.rand(*y.shape)) +y += 0.5 - rng.rand(*y.shape) X_train, X_test, y_train, y_test = train_test_split( - X, y, train_size=400, test_size=200, random_state=4) + X, y, train_size=400, test_size=200, random_state=4 +) max_depth = 30 -regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, - max_depth=max_depth, - random_state=0)) +regr_multirf = MultiOutputRegressor( + RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0) +) regr_multirf.fit(X_train, y_train) -regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth, - random_state=2) +regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=2) regr_rf.fit(X_train, y_train) # Predict on new data @@ -60,14 +60,35 @@ plt.figure() s = 50 a = 0.4 -plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k', - c="navy", s=s, marker="s", alpha=a, label="Data") -plt.scatter(y_multirf[:, 0], y_multirf[:, 1], edgecolor='k', - c="cornflowerblue", s=s, alpha=a, - label="Multi RF score=%.2f" % regr_multirf.score(X_test, y_test)) -plt.scatter(y_rf[:, 0], y_rf[:, 1], edgecolor='k', - c="c", s=s, marker="^", alpha=a, - label="RF score=%.2f" % regr_rf.score(X_test, y_test)) +plt.scatter( + y_test[:, 0], + y_test[:, 1], + edgecolor="k", + c="navy", + s=s, + marker="s", + alpha=a, + label="Data", +) +plt.scatter( + y_multirf[:, 0], + y_multirf[:, 1], + edgecolor="k", + c="cornflowerblue", + s=s, + alpha=a, + label="Multi RF score=%.2f" % regr_multirf.score(X_test, y_test), +) +plt.scatter( + y_rf[:, 0], + y_rf[:, 1], + edgecolor="k", + c="c", + s=s, + marker="^", + alpha=a, + label="RF score=%.2f" % regr_rf.score(X_test, y_test), +) plt.xlim([-6, 6]) plt.ylim([-6, 6]) plt.xlabel("target 1") diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py index afa48c62d8d0b..1b48d50f2c40f 100644 --- a/examples/ensemble/plot_stack_predictors.py +++ b/examples/ensemble/plot_stack_predictors.py @@ -23,7 +23,8 @@ print(__doc__) from sklearn import set_config -set_config(display='diagram') + +set_config(display="diagram") # %% # Download the dataset @@ -54,11 +55,28 @@ def load_ames_housing(): X = df.data y = df.target - features = ['YrSold', 'HeatingQC', 'Street', 'YearRemodAdd', 'Heating', - 'MasVnrType', 'BsmtUnfSF', 'Foundation', 'MasVnrArea', - 'MSSubClass', 'ExterQual', 'Condition2', 'GarageCars', - 'GarageType', 'OverallQual', 'TotalBsmtSF', 'BsmtFinSF1', - 'HouseStyle', 'MiscFeature', 'MoSold'] + features = [ + "YrSold", + "HeatingQC", + "Street", + "YearRemodAdd", + "Heating", + "MasVnrType", + "BsmtUnfSF", + "Foundation", + "MasVnrArea", + "MSSubClass", + "ExterQual", + "Condition2", + "GarageCars", + "GarageType", + "OverallQual", + "TotalBsmtSF", + "BsmtFinSF1", + "HouseStyle", + "MiscFeature", + "MoSold", + ] X = X[features] X, y = shuffle(X, y, random_state=0) @@ -105,11 +123,13 @@ def load_ames_housing(): from sklearn.preprocessing import OrdinalEncoder cat_tree_processor = OrdinalEncoder( - handle_unknown="use_encoded_value", unknown_value=-1) + handle_unknown="use_encoded_value", unknown_value=-1 +) num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True) tree_preprocessor = make_column_transformer( - (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)) + (num_tree_processor, num_selector), (cat_tree_processor, cat_selector) +) tree_preprocessor # %% @@ -121,10 +141,12 @@ def load_ames_housing(): cat_linear_processor = OneHotEncoder(handle_unknown="ignore") num_linear_processor = make_pipeline( - StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)) + StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True) +) linear_preprocessor = make_column_transformer( - (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)) + (num_linear_processor, num_selector), (cat_linear_processor, cat_selector) +) linear_preprocessor # %% @@ -155,27 +177,28 @@ def load_ames_housing(): # %% from sklearn.ensemble import RandomForestRegressor -rf_pipeline = make_pipeline( - tree_preprocessor, RandomForestRegressor(random_state=42)) +rf_pipeline = make_pipeline(tree_preprocessor, RandomForestRegressor(random_state=42)) rf_pipeline # %% from sklearn.ensemble import HistGradientBoostingRegressor gbdt_pipeline = make_pipeline( - tree_preprocessor, HistGradientBoostingRegressor(random_state=0)) + tree_preprocessor, HistGradientBoostingRegressor(random_state=0) +) gbdt_pipeline # %% from sklearn.ensemble import StackingRegressor from sklearn.linear_model import RidgeCV -estimators = [('Random Forest', rf_pipeline), - ('Lasso', lasso_pipeline), - ('Gradient Boosting', gbdt_pipeline)] +estimators = [ + ("Random Forest", rf_pipeline), + ("Lasso", lasso_pipeline), + ("Gradient Boosting", gbdt_pipeline), +] -stacking_regressor = StackingRegressor( - estimators=estimators, final_estimator=RidgeCV()) +stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV()) stacking_regressor # %% @@ -197,52 +220,58 @@ def load_ames_housing(): def plot_regression_results(ax, y_true, y_pred, title, scores, elapsed_time): """Scatter plot of the predicted vs true targets.""" - ax.plot([y_true.min(), y_true.max()], - [y_true.min(), y_true.max()], - '--r', linewidth=2) + ax.plot( + [y_true.min(), y_true.max()], [y_true.min(), y_true.max()], "--r", linewidth=2 + ) ax.scatter(y_true, y_pred, alpha=0.2) - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() - ax.spines['left'].set_position(('outward', 10)) - ax.spines['bottom'].set_position(('outward', 10)) + ax.spines["left"].set_position(("outward", 10)) + ax.spines["bottom"].set_position(("outward", 10)) ax.set_xlim([y_true.min(), y_true.max()]) ax.set_ylim([y_true.min(), y_true.max()]) - ax.set_xlabel('Measured') - ax.set_ylabel('Predicted') - extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False, - edgecolor='none', linewidth=0) - ax.legend([extra], [scores], loc='upper left') - title = title + '\n Evaluation in {:.2f} seconds'.format(elapsed_time) + ax.set_xlabel("Measured") + ax.set_ylabel("Predicted") + extra = plt.Rectangle( + (0, 0), 0, 0, fc="w", fill=False, edgecolor="none", linewidth=0 + ) + ax.legend([extra], [scores], loc="upper left") + title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time) ax.set_title(title) fig, axs = plt.subplots(2, 2, figsize=(9, 7)) axs = np.ravel(axs) -for ax, (name, est) in zip(axs, estimators + [('Stacking Regressor', - stacking_regressor)]): +for ax, (name, est) in zip( + axs, estimators + [("Stacking Regressor", stacking_regressor)] +): start_time = time.time() - score = cross_validate(est, X, y, - scoring=['r2', 'neg_mean_absolute_error'], - n_jobs=-1, verbose=0) + score = cross_validate( + est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0 + ) elapsed_time = time.time() - start_time y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0) plot_regression_results( - ax, y, y_pred, + ax, + y, + y_pred, name, - (r'$R^2={:.2f} \pm {:.2f}$' + '\n' + r'$MAE={:.2f} \pm {:.2f}$') - .format(np.mean(score['test_r2']), - np.std(score['test_r2']), - -np.mean(score['test_neg_mean_absolute_error']), - np.std(score['test_neg_mean_absolute_error'])), - elapsed_time) - -plt.suptitle('Single predictors versus stacked predictors') + (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format( + np.mean(score["test_r2"]), + np.std(score["test_r2"]), + -np.mean(score["test_neg_mean_absolute_error"]), + np.std(score["test_neg_mean_absolute_error"]), + ), + elapsed_time, + ) + +plt.suptitle("Single predictors versus stacked predictors") plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show() diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py index fdfda74947f5f..7b2cb278c9035 100644 --- a/examples/ensemble/plot_voting_decision_regions.py +++ b/examples/ensemble/plot_voting_decision_regions.py @@ -43,10 +43,12 @@ # Training classifiers clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) -clf3 = SVC(gamma=.1, kernel='rbf', probability=True) -eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), - ('svc', clf3)], - voting='soft', weights=[2, 1, 2]) +clf3 = SVC(gamma=0.1, kernel="rbf", probability=True) +eclf = VotingClassifier( + estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)], + voting="soft", + weights=[2, 1, 2], +) clf1.fit(X, y) clf2.fit(X, y) @@ -56,22 +58,21 @@ # Plotting decision regions x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), - np.arange(y_min, y_max, 0.1)) +xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) -f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8)) +f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8)) -for idx, clf, tt in zip(product([0, 1], [0, 1]), - [clf1, clf2, clf3, eclf], - ['Decision Tree (depth=4)', 'KNN (k=7)', - 'Kernel SVM', 'Soft Voting']): +for idx, clf, tt in zip( + product([0, 1], [0, 1]), + [clf1, clf2, clf3, eclf], + ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"], +): Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4) - axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, - s=20, edgecolor='k') + axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k") axarr[idx[0], idx[1]].set_title(tt) plt.show() diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py index 1e65e7d725964..311539f36fc75 100644 --- a/examples/ensemble/plot_voting_probas.py +++ b/examples/ensemble/plot_voting_probas.py @@ -38,9 +38,11 @@ X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) -eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], - voting='soft', - weights=[1, 1, 5]) +eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + weights=[1, 1, 5], +) # predict class probabilities for all classifiers probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)] @@ -59,28 +61,36 @@ fig, ax = plt.subplots() # bars for classifier 1-3 -p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, - color='green', edgecolor='k') -p2 = ax.bar(ind + width, np.hstack(([class2_1[:-1], [0]])), width, - color='lightgreen', edgecolor='k') +p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k") +p2 = ax.bar( + ind + width, + np.hstack(([class2_1[:-1], [0]])), + width, + color="lightgreen", + edgecolor="k", +) # bars for VotingClassifier -p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, - color='blue', edgecolor='k') -p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]], width, - color='steelblue', edgecolor='k') +p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k") +p4 = ax.bar( + ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k" +) # plot annotations -plt.axvline(2.8, color='k', linestyle='dashed') +plt.axvline(2.8, color="k", linestyle="dashed") ax.set_xticks(ind + width) -ax.set_xticklabels(['LogisticRegression\nweight 1', - 'GaussianNB\nweight 1', - 'RandomForestClassifier\nweight 5', - 'VotingClassifier\n(average probabilities)'], - rotation=40, - ha='right') +ax.set_xticklabels( + [ + "LogisticRegression\nweight 1", + "GaussianNB\nweight 1", + "RandomForestClassifier\nweight 5", + "VotingClassifier\n(average probabilities)", + ], + rotation=40, + ha="right", +) plt.ylim([0, 1]) -plt.title('Class probabilities for sample 1 by different classifiers') -plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left') +plt.title("Class probabilities for sample 1 by different classifiers") +plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left") plt.tight_layout() plt.show() diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py index 106efbac2461d..bc0c0b059e9e6 100644 --- a/examples/ensemble/plot_voting_regressor.py +++ b/examples/ensemble/plot_voting_regressor.py @@ -51,7 +51,7 @@ reg2.fit(X, y) reg3.fit(X, y) -ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)]) +ereg = VotingRegressor([("gb", reg1), ("rf", reg2), ("lr", reg3)]) ereg.fit(X, y) # %% @@ -75,16 +75,15 @@ # prediction made by :class:`~ensemble.VotingRegressor`. plt.figure() -plt.plot(pred1, 'gd', label='GradientBoostingRegressor') -plt.plot(pred2, 'b^', label='RandomForestRegressor') -plt.plot(pred3, 'ys', label='LinearRegression') -plt.plot(pred4, 'r*', ms=10, label='VotingRegressor') - -plt.tick_params(axis='x', which='both', bottom=False, top=False, - labelbottom=False) -plt.ylabel('predicted') -plt.xlabel('training samples') +plt.plot(pred1, "gd", label="GradientBoostingRegressor") +plt.plot(pred2, "b^", label="RandomForestRegressor") +plt.plot(pred3, "ys", label="LinearRegression") +plt.plot(pred4, "r*", ms=10, label="VotingRegressor") + +plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False) +plt.ylabel("predicted") +plt.xlabel("training samples") plt.legend(loc="best") -plt.title('Regressor predictions and their average') +plt.title("Regressor predictions and their average") plt.show() diff --git a/examples/exercises/plot_cv_diabetes.py b/examples/exercises/plot_cv_diabetes.py index d71abd8fe6455..aa836111f081d 100644 --- a/examples/exercises/plot_cv_diabetes.py +++ b/examples/exercises/plot_cv_diabetes.py @@ -27,28 +27,28 @@ lasso = Lasso(random_state=0, max_iter=10000) alphas = np.logspace(-4, -0.5, 30) -tuned_parameters = [{'alpha': alphas}] +tuned_parameters = [{"alpha": alphas}] n_folds = 5 clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False) clf.fit(X, y) -scores = clf.cv_results_['mean_test_score'] -scores_std = clf.cv_results_['std_test_score'] +scores = clf.cv_results_["mean_test_score"] +scores_std = clf.cv_results_["std_test_score"] plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores) # plot error lines showing +/- std. errors of the scores std_error = scores_std / np.sqrt(n_folds) -plt.semilogx(alphas, scores + std_error, 'b--') -plt.semilogx(alphas, scores - std_error, 'b--') +plt.semilogx(alphas, scores + std_error, "b--") +plt.semilogx(alphas, scores - std_error, "b--") # alpha=0.2 controls the translucency of the fill color plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2) -plt.ylabel('CV score +/- std error') -plt.xlabel('alpha') -plt.axhline(np.max(scores), linestyle='--', color='.5') +plt.ylabel("CV score +/- std error") +plt.xlabel("alpha") +plt.axhline(np.max(scores), linestyle="--", color=".5") plt.xlim([alphas[0], alphas[-1]]) # ############################################################################# @@ -62,15 +62,17 @@ lasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000) k_fold = KFold(3) -print("Answer to the bonus question:", - "how much can you trust the selection of alpha?") +print("Answer to the bonus question:", "how much can you trust the selection of alpha?") print() print("Alpha parameters maximising the generalization score on different") print("subsets of the data:") for k, (train, test) in enumerate(k_fold.split(X, y)): lasso_cv.fit(X[train], y[train]) - print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}". - format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test]))) + print( + "[fold {0}] alpha: {1:.5f}, score: {2:.5f}".format( + k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test]) + ) + ) print() print("Answer: Not very much since we obtained different alphas for different") print("subsets of the data and moreover, the scores for these alphas differ") diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py index 4f4ef8cc761e6..f350444395be7 100644 --- a/examples/exercises/plot_cv_digits.py +++ b/examples/exercises/plot_cv_digits.py @@ -17,7 +17,7 @@ X, y = datasets.load_digits(return_X_y=True) -svc = svm.SVC(kernel='linear') +svc = svm.SVC(kernel="linear") C_s = np.logspace(-10, 0, 10) scores = list() @@ -30,13 +30,14 @@ # Do the plotting import matplotlib.pyplot as plt + plt.figure() plt.semilogx(C_s, scores) -plt.semilogx(C_s, np.array(scores) + np.array(scores_std), 'b--') -plt.semilogx(C_s, np.array(scores) - np.array(scores_std), 'b--') +plt.semilogx(C_s, np.array(scores) + np.array(scores_std), "b--") +plt.semilogx(C_s, np.array(scores) - np.array(scores_std), "b--") locs, labels = plt.yticks() plt.yticks(locs, list(map(lambda x: "%g" % x, locs))) -plt.ylabel('CV score') -plt.xlabel('Parameter C') +plt.ylabel("CV score") +plt.xlabel("Parameter C") plt.ylim(0, 1.1) plt.show() diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py index f5f01687d03eb..638631f80d066 100644 --- a/examples/exercises/plot_digits_classification_exercise.py +++ b/examples/exercises/plot_digits_classification_exercise.py @@ -19,14 +19,16 @@ n_samples = len(X_digits) -X_train = X_digits[:int(.9 * n_samples)] -y_train = y_digits[:int(.9 * n_samples)] -X_test = X_digits[int(.9 * n_samples):] -y_test = y_digits[int(.9 * n_samples):] +X_train = X_digits[: int(0.9 * n_samples)] +y_train = y_digits[: int(0.9 * n_samples)] +X_test = X_digits[int(0.9 * n_samples) :] +y_test = y_digits[int(0.9 * n_samples) :] knn = neighbors.KNeighborsClassifier() logistic = linear_model.LogisticRegression(max_iter=1000) -print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test)) -print('LogisticRegression score: %f' - % logistic.fit(X_train, y_train).score(X_test, y_test)) +print("KNN score: %f" % knn.fit(X_train, y_train).score(X_test, y_test)) +print( + "LogisticRegression score: %f" + % logistic.fit(X_train, y_train).score(X_test, y_test) +) diff --git a/examples/exercises/plot_iris_exercise.py b/examples/exercises/plot_iris_exercise.py index 39723a2271f5d..98586c311697e 100644 --- a/examples/exercises/plot_iris_exercise.py +++ b/examples/exercises/plot_iris_exercise.py @@ -29,26 +29,28 @@ X = X[order] y = y[order].astype(float) -X_train = X[:int(.9 * n_sample)] -y_train = y[:int(.9 * n_sample)] -X_test = X[int(.9 * n_sample):] -y_test = y[int(.9 * n_sample):] +X_train = X[: int(0.9 * n_sample)] +y_train = y[: int(0.9 * n_sample)] +X_test = X[int(0.9 * n_sample) :] +y_test = y[int(0.9 * n_sample) :] # fit the model -for kernel in ('linear', 'rbf', 'poly'): +for kernel in ("linear", "rbf", "poly"): clf = svm.SVC(kernel=kernel, gamma=10) clf.fit(X_train, y_train) plt.figure() plt.clf() - plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired, - edgecolor='k', s=20) + plt.scatter( + X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired, edgecolor="k", s=20 + ) # Circle out the test data - plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none', - zorder=10, edgecolor='k') + plt.scatter( + X_test[:, 0], X_test[:, 1], s=80, facecolors="none", zorder=10, edgecolor="k" + ) - plt.axis('tight') + plt.axis("tight") x_min = X[:, 0].min() x_max = X[:, 0].max() y_min = X[:, 1].min() @@ -60,8 +62,14 @@ # Put the result into a color plot Z = Z.reshape(XX.shape) plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) - plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], - linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) + plt.contour( + XX, + YY, + Z, + colors=["k", "k", "k"], + linestyles=["--", "-", "--"], + levels=[-0.5, 0, 0.5], + ) plt.title(kernel) plt.show() diff --git a/examples/feature_selection/plot_f_test_vs_mi.py b/examples/feature_selection/plot_f_test_vs_mi.py index d9359380bfa96..d6fef3e62da0d 100644 --- a/examples/feature_selection/plot_f_test_vs_mi.py +++ b/examples/feature_selection/plot_f_test_vs_mi.py @@ -40,10 +40,9 @@ plt.figure(figsize=(15, 5)) for i in range(3): plt.subplot(1, 3, i + 1) - plt.scatter(X[:, i], y, edgecolor='black', s=20) + plt.scatter(X[:, i], y, edgecolor="black", s=20) plt.xlabel("$x_{}$".format(i + 1), fontsize=14) if i == 0: plt.ylabel("$y$", fontsize=14) - plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), - fontsize=16) + plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), fontsize=16) plt.show() diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py index 1e4ef6a81bba8..a68019568e015 100644 --- a/examples/feature_selection/plot_feature_selection.py +++ b/examples/feature_selection/plot_feature_selection.py @@ -44,9 +44,7 @@ X = np.hstack((X, E)) # Split dataset to select feature and evaluate the classifier -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=0 -) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) plt.figure(1) plt.clf() @@ -61,38 +59,47 @@ selector.fit(X_train, y_train) scores = -np.log10(selector.pvalues_) scores /= scores.max() -plt.bar(X_indices - .45, scores, width=.2, - label=r'Univariate score ($-Log(p_{value})$)') +plt.bar( + X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)" +) # ############################################################################# # Compare to the weights of an SVM clf = make_pipeline(MinMaxScaler(), LinearSVC()) clf.fit(X_train, y_train) -print('Classification accuracy without selecting features: {:.3f}' - .format(clf.score(X_test, y_test))) +print( + "Classification accuracy without selecting features: {:.3f}".format( + clf.score(X_test, y_test) + ) +) svm_weights = np.abs(clf[-1].coef_).sum(axis=0) svm_weights /= svm_weights.sum() -plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight') +plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight") -clf_selected = make_pipeline( - SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC() -) +clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()) clf_selected.fit(X_train, y_train) -print('Classification accuracy after univariate feature selection: {:.3f}' - .format(clf_selected.score(X_test, y_test))) +print( + "Classification accuracy after univariate feature selection: {:.3f}".format( + clf_selected.score(X_test, y_test) + ) +) svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0) svm_weights_selected /= svm_weights_selected.sum() -plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, - width=.2, label='SVM weights after selection') +plt.bar( + X_indices[selector.get_support()] - 0.05, + svm_weights_selected, + width=0.2, + label="SVM weights after selection", +) plt.title("Comparing feature selection") -plt.xlabel('Feature number') +plt.xlabel("Feature number") plt.yticks(()) -plt.axis('tight') -plt.legend(loc='upper right') +plt.axis("tight") +plt.legend(loc="upper right") plt.show() diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py index 871c894ee0711..a9a426a0e7b47 100644 --- a/examples/feature_selection/plot_feature_selection_pipeline.py +++ b/examples/feature_selection/plot_feature_selection_pipeline.py @@ -11,7 +11,8 @@ print(__doc__) from sklearn import set_config -set_config(display='diagram') + +set_config(display="diagram") # %% # We will start by generating a binary classification dataset. Subsequently, we @@ -21,8 +22,13 @@ from sklearn.model_selection import train_test_split X, y = make_classification( - n_features=20, n_informative=3, n_redundant=0, n_classes=2, - n_clusters_per_class=2, random_state=42) + n_features=20, + n_informative=3, + n_redundant=0, + n_classes=2, + n_clusters_per_class=2, + random_state=42, +) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # %% diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py index 71acf5eace22b..160fd5a826376 100644 --- a/examples/feature_selection/plot_rfe_with_cross_validation.py +++ b/examples/feature_selection/plot_rfe_with_cross_validation.py @@ -15,18 +15,29 @@ from sklearn.datasets import make_classification # Build a classification task using 3 informative features -X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, - n_redundant=2, n_repeated=0, n_classes=8, - n_clusters_per_class=1, random_state=0) +X, y = make_classification( + n_samples=1000, + n_features=25, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + random_state=0, +) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring shows the proportion of correct classifications min_features_to_select = 1 # Minimum number of features to consider -rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), - scoring='accuracy', - min_features_to_select=min_features_to_select) +rfecv = RFECV( + estimator=svc, + step=1, + cv=StratifiedKFold(2), + scoring="accuracy", + min_features_to_select=min_features_to_select, +) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) @@ -35,7 +46,8 @@ plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (accuracy)") -plt.plot(range(min_features_to_select, - len(rfecv.grid_scores_) + min_features_to_select), - rfecv.grid_scores_) +plt.plot( + range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select), + rfecv.grid_scores_, +) plt.show() diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py index fdc57dc867f9e..353774acd113e 100644 --- a/examples/feature_selection/plot_select_from_model_diabetes.py +++ b/examples/feature_selection/plot_select_from_model_diabetes.py @@ -77,8 +77,7 @@ tic = time() sfm = SelectFromModel(lasso, threshold=threshold).fit(X, y) toc = time() -print("Features selected by SelectFromModel: " - f"{feature_names[sfm.get_support()]}") +print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}") print(f"Done in {toc - tic:.3f}s") # %% @@ -100,20 +99,26 @@ from sklearn.feature_selection import SequentialFeatureSelector tic_fwd = time() -sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=2, - direction='forward').fit(X, y) +sfs_forward = SequentialFeatureSelector( + lasso, n_features_to_select=2, direction="forward" +).fit(X, y) toc_fwd = time() tic_bwd = time() -sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2, - direction='backward').fit(X, y) +sfs_backward = SequentialFeatureSelector( + lasso, n_features_to_select=2, direction="backward" +).fit(X, y) toc_bwd = time() -print("Features selected by forward sequential selection: " - f"{feature_names[sfs_forward.get_support()]}") +print( + "Features selected by forward sequential selection: " + f"{feature_names[sfs_forward.get_support()]}" +) print(f"Done in {toc_fwd - tic_fwd:.3f}s") -print("Features selected by backward sequential selection: " - f"{feature_names[sfs_backward.get_support()]}") +print( + "Features selected by backward sequential selection: " + f"{feature_names[sfs_backward.get_support()]}" +) print(f"Done in {toc_bwd - tic_bwd:.3f}s") # %% diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py index 1eb771673b0d6..dd60fa2d6dda8 100644 --- a/examples/gaussian_process/plot_compare_gpr_krr.py +++ b/examples/gaussian_process/plot_compare_gpr_krr.py @@ -69,17 +69,20 @@ y += 3 * (0.5 - rng.rand(X.shape[0])) # add noise # Fit KernelRidge with parameter selection based on 5-fold cross validation -param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3], - "kernel": [ExpSineSquared(l, p) - for l in np.logspace(-2, 2, 10) - for p in np.logspace(0, 2, 10)]} +param_grid = { + "alpha": [1e0, 1e-1, 1e-2, 1e-3], + "kernel": [ + ExpSineSquared(l, p) + for l in np.logspace(-2, 2, 10) + for p in np.logspace(0, 2, 10) + ], +} kr = GridSearchCV(KernelRidge(), param_grid=param_grid) stime = time.time() kr.fit(X, y) print("Time for KRR fitting: %.3f" % (time.time() - stime)) -gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \ - + WhiteKernel(1e-1) +gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) + WhiteKernel(1e-1) gpr = GaussianProcessRegressor(kernel=gp_kernel) stime = time.time() gpr.fit(X, y) @@ -98,24 +101,22 @@ stime = time.time() y_gpr, y_std = gpr.predict(X_plot, return_std=True) -print("Time for GPR prediction with standard-deviation: %.3f" - % (time.time() - stime)) +print("Time for GPR prediction with standard-deviation: %.3f" % (time.time() - stime)) # Plot results plt.figure(figsize=(10, 5)) lw = 2 -plt.scatter(X, y, c='k', label='data') -plt.plot(X_plot, np.sin(X_plot), color='navy', lw=lw, label='True') -plt.plot(X_plot, y_kr, color='turquoise', lw=lw, - label='KRR (%s)' % kr.best_params_) -plt.plot(X_plot, y_gpr, color='darkorange', lw=lw, - label='GPR (%s)' % gpr.kernel_) -plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='darkorange', - alpha=0.2) -plt.xlabel('data') -plt.ylabel('target') +plt.scatter(X, y, c="k", label="data") +plt.plot(X_plot, np.sin(X_plot), color="navy", lw=lw, label="True") +plt.plot(X_plot, y_kr, color="turquoise", lw=lw, label="KRR (%s)" % kr.best_params_) +plt.plot(X_plot, y_gpr, color="darkorange", lw=lw, label="GPR (%s)" % gpr.kernel_) +plt.fill_between( + X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color="darkorange", alpha=0.2 +) +plt.xlabel("data") +plt.ylabel("target") plt.xlim(0, 20) plt.ylim(-4, 4) -plt.title('GPR versus Kernel Ridge') -plt.legend(loc="best", scatterpoints=1, prop={'size': 8}) +plt.title("GPR versus Kernel Ridge") +plt.legend(loc="best", scatterpoints=1, prop={"size": 8}) plt.show() diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py index 1d00b9f330400..ab22134e2048c 100644 --- a/examples/gaussian_process/plot_gpc.py +++ b/examples/gaussian_process/plot_gpc.py @@ -42,37 +42,58 @@ y = np.array(X[:, 0] > 2.5, dtype=int) # Specify Gaussian Processes with fixed and optimized hyperparameters -gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), - optimizer=None) +gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) -print("Log Marginal Likelihood (initial): %.3f" - % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) -print("Log Marginal Likelihood (optimized): %.3f" - % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) - -print("Accuracy: %.3f (initial) %.3f (optimized)" - % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), - accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))) -print("Log-loss: %.3f (initial) %.3f (optimized)" - % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), - log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))) +print( + "Log Marginal Likelihood (initial): %.3f" + % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta) +) +print( + "Log Marginal Likelihood (optimized): %.3f" + % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta) +) + +print( + "Accuracy: %.3f (initial) %.3f (optimized)" + % ( + accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), + accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])), + ) +) +print( + "Log-loss: %.3f (initial) %.3f (optimized)" + % ( + log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), + log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]), + ) +) # Plot posteriors plt.figure() -plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data", - edgecolors=(0, 0, 0)) -plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data", - edgecolors=(0, 0, 0)) +plt.scatter( + X[:train_size, 0], y[:train_size], c="k", label="Train data", edgecolors=(0, 0, 0) +) +plt.scatter( + X[train_size:, 0], y[train_size:], c="g", label="Test data", edgecolors=(0, 0, 0) +) X_ = np.linspace(0, 5, 100) -plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r', - label="Initial kernel: %s" % gp_fix.kernel_) -plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b', - label="Optimized kernel: %s" % gp_opt.kernel_) +plt.plot( + X_, + gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], + "r", + label="Initial kernel: %s" % gp_fix.kernel_, +) +plt.plot( + X_, + gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], + "b", + label="Optimized kernel: %s" % gp_opt.kernel_, +) plt.xlabel("Feature") plt.ylabel("Class 1 probability") plt.xlim(0, 5) @@ -84,13 +105,20 @@ theta0 = np.logspace(0, 8, 30) theta1 = np.logspace(-1, 1, 29) Theta0, Theta1 = np.meshgrid(theta0, theta1) -LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]])) - for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])] +LML = [ + [ + gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]])) + for i in range(Theta0.shape[0]) + ] + for j in range(Theta0.shape[1]) +] LML = np.array(LML).T -plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1], - 'ko', zorder=10) -plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1], - 'ko', zorder=10) +plt.plot( + np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1], "ko", zorder=10 +) +plt.plot( + np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1], "ko", zorder=10 +) plt.pcolor(Theta0, Theta1, LML) plt.xscale("log") plt.yscale("log") diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py index fe1030131709e..4aa4121664ece 100644 --- a/examples/gaussian_process/plot_gpc_iris.py +++ b/examples/gaussian_process/plot_gpc_iris.py @@ -21,7 +21,7 @@ X = iris.data[:, :2] # we only take the first two features. y = np.array(iris.target, dtype=int) -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh kernel = 1.0 * RBF([1.0]) gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y) @@ -31,8 +31,7 @@ # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) titles = ["Isotropic RBF", "Anisotropic RBF"] plt.figure(figsize=(10, 5)) @@ -48,16 +47,16 @@ plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower") # Plot also the training points - plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y], - edgecolors=(0, 0, 0)) - plt.xlabel('Sepal length') - plt.ylabel('Sepal width') + plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y], edgecolors=(0, 0, 0)) + plt.xlabel("Sepal length") + plt.ylabel("Sepal width") plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) - plt.title("%s, LML: %.3f" % - (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta))) + plt.title( + "%s, LML: %.3f" % (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)) + ) plt.tight_layout() plt.show() diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py index 7b2a14cf4fc41..cd0fd740e2fc3 100644 --- a/examples/gaussian_process/plot_gpc_isoprobability.py +++ b/examples/gaussian_process/plot_gpc_isoprobability.py @@ -31,18 +31,22 @@ def g(x): """The function to predict (classification will then consist in predicting whether g(x) <= 0 or not)""" - return 5. - x[:, 1] - .5 * x[:, 0] ** 2. + return 5.0 - x[:, 1] - 0.5 * x[:, 0] ** 2.0 # Design of experiments -X = np.array([[-4.61611719, -6.00099547], - [4.10469096, 5.32782448], - [0.00000000, -0.50000000], - [-6.17289014, -4.6984743], - [1.3109306, -6.93271427], - [-5.03823144, 3.10584743], - [-2.87600388, 6.74310541], - [5.21301203, 4.26386883]]) +X = np.array( + [ + [-4.61611719, -6.00099547], + [4.10469096, 5.32782448], + [0.00000000, -0.50000000], + [-6.17289014, -4.6984743], + [1.3109306, -6.93271427], + [-5.03823144, 3.10584743], + [-2.87600388, 6.74310541], + [5.21301203, 4.26386883], + ] +) # Observations y = np.array(g(X) > 0, dtype=int) @@ -55,8 +59,7 @@ def g(x): # Evaluate real function and the predicted probability res = 50 -x1, x2 = np.meshgrid(np.linspace(- lim, lim, res), - np.linspace(- lim, lim, res)) +x1, x2 = np.meshgrid(np.linspace(-lim, lim, res), np.linspace(-lim, lim, res)) xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T y_true = g(xx) @@ -67,37 +70,33 @@ def g(x): # Plot the probabilistic classification iso-values fig = plt.figure(1) ax = fig.gca() -ax.axes.set_aspect('equal') +ax.axes.set_aspect("equal") plt.xticks([]) plt.yticks([]) ax.set_xticklabels([]) ax.set_yticklabels([]) -plt.xlabel('$x_1$') -plt.ylabel('$x_2$') - -cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8, - extent=(-lim, lim, -lim, lim)) -norm = plt.matplotlib.colors.Normalize(vmin=0., vmax=0.9) -cb = plt.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm) -cb.set_label(r'${\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\right]$') +plt.xlabel("$x_1$") +plt.ylabel("$x_2$") + +cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8, extent=(-lim, lim, -lim, lim)) +norm = plt.matplotlib.colors.Normalize(vmin=0.0, vmax=0.9) +cb = plt.colorbar(cax, ticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], norm=norm) +cb.set_label(r"${\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\right]$") plt.clim(0, 1) -plt.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12) +plt.plot(X[y <= 0, 0], X[y <= 0, 1], "r.", markersize=12) -plt.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12) +plt.plot(X[y > 0, 0], X[y > 0, 1], "b.", markersize=12) -plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot') +plt.contour(x1, x2, y_true, [0.0], colors="k", linestyles="dashdot") -cs = plt.contour(x1, x2, y_prob, [0.666], colors='b', - linestyles='solid') +cs = plt.contour(x1, x2, y_prob, [0.666], colors="b", linestyles="solid") plt.clabel(cs, fontsize=11) -cs = plt.contour(x1, x2, y_prob, [0.5], colors='k', - linestyles='dashed') +cs = plt.contour(x1, x2, y_prob, [0.5], colors="k", linestyles="dashed") plt.clabel(cs, fontsize=11) -cs = plt.contour(x1, x2, y_prob, [0.334], colors='r', - linestyles='solid') +cs = plt.contour(x1, x2, y_prob, [0.334], colors="r", linestyles="solid") plt.clabel(cs, fontsize=11) plt.show() diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py index 04f014e13e8ae..011f36ada1021 100644 --- a/examples/gaussian_process/plot_gpc_xor.py +++ b/examples/gaussian_process/plot_gpc_xor.py @@ -22,15 +22,14 @@ from sklearn.gaussian_process.kernels import RBF, DotProduct -xx, yy = np.meshgrid(np.linspace(-3, 3, 50), - np.linspace(-3, 3, 50)) +xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50)) rng = np.random.RandomState(0) X = rng.randn(200, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) # fit the model plt.figure(figsize=(10, 5)) -kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2] +kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0) ** 2] for i, kernel in enumerate(kernels): clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y) @@ -39,20 +38,25 @@ Z = Z.reshape(xx.shape) plt.subplot(1, 2, i + 1) - image = plt.imshow(Z, interpolation='nearest', - extent=(xx.min(), xx.max(), yy.min(), yy.max()), - aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) - contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2, - colors=['k']) - plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, - edgecolors=(0, 0, 0)) + image = plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + aspect="auto", + origin="lower", + cmap=plt.cm.PuOr_r, + ) + contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2, colors=["k"]) + plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors=(0, 0, 0)) plt.xticks(()) plt.yticks(()) plt.axis([-3, 3, -3, 3]) plt.colorbar(image) - plt.title("%s\n Log-Marginal-Likelihood:%.3f" - % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)), - fontsize=12) + plt.title( + "%s\n Log-Marginal-Likelihood:%.3f" + % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)), + fontsize=12, + ) plt.tight_layout() plt.show() diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index 2cc751438cbd4..7afe41d77cc63 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -67,8 +67,12 @@ from matplotlib import pyplot as plt from sklearn.datasets import fetch_openml from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels \ - import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared +from sklearn.gaussian_process.kernels import ( + RBF, + WhiteKernel, + RationalQuadratic, + ExpSineSquared, +) print(__doc__) @@ -102,52 +106,54 @@ def load_mauna_loa_atmospheric_co2(): X, y = load_mauna_loa_atmospheric_co2() # Kernel with parameters given in GPML book -k1 = 66.0**2 * RBF(length_scale=67.0) # long term smooth rising trend -k2 = 2.4**2 * RBF(length_scale=90.0) \ - * ExpSineSquared(length_scale=1.3, periodicity=1.0) # seasonal component +k1 = 66.0 ** 2 * RBF(length_scale=67.0) # long term smooth rising trend +k2 = ( + 2.4 ** 2 + * RBF(length_scale=90.0) + * ExpSineSquared(length_scale=1.3, periodicity=1.0) +) # seasonal component # medium term irregularity -k3 = 0.66**2 \ - * RationalQuadratic(length_scale=1.2, alpha=0.78) -k4 = 0.18**2 * RBF(length_scale=0.134) \ - + WhiteKernel(noise_level=0.19**2) # noise terms +k3 = 0.66 ** 2 * RationalQuadratic(length_scale=1.2, alpha=0.78) +k4 = 0.18 ** 2 * RBF(length_scale=0.134) + WhiteKernel( + noise_level=0.19 ** 2 +) # noise terms kernel_gpml = k1 + k2 + k3 + k4 -gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0, - optimizer=None, normalize_y=True) +gp = GaussianProcessRegressor( + kernel=kernel_gpml, alpha=0, optimizer=None, normalize_y=True +) gp.fit(X, y) print("GPML kernel: %s" % gp.kernel_) -print("Log-marginal-likelihood: %.3f" - % gp.log_marginal_likelihood(gp.kernel_.theta)) +print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta)) # Kernel with optimized parameters -k1 = 50.0**2 * RBF(length_scale=50.0) # long term smooth rising trend -k2 = 2.0**2 * RBF(length_scale=100.0) \ - * ExpSineSquared(length_scale=1.0, periodicity=1.0, - periodicity_bounds="fixed") # seasonal component +k1 = 50.0 ** 2 * RBF(length_scale=50.0) # long term smooth rising trend +k2 = ( + 2.0 ** 2 + * RBF(length_scale=100.0) + * ExpSineSquared(length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed") +) # seasonal component # medium term irregularities -k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0) -k4 = 0.1**2 * RBF(length_scale=0.1) \ - + WhiteKernel(noise_level=0.1**2, - noise_level_bounds=(1e-5, np.inf)) # noise terms +k3 = 0.5 ** 2 * RationalQuadratic(length_scale=1.0, alpha=1.0) +k4 = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel( + noise_level=0.1 ** 2, noise_level_bounds=(1e-5, np.inf) +) # noise terms kernel = k1 + k2 + k3 + k4 -gp = GaussianProcessRegressor(kernel=kernel, alpha=0, - normalize_y=True) +gp = GaussianProcessRegressor(kernel=kernel, alpha=0, normalize_y=True) gp.fit(X, y) print("\nLearned kernel: %s" % gp.kernel_) -print("Log-marginal-likelihood: %.3f" - % gp.log_marginal_likelihood(gp.kernel_.theta)) +print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta)) X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis] y_pred, y_std = gp.predict(X_, return_std=True) # Illustration -plt.scatter(X, y, c='k') +plt.scatter(X, y, c="k") plt.plot(X_, y_pred) -plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std, - alpha=0.5, color='k') +plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std, alpha=0.5, color="k") plt.xlim(X_.min(), X_.max()) plt.xlabel("Year") plt.ylabel(r"CO$_2$ in ppm") diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py index 5f8ce2cd0fe96..0bba4827cd685 100644 --- a/examples/gaussian_process/plot_gpr_noisy.py +++ b/examples/gaussian_process/plot_gpr_noisy.py @@ -36,40 +36,50 @@ # First run plt.figure() -kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \ - + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)) -gp = GaussianProcessRegressor(kernel=kernel, - alpha=0.0).fit(X, y) +kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel( + noise_level=1, noise_level_bounds=(1e-10, 1e1) +) +gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y) X_ = np.linspace(0, 5, 100) y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True) -plt.plot(X_, y_mean, 'k', lw=3, zorder=9) -plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)), - y_mean + np.sqrt(np.diag(y_cov)), - alpha=0.5, color='k') -plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9) -plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0)) -plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s" - % (kernel, gp.kernel_, - gp.log_marginal_likelihood(gp.kernel_.theta))) +plt.plot(X_, y_mean, "k", lw=3, zorder=9) +plt.fill_between( + X_, + y_mean - np.sqrt(np.diag(y_cov)), + y_mean + np.sqrt(np.diag(y_cov)), + alpha=0.5, + color="k", +) +plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9) +plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0)) +plt.title( + "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s" + % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)) +) plt.tight_layout() # Second run plt.figure() -kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \ - + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1)) -gp = GaussianProcessRegressor(kernel=kernel, - alpha=0.0).fit(X, y) +kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel( + noise_level=1e-5, noise_level_bounds=(1e-10, 1e1) +) +gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y) X_ = np.linspace(0, 5, 100) y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True) -plt.plot(X_, y_mean, 'k', lw=3, zorder=9) -plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)), - y_mean + np.sqrt(np.diag(y_cov)), - alpha=0.5, color='k') -plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9) -plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0)) -plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s" - % (kernel, gp.kernel_, - gp.log_marginal_likelihood(gp.kernel_.theta))) +plt.plot(X_, y_mean, "k", lw=3, zorder=9) +plt.fill_between( + X_, + y_mean - np.sqrt(np.diag(y_cov)), + y_mean + np.sqrt(np.diag(y_cov)), + alpha=0.5, + color="k", +) +plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9) +plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0)) +plt.title( + "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s" + % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)) +) plt.tight_layout() # Plot LML landscape @@ -77,15 +87,19 @@ theta0 = np.logspace(-2, 3, 49) theta1 = np.logspace(-2, 0, 50) Theta0, Theta1 = np.meshgrid(theta0, theta1) -LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]])) - for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])] +LML = [ + [ + gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]])) + for i in range(Theta0.shape[0]) + ] + for j in range(Theta0.shape[1]) +] LML = np.array(LML).T vmin, vmax = (-LML).min(), (-LML).max() vmax = 50 level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1) -plt.contour(Theta0, Theta1, -LML, - levels=level, norm=LogNorm(vmin=vmin, vmax=vmax)) +plt.contour(Theta0, Theta1, -LML, levels=level, norm=LogNorm(vmin=vmin, vmax=vmax)) plt.colorbar() plt.xscale("log") plt.yscale("log") diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py index e11071cec5bfd..0058589f7a786 100644 --- a/examples/gaussian_process/plot_gpr_noisy_targets.py +++ b/examples/gaussian_process/plot_gpr_noisy_targets.py @@ -41,7 +41,7 @@ def f(x): # ---------------------------------------------------------------------- # First the noiseless case -X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T +X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T # Observations y = f(X).ravel() @@ -63,17 +63,21 @@ def f(x): # Plot the function, the prediction and the 95% confidence interval based on # the MSE plt.figure() -plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$') -plt.plot(X, y, 'r.', markersize=10, label='Observations') -plt.plot(x, y_pred, 'b-', label='Prediction') -plt.fill(np.concatenate([x, x[::-1]]), - np.concatenate([y_pred - 1.9600 * sigma, - (y_pred + 1.9600 * sigma)[::-1]]), - alpha=.5, fc='b', ec='None', label='95% confidence interval') -plt.xlabel('$x$') -plt.ylabel('$f(x)$') +plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$") +plt.plot(X, y, "r.", markersize=10, label="Observations") +plt.plot(x, y_pred, "b-", label="Prediction") +plt.fill( + np.concatenate([x, x[::-1]]), + np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]), + alpha=0.5, + fc="b", + ec="None", + label="95% confidence interval", +) +plt.xlabel("$x$") +plt.ylabel("$f(x)$") plt.ylim(-10, 20) -plt.legend(loc='upper left') +plt.legend(loc="upper left") # ---------------------------------------------------------------------- # now the noisy case @@ -87,8 +91,7 @@ def f(x): y += noise # Instantiate a Gaussian Process model -gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2, - n_restarts_optimizer=10) +gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2, n_restarts_optimizer=10) # Fit to data using Maximum Likelihood Estimation of the parameters gp.fit(X, y) @@ -99,16 +102,20 @@ def f(x): # Plot the function, the prediction and the 95% confidence interval based on # the MSE plt.figure() -plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$') -plt.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label='Observations') -plt.plot(x, y_pred, 'b-', label='Prediction') -plt.fill(np.concatenate([x, x[::-1]]), - np.concatenate([y_pred - 1.9600 * sigma, - (y_pred + 1.9600 * sigma)[::-1]]), - alpha=.5, fc='b', ec='None', label='95% confidence interval') -plt.xlabel('$x$') -plt.ylabel('$f(x)$') +plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$") +plt.errorbar(X.ravel(), y, dy, fmt="r.", markersize=10, label="Observations") +plt.plot(x, y_pred, "b-", label="Prediction") +plt.fill( + np.concatenate([x, x[::-1]]), + np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]), + alpha=0.5, + fc="b", + ec="None", + label="95% confidence interval", +) +plt.xlabel("$x$") +plt.ylabel("$f(x)$") plt.ylim(-10, 20) -plt.legend(loc='upper left') +plt.legend(loc="upper left") plt.show() diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py index 64a84ab38647a..59af31664cb74 100644 --- a/examples/gaussian_process/plot_gpr_on_structured_data.py +++ b/examples/gaussian_process/plot_gpr_on_structured_data.py @@ -48,44 +48,43 @@ class SequenceKernel(GenericKernelMixin, Kernel): - ''' + """ A minimal (but valid) convolutional kernel for sequences of variable - lengths.''' - def __init__(self, - baseline_similarity=0.5, - baseline_similarity_bounds=(1e-5, 1)): + lengths.""" + + def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)): self.baseline_similarity = baseline_similarity self.baseline_similarity_bounds = baseline_similarity_bounds @property def hyperparameter_baseline_similarity(self): - return Hyperparameter("baseline_similarity", - "numeric", - self.baseline_similarity_bounds) + return Hyperparameter( + "baseline_similarity", "numeric", self.baseline_similarity_bounds + ) def _f(self, s1, s2): - ''' + """ kernel value between a pair of sequences - ''' - return sum([1.0 if c1 == c2 else self.baseline_similarity - for c1 in s1 - for c2 in s2]) + """ + return sum( + [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2] + ) def _g(self, s1, s2): - ''' + """ kernel derivative between a pair of sequences - ''' - return sum([0.0 if c1 == c2 else 1.0 - for c1 in s1 - for c2 in s2]) + """ + return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2]) def __call__(self, X, Y=None, eval_gradient=False): if Y is None: Y = X if eval_gradient: - return (np.array([[self._f(x, y) for y in Y] for x in X]), - np.array([[[self._g(x, y)] for y in Y] for x in X])) + return ( + np.array([[self._f(x, y) for y in Y] for x in X]), + np.array([[[self._g(x, y)] for y in Y] for x in X]), + ) else: return np.array([[self._f(x, y) for y in Y] for x in X]) @@ -103,28 +102,28 @@ def clone_with_theta(self, theta): kernel = SequenceKernel() -''' +""" Sequence similarity matrix under the kernel =========================================== -''' +""" -X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA']) +X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"]) K = kernel(X) D = kernel.diag(X) plt.figure(figsize=(8, 5)) -plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5))) +plt.imshow(np.diag(D ** -0.5).dot(K).dot(np.diag(D ** -0.5))) plt.xticks(np.arange(len(X)), X) plt.yticks(np.arange(len(X)), X) -plt.title('Sequence similarity under the kernel') +plt.title("Sequence similarity under the kernel") -''' +""" Regression ========== -''' +""" -X = np.array(['AGCT', 'AGC', 'AACT', 'TAA', 'AAA', 'GAACA']) +X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"]) Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0]) training_idx = [0, 1, 3, 4] @@ -132,43 +131,58 @@ def clone_with_theta(self, theta): gp.fit(X[training_idx], Y[training_idx]) plt.figure(figsize=(8, 5)) -plt.bar(np.arange(len(X)), gp.predict(X), color='b', label='prediction') -plt.bar(training_idx, Y[training_idx], width=0.2, color='r', - alpha=1, label='training') +plt.bar(np.arange(len(X)), gp.predict(X), color="b", label="prediction") +plt.bar(training_idx, Y[training_idx], width=0.2, color="r", alpha=1, label="training") plt.xticks(np.arange(len(X)), X) -plt.title('Regression on sequences') +plt.title("Regression on sequences") plt.legend() -''' +""" Classification ============== -''' +""" -X_train = np.array(['AGCT', 'CGA', 'TAAC', 'TCG', 'CTTT', 'TGCT']) +X_train = np.array(["AGCT", "CGA", "TAAC", "TCG", "CTTT", "TGCT"]) # whether there are 'A's in the sequence Y_train = np.array([True, True, True, False, False, False]) gp = GaussianProcessClassifier(kernel) gp.fit(X_train, Y_train) -X_test = ['AAA', 'ATAG', 'CTC', 'CT', 'C'] +X_test = ["AAA", "ATAG", "CTC", "CT", "C"] Y_test = [True, True, False, False, False] plt.figure(figsize=(8, 5)) -plt.scatter(np.arange(len(X_train)), [1.0 if c else -1.0 for c in Y_train], - s=100, marker='o', edgecolor='none', facecolor=(1, 0.75, 0), - label='training') -plt.scatter(len(X_train) + np.arange(len(X_test)), - [1.0 if c else -1.0 for c in Y_test], - s=100, marker='o', edgecolor='none', facecolor='r', label='truth') -plt.scatter(len(X_train) + np.arange(len(X_test)), - [1.0 if c else -1.0 for c in gp.predict(X_test)], - s=100, marker='x', edgecolor=(0, 1.0, 0.3), linewidth=2, - label='prediction') -plt.xticks(np.arange(len(X_train) + len(X_test)), - np.concatenate((X_train, X_test))) +plt.scatter( + np.arange(len(X_train)), + [1.0 if c else -1.0 for c in Y_train], + s=100, + marker="o", + edgecolor="none", + facecolor=(1, 0.75, 0), + label="training", +) +plt.scatter( + len(X_train) + np.arange(len(X_test)), + [1.0 if c else -1.0 for c in Y_test], + s=100, + marker="o", + edgecolor="none", + facecolor="r", + label="truth", +) +plt.scatter( + len(X_train) + np.arange(len(X_test)), + [1.0 if c else -1.0 for c in gp.predict(X_test)], + s=100, + marker="x", + edgecolor=(0, 1.0, 0.3), + linewidth=2, + label="prediction", +) +plt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test))) plt.yticks([-1, 1], [False, True]) -plt.title('Classification on sequences') +plt.title("Classification on sequences") plt.legend() plt.show() diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 877cfd37c0067..77f015fd9ae12 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -71,10 +71,9 @@ br_estimator = BayesianRidge() score_full_data = pd.DataFrame( cross_val_score( - br_estimator, X_full, y_full, scoring='neg_mean_squared_error', - cv=N_SPLITS + br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS ), - columns=['Full Data'] + columns=["Full Data"], ) # Add a single missing value to each row @@ -86,39 +85,35 @@ # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() -for strategy in ('mean', 'median'): +for strategy in ("mean", "median"): estimator = make_pipeline( - SimpleImputer(missing_values=np.nan, strategy=strategy), - br_estimator + SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator ) score_simple_imputer[strategy] = cross_val_score( - estimator, X_missing, y_missing, scoring='neg_mean_squared_error', - cv=N_SPLITS + estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS ) # Estimate the score after iterative imputation of the missing values # with different estimators estimators = [ BayesianRidge(), - DecisionTreeRegressor(max_features='sqrt', random_state=0), + DecisionTreeRegressor(max_features="sqrt", random_state=0), ExtraTreesRegressor(n_estimators=10, random_state=0), - KNeighborsRegressor(n_neighbors=15) + KNeighborsRegressor(n_neighbors=15), ] score_iterative_imputer = pd.DataFrame() for impute_estimator in estimators: estimator = make_pipeline( - IterativeImputer(random_state=0, estimator=impute_estimator), - br_estimator + IterativeImputer(random_state=0, estimator=impute_estimator), br_estimator + ) + score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score( + estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS ) - score_iterative_imputer[impute_estimator.__class__.__name__] = \ - cross_val_score( - estimator, X_missing, y_missing, scoring='neg_mean_squared_error', - cv=N_SPLITS - ) scores = pd.concat( [score_full_data, score_simple_imputer, score_iterative_imputer], - keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 + keys=["Original", "SimpleImputer", "IterativeImputer"], + axis=1, ) # plot california housing results @@ -126,8 +121,8 @@ means = -scores.mean() errors = scores.std() means.plot.barh(xerr=errors, ax=ax) -ax.set_title('California Housing Regression with Different Imputation Methods') -ax.set_xlabel('MSE (smaller is better)') +ax.set_title("California Housing Regression with Different Imputation Methods") +ax.set_xlabel("MSE (smaller is better)") ax.set_yticks(np.arange(means.shape[0])) ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()]) plt.tight_layout(pad=1) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 3ea5c61427ff0..ec9d881af85b1 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -65,7 +65,7 @@ def add_missing_values(X_full, y_full): n_missing_samples = int(n_samples * missing_rate) missing_samples = np.zeros(n_samples, dtype=bool) - missing_samples[: n_missing_samples] = True + missing_samples[:n_missing_samples] = True rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) @@ -76,11 +76,9 @@ def add_missing_values(X_full, y_full): return X_missing, y_missing -X_miss_california, y_miss_california = add_missing_values( - X_california, y_california) +X_miss_california, y_miss_california = add_missing_values(X_california, y_california) -X_miss_diabetes, y_miss_diabetes = add_missing_values( - X_diabetes, y_diabetes) +X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes) # %% @@ -115,9 +113,9 @@ def add_missing_values(X_full, y_full): def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline(imputer, regressor) - impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=N_SPLITS) + impute_scores = cross_val_score( + estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS + ) return impute_scores @@ -136,16 +134,15 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): def get_full_score(X_full, y_full): - full_scores = cross_val_score(regressor, X_full, y_full, - scoring='neg_mean_squared_error', - cv=N_SPLITS) + full_scores = cross_val_score( + regressor, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS + ) return full_scores.mean(), full_scores.std() -mses_california[0], stds_california[0] = get_full_score(X_california, - y_california) +mses_california[0], stds_california[0] = get_full_score(X_california, y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -x_labels.append('Full data') +x_labels.append("Full data") # %% @@ -159,17 +156,20 @@ def get_full_score(X_full, y_full): def get_impute_zero_score(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, - strategy='constant', fill_value=0) + imputer = SimpleImputer( + missing_values=np.nan, add_indicator=True, strategy="constant", fill_value=0 + ) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return zero_impute_scores.mean(), zero_impute_scores.std() mses_california[1], stds_california[1] = get_impute_zero_score( - X_miss_california, y_miss_california) -mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, - y_miss_diabetes) -x_labels.append('Zero imputation') + X_miss_california, y_miss_california +) +mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score( + X_miss_diabetes, y_miss_diabetes +) +x_labels.append("Zero imputation") # %% @@ -179,6 +179,7 @@ def get_impute_zero_score(X_missing, y_missing): # :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted # or unweighted mean of the desired number of nearest neighbors. + def get_impute_knn_score(X_missing, y_missing): imputer = KNNImputer(missing_values=np.nan, add_indicator=True) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) @@ -186,10 +187,12 @@ def get_impute_knn_score(X_missing, y_missing): mses_california[2], stds_california[2] = get_impute_knn_score( - X_miss_california, y_miss_california) -mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes, - y_miss_diabetes) -x_labels.append('KNN Imputation') + X_miss_california, y_miss_california +) +mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( + X_miss_diabetes, y_miss_diabetes +) +x_labels.append("KNN Imputation") # %% @@ -197,18 +200,18 @@ def get_impute_knn_score(X_missing, y_missing): # ------------------------------- # + def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, strategy="mean", - add_indicator=True) + imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True) mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return mean_impute_scores.mean(), mean_impute_scores.std() -mses_california[3], stds_california[3] = get_impute_mean(X_miss_california, - y_miss_california) -mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, - y_miss_diabetes) -x_labels.append('Mean Imputation') +mses_california[3], stds_california[3] = get_impute_mean( + X_miss_california, y_miss_california +) +mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes) +x_labels.append("Mean Imputation") # %% @@ -223,21 +226,26 @@ def get_impute_mean(X_missing, y_missing): # to potentially improve performance. # + def get_impute_iterative(X_missing, y_missing): - imputer = IterativeImputer(missing_values=np.nan, add_indicator=True, - random_state=0, n_nearest_features=5, - sample_posterior=True) - iterative_impute_scores = get_scores_for_imputer(imputer, - X_missing, - y_missing) + imputer = IterativeImputer( + missing_values=np.nan, + add_indicator=True, + random_state=0, + n_nearest_features=5, + sample_posterior=True, + ) + iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return iterative_impute_scores.mean(), iterative_impute_scores.std() mses_california[4], stds_california[4] = get_impute_iterative( - X_miss_california, y_miss_california) -mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, - y_miss_diabetes) -x_labels.append('Iterative Imputation') + X_miss_california, y_miss_california +) +mses_diabetes[4], stds_diabetes[4] = get_impute_iterative( + X_miss_diabetes, y_miss_diabetes +) +x_labels.append("Iterative Imputation") mses_diabetes = mses_diabetes * -1 mses_california = mses_california * -1 @@ -255,34 +263,45 @@ def get_impute_iterative(X_missing, y_missing): n_bars = len(mses_diabetes) xval = np.arange(n_bars) -colors = ['r', 'g', 'b', 'orange', 'black'] +colors = ["r", "g", "b", "orange", "black"] # plot diabetes results plt.figure(figsize=(12, 6)) ax1 = plt.subplot(121) for j in xval: - ax1.barh(j, mses_diabetes[j], xerr=stds_diabetes[j], - color=colors[j], alpha=0.6, align='center') - -ax1.set_title('Imputation Techniques with Diabetes Data') -ax1.set_xlim(left=np.min(mses_diabetes) * 0.9, - right=np.max(mses_diabetes) * 1.1) + ax1.barh( + j, + mses_diabetes[j], + xerr=stds_diabetes[j], + color=colors[j], + alpha=0.6, + align="center", + ) + +ax1.set_title("Imputation Techniques with Diabetes Data") +ax1.set_xlim(left=np.min(mses_diabetes) * 0.9, right=np.max(mses_diabetes) * 1.1) ax1.set_yticks(xval) -ax1.set_xlabel('MSE') +ax1.set_xlabel("MSE") ax1.invert_yaxis() ax1.set_yticklabels(x_labels) # plot california dataset results ax2 = plt.subplot(122) for j in xval: - ax2.barh(j, mses_california[j], xerr=stds_california[j], - color=colors[j], alpha=0.6, align='center') - -ax2.set_title('Imputation Techniques with California Data') + ax2.barh( + j, + mses_california[j], + xerr=stds_california[j], + color=colors[j], + alpha=0.6, + align="center", + ) + +ax2.set_title("Imputation Techniques with California Data") ax2.set_yticks(xval) -ax2.set_xlabel('MSE') +ax2.set_xlabel("MSE") ax2.invert_yaxis() -ax2.set_yticklabels([''] * n_bars) +ax2.set_yticklabels([""] * n_bars) plt.show() diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 766a01fbeb12d..c67b24326e128 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -52,30 +52,32 @@ # values). X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) rng = np.random.RandomState(seed=42) -X['random_cat'] = rng.randint(3, size=X.shape[0]) -X['random_num'] = rng.randn(X.shape[0]) +X["random_cat"] = rng.randint(3, size=X.shape[0]) +X["random_num"] = rng.randn(X.shape[0]) -categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] -numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] +categorical_columns = ["pclass", "sex", "embarked", "random_cat"] +numerical_columns = ["age", "sibsp", "parch", "fare", "random_num"] X = X[categorical_columns + numerical_columns] -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) -categorical_encoder = OneHotEncoder(handle_unknown='ignore') -numerical_pipe = Pipeline([ - ('imputer', SimpleImputer(strategy='mean')) -]) +categorical_encoder = OneHotEncoder(handle_unknown="ignore") +numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))]) preprocessing = ColumnTransformer( - [('cat', categorical_encoder, categorical_columns), - ('num', numerical_pipe, numerical_columns)]) - -rf = Pipeline([ - ('preprocess', preprocessing), - ('classifier', RandomForestClassifier(random_state=42)) -]) + [ + ("cat", categorical_encoder, categorical_columns), + ("num", numerical_pipe, numerical_columns), + ] +) + +rf = Pipeline( + [ + ("preprocess", preprocessing), + ("classifier", RandomForestClassifier(random_state=42)), + ] +) rf.fit(X_train, y_train) # %% @@ -118,13 +120,11 @@ # therefore do not reflect the ability of feature to be useful to make # predictions that generalize to the test set (when the model has enough # capacity). -ohe = (rf.named_steps['preprocess'] - .named_transformers_['cat']) +ohe = rf.named_steps["preprocess"].named_transformers_["cat"] feature_names = ohe.get_feature_names_out(categorical_columns) feature_names = np.r_[feature_names, numerical_columns] -tree_feature_importances = ( - rf.named_steps['classifier'].feature_importances_) +tree_feature_importances = rf.named_steps["classifier"].feature_importances_ sorted_idx = tree_feature_importances.argsort() y_ticks = np.arange(0, len(feature_names)) @@ -144,13 +144,15 @@ # # Also note that both random features have very low importances (close to 0) as # expected. -result = permutation_importance(rf, X_test, y_test, n_repeats=10, - random_state=42, n_jobs=2) +result = permutation_importance( + rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 +) sorted_idx = result.importances_mean.argsort() fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) +ax.boxplot( + result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx] +) ax.set_title("Permutation Importances (test set)") fig.tight_layout() plt.show() @@ -162,13 +164,15 @@ # plots is a confirmation that the RF model has enough capacity to use that # random numerical feature to overfit. You can further confirm this by # re-running this example with constrained RF with min_samples_leaf=10. -result = permutation_importance(rf, X_train, y_train, n_repeats=10, - random_state=42, n_jobs=2) +result = permutation_importance( + rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2 +) sorted_idx = result.importances_mean.argsort() fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_train.columns[sorted_idx]) +ax.boxplot( + result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx] +) ax.set_title("Permutation Importances (train set)") fig.tight_layout() plt.show() diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py index 12fcd5ebf7bcd..23edc95583483 100644 --- a/examples/inspection/plot_permutation_importance_multicollinear.py +++ b/examples/inspection/plot_permutation_importance_multicollinear.py @@ -51,21 +51,22 @@ # computed above: some feature must be important. The permutation importance # is calculated on the training set to show how much the model relies on each # feature during training. -result = permutation_importance(clf, X_train, y_train, n_repeats=10, - random_state=42) +result = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=42) perm_sorted_idx = result.importances_mean.argsort() tree_importance_sorted_idx = np.argsort(clf.feature_importances_) tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) -ax1.barh(tree_indices, - clf.feature_importances_[tree_importance_sorted_idx], height=0.7) +ax1.barh(tree_indices, clf.feature_importances_[tree_importance_sorted_idx], height=0.7) ax1.set_yticks(tree_indices) ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx]) ax1.set_ylim((0, len(clf.feature_importances_))) -ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False, - labels=data.feature_names[perm_sorted_idx]) +ax2.boxplot( + result.importances[perm_sorted_idx].T, + vert=False, + labels=data.feature_names[perm_sorted_idx], +) fig.tight_layout() plt.show() @@ -82,7 +83,7 @@ corr = spearmanr(X).correlation # Ensure the correlation matrix is symmetric -corr = (corr + corr.T)/2 +corr = (corr + corr.T) / 2 np.fill_diagonal(corr, 1) # We convert the correlation matrix to a distance matrix before performing @@ -92,13 +93,13 @@ dendro = hierarchy.dendrogram( dist_linkage, labels=data.feature_names.tolist(), ax=ax1, leaf_rotation=90 ) -dendro_idx = np.arange(0, len(dendro['ivl'])) +dendro_idx = np.arange(0, len(dendro["ivl"])) -ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']]) +ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) ax2.set_xticks(dendro_idx) ax2.set_yticks(dendro_idx) -ax2.set_xticklabels(dendro['ivl'], rotation='vertical') -ax2.set_yticklabels(dendro['ivl']) +ax2.set_xticklabels(dendro["ivl"], rotation="vertical") +ax2.set_yticklabels(dendro["ivl"]) fig.tight_layout() plt.show() @@ -108,7 +109,7 @@ # keep, select those features from our dataset, and train a new random forest. # The test accuracy of the new random forest did not change much compared to # the random forest trained on the complete dataset. -cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion='distance') +cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance") cluster_id_to_feature_ids = defaultdict(list) for idx, cluster_id in enumerate(cluster_ids): cluster_id_to_feature_ids[cluster_id].append(idx) @@ -119,5 +120,8 @@ clf_sel = RandomForestClassifier(n_estimators=100, random_state=42) clf_sel.fit(X_train_sel, y_train) -print("Accuracy on test data with features removed: {:.2f}".format( - clf_sel.score(X_test_sel, y_test))) +print( + "Accuracy on test data with features removed: {:.2f}".format( + clf_sel.score(X_test_sel, y_test) + ) +) diff --git a/examples/kernel_approximation/plot_scalable_poly_kernels.py b/examples/kernel_approximation/plot_scalable_poly_kernels.py index 7d026dbcf16d6..c52b9fd59668d 100644 --- a/examples/kernel_approximation/plot_scalable_poly_kernels.py +++ b/examples/kernel_approximation/plot_scalable_poly_kernels.py @@ -50,9 +50,9 @@ # To actually reproduce the results in the original Tensor Sketch paper, # select 100,000 for training. -X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5_000, - test_size=10_000, - random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=5_000, test_size=10_000, random_state=42 +) # %% # Now scale features to the range [0, 1] to match the format of the dataset in @@ -104,11 +104,15 @@ ps_lsvm_score = 0 for _ in range(n_runs): - pipeline = Pipeline(steps=[("kernel_approximator", - PolynomialCountSketch( - n_components=n_components, - degree=4)), - ("linear_classifier", LinearSVC())]) + pipeline = Pipeline( + steps=[ + ( + "kernel_approximator", + PolynomialCountSketch(n_components=n_components, degree=4), + ), + ("linear_classifier", LinearSVC()), + ] + ) start = time.time() pipeline.fit(X_train, y_train) @@ -119,10 +123,13 @@ ps_lsvm_score /= n_runs results[f"LSVM + PS({n_components})"] = { - "time": ps_lsvm_time, "score": ps_lsvm_score + "time": ps_lsvm_time, + "score": ps_lsvm_score, } - print(f"Linear SVM score on {n_components} PolynomialCountSketch " + - f"features: {ps_lsvm_score:.2f}%") + print( + f"Linear SVM score on {n_components} PolynomialCountSketch " + + f"features: {ps_lsvm_score:.2f}%" + ) # %% # Train a kernelized SVM to see how well :class:`PolynomialCountSketch` @@ -132,7 +139,7 @@ from sklearn.svm import SVC -ksvm = SVC(C=500., kernel="poly", degree=4, coef0=0, gamma=1.) +ksvm = SVC(C=500.0, kernel="poly", degree=4, coef0=0, gamma=1.0) start = time.time() ksvm.fit(X_train, y_train) @@ -151,23 +158,59 @@ N_COMPONENTS = [250, 500, 1000, 2000] fig, ax = plt.subplots(figsize=(7, 7)) -ax.scatter([results["LSVM"]["time"], ], [results["LSVM"]["score"], ], - label="Linear SVM", c="green", marker="^") - -ax.scatter([results["LSVM + PS(250)"]["time"], ], - [results["LSVM + PS(250)"]["score"], ], - label="Linear SVM + PolynomialCountSketch", c="blue") +ax.scatter( + [ + results["LSVM"]["time"], + ], + [ + results["LSVM"]["score"], + ], + label="Linear SVM", + c="green", + marker="^", +) + +ax.scatter( + [ + results["LSVM + PS(250)"]["time"], + ], + [ + results["LSVM + PS(250)"]["score"], + ], + label="Linear SVM + PolynomialCountSketch", + c="blue", +) for n_components in N_COMPONENTS: - ax.scatter([results[f"LSVM + PS({n_components})"]["time"], ], - [results[f"LSVM + PS({n_components})"]["score"], ], - c="blue") - ax.annotate(f"n_comp.={n_components}", - (results[f"LSVM + PS({n_components})"]["time"], - results[f"LSVM + PS({n_components})"]["score"]), - xytext=(-30, 10), textcoords="offset pixels") - -ax.scatter([results["KSVM"]["time"], ], [results["KSVM"]["score"], ], - label="Kernel SVM", c="red", marker="x") + ax.scatter( + [ + results[f"LSVM + PS({n_components})"]["time"], + ], + [ + results[f"LSVM + PS({n_components})"]["score"], + ], + c="blue", + ) + ax.annotate( + f"n_comp.={n_components}", + ( + results[f"LSVM + PS({n_components})"]["time"], + results[f"LSVM + PS({n_components})"]["score"], + ), + xytext=(-30, 10), + textcoords="offset pixels", + ) + +ax.scatter( + [ + results["KSVM"]["time"], + ], + [ + results["KSVM"]["score"], + ], + label="Kernel SVM", + c="red", + marker="x", +) ax.set_xlabel("Training time (s)") ax.set_ylabel("Accuracy (%)") diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py index 177bd8ce24ad1..58529fe37a2cc 100644 --- a/examples/linear_model/plot_ard.py +++ b/examples/linear_model/plot_ard.py @@ -39,15 +39,15 @@ # Create Gaussian data X = np.random.randn(n_samples, n_features) # Create weights with a precision lambda_ of 4. -lambda_ = 4. +lambda_ = 4.0 w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: - w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) + w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. -alpha_ = 50. -noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) +alpha_ = 50.0 +noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise @@ -64,27 +64,32 @@ # weights, and predictions with standard deviations plt.figure(figsize=(6, 5)) plt.title("Weights of the model") -plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2, - label="ARD estimate") -plt.plot(ols.coef_, color='yellowgreen', linestyle=':', linewidth=2, - label="OLS estimate") -plt.plot(w, color='orange', linestyle='-', linewidth=2, label="Ground truth") +plt.plot(clf.coef_, color="darkblue", linestyle="-", linewidth=2, label="ARD estimate") +plt.plot( + ols.coef_, color="yellowgreen", linestyle=":", linewidth=2, label="OLS estimate" +) +plt.plot(w, color="orange", linestyle="-", linewidth=2, label="Ground truth") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc=1) plt.figure(figsize=(6, 5)) plt.title("Histogram of the weights") -plt.hist(clf.coef_, bins=n_features, color='navy', log=True) -plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.), - color='gold', marker='o', label="Relevant features") +plt.hist(clf.coef_, bins=n_features, color="navy", log=True) +plt.scatter( + clf.coef_[relevant_features], + np.full(len(relevant_features), 5.0), + color="gold", + marker="o", + label="Relevant features", +) plt.ylabel("Features") plt.xlabel("Values of the weights") plt.legend(loc=1) plt.figure(figsize=(6, 5)) plt.title("Marginal log-likelihood") -plt.plot(clf.scores_, color='navy', linewidth=2) +plt.plot(clf.scores_, color="navy", linewidth=2) plt.ylabel("Score") plt.xlabel("Iterations") @@ -106,10 +111,8 @@ def f(x, noise_amount): y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) -plt.errorbar(X_plot, y_mean, y_std, color='navy', - label="Polynomial ARD", linewidth=2) -plt.plot(X_plot, y_plot, color='gold', linewidth=2, - label="Ground Truth") +plt.errorbar(X_plot, y_mean, y_std, color="navy", label="Polynomial ARD", linewidth=2) +plt.plot(X_plot, y_plot, color="gold", linewidth=2, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py index 43925e72c591c..a9593a5b41061 100644 --- a/examples/linear_model/plot_bayesian_ridge.py +++ b/examples/linear_model/plot_bayesian_ridge.py @@ -36,15 +36,15 @@ n_samples, n_features = 100, 100 X = np.random.randn(n_samples, n_features) # Create Gaussian data # Create weights with a precision lambda_ of 4. -lambda_ = 4. +lambda_ = 4.0 w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: - w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) + w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. -alpha_ = 50. -noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) +alpha_ = 50.0 +noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise @@ -62,27 +62,29 @@ lw = 2 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") -plt.plot(clf.coef_, color='lightgreen', linewidth=lw, - label="Bayesian Ridge estimate") -plt.plot(w, color='gold', linewidth=lw, label="Ground truth") -plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate") +plt.plot(clf.coef_, color="lightgreen", linewidth=lw, label="Bayesian Ridge estimate") +plt.plot(w, color="gold", linewidth=lw, label="Ground truth") +plt.plot(ols.coef_, color="navy", linestyle="--", label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc="best", prop=dict(size=12)) plt.figure(figsize=(6, 5)) plt.title("Histogram of the weights") -plt.hist(clf.coef_, bins=n_features, color='gold', log=True, - edgecolor='black') -plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.), - color='navy', label="Relevant features") +plt.hist(clf.coef_, bins=n_features, color="gold", log=True, edgecolor="black") +plt.scatter( + clf.coef_[relevant_features], + np.full(len(relevant_features), 5.0), + color="navy", + label="Relevant features", +) plt.ylabel("Features") plt.xlabel("Values of the weights") plt.legend(loc="upper left") plt.figure(figsize=(6, 5)) plt.title("Marginal log-likelihood") -plt.plot(clf.scores_, color='navy', linewidth=lw) +plt.plot(clf.scores_, color="navy", linewidth=lw) plt.ylabel("Score") plt.xlabel("Iterations") @@ -104,10 +106,15 @@ def f(x, noise_amount): y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) -plt.errorbar(X_plot, y_mean, y_std, color='navy', - label="Polynomial Bayesian Ridge Regression", linewidth=lw) -plt.plot(X_plot, y_plot, color='gold', linewidth=lw, - label="Ground Truth") +plt.errorbar( + X_plot, + y_mean, + y_std, + color="navy", + label="Polynomial Bayesian Ridge Regression", + linewidth=lw, +) +plt.plot(X_plot, y_plot, color="gold", linewidth=lw, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py index 2f4a36d47d9a6..fc4f9e6cc384c 100644 --- a/examples/linear_model/plot_bayesian_ridge_curvefit.py +++ b/examples/linear_model/plot_bayesian_ridge_curvefit.py @@ -34,16 +34,17 @@ from sklearn.linear_model import BayesianRidge -def func(x): return np.sin(2*np.pi*x) +def func(x): + return np.sin(2 * np.pi * x) # ############################################################################# # Generate sinusoidal data with noise size = 25 rng = np.random.RandomState(1234) -x_train = rng.uniform(0., 1., size) +x_train = rng.uniform(0.0, 1.0, size) y_train = func(x_train) + rng.normal(scale=0.1, size=size) -x_test = np.linspace(0., 1., 100) +x_test = np.linspace(0.0, 1.0, 100) # ############################################################################# @@ -59,9 +60,9 @@ def func(x): return np.sin(2*np.pi*x) for i, ax in enumerate(axes): # Bayesian ridge regression with different initial value pairs if i == 0: - init = [1 / np.var(y_train), 1.] # Default values + init = [1 / np.var(y_train), 1.0] # Default values elif i == 1: - init = [1., 1e-3] + init = [1.0, 1e-3] reg.set_params(alpha_init=init[0], lambda_init=init[1]) reg.fit(X_train, y_train) ymean, ystd = reg.predict(X_test, return_std=True) @@ -69,17 +70,18 @@ def func(x): return np.sin(2*np.pi*x) ax.plot(x_test, func(x_test), color="blue", label="sin($2\\pi x$)") ax.scatter(x_train, y_train, s=50, alpha=0.5, label="observation") ax.plot(x_test, ymean, color="red", label="predict mean") - ax.fill_between(x_test, ymean-ystd, ymean+ystd, - color="pink", alpha=0.5, label="predict std") + ax.fill_between( + x_test, ymean - ystd, ymean + ystd, color="pink", alpha=0.5, label="predict std" + ) ax.set_ylim(-1.3, 1.3) ax.legend() - title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format( - init[0], init[1]) + title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format(init[0], init[1]) if i == 0: title += " (Default)" ax.set_title(title, fontsize=12) text = "$\\alpha={:.1f}$\n$\\lambda={:.3f}$\n$L={:.1f}$".format( - reg.alpha_, reg.lambda_, reg.scores_[-1]) + reg.alpha_, reg.lambda_, reg.scores_[-1] + ) ax.text(0.05, -1.0, text, fontsize=12) plt.tight_layout() diff --git a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py index 852ea545c5fd6..977759bc66b4c 100644 --- a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py +++ b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py @@ -38,7 +38,7 @@ # weights, we must first center the design matrix, and rescale it by the # normalized weights prior to computing the gram matrix. X_offset = np.average(X, axis=0, weights=normalized_weights) -X_centered = (X - np.average(X, axis=0, weights=normalized_weights)) +X_centered = X - np.average(X, axis=0, weights=normalized_weights) X_scaled = X_centered * np.sqrt(normalized_weights)[:, np.newaxis] gram = np.dot(X_scaled.T, X_scaled) diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py index e5f71cc861d88..d3137f3ac26cb 100644 --- a/examples/linear_model/plot_huber_vs_ridge.py +++ b/examples/linear_model/plot_huber_vs_ridge.py @@ -25,22 +25,23 @@ # Generate toy data. rng = np.random.RandomState(0) -X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0, - bias=100.0) +X, y = make_regression( + n_samples=20, n_features=1, random_state=0, noise=4.0, bias=100.0 +) # Add four strong outliers to the dataset. X_outliers = rng.normal(0, 0.5, size=(4, 1)) y_outliers = rng.normal(0, 2.0, size=4) -X_outliers[:2, :] += X.max() + X.mean() / 4. -X_outliers[2:, :] += X.min() - X.mean() / 4. -y_outliers[:2] += y.min() - y.mean() / 4. -y_outliers[2:] += y.max() + y.mean() / 4. +X_outliers[:2, :] += X.max() + X.mean() / 4.0 +X_outliers[2:, :] += X.min() - X.mean() / 4.0 +y_outliers[:2] += y.min() - y.mean() / 4.0 +y_outliers[2:] += y.max() + y.mean() / 4.0 X = np.vstack((X, X_outliers)) y = np.concatenate((y, y_outliers)) -plt.plot(X, y, 'b.') +plt.plot(X, y, "b.") # Fit the huber regressor over a series of epsilon values. -colors = ['r-', 'b-', 'y-', 'm-'] +colors = ["r-", "b-", "y-", "m-"] x = np.linspace(X.min(), X.max(), 7) epsilon_values = [1, 1.5, 1.75, 1.9] @@ -55,7 +56,7 @@ ridge.fit(X, y) coef_ridge = ridge.coef_ coef_ = ridge.coef_ * x + ridge.intercept_ -plt.plot(x, coef_, 'g-', label="ridge regression") +plt.plot(x, coef_, "g-", label="ridge regression") plt.title("Comparison of HuberRegressor vs Ridge") plt.xlabel("X") diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py index b04094a647e90..63853198f2adc 100644 --- a/examples/linear_model/plot_iris_logistic.py +++ b/examples/linear_model/plot_iris_logistic.py @@ -34,9 +34,9 @@ # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. -x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 -y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 -h = .02 # step size in the mesh +x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 +y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 +h = 0.02 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()]) @@ -46,9 +46,9 @@ plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points -plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired) -plt.xlabel('Sepal length') -plt.ylabel('Sepal width') +plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors="k", cmap=plt.cm.Paired) +plt.xlabel("Sepal length") +plt.ylabel("Sepal width") plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py index 8934ddc76395c..6a4b481048d8b 100644 --- a/examples/linear_model/plot_lasso_and_elasticnet.py +++ b/examples/linear_model/plot_lasso_and_elasticnet.py @@ -33,8 +33,8 @@ # Split data in train set and test set n_samples = X.shape[0] -X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] -X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] +X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] +X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] # ############################################################################# # Lasso @@ -59,18 +59,32 @@ print(enet) print("r^2 on test data : %f" % r2_score_enet) -m, s, _ = plt.stem(np.where(enet.coef_)[0], enet.coef_[enet.coef_ != 0], - markerfmt='x', label='Elastic net coefficients', - use_line_collection=True) +m, s, _ = plt.stem( + np.where(enet.coef_)[0], + enet.coef_[enet.coef_ != 0], + markerfmt="x", + label="Elastic net coefficients", + use_line_collection=True, +) plt.setp([m, s], color="#2ca02c") -m, s, _ = plt.stem(np.where(lasso.coef_)[0], lasso.coef_[lasso.coef_ != 0], - markerfmt='x', label='Lasso coefficients', - use_line_collection=True) -plt.setp([m, s], color='#ff7f0e') -plt.stem(np.where(coef)[0], coef[coef != 0], label='true coefficients', - markerfmt='bx', use_line_collection=True) - -plt.legend(loc='best') -plt.title("Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f" - % (r2_score_lasso, r2_score_enet)) +m, s, _ = plt.stem( + np.where(lasso.coef_)[0], + lasso.coef_[lasso.coef_ != 0], + markerfmt="x", + label="Lasso coefficients", + use_line_collection=True, +) +plt.setp([m, s], color="#ff7f0e") +plt.stem( + np.where(coef)[0], + coef[coef != 0], + label="true coefficients", + markerfmt="bx", + use_line_collection=True, +) + +plt.legend(loc="best") +plt.title( + "Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f" % (r2_score_lasso, r2_score_enet) +) plt.show() diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py index ba8f4f7bb4089..a7e8c22042459 100644 --- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py +++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py @@ -36,8 +36,10 @@ dense_lasso.fit(X, y) print("Dense Lasso done in %fs" % (time() - t0)) -print("Distance between coefficients : %s" - % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)) +print( + "Distance between coefficients : %s" + % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_) +) # ############################################################################# # The two Lasso implementations on Sparse data @@ -62,5 +64,7 @@ dense_lasso.fit(Xs.toarray(), y) print("Dense Lasso done in %fs" % (time() - t0)) -print("Distance between coefficients : %s" - % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_)) +print( + "Distance between coefficients : %s" + % linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_) +) diff --git a/examples/linear_model/plot_lasso_lars.py b/examples/linear_model/plot_lasso_lars.py index 06d4c94bbed70..2ebeb46037072 100644 --- a/examples/linear_model/plot_lasso_lars.py +++ b/examples/linear_model/plot_lasso_lars.py @@ -25,16 +25,16 @@ X, y = datasets.load_diabetes(return_X_y=True) print("Computing regularization path using the LARS ...") -_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True) +_, _, coefs = linear_model.lars_path(X, y, method="lasso", verbose=True) xx = np.sum(np.abs(coefs.T), axis=1) xx /= xx[-1] plt.plot(xx, coefs.T) ymin, ymax = plt.ylim() -plt.vlines(xx, ymin, ymax, linestyle='dashed') -plt.xlabel('|coef| / max|coef|') -plt.ylabel('Coefficients') -plt.title('LASSO Path') -plt.axis('tight') +plt.vlines(xx, ymin, ymax, linestyle="dashed") +plt.xlabel("|coef| / max|coef|") +plt.ylabel("Coefficients") +plt.title("LASSO Path") +plt.axis("tight") plt.show() diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py index 845a86c3bda4a..2bbc4a9e8d921 100644 --- a/examples/linear_model/plot_lasso_model_selection.py +++ b/examples/linear_model/plot_lasso_model_selection.py @@ -68,33 +68,42 @@ # ############################################################################# # LassoLarsIC: least angle regression with BIC/AIC criterion -model_bic = LassoLarsIC(criterion='bic', normalize=False) +model_bic = LassoLarsIC(criterion="bic", normalize=False) t1 = time.time() model_bic.fit(X, y) t_bic = time.time() - t1 alpha_bic_ = model_bic.alpha_ -model_aic = LassoLarsIC(criterion='aic', normalize=False) +model_aic = LassoLarsIC(criterion="aic", normalize=False) model_aic.fit(X, y) alpha_aic_ = model_aic.alpha_ def plot_ic_criterion(model, name, color): criterion_ = model.criterion_ - plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color, - linewidth=3, label='%s criterion' % name) - plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3, - label='alpha: %s estimate' % name) - plt.xlabel(r'$\alpha$') - plt.ylabel('criterion') + plt.semilogx( + model.alphas_ + EPSILON, + criterion_, + "--", + color=color, + linewidth=3, + label="%s criterion" % name, + ) + plt.axvline( + model.alpha_ + EPSILON, + color=color, + linewidth=3, + label="alpha: %s estimate" % name, + ) + plt.xlabel(r"$\alpha$") + plt.ylabel("criterion") plt.figure() -plot_ic_criterion(model_aic, 'AIC', 'b') -plot_ic_criterion(model_bic, 'BIC', 'r') +plot_ic_criterion(model_aic, "AIC", "b") +plot_ic_criterion(model_bic, "BIC", "r") plt.legend() -plt.title('Information-criterion for model selection (training time %.3fs)' - % t_bic) +plt.title("Information-criterion for model selection (training time %.3fs)" % t_bic) # ############################################################################# # LassoCV: coordinate descent @@ -108,19 +117,27 @@ def plot_ic_criterion(model, name, color): # Display results plt.figure() ymin, ymax = 2300, 3800 -plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':') -plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', - label='Average across the folds', linewidth=2) -plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k', - label='alpha: CV estimate') +plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":") +plt.plot( + model.alphas_ + EPSILON, + model.mse_path_.mean(axis=-1), + "k", + label="Average across the folds", + linewidth=2, +) +plt.axvline( + model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate" +) plt.legend() -plt.xlabel(r'$\alpha$') -plt.ylabel('Mean square error') -plt.title('Mean square error on each fold: coordinate descent ' - '(train time: %.2fs)' % t_lasso_cv) -plt.axis('tight') +plt.xlabel(r"$\alpha$") +plt.ylabel("Mean square error") +plt.title( + "Mean square error on each fold: coordinate descent (train time: %.2fs)" + % t_lasso_cv +) +plt.axis("tight") plt.ylim(ymin, ymax) # ############################################################################# @@ -134,18 +151,21 @@ def plot_ic_criterion(model, name, color): # Display results plt.figure() -plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':') -plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', - label='Average across the folds', linewidth=2) -plt.axvline(model.alpha_, linestyle='--', color='k', - label='alpha CV') +plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":") +plt.semilogx( + model.cv_alphas_ + EPSILON, + model.mse_path_.mean(axis=-1), + "k", + label="Average across the folds", + linewidth=2, +) +plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV") plt.legend() -plt.xlabel(r'$\alpha$') -plt.ylabel('Mean square error') -plt.title('Mean square error on each fold: Lars (train time: %.2fs)' - % t_lasso_lars_cv) -plt.axis('tight') +plt.xlabel(r"$\alpha$") +plt.ylabel("Mean square error") +plt.title("Mean square error on each fold: Lars (train time: %.2fs)" % t_lasso_lars_cv) +plt.axis("tight") plt.ylim(ymin, ymax) plt.show() diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py index 6b843007624a9..49806a775328f 100644 --- a/examples/linear_model/plot_logistic.py +++ b/examples/linear_model/plot_logistic.py @@ -31,7 +31,7 @@ X = np.random.normal(size=n_samples) y = (X > 0).astype(float) X[X > 0] *= 4 -X += .3 * np.random.normal(size=n_samples) +X += 0.3 * np.random.normal(size=n_samples) X = X[:, np.newaxis] @@ -42,24 +42,27 @@ # and plot the result plt.figure(1, figsize=(4, 3)) plt.clf() -plt.scatter(X.ravel(), y, color='black', zorder=20) +plt.scatter(X.ravel(), y, color="black", zorder=20) X_test = np.linspace(-5, 10, 300) loss = expit(X_test * clf.coef_ + clf.intercept_).ravel() -plt.plot(X_test, loss, color='red', linewidth=3) +plt.plot(X_test, loss, color="red", linewidth=3) ols = linear_model.LinearRegression() ols.fit(X, y) plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1) -plt.axhline(.5, color='.5') +plt.axhline(0.5, color=".5") -plt.ylabel('y') -plt.xlabel('X') +plt.ylabel("y") +plt.xlabel("X") plt.xticks(range(-5, 10)) plt.yticks([0, 0.5, 1]) -plt.ylim(-.25, 1.25) +plt.ylim(-0.25, 1.25) plt.xlim(-4, 10) -plt.legend(('Logistic Regression Model', 'Linear Regression Model'), - loc="lower right", fontsize='small') +plt.legend( + ("Logistic Regression Model", "Linear Regression Model"), + loc="lower right", + fontsize="small", +) plt.tight_layout() plt.show() diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py index 3e518e8ec1e7a..bd42966604075 100644 --- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py +++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py @@ -42,10 +42,11 @@ # Set regularization parameter for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): # turn down tolerance for short training time - clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01, solver='saga') - clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga') - clf_en_LR = LogisticRegression(C=C, penalty='elasticnet', solver='saga', - l1_ratio=l1_ratio, tol=0.01) + clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") + clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") + clf_en_LR = LogisticRegression( + C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 + ) clf_l1_LR.fit(X, y) clf_l2_LR.fit(X, y) clf_en_LR.fit(X, y) @@ -63,15 +64,13 @@ print("C=%.2f" % C) print("{:<40} {:.2f}%".format("Sparsity with L1 penalty:", sparsity_l1_LR)) - print("{:<40} {:.2f}%".format("Sparsity with Elastic-Net penalty:", - sparsity_en_LR)) + print("{:<40} {:.2f}%".format("Sparsity with Elastic-Net penalty:", sparsity_en_LR)) print("{:<40} {:.2f}%".format("Sparsity with L2 penalty:", sparsity_l2_LR)) - print("{:<40} {:.2f}".format("Score with L1 penalty:", - clf_l1_LR.score(X, y))) - print("{:<40} {:.2f}".format("Score with Elastic-Net penalty:", - clf_en_LR.score(X, y))) - print("{:<40} {:.2f}".format("Score with L2 penalty:", - clf_l2_LR.score(X, y))) + print("{:<40} {:.2f}".format("Score with L1 penalty:", clf_l1_LR.score(X, y))) + print( + "{:<40} {:.2f}".format("Score with Elastic-Net penalty:", clf_en_LR.score(X, y)) + ) + print("{:<40} {:.2f}".format("Score with L2 penalty:", clf_l2_LR.score(X, y))) if i == 0: axes_row[0].set_title("L1 penalty") @@ -79,11 +78,16 @@ axes_row[2].set_title("L2 penalty") for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): - ax.imshow(np.abs(coefs.reshape(8, 8)), interpolation='nearest', - cmap='binary', vmax=1, vmin=0) + ax.imshow( + np.abs(coefs.reshape(8, 8)), + interpolation="nearest", + cmap="binary", + vmax=1, + vmin=0, + ) ax.set_xticks(()) ax.set_yticks(()) - axes_row[0].set_ylabel('C = %s' % C) + axes_row[0].set_ylabel("C = %s" % C) plt.show() diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py index 518a2aeade61c..ec22777f71e8a 100644 --- a/examples/linear_model/plot_logistic_multinomial.py +++ b/examples/linear_model/plot_logistic_multinomial.py @@ -22,19 +22,19 @@ transformation = [[0.4, 0.2], [-0.4, 1.2]] X = np.dot(X, transformation) -for multi_class in ('multinomial', 'ovr'): - clf = LogisticRegression(solver='sag', max_iter=100, random_state=42, - multi_class=multi_class).fit(X, y) +for multi_class in ("multinomial", "ovr"): + clf = LogisticRegression( + solver="sag", max_iter=100, random_state=42, multi_class=multi_class + ).fit(X, y) # print the training scores print("training score : %.3f (%s)" % (clf.score(X, y), multi_class)) # create a mesh to plot in - h = .02 # step size in the mesh + h = 0.02 # step size in the mesh x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. @@ -44,14 +44,15 @@ plt.figure() plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.title("Decision surface of LogisticRegression (%s)" % multi_class) - plt.axis('tight') + plt.axis("tight") # Plot also the training points colors = "bry" for i, color in zip(clf.classes_, colors): idx = np.where(y == i) - plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired, - edgecolor='black', s=20) + plt.scatter( + X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired, edgecolor="black", s=20 + ) # Plot the three one-against-all classifiers xmin, xmax = plt.xlim() @@ -62,8 +63,8 @@ def plot_hyperplane(c, color): def line(x0): return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1] - plt.plot([xmin, xmax], [line(xmin), line(xmax)], - ls="--", color=color) + + plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color) for i, color in zip(clf.classes_, colors): plot_hyperplane(i, color) diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py index 7aead065f3445..eacb74f22cf75 100644 --- a/examples/linear_model/plot_logistic_path.py +++ b/examples/linear_model/plot_logistic_path.py @@ -50,15 +50,19 @@ # ############################################################################# # Demo path functions -cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16) +cs = l1_min_c(X, y, loss="log") * np.logspace(0, 7, 16) print("Computing regularization path ...") start = time() -clf = linear_model.LogisticRegression(penalty='l1', solver='liblinear', - tol=1e-6, max_iter=int(1e6), - warm_start=True, - intercept_scaling=10000.) +clf = linear_model.LogisticRegression( + penalty="l1", + solver="liblinear", + tol=1e-6, + max_iter=int(1e6), + warm_start=True, + intercept_scaling=10000.0, +) coefs_ = [] for c in cs: clf.set_params(C=c) @@ -67,10 +71,10 @@ print("This took %0.3fs" % (time() - start)) coefs_ = np.array(coefs_) -plt.plot(np.log10(cs), coefs_, marker='o') +plt.plot(np.log10(cs), coefs_, marker="o") ymin, ymax = plt.ylim() -plt.xlabel('log(C)') -plt.ylabel('Coefficients') -plt.title('Logistic Regression Path') -plt.axis('tight') +plt.xlabel("log(C)") +plt.ylabel("Coefficients") +plt.title("Logistic Regression Path") +plt.axis("tight") plt.show() diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py index c7a9536383bc2..3802971ccf345 100644 --- a/examples/linear_model/plot_multi_task_lasso_support.py +++ b/examples/linear_model/plot_multi_task_lasso_support.py @@ -31,39 +31,43 @@ coef = np.zeros((n_tasks, n_features)) times = np.linspace(0, 2 * np.pi, n_tasks) for k in range(n_relevant_features): - coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) + coef[:, k] = np.sin((1.0 + rng.randn(1)) * times + 3 * rng.randn(1)) X = rng.randn(n_samples, n_features) Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) -coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_ +coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.0).fit(X, Y).coef_ # ############################################################################# # Plot support and time series fig = plt.figure(figsize=(8, 5)) plt.subplot(1, 2, 1) plt.spy(coef_lasso_) -plt.xlabel('Feature') -plt.ylabel('Time (or Task)') -plt.text(10, 5, 'Lasso') +plt.xlabel("Feature") +plt.ylabel("Time (or Task)") +plt.text(10, 5, "Lasso") plt.subplot(1, 2, 2) plt.spy(coef_multi_task_lasso_) -plt.xlabel('Feature') -plt.ylabel('Time (or Task)') -plt.text(10, 5, 'MultiTaskLasso') -fig.suptitle('Coefficient non-zero location') +plt.xlabel("Feature") +plt.ylabel("Time (or Task)") +plt.text(10, 5, "MultiTaskLasso") +fig.suptitle("Coefficient non-zero location") feature_to_plot = 0 plt.figure() lw = 2 -plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw, - label='Ground truth') -plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw, - label='Lasso') -plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw, - label='MultiTaskLasso') -plt.legend(loc='upper center') -plt.axis('tight') +plt.plot(coef[:, feature_to_plot], color="seagreen", linewidth=lw, label="Ground truth") +plt.plot( + coef_lasso_[:, feature_to_plot], color="cornflowerblue", linewidth=lw, label="Lasso" +) +plt.plot( + coef_multi_task_lasso_[:, feature_to_plot], + color="gold", + linewidth=lw, + label="MultiTaskLasso", +) +plt.legend(loc="upper center") +plt.axis("tight") plt.ylim([-1.1, 1.1]) plt.show() diff --git a/examples/linear_model/plot_nnls.py b/examples/linear_model/plot_nnls.py index 56f357c4214a6..02a6dade30cae 100644 --- a/examples/linear_model/plot_nnls.py +++ b/examples/linear_model/plot_nnls.py @@ -24,7 +24,7 @@ y = np.dot(X, true_coef) # Add some noise -y += 5 * np.random.normal(size=(n_samples, )) +y += 5 * np.random.normal(size=(n_samples,)) # %% # Split the data in train set and test set @@ -62,6 +62,6 @@ low_y, high_y = ax.get_ylim() low = max(low_x, low_y) high = min(high_x, high_y) -ax.plot([low, high], [low, high], ls="--", c=".3", alpha=.5) +ax.plot([low, high], [low, high], ls="--", c=".3", alpha=0.5) ax.set_xlabel("OLS regression coefficients", fontweight="bold") ax.set_ylabel("NNLS regression coefficients", fontweight="bold") diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index 5f8c0079c3582..567596e82a8f1 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -51,17 +51,15 @@ diabetes_y_pred = regr.predict(diabetes_X_test) # The coefficients -print('Coefficients: \n', regr.coef_) +print("Coefficients: \n", regr.coef_) # The mean squared error -print('Mean squared error: %.2f' - % mean_squared_error(diabetes_y_test, diabetes_y_pred)) +print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred)) # The coefficient of determination: 1 is perfect prediction -print('Coefficient of determination: %.2f' - % r2_score(diabetes_y_test, diabetes_y_pred)) +print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred)) # Plot outputs -plt.scatter(diabetes_X_test, diabetes_y_test, color='black') -plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3) +plt.scatter(diabetes_X_test, diabetes_y_test, color="black") +plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3) plt.xticks(()) plt.yticks(()) diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py index aa34b3b680202..34b5a76954730 100644 --- a/examples/linear_model/plot_ols_3d.py +++ b/examples/linear_model/plot_ols_3d.py @@ -44,16 +44,18 @@ def plot_figs(fig_num, elev, azim, X_train, clf): plt.clf() ax = Axes3D(fig, elev=elev, azim=azim) - ax.scatter(X_train[:, 0], X_train[:, 1], y_train, c='k', marker='+') - ax.plot_surface(np.array([[-.1, -.1], [.15, .15]]), - np.array([[-.1, .15], [-.1, .15]]), - clf.predict(np.array([[-.1, -.1, .15, .15], - [-.1, .15, -.1, .15]]).T - ).reshape((2, 2)), - alpha=.5) - ax.set_xlabel('X_1') - ax.set_ylabel('X_2') - ax.set_zlabel('Y') + ax.scatter(X_train[:, 0], X_train[:, 1], y_train, c="k", marker="+") + ax.plot_surface( + np.array([[-0.1, -0.1], [0.15, 0.15]]), + np.array([[-0.1, 0.15], [-0.1, 0.15]]), + clf.predict( + np.array([[-0.1, -0.1, 0.15, 0.15], [-0.1, 0.15, -0.1, 0.15]]).T + ).reshape((2, 2)), + alpha=0.5, + ) + ax.set_xlabel("X_1") + ax.set_ylabel("X_2") + ax.set_zlabel("Y") ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) @@ -64,11 +66,11 @@ def plot_figs(fig_num, elev, azim, X_train, clf): azim = -110 plot_figs(1, elev, azim, X_train, ols) -elev = -.5 +elev = -0.5 azim = 0 plot_figs(2, elev, azim, X_train, ols) -elev = -.5 +elev = -0.5 azim = 90 plot_figs(3, elev, azim, X_train, ols) diff --git a/examples/linear_model/plot_ols_ridge_variance.py b/examples/linear_model/plot_ols_ridge_variance.py index ba5f65575f927..e94979077a230 100644 --- a/examples/linear_model/plot_ols_ridge_variance.py +++ b/examples/linear_model/plot_ols_ridge_variance.py @@ -33,34 +33,35 @@ from sklearn import linear_model -X_train = np.c_[.5, 1].T -y_train = [.5, 1] +X_train = np.c_[0.5, 1].T +y_train = [0.5, 1] X_test = np.c_[0, 2].T np.random.seed(0) -classifiers = dict(ols=linear_model.LinearRegression(), - ridge=linear_model.Ridge(alpha=.1)) +classifiers = dict( + ols=linear_model.LinearRegression(), ridge=linear_model.Ridge(alpha=0.1) +) for name, clf in classifiers.items(): fig, ax = plt.subplots(figsize=(4, 3)) for _ in range(6): - this_X = .1 * np.random.normal(size=(2, 1)) + X_train + this_X = 0.1 * np.random.normal(size=(2, 1)) + X_train clf.fit(this_X, y_train) - ax.plot(X_test, clf.predict(X_test), color='gray') - ax.scatter(this_X, y_train, s=3, c='gray', marker='o', zorder=10) + ax.plot(X_test, clf.predict(X_test), color="gray") + ax.scatter(this_X, y_train, s=3, c="gray", marker="o", zorder=10) clf.fit(X_train, y_train) - ax.plot(X_test, clf.predict(X_test), linewidth=2, color='blue') - ax.scatter(X_train, y_train, s=30, c='red', marker='+', zorder=10) + ax.plot(X_test, clf.predict(X_test), linewidth=2, color="blue") + ax.scatter(X_train, y_train, s=30, c="red", marker="+", zorder=10) ax.set_title(name) ax.set_xlim(0, 2) ax.set_ylim((0, 1.6)) - ax.set_xlabel('X') - ax.set_ylabel('y') + ax.set_xlabel("X") + ax.set_ylabel("y") fig.tight_layout() diff --git a/examples/linear_model/plot_omp.py b/examples/linear_model/plot_omp.py index 6052942fe9f48..2431ca10f5fb7 100644 --- a/examples/linear_model/plot_omp.py +++ b/examples/linear_model/plot_omp.py @@ -22,13 +22,15 @@ # y = Xw # |x|_0 = n_nonzero_coefs -y, X, w = make_sparse_coded_signal(n_samples=1, - n_components=n_components, - n_features=n_features, - n_nonzero_coefs=n_nonzero_coefs, - random_state=0) +y, X, w = make_sparse_coded_signal( + n_samples=1, + n_components=n_components, + n_features=n_features, + n_nonzero_coefs=n_nonzero_coefs, + random_state=0, +) -idx, = w.nonzero() +(idx,) = w.nonzero() # distort the clean signal y_noisy = y + 0.05 * np.random.randn(len(y)) @@ -41,11 +43,10 @@ plt.stem(idx, w[idx], use_line_collection=True) # plot the noise-free reconstruction -omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, - normalize=False) +omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, normalize=False) omp.fit(X, y) coef = omp.coef_ -idx_r, = coef.nonzero() +(idx_r,) = coef.nonzero() plt.subplot(4, 1, 2) plt.xlim(0, 512) plt.title("Recovered signal from noise-free measurements") @@ -54,7 +55,7 @@ # plot the noisy reconstruction omp.fit(X, y_noisy) coef = omp.coef_ -idx_r, = coef.nonzero() +(idx_r,) = coef.nonzero() plt.subplot(4, 1, 3) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements") @@ -64,13 +65,12 @@ omp_cv = OrthogonalMatchingPursuitCV(normalize=False) omp_cv.fit(X, y_noisy) coef = omp_cv.coef_ -idx_r, = coef.nonzero() +(idx_r,) = coef.nonzero() plt.subplot(4, 1, 4) plt.xlim(0, 512) plt.title("Recovered signal from noisy measurements with CV") plt.stem(idx_r, coef[idx_r], use_line_collection=True) plt.subplots_adjust(0.06, 0.04, 0.94, 0.90, 0.20, 0.38) -plt.suptitle('Sparse signal recovery with Orthogonal Matching Pursuit', - fontsize=16) +plt.suptitle("Sparse signal recovery with Orthogonal Matching Pursuit", fontsize=16) plt.show() diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 570baee9e1f67..7c9abd261937d 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -70,12 +70,15 @@ df["Frequency"] = df["ClaimNb"] / df["Exposure"] -print("Average Frequency = {}" - .format(np.average(df["Frequency"], weights=df["Exposure"]))) +print( + "Average Frequency = {}".format(np.average(df["Frequency"], weights=df["Exposure"])) +) -print("Fraction of exposure with zero claims = {0:.1%}" - .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / - df["Exposure"].sum())) +print( + "Fraction of exposure with zero claims = {0:.1%}".format( + df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / df["Exposure"].sum() + ) +) fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4)) ax0.set_title("Number of claims") @@ -100,20 +103,19 @@ log_scale_transformer = make_pipeline( - FunctionTransformer(np.log, validate=False), - StandardScaler() + FunctionTransformer(np.log, validate=False), StandardScaler() ) linear_model_preprocessor = ColumnTransformer( [ - ("passthrough_numeric", "passthrough", - ["BonusMalus"]), - ("binned_numeric", KBinsDiscretizer(n_bins=10), - ["VehAge", "DrivAge"]), - ("log_scaled_numeric", log_scale_transformer, - ["Density"]), - ("onehot_categorical", OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), + ( + "onehot_categorical", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), ], remainder="drop", ) @@ -137,11 +139,12 @@ df_train, df_test = train_test_split(df, test_size=0.33, random_state=0) -dummy = Pipeline([ - ("preprocessor", linear_model_preprocessor), - ("regressor", DummyRegressor(strategy='mean')), -]).fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +dummy = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", DummyRegressor(strategy="mean")), + ] +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) ############################################################################## @@ -157,26 +160,38 @@ def score_estimator(estimator, df_test): """Score an estimator on the test set.""" y_pred = estimator.predict(df_test) - print("MSE: %.3f" % - mean_squared_error(df_test["Frequency"], y_pred, - sample_weight=df_test["Exposure"])) - print("MAE: %.3f" % - mean_absolute_error(df_test["Frequency"], y_pred, - sample_weight=df_test["Exposure"])) + print( + "MSE: %.3f" + % mean_squared_error( + df_test["Frequency"], y_pred, sample_weight=df_test["Exposure"] + ) + ) + print( + "MAE: %.3f" + % mean_absolute_error( + df_test["Frequency"], y_pred, sample_weight=df_test["Exposure"] + ) + ) # Ignore non-positive predictions, as they are invalid for # the Poisson deviance. mask = y_pred > 0 if (~mask).any(): n_masked, n_samples = (~mask).sum(), mask.shape[0] - print(f"WARNING: Estimator yields invalid, non-positive predictions " - f" for {n_masked} samples out of {n_samples}. These predictions " - f"are ignored when computing the Poisson deviance.") + print( + "WARNING: Estimator yields invalid, non-positive predictions " + f" for {n_masked} samples out of {n_samples}. These predictions " + "are ignored when computing the Poisson deviance." + ) - print("mean Poisson deviance: %.3f" % - mean_poisson_deviance(df_test["Frequency"][mask], - y_pred[mask], - sample_weight=df_test["Exposure"][mask])) + print( + "mean Poisson deviance: %.3f" + % mean_poisson_deviance( + df_test["Frequency"][mask], + y_pred[mask], + sample_weight=df_test["Exposure"][mask], + ) + ) print("Constant mean frequency evaluation:") @@ -194,11 +209,12 @@ def score_estimator(estimator, df_test): from sklearn.linear_model import Ridge -ridge_glm = Pipeline([ - ("preprocessor", linear_model_preprocessor), - ("regressor", Ridge(alpha=1e-6)), -]).fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +ridge_glm = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", Ridge(alpha=1e-6)), + ] +).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) # %% # The Poisson deviance cannot be computed on non-positive values predicted by @@ -227,12 +243,15 @@ def score_estimator(estimator, df_test): n_samples = df_train.shape[0] -poisson_glm = Pipeline([ - ("preprocessor", linear_model_preprocessor), - ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)) -]) -poisson_glm.fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +poisson_glm = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), + ] +) +poisson_glm.fit( + df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] +) print("PoissonRegressor evaluation:") score_estimator(poisson_glm, df_test) @@ -264,20 +283,27 @@ def score_estimator(estimator, df_test): tree_preprocessor = ColumnTransformer( [ - ("categorical", OrdinalEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), - ("numeric", "passthrough", - ["VehAge", "DrivAge", "BonusMalus", "Density"]), + ( + "categorical", + OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("numeric", "passthrough", ["VehAge", "DrivAge", "BonusMalus", "Density"]), ], remainder="drop", ) -poisson_gbrt = Pipeline([ - ("preprocessor", tree_preprocessor), - ("regressor", HistGradientBoostingRegressor(loss="poisson", - max_leaf_nodes=128)), -]) -poisson_gbrt.fit(df_train, df_train["Frequency"], - regressor__sample_weight=df_train["Exposure"]) +poisson_gbrt = Pipeline( + [ + ("preprocessor", tree_preprocessor), + ( + "regressor", + HistGradientBoostingRegressor(loss="poisson", max_leaf_nodes=128), + ), + ] +) +poisson_gbrt.fit( + df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] +) print("Poisson Gradient Boosted Trees evaluation:") score_estimator(poisson_gbrt, df_test) @@ -298,14 +324,11 @@ def score_estimator(estimator, df_test): fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 -for row_idx, label, df in zip(range(2), - ["train", "test"], - [df_train, df_test]): - df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), - ax=axes[row_idx, 0]) +for row_idx, label, df in zip(range(2), ["train", "test"], [df_train, df_test]): + df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), ax=axes[row_idx, 0]) axes[row_idx, 0].set_title("Data") - axes[row_idx, 0].set_yscale('log') + axes[row_idx, 0].set_yscale("log") axes[row_idx, 0].set_xlabel("y (observed Frequency)") axes[row_idx, 0].set_ylim([1e1, 5e5]) axes[row_idx, 0].set_ylabel(label + " samples") @@ -313,12 +336,13 @@ def score_estimator(estimator, df_test): for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): y_pred = model.predict(df) - pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), - ax=axes[row_idx, idx+1]) + pd.Series(y_pred).hist( + bins=np.linspace(-1, 4, n_bins), ax=axes[row_idx, idx + 1] + ) axes[row_idx, idx + 1].set( title=model[-1].__class__.__name__, - yscale='log', - xlabel="y_pred (predicted expected Frequency)" + yscale="log", + xlabel="y_pred (predicted expected Frequency)", ) plt.tight_layout() @@ -361,8 +385,7 @@ def score_estimator(estimator, df_test): from sklearn.utils import gen_even_slices -def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, - n_bins=100): +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): """Compare predictions and observations for bins ordered by y_pred. We order the samples by ``y_pred`` and split it in bins. @@ -389,19 +412,14 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, average y_pred for each bin """ idx_sort = np.argsort(y_pred) - bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins + bin_centers = np.arange(0, 1, 1 / n_bins) + 0.5 / n_bins y_pred_bin = np.zeros(n_bins) y_true_bin = np.zeros(n_bins) for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): weights = sample_weight[idx_sort][sl] - y_pred_bin[n] = np.average( - y_pred[idx_sort][sl], weights=weights - ) - y_true_bin[n] = np.average( - y_true[idx_sort][sl], - weights=weights - ) + y_pred_bin[n] = np.average(y_pred[idx_sort][sl], weights=weights) + y_true_bin[n] = np.average(y_true[idx_sort][sl], weights=weights) return bin_centers, y_true_bin, y_pred_bin @@ -409,27 +427,26 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, - dummy]): +for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, dummy]): y_pred = model.predict(df_test) y_true = df_test["Frequency"].values exposure = df_test["Exposure"].values q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( - y_true, y_pred, sample_weight=exposure, n_bins=10) + y_true, y_pred, sample_weight=exposure, n_bins=10 + ) # Name of the model after the estimator used in the last step of the # pipeline. - print(f"Predicted number of claims by {model[-1]}: " - f"{np.sum(y_pred * exposure):.1f}") + print(f"Predicted number of claims by {model[-1]}: {np.sum(y_pred * exposure):.1f}") - axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions") - axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations") + axi.plot(q, y_pred_seg, marker="x", linestyle="--", label="predictions") + axi.plot(q, y_true_seg, marker="o", linestyle="--", label="observations") axi.set_xlim(0, 1.0) axi.set_ylim(0, 0.5) axi.set( title=model[-1], - xlabel='Fraction of samples sorted by y_pred', - ylabel='Mean Frequency (y_pred)' + xlabel="Fraction of samples sorted by y_pred", + ylabel="Mean Frequency (y_pred)", ) axi.legend() plt.tight_layout() @@ -489,27 +506,27 @@ def lorenz_curve(y_true, y_pred, exposure): for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: y_pred = model.predict(df_test) - cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred, - df_test["Exposure"]) + cum_exposure, cum_claims = lorenz_curve( + df_test["Frequency"], y_pred, df_test["Exposure"] + ) gini = 1 - 2 * auc(cum_exposure, cum_claims) label = "{} (Gini: {:.2f})".format(model[-1], gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], - df_test["Frequency"], - df_test["Exposure"]) +cum_exposure, cum_claims = lorenz_curve( + df_test["Frequency"], df_test["Frequency"], df_test["Exposure"] +) gini = 1 - 2 * auc(cum_exposure, cum_claims) label = "Oracle (Gini: {:.2f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline -ax.plot([0, 1], [0, 1], linestyle="--", color="black", - label="Random baseline") +ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( title="Lorenz curves by model", - xlabel='Cumulative proportion of exposure (from safest to riskiest)', - ylabel='Cumulative proportion of claims' + xlabel="Cumulative proportion of exposure (from safest to riskiest)", + ylabel="Cumulative proportion of claims", ) ax.legend(loc="upper left") diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py index c6cd1f9d591bd..db1666168c88e 100644 --- a/examples/linear_model/plot_polynomial_interpolation.py +++ b/examples/linear_model/plot_polynomial_interpolation.py @@ -54,6 +54,7 @@ # We start by defining a function that we intent to approximate and prepare # plotting it. + def f(x): """Function to be approximated by polynomial interpolation.""" return x * np.sin(x) @@ -81,9 +82,9 @@ def f(x): # plot function lw = 2 fig, ax = plt.subplots() -ax.set_prop_cycle(color=[ - "black", "teal", "yellowgreen", "gold", "darkorange", "tomato" -]) +ax.set_prop_cycle( + color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"] +) ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth") # plot training points @@ -97,13 +98,12 @@ def f(x): ax.plot(x_plot, y_plot, label=f"degree {degree}") # B-spline with 4 + 3 - 1 = 6 basis functions -model = make_pipeline(SplineTransformer(n_knots=4, degree=3), - Ridge(alpha=1e-3)) +model = make_pipeline(SplineTransformer(n_knots=4, degree=3), Ridge(alpha=1e-3)) model.fit(X_train, y_train) y_plot = model.predict(X_plot) ax.plot(x_plot, y_plot, label="B-spline") -ax.legend(loc='lower center') +ax.legend(loc="lower center") ax.set_ylim(-20, 10) plt.show() @@ -133,7 +133,7 @@ def f(x): # plot knots of spline knots = splt.bsplines_[0].t -axes[1].vlines(knots[3:-3], ymin=0, ymax=0.8, linestyles='dashed') +axes[1].vlines(knots[3:-3], ymin=0, ymax=0.8, linestyles="dashed") plt.show() # %% @@ -187,12 +187,15 @@ def g(x): ax.scatter(x_train, y_train, label="training points") for transformer, label in [ - (SplineTransformer(degree=3, n_knots=10), "spline"), - (SplineTransformer( - degree=3, - knots=np.linspace(0, 2 * np.pi, 10)[:, None], - extrapolation="periodic" - ), "periodic spline") + (SplineTransformer(degree=3, n_knots=10), "spline"), + ( + SplineTransformer( + degree=3, + knots=np.linspace(0, 2 * np.pi, 10)[:, None], + extrapolation="periodic", + ), + "periodic spline", + ), ]: model = make_pipeline(transformer, Ridge(alpha=1e-3)) model.fit(X_train, y_train) @@ -205,11 +208,9 @@ def g(x): # %% We again plot the underlying splines. fig, ax = plt.subplots() knots = np.linspace(0, 2 * np.pi, 4) -splt = SplineTransformer( - knots=knots[:, None], - degree=3, - extrapolation="periodic" -).fit(X_train) +splt = SplineTransformer(knots=knots[:, None], degree=3, extrapolation="periodic").fit( + X_train +) ax.plot(x_plot_ext, splt.transform(X_plot_ext)) ax.legend(ax.lines, [f"spline {n}" for n in range(3)]) plt.show() diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py index e1f1d484bf6b5..9fae4720499af 100644 --- a/examples/linear_model/plot_quantile_regression.py +++ b/examples/linear_model/plot_quantile_regression.py @@ -41,9 +41,7 @@ # # - in the first case, a heteroscedastic Normal noise is added; # - in the second case, an asymmetric Pareto noise is added. -y_normal = y_true_mean + rng.normal( - loc=0, scale=0.5 + 0.5 * x, size=x.shape[0] -) +y_normal = y_true_mean + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0]) a = 5 y_pareto = y_true_mean + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1)) @@ -52,21 +50,15 @@ # residuals `y - mean(y)`. import matplotlib.pyplot as plt -_, axs = plt.subplots( - nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row" -) +_, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row") axs[0, 0].plot(x, y_true_mean, label="True mean") -axs[0, 0].scatter( - x, y_normal, color="black", alpha=0.5, label="Observations" -) +axs[0, 0].scatter(x, y_normal, color="black", alpha=0.5, label="Observations") axs[1, 0].hist(y_true_mean - y_normal, edgecolor="black") axs[0, 1].plot(x, y_true_mean, label="True mean") -axs[0, 1].scatter( - x, y_pareto, color="black", alpha=0.5, label="Observations" -) +axs[0, 1].scatter(x, y_pareto, color="black", alpha=0.5, label="Observations") axs[1, 1].hist(y_true_mean - y_pareto, edgecolor="black") axs[0, 0].set_title("Dataset with heteroscedastic Normal distributed targets") @@ -74,9 +66,7 @@ axs[1, 0].set_title( "Residuals distribution for heteroscedastic Normal distributed targets" ) -axs[1, 1].set_title( - "Residuals distribution for asymmetric Pareto distributed target" -) +axs[1, 1].set_title("Residuals distribution for asymmetric Pareto distributed target") axs[0, 0].legend() axs[0, 1].legend() axs[0, 0].set_ylabel("y") diff --git a/examples/linear_model/plot_ransac.py b/examples/linear_model/plot_ransac.py index 0bafe4ee4a394..0c4070daf2fe9 100644 --- a/examples/linear_model/plot_ransac.py +++ b/examples/linear_model/plot_ransac.py @@ -17,9 +17,14 @@ n_outliers = 50 -X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1, - n_informative=1, noise=10, - coef=True, random_state=0) +X, y, coef = datasets.make_regression( + n_samples=n_samples, + n_features=1, + n_informative=1, + noise=10, + coef=True, + random_state=0, +) # Add outlier data np.random.seed(0) @@ -46,14 +51,21 @@ print(coef, lr.coef_, ransac.estimator_.coef_) lw = 2 -plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.', - label='Inliers') -plt.scatter(X[outlier_mask], y[outlier_mask], color='gold', marker='.', - label='Outliers') -plt.plot(line_X, line_y, color='navy', linewidth=lw, label='Linear regressor') -plt.plot(line_X, line_y_ransac, color='cornflowerblue', linewidth=lw, - label='RANSAC regressor') -plt.legend(loc='lower right') +plt.scatter( + X[inlier_mask], y[inlier_mask], color="yellowgreen", marker=".", label="Inliers" +) +plt.scatter( + X[outlier_mask], y[outlier_mask], color="gold", marker=".", label="Outliers" +) +plt.plot(line_X, line_y, color="navy", linewidth=lw, label="Linear regressor") +plt.plot( + line_X, + line_y_ransac, + color="cornflowerblue", + linewidth=lw, + label="RANSAC regressor", +) +plt.legend(loc="lower right") plt.xlabel("Input") plt.ylabel("Response") plt.show() diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py index 3f3e574708d48..0dd395baf5fcd 100644 --- a/examples/linear_model/plot_ridge_coeffs.py +++ b/examples/linear_model/plot_ridge_coeffs.py @@ -50,8 +50,9 @@ clf = Ridge() -X, y, w = make_regression(n_samples=10, n_features=10, coef=True, - random_state=1, bias=3.5) +X, y, w = make_regression( + n_samples=10, n_features=10, coef=True, random_state=1, bias=3.5 +) coefs = [] errors = [] @@ -71,19 +72,19 @@ plt.subplot(121) ax = plt.gca() ax.plot(alphas, coefs) -ax.set_xscale('log') -plt.xlabel('alpha') -plt.ylabel('weights') -plt.title('Ridge coefficients as a function of the regularization') -plt.axis('tight') +ax.set_xscale("log") +plt.xlabel("alpha") +plt.ylabel("weights") +plt.title("Ridge coefficients as a function of the regularization") +plt.axis("tight") plt.subplot(122) ax = plt.gca() ax.plot(alphas, errors) -ax.set_xscale('log') -plt.xlabel('alpha') -plt.ylabel('error') -plt.title('Coefficient error as a function of the regularization') -plt.axis('tight') +ax.set_xscale("log") +plt.xlabel("alpha") +plt.ylabel("error") +plt.title("Coefficient error as a function of the regularization") +plt.axis("tight") plt.show() diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py index b16212cbd3718..14fac0b5bdc92 100644 --- a/examples/linear_model/plot_ridge_path.py +++ b/examples/linear_model/plot_ridge_path.py @@ -36,7 +36,7 @@ from sklearn import linear_model # X is the 10x10 Hilbert matrix -X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) +X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) y = np.ones(10) # ############################################################################# @@ -57,10 +57,10 @@ ax = plt.gca() ax.plot(alphas, coefs) -ax.set_xscale('log') +ax.set_xscale("log") ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis -plt.xlabel('alpha') -plt.ylabel('weights') -plt.title('Ridge coefficients as a function of the regularization') -plt.axis('tight') +plt.xlabel("alpha") +plt.ylabel("weights") +plt.title("Ridge coefficients as a function of the regularization") +plt.axis("tight") plt.show() diff --git a/examples/linear_model/plot_robust_fit.py b/examples/linear_model/plot_robust_fit.py index 88fc05a695839..c9fe49fc0d416 100644 --- a/examples/linear_model/plot_robust_fit.py +++ b/examples/linear_model/plot_robust_fit.py @@ -34,7 +34,11 @@ import numpy as np from sklearn.linear_model import ( - LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor) + LinearRegression, + TheilSenRegressor, + RANSACRegressor, + HuberRegressor, +) from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline @@ -62,36 +66,50 @@ X_errors_large = X.copy() X_errors_large[::3] = 10 -estimators = [('OLS', LinearRegression()), - ('Theil-Sen', TheilSenRegressor(random_state=42)), - ('RANSAC', RANSACRegressor(random_state=42)), - ('HuberRegressor', HuberRegressor())] -colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', - 'RANSAC': 'lightgreen', 'HuberRegressor': 'black'} -linestyle = {'OLS': '-', 'Theil-Sen': '-.', 'RANSAC': '--', 'HuberRegressor': '--'} +estimators = [ + ("OLS", LinearRegression()), + ("Theil-Sen", TheilSenRegressor(random_state=42)), + ("RANSAC", RANSACRegressor(random_state=42)), + ("HuberRegressor", HuberRegressor()), +] +colors = { + "OLS": "turquoise", + "Theil-Sen": "gold", + "RANSAC": "lightgreen", + "HuberRegressor": "black", +} +linestyle = {"OLS": "-", "Theil-Sen": "-.", "RANSAC": "--", "HuberRegressor": "--"} lw = 3 x_plot = np.linspace(X.min(), X.max()) for title, this_X, this_y in [ - ('Modeling Errors Only', X, y), - ('Corrupt X, Small Deviants', X_errors, y), - ('Corrupt y, Small Deviants', X, y_errors), - ('Corrupt X, Large Deviants', X_errors_large, y), - ('Corrupt y, Large Deviants', X, y_errors_large)]: + ("Modeling Errors Only", X, y), + ("Corrupt X, Small Deviants", X_errors, y), + ("Corrupt y, Small Deviants", X, y_errors), + ("Corrupt X, Large Deviants", X_errors_large, y), + ("Corrupt y, Large Deviants", X, y_errors_large), +]: plt.figure(figsize=(5, 4)) - plt.plot(this_X[:, 0], this_y, 'b+') + plt.plot(this_X[:, 0], this_y, "b+") for name, estimator in estimators: model = make_pipeline(PolynomialFeatures(3), estimator) model.fit(this_X, this_y) mse = mean_squared_error(model.predict(X_test), y_test) y_plot = model.predict(x_plot[:, np.newaxis]) - plt.plot(x_plot, y_plot, color=colors[name], linestyle=linestyle[name], - linewidth=lw, label='%s: error = %.3f' % (name, mse)) - - legend_title = 'Error of Mean\nAbsolute Deviation\nto Non-corrupt Data' - legend = plt.legend(loc='upper right', frameon=False, title=legend_title, - prop=dict(size='x-small')) + plt.plot( + x_plot, + y_plot, + color=colors[name], + linestyle=linestyle[name], + linewidth=lw, + label="%s: error = %.3f" % (name, mse), + ) + + legend_title = "Error of Mean\nAbsolute Deviation\nto Non-corrupt Data" + legend = plt.legend( + loc="upper right", frameon=False, title=legend_title, prop=dict(size="x-small") + ) plt.xlim(-4, 10.2) plt.ylim(-2, 10.2) plt.title(title) diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py index 346627c933d85..c75775cbe84f0 100644 --- a/examples/linear_model/plot_sgd_comparison.py +++ b/examples/linear_model/plot_sgd_comparison.py @@ -27,14 +27,18 @@ ("SGD", SGDClassifier(max_iter=100)), ("ASGD", SGDClassifier(average=True)), ("Perceptron", Perceptron()), - ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', - C=1.0, tol=1e-4)), - ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', - C=1.0, tol=1e-4)), - ("SAG", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0])) + ( + "Passive-Aggressive I", + PassiveAggressiveClassifier(loss="hinge", C=1.0, tol=1e-4), + ), + ( + "Passive-Aggressive II", + PassiveAggressiveClassifier(loss="squared_hinge", C=1.0, tol=1e-4), + ), + ("SAG", LogisticRegression(solver="sag", tol=1e-1, C=1.0e4 / X.shape[0])), ] -xx = 1. - np.array(heldout) +xx = 1.0 - np.array(heldout) for name, clf in classifiers: print("training %s" % name) @@ -43,8 +47,9 @@ for i in heldout: yy_ = [] for r in range(rounds): - X_train, X_test, y_train, y_test = \ - train_test_split(X, y, test_size=i, random_state=rng) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=i, random_state=rng + ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) yy_.append(1 - np.mean(y_pred == y_test)) diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py index bd7461dfb5f92..baa63f770bb6c 100644 --- a/examples/linear_model/plot_sgd_early_stopping.py +++ b/examples/linear_model/plot_sgd_early_stopping.py @@ -55,10 +55,10 @@ print(__doc__) -def load_mnist(n_samples=None, class_0='0', class_1='8'): +def load_mnist(n_samples=None, class_0="0", class_1="8"): """Load MNIST, select two classes, shuffle and return only n_samples.""" # Load data from http://openml.org/d/554 - mnist = fetch_openml('mnist_784', version=1) + mnist = fetch_openml("mnist_784", version=1) # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) @@ -88,55 +88,58 @@ def fit_and_score(estimator, max_iter, X_train, X_test, y_train, y_test): # Define the estimators to compare estimator_dict = { - 'No stopping criterion': - linear_model.SGDClassifier(n_iter_no_change=3), - 'Training loss': - linear_model.SGDClassifier(early_stopping=False, n_iter_no_change=3, - tol=0.1), - 'Validation score': - linear_model.SGDClassifier(early_stopping=True, n_iter_no_change=3, - tol=0.0001, validation_fraction=0.2) + "No stopping criterion": linear_model.SGDClassifier(n_iter_no_change=3), + "Training loss": linear_model.SGDClassifier( + early_stopping=False, n_iter_no_change=3, tol=0.1 + ), + "Validation score": linear_model.SGDClassifier( + early_stopping=True, n_iter_no_change=3, tol=0.0001, validation_fraction=0.2 + ), } # Load the dataset X, y = load_mnist(n_samples=10000) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, - random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) results = [] for estimator_name, estimator in estimator_dict.items(): - print(estimator_name + ': ', end='') + print(estimator_name + ": ", end="") for max_iter in range(1, 50): - print('.', end='') + print(".", end="") sys.stdout.flush() fit_time, n_iter, train_score, test_score = fit_and_score( - estimator, max_iter, X_train, X_test, y_train, y_test) + estimator, max_iter, X_train, X_test, y_train, y_test + ) - results.append((estimator_name, max_iter, fit_time, n_iter, - train_score, test_score)) - print('') + results.append( + (estimator_name, max_iter, fit_time, n_iter, train_score, test_score) + ) + print("") # Transform the results in a pandas dataframe for easy plotting columns = [ - 'Stopping criterion', 'max_iter', 'Fit time (sec)', 'n_iter_', - 'Train score', 'Test score' + "Stopping criterion", + "max_iter", + "Fit time (sec)", + "n_iter_", + "Train score", + "Test score", ] results_df = pd.DataFrame(results, columns=columns) # Define what to plot (x_axis, y_axis) -lines = 'Stopping criterion' +lines = "Stopping criterion" plot_list = [ - ('max_iter', 'Train score'), - ('max_iter', 'Test score'), - ('max_iter', 'n_iter_'), - ('max_iter', 'Fit time (sec)'), + ("max_iter", "Train score"), + ("max_iter", "Test score"), + ("max_iter", "n_iter_"), + ("max_iter", "Fit time (sec)"), ] nrows = 2 -ncols = int(np.ceil(len(plot_list) / 2.)) -fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols, - 4 * nrows)) +ncols = int(np.ceil(len(plot_list) / 2.0)) +fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols, 4 * nrows)) axes[0, 0].get_shared_y_axes().join(axes[0, 0], axes[0, 1]) for ax, (x_axis, y_axis) in zip(axes.ravel(), plot_list): diff --git a/examples/linear_model/plot_sgd_iris.py b/examples/linear_model/plot_sgd_iris.py index 0dddf7475728d..367ec6a1f69af 100644 --- a/examples/linear_model/plot_sgd_iris.py +++ b/examples/linear_model/plot_sgd_iris.py @@ -36,15 +36,14 @@ std = X.std(axis=0) X = (X - mean) / std -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. @@ -52,15 +51,22 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) -plt.axis('tight') +plt.axis("tight") # Plot also the training points for i, color in zip(clf.classes_, colors): idx = np.where(y == i) - plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], - cmap=plt.cm.Paired, edgecolor='black', s=20) + plt.scatter( + X[idx, 0], + X[idx, 1], + c=color, + label=iris.target_names[i], + cmap=plt.cm.Paired, + edgecolor="black", + s=20, + ) plt.title("Decision surface of multi-class SGD") -plt.axis('tight') +plt.axis("tight") # Plot the three one-against-all classifiers xmin, xmax = plt.xlim() @@ -73,8 +79,7 @@ def plot_hyperplane(c, color): def line(x0): return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1] - plt.plot([xmin, xmax], [line(xmin), line(xmax)], - ls="--", color=color) + plt.plot([xmin, xmax], [line(xmin), line(xmax)], ls="--", color=color) for i, color in zip(clf.classes_, colors): diff --git a/examples/linear_model/plot_sgd_loss_functions.py b/examples/linear_model/plot_sgd_loss_functions.py index 4a7ad9ce9f0be..0cc4378ac9286 100644 --- a/examples/linear_model/plot_sgd_loss_functions.py +++ b/examples/linear_model/plot_sgd_loss_functions.py @@ -16,25 +16,32 @@ def modified_huber_loss(y_true, y_pred): z = y_pred * y_true loss = -4 * z loss[z >= -1] = (1 - z[z >= -1]) ** 2 - loss[z >= 1.] = 0 + loss[z >= 1.0] = 0 return loss xmin, xmax = -4, 4 xx = np.linspace(xmin, xmax, 100) lw = 2 -plt.plot([xmin, 0, 0, xmax], [1, 1, 0, 0], color='gold', lw=lw, - label="Zero-one loss") -plt.plot(xx, np.where(xx < 1, 1 - xx, 0), color='teal', lw=lw, - label="Hinge loss") -plt.plot(xx, -np.minimum(xx, 0), color='yellowgreen', lw=lw, - label="Perceptron loss") -plt.plot(xx, np.log2(1 + np.exp(-xx)), color='cornflowerblue', lw=lw, - label="Log loss") -plt.plot(xx, np.where(xx < 1, 1 - xx, 0) ** 2, color='orange', lw=lw, - label="Squared hinge loss") -plt.plot(xx, modified_huber_loss(xx, 1), color='darkorchid', lw=lw, - linestyle='--', label="Modified Huber loss") +plt.plot([xmin, 0, 0, xmax], [1, 1, 0, 0], color="gold", lw=lw, label="Zero-one loss") +plt.plot(xx, np.where(xx < 1, 1 - xx, 0), color="teal", lw=lw, label="Hinge loss") +plt.plot(xx, -np.minimum(xx, 0), color="yellowgreen", lw=lw, label="Perceptron loss") +plt.plot(xx, np.log2(1 + np.exp(-xx)), color="cornflowerblue", lw=lw, label="Log loss") +plt.plot( + xx, + np.where(xx < 1, 1 - xx, 0) ** 2, + color="orange", + lw=lw, + label="Squared hinge loss", +) +plt.plot( + xx, + modified_huber_loss(xx, 1), + color="darkorchid", + lw=lw, + linestyle="--", + label="Modified Huber loss", +) plt.ylim((0, 8)) plt.legend(loc="upper right") plt.xlabel(r"Decision function $f(x)$") diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py index 0307fb0e8ed94..795be3a15c4dc 100644 --- a/examples/linear_model/plot_sgd_penalties.py +++ b/examples/linear_model/plot_sgd_penalties.py @@ -30,22 +30,26 @@ plt.figure(figsize=(10, 10), dpi=100) ax = plt.gca() -elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], - colors=elastic_net_color) +elastic_net_contour = plt.contour( + xx, yy, elastic_net, levels=[1], colors=elastic_net_color +) l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color) l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color) ax.set_aspect("equal") -ax.spines['left'].set_position('center') -ax.spines['right'].set_color('none') -ax.spines['bottom'].set_position('center') -ax.spines['top'].set_color('none') - -plt.clabel(elastic_net_contour, inline=1, fontsize=18, - fmt={1.0: 'elastic-net'}, manual=[(-1, -1)]) -plt.clabel(l2_contour, inline=1, fontsize=18, - fmt={1.0: 'L2'}, manual=[(-1, -1)]) -plt.clabel(l1_contour, inline=1, fontsize=18, - fmt={1.0: 'L1'}, manual=[(-1, -1)]) +ax.spines["left"].set_position("center") +ax.spines["right"].set_color("none") +ax.spines["bottom"].set_position("center") +ax.spines["top"].set_color("none") + +plt.clabel( + elastic_net_contour, + inline=1, + fontsize=18, + fmt={1.0: "elastic-net"}, + manual=[(-1, -1)], +) +plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)]) +plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)]) plt.tight_layout() plt.show() diff --git a/examples/linear_model/plot_sgd_separating_hyperplane.py b/examples/linear_model/plot_sgd_separating_hyperplane.py index e7263e4ecd347..b485c32fbd4d5 100644 --- a/examples/linear_model/plot_sgd_separating_hyperplane.py +++ b/examples/linear_model/plot_sgd_separating_hyperplane.py @@ -34,11 +34,10 @@ p = clf.decision_function([[x1, x2]]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] -linestyles = ['dashed', 'solid', 'dashed'] -colors = 'k' +linestyles = ["dashed", "solid", "dashed"] +colors = "k" plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) -plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, - edgecolor='black', s=20) +plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor="black", s=20) -plt.axis('tight') +plt.axis("tight") plt.show() diff --git a/examples/linear_model/plot_sgd_weighted_samples.py b/examples/linear_model/plot_sgd_weighted_samples.py index 3617d81b0a063..64156fe7096c1 100644 --- a/examples/linear_model/plot_sgd_weighted_samples.py +++ b/examples/linear_model/plot_sgd_weighted_samples.py @@ -23,25 +23,35 @@ # plot the weighted data points xx, yy = np.meshgrid(np.linspace(-4, 5, 500), np.linspace(-4, 5, 500)) plt.figure() -plt.scatter(X[:, 0], X[:, 1], c=y, s=sample_weight, alpha=0.9, - cmap=plt.cm.bone, edgecolor='black') +plt.scatter( + X[:, 0], + X[:, 1], + c=y, + s=sample_weight, + alpha=0.9, + cmap=plt.cm.bone, + edgecolor="black", +) # fit the unweighted model clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100) clf.fit(X, y) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) -no_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=['solid']) +no_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=["solid"]) # fit the weighted model clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100) clf.fit(X, y, sample_weight=sample_weight) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) -samples_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=['dashed']) +samples_weights = plt.contour(xx, yy, Z, levels=[0], linestyles=["dashed"]) -plt.legend([no_weights.collections[0], samples_weights.collections[0]], - ["no weights", "with weights"], loc="lower left") +plt.legend( + [no_weights.collections[0], samples_weights.collections[0]], + ["no weights", "with weights"], + loc="lower left", +) plt.xticks(()) plt.yticks(()) diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py index e70694cdb1c1b..2252ad1fc98e9 100644 --- a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py +++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py @@ -27,10 +27,9 @@ from sklearn.kernel_approximation import Nystroem from sklearn.pipeline import make_pipeline -font = {'weight': 'normal', - 'size': 15} +font = {"weight": "normal", "size": 15} -matplotlib.rc('font', **font) +matplotlib.rc("font", **font) random_state = 42 rng = np.random.RandomState(random_state) @@ -48,10 +47,10 @@ # OCSVM hyperparameters nu = 0.05 -gamma = 2. +gamma = 2.0 # Fit the One-Class SVM -clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu) +clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) @@ -66,8 +65,9 @@ # Fit the One-Class SVM using a kernel approximation and SGD transform = Nystroem(gamma=gamma, random_state=random_state) -clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True, - random_state=random_state, tol=1e-4) +clf_sgd = SGDOneClassSVM( + nu=nu, shuffle=True, fit_intercept=True, random_state=random_state, tol=1e-4 +) pipe_sgd = make_pipeline(transform, clf_sgd) pipe_sgd.fit(X_train) y_pred_train_sgd = pipe_sgd.predict(X_train) @@ -82,54 +82,73 @@ # plot the level sets of the decision function plt.figure(figsize=(9, 6)) -plt.title('One Class SVM') +plt.title("One Class SVM") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) -a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') -plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') +a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") +plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") s = 20 -b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') -b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, - edgecolors='k') -c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, - edgecolors='k') -plt.axis('tight') +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k") +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k") +plt.axis("tight") plt.xlim((-4.5, 4.5)) plt.ylim((-4.5, 4.5)) -plt.legend([a.collections[0], b1, b2, c], - ["learned frontier", "training observations", - "new regular observations", "new abnormal observations"], - loc="upper left") +plt.legend( + [a.collections[0], b1, b2, c], + [ + "learned frontier", + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="upper left", +) plt.xlabel( - "error train: %d/%d; errors novel regular: %d/%d; " - "errors novel abnormal: %d/%d" - % (n_error_train, X_train.shape[0], n_error_test, X_test.shape[0], - n_error_outliers, X_outliers.shape[0])) + "error train: %d/%d; errors novel regular: %d/%d; errors novel abnormal: %d/%d" + % ( + n_error_train, + X_train.shape[0], + n_error_test, + X_test.shape[0], + n_error_outliers, + X_outliers.shape[0], + ) +) plt.show() plt.figure(figsize=(9, 6)) -plt.title('Online One-Class SVM') -plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7), - cmap=plt.cm.PuBu) -a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors='darkred') -plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors='palevioletred') +plt.title("Online One-Class SVM") +plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7), cmap=plt.cm.PuBu) +a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors="darkred") +plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors="palevioletred") s = 20 -b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') -b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, - edgecolors='k') -c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, - edgecolors='k') -plt.axis('tight') +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k") +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k") +plt.axis("tight") plt.xlim((-4.5, 4.5)) plt.ylim((-4.5, 4.5)) -plt.legend([a.collections[0], b1, b2, c], - ["learned frontier", "training observations", - "new regular observations", "new abnormal observations"], - loc="upper left") +plt.legend( + [a.collections[0], b1, b2, c], + [ + "learned frontier", + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="upper left", +) plt.xlabel( - "error train: %d/%d; errors novel regular: %d/%d; " - "errors novel abnormal: %d/%d" - % (n_error_train_sgd, X_train.shape[0], n_error_test_sgd, X_test.shape[0], - n_error_outliers_sgd, X_outliers.shape[0])) + "error train: %d/%d; errors novel regular: %d/%d; errors novel abnormal: %d/%d" + % ( + n_error_train_sgd, + X_train.shape[0], + n_error_test_sgd, + X_test.shape[0], + n_error_outliers_sgd, + X_outliers.shape[0], + ) +) plt.show() diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py index bab97be5acd8b..71de01bbf34a1 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py +++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py @@ -32,32 +32,34 @@ print(__doc__) # Author: Arthur Mensch -warnings.filterwarnings("ignore", category=ConvergenceWarning, - module="sklearn") +warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") t0 = timeit.default_timer() # We use SAGA solver -solver = 'saga' +solver = "saga" # Turn down for faster run time n_samples = 10000 -X, y = fetch_20newsgroups_vectorized(subset='all', return_X_y=True) +X, y = fetch_20newsgroups_vectorized(subset="all", return_X_y=True) X = X[:n_samples] y = y[:n_samples] -X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=42, - stratify=y, - test_size=0.1) +X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=42, stratify=y, test_size=0.1 +) train_samples, n_features = X_train.shape n_classes = np.unique(y).shape[0] -print('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i' - % (train_samples, n_features, n_classes)) +print( + "Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i" + % (train_samples, n_features, n_classes) +) -models = {'ovr': {'name': 'One versus Rest', 'iters': [1, 2, 4]}, - 'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}} +models = { + "ovr": {"name": "One versus Rest", "iters": [1, 2, 4]}, + "multinomial": {"name": "Multinomial", "iters": [1, 3, 7]}, +} for model in models: # Add initial chance-level values for plotting purpose @@ -68,15 +70,18 @@ model_params = models[model] # Small number of epochs for fast runtime - for this_max_iter in model_params['iters']: - print('[model=%s, solver=%s] Number of epochs: %s' % - (model_params['name'], solver, this_max_iter)) - lr = LogisticRegression(solver=solver, - multi_class=model, - penalty='l1', - max_iter=this_max_iter, - random_state=42, - ) + for this_max_iter in model_params["iters"]: + print( + "[model=%s, solver=%s] Number of epochs: %s" + % (model_params["name"], solver, this_max_iter) + ) + lr = LogisticRegression( + solver=solver, + multi_class=model, + penalty="l1", + max_iter=this_max_iter, + random_state=42, + ) t1 = timeit.default_timer() lr.fit(X_train, y_train) train_time = timeit.default_timer() - t1 @@ -87,31 +92,33 @@ accuracies.append(accuracy) densities.append(density) times.append(train_time) - models[model]['times'] = times - models[model]['densities'] = densities - models[model]['accuracies'] = accuracies - print('Test accuracy for model %s: %.4f' % (model, accuracies[-1])) - print('%% non-zero coefficients for model %s, ' - 'per class:\n %s' % (model, densities[-1])) - print('Run time (%i epochs) for model %s:' - '%.2f' % (model_params['iters'][-1], model, times[-1])) + models[model]["times"] = times + models[model]["densities"] = densities + models[model]["accuracies"] = accuracies + print("Test accuracy for model %s: %.4f" % (model, accuracies[-1])) + print( + "%% non-zero coefficients for model %s, per class:\n %s" + % (model, densities[-1]) + ) + print( + "Run time (%i epochs) for model %s:%.2f" + % (model_params["iters"][-1], model, times[-1]) + ) fig = plt.figure() ax = fig.add_subplot(111) for model in models: - name = models[model]['name'] - times = models[model]['times'] - accuracies = models[model]['accuracies'] - ax.plot(times, accuracies, marker='o', - label='Model: %s' % name) - ax.set_xlabel('Train time (s)') - ax.set_ylabel('Test accuracy') + name = models[model]["name"] + times = models[model]["times"] + accuracies = models[model]["accuracies"] + ax.plot(times, accuracies, marker="o", label="Model: %s" % name) + ax.set_xlabel("Train time (s)") + ax.set_ylabel("Test accuracy") ax.legend() -fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n' - 'Dataset %s' % '20newsgroups') +fig.suptitle("Multinomial vs One-vs-Rest Logistic L1\nDataset %s" % "20newsgroups") fig.tight_layout() fig.subplots_adjust(top=0.85) run_time = timeit.default_timer() - t0 -print('Example run in %.3f s' % run_time) +print("Example run in %.3f s" % run_time) plt.show() diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index 968d597a5cac7..27d428881216f 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -36,7 +36,7 @@ train_samples = 5000 # Load data from https://www.openml.org/d/554 -X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False) +X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) random_state = check_random_state(0) permutation = random_state.permutation(X.shape[0]) @@ -45,16 +45,15 @@ X = X.reshape((X.shape[0], -1)) X_train, X_test, y_train, y_test = train_test_split( - X, y, train_size=train_samples, test_size=10000) + X, y, train_size=train_samples, test_size=10000 +) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Turn up tolerance for faster convergence -clf = LogisticRegression( - C=50. / train_samples, penalty='l1', solver='saga', tol=0.1 -) +clf = LogisticRegression(C=50.0 / train_samples, penalty="l1", solver="saga", tol=0.1) clf.fit(X_train, y_train) sparsity = np.mean(clf.coef_ == 0) * 100 score = clf.score(X_test, y_test) @@ -67,13 +66,18 @@ scale = np.abs(coef).max() for i in range(10): l1_plot = plt.subplot(2, 5, i + 1) - l1_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest', - cmap=plt.cm.RdBu, vmin=-scale, vmax=scale) + l1_plot.imshow( + coef[i].reshape(28, 28), + interpolation="nearest", + cmap=plt.cm.RdBu, + vmin=-scale, + vmax=scale, + ) l1_plot.set_xticks(()) l1_plot.set_yticks(()) - l1_plot.set_xlabel('Class %i' % i) -plt.suptitle('Classification vector for...') + l1_plot.set_xlabel("Class %i" % i) +plt.suptitle("Classification vector for...") run_time = time.time() - t0 -print('Example run in %.3f s' % run_time) +print("Example run in %.3f s" % run_time) plt.show() diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py index c80b4a409937b..54196188d864a 100644 --- a/examples/linear_model/plot_theilsen.py +++ b/examples/linear_model/plot_theilsen.py @@ -45,10 +45,12 @@ print(__doc__) -estimators = [('OLS', LinearRegression()), - ('Theil-Sen', TheilSenRegressor(random_state=42)), - ('RANSAC', RANSACRegressor(random_state=42)), ] -colors = {'OLS': 'turquoise', 'Theil-Sen': 'gold', 'RANSAC': 'lightgreen'} +estimators = [ + ("OLS", LinearRegression()), + ("Theil-Sen", TheilSenRegressor(random_state=42)), + ("RANSAC", RANSACRegressor(random_state=42)), +] +colors = {"OLS": "turquoise", "Theil-Sen": "gold", "RANSAC": "lightgreen"} lw = 2 # ############################################################################# @@ -58,26 +60,31 @@ n_samples = 200 # Linear model y = 3*x + N(2, 0.1**2) x = np.random.randn(n_samples) -w = 3. -c = 2. +w = 3.0 +c = 2.0 noise = 0.1 * np.random.randn(n_samples) y = w * x + c + noise # 10% outliers y[-20:] += -20 * x[-20:] X = x[:, np.newaxis] -plt.scatter(x, y, color='indigo', marker='x', s=40) +plt.scatter(x, y, color="indigo", marker="x", s=40) line_x = np.array([-3, 3]) for name, estimator in estimators: t0 = time.time() estimator.fit(X, y) elapsed_time = time.time() - t0 y_pred = estimator.predict(line_x.reshape(2, 1)) - plt.plot(line_x, y_pred, color=colors[name], linewidth=lw, - label='%s (fit time: %.2fs)' % (name, elapsed_time)) - -plt.axis('tight') -plt.legend(loc='upper left') + plt.plot( + line_x, + y_pred, + color=colors[name], + linewidth=lw, + label="%s (fit time: %.2fs)" % (name, elapsed_time), + ) + +plt.axis("tight") +plt.legend(loc="upper left") plt.title("Corrupt y") # ############################################################################# @@ -94,7 +101,7 @@ X = x[:, np.newaxis] plt.figure() -plt.scatter(x, y, color='indigo', marker='x', s=40) +plt.scatter(x, y, color="indigo", marker="x", s=40) line_x = np.array([-3, 10]) for name, estimator in estimators: @@ -102,10 +109,15 @@ estimator.fit(X, y) elapsed_time = time.time() - t0 y_pred = estimator.predict(line_x.reshape(2, 1)) - plt.plot(line_x, y_pred, color=colors[name], linewidth=lw, - label='%s (fit time: %.2fs)' % (name, elapsed_time)) - -plt.axis('tight') -plt.legend(loc='upper left') + plt.plot( + line_x, + y_pred, + color=colors[name], + linewidth=lw, + label="%s (fit time: %.2fs)" % (name, elapsed_time), + ) + +plt.axis("tight") +plt.legend(loc="upper left") plt.title("Corrupt x") plt.show() diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 8edf97d0738a9..1bcf26742d2f1 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -72,15 +72,15 @@ def load_mtpl2(n_samples=100000): 678013 samples. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 - df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] - df_freq['IDpol'] = df_freq['IDpol'].astype(int) - df_freq.set_index('IDpol', inplace=True) + df_freq = fetch_openml(data_id=41214, as_frame=True)["data"] + df_freq["IDpol"] = df_freq["IDpol"].astype(int) + df_freq.set_index("IDpol", inplace=True) # freMTPL2sev dataset from https://www.openml.org/d/41215 - df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] + df_sev = fetch_openml(data_id=41215, as_frame=True)["data"] # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby('IDpol').sum() + df_sev = df_sev.groupby("IDpol").sum() df = df_freq.join(df_sev, how="left") df["ClaimAmount"].fillna(0, inplace=True) @@ -91,8 +91,17 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] -def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, ax=None, fill_legend=False): +def plot_obs_pred( + df, + feature, + weight, + observed, + predicted, + y_label=None, + title=None, + ax=None, + fill_legend=False, +): """Plot observed and predicted - aggregated per feature level. Parameters @@ -139,21 +148,30 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, def score_estimator( - estimator, X_train, X_test, df_train, df_test, target, weights, + estimator, + X_train, + X_test, + df_train, + df_test, + target, + weights, tweedie_powers=None, ): """Evaluate an estimator on train and test sets with different metrics""" metrics = [ - ("D² explained", None), # Use default scorer if it exists + ("D² explained", None), # Use default scorer if it exists ("mean abs. error", mean_absolute_error), ("mean squared error", mean_squared_error), ] if tweedie_powers: - metrics += [( - "mean Tweedie dev p={:.4f}".format(power), - partial(mean_tweedie_deviance, power=power) - ) for power in tweedie_powers] + metrics += [ + ( + "mean Tweedie dev p={:.4f}".format(power), + partial(mean_tweedie_deviance, power=power), + ) + for power in tweedie_powers + ] res = [] for subset_label, X, df in [ @@ -177,16 +195,14 @@ def score_estimator( else: score = metric(y, y_pred, sample_weight=_weights) - res.append( - {"subset": subset_label, "metric": score_label, "score": score} - ) + res.append({"subset": subset_label, "metric": score_label, "score": score}) res = ( pd.DataFrame(res) .set_index(["metric", "subset"]) .score.unstack(-1) .round(4) - .loc[:, ['train', 'test']] + .loc[:, ["train", "test"]] ) return res @@ -213,20 +229,19 @@ def score_estimator( df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) log_scale_transformer = make_pipeline( - FunctionTransformer(func=np.log), - StandardScaler() + FunctionTransformer(func=np.log), StandardScaler() ) column_trans = ColumnTransformer( [ - ("binned_numeric", KBinsDiscretizer(n_bins=10), - ["VehAge", "DrivAge"]), - ("onehot_categorical", OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), - ("passthrough_numeric", "passthrough", - ["BonusMalus"]), - ("log_scaled_numeric", log_scale_transformer, - ["Density"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "onehot_categorical", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) @@ -263,8 +278,7 @@ def score_estimator( # on the training set via a quasi-Newton solver: l-BFGS. Some of the features # are collinear, we use a weak penalization to avoid numerical issues. glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400) -glm_freq.fit(X_train, df_train["Frequency"], - sample_weight=df_train["Exposure"]) +glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) scores = score_estimator( glm_freq, @@ -306,7 +320,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[0, 1], - fill_legend=True + fill_legend=True, ) plot_obs_pred( @@ -318,7 +332,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 0], - fill_legend=True + fill_legend=True, ) plot_obs_pred( @@ -330,7 +344,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 1], - fill_legend=True + fill_legend=True, ) @@ -356,7 +370,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GammaRegressor(alpha=10., max_iter=10000) +glm_sev = GammaRegressor(alpha=10.0, max_iter=10000) glm_sev.fit( X_train[mask_train.values], @@ -385,12 +399,18 @@ def score_estimator( # such, it is conditional on having at least one claim, and cannot be used to # predict the average claim amount per policy in general. -print("Mean AvgClaim Amount per policy: %.2f " - % df_train["AvgClaimAmount"].mean()) -print("Mean AvgClaim Amount | NbClaim > 0: %.2f" - % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()) -print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" - % glm_sev.predict(X_train).mean()) +print( + "Mean AvgClaim Amount per policy: %.2f " + % df_train["AvgClaimAmount"].mean() +) +print( + "Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean() +) +print( + "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" + % glm_sev.predict(X_train).mean() +) # %% @@ -419,7 +439,7 @@ def score_estimator( y_label="Average Claim Severity", title="test data", ax=ax[1], - fill_legend=True + fill_legend=True, ) plt.tight_layout() @@ -455,9 +475,10 @@ def score_estimator( # Ideally, we hope that one model will be consistently better than the other, # regardless of `power`. -glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) -glm_pure_premium.fit(X_train, df_train["PurePremium"], - sample_weight=df_train["Exposure"]) +glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000) +glm_pure_premium.fit( + X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"] +) tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] @@ -480,15 +501,17 @@ def score_estimator( df_test, target="PurePremium", weights="Exposure", - tweedie_powers=tweedie_powers + tweedie_powers=tweedie_powers, ) -scores = pd.concat([scores_product_model, scores_glm_pure_premium], - axis=1, sort=True, - keys=('Product Model', 'TweedieRegressor')) -print("Evaluation of the Product Model and the Tweedie Regressor " - "on target PurePremium") -with pd.option_context('display.expand_frame_repr', False): +scores = pd.concat( + [scores_product_model, scores_glm_pure_premium], + axis=1, + sort=True, + keys=("Product Model", "TweedieRegressor"), +) +print("Evaluation of the Product Model and the Tweedie Regressor on target PurePremium") +with pd.option_context("display.expand_frame_repr", False): print(scores) # %% @@ -515,8 +538,7 @@ def score_estimator( exposure * glm_freq.predict(X) * glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" - % glm_pure_premium.power: np.sum( - exposure * glm_pure_premium.predict(X)), + % glm_pure_premium.power: np.sum(exposure * glm_pure_premium.predict(X)), } ) @@ -567,30 +589,31 @@ def lorenz_curve(y_true, y_pred, exposure): y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) y_pred_total = glm_pure_premium.predict(X_test) -for label, y_pred in [("Frequency * Severity model", y_pred_product), - ("Compound Poisson Gamma", y_pred_total)]: +for label, y_pred in [ + ("Frequency * Severity model", y_pred_product), + ("Compound Poisson Gamma", y_pred_total), +]: ordered_samples, cum_claims = lorenz_curve( - df_test["PurePremium"], y_pred, df_test["Exposure"]) + df_test["PurePremium"], y_pred, df_test["Exposure"] + ) gini = 1 - 2 * auc(ordered_samples, cum_claims) label += " (Gini index: {:.3f})".format(gini) ax.plot(ordered_samples, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test ordered_samples, cum_claims = lorenz_curve( - df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]) + df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"] +) gini = 1 - 2 * auc(ordered_samples, cum_claims) label = "Oracle (Gini index: {:.3f})".format(gini) -ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", - label=label) +ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", label=label) # Random baseline -ax.plot([0, 1], [0, 1], linestyle="--", color="black", - label="Random baseline") +ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( title="Lorenz Curves", - xlabel=('Fraction of policyholders\n' - '(ordered by model from safest to riskiest)'), - ylabel='Fraction of total claim amount' + xlabel="Fraction of policyholders\n(ordered by model from safest to riskiest)", + ylabel="Fraction of total claim amount", ) ax.legend(loc="upper left") plt.plot() diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index c78ecc234186a..a8485f07bf150 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -43,31 +43,34 @@ # Create figure fig = plt.figure(figsize=(15, 8)) -fig.suptitle("Manifold Learning with %i points, %i neighbors" - % (1000, n_neighbors), fontsize=14) +fig.suptitle( + "Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14 +) # Add 3d scatter plot -ax = fig.add_subplot(251, projection='3d') +ax = fig.add_subplot(251, projection="3d") ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) ax.view_init(4, -72) # Set-up manifold methods -LLE = partial(manifold.LocallyLinearEmbedding, - n_neighbors=n_neighbors, n_components=n_components, - eigen_solver='auto') +LLE = partial( + manifold.LocallyLinearEmbedding, + n_neighbors=n_neighbors, + n_components=n_components, + eigen_solver="auto", +) methods = OrderedDict() -methods['LLE'] = LLE(method='standard') -methods['LTSA'] = LLE(method='ltsa') -methods['Hessian LLE'] = LLE(method='hessian') -methods['Modified LLE'] = LLE(method='modified') -methods['Isomap'] = manifold.Isomap(n_neighbors=n_neighbors, - n_components=n_components) -methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) -methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, - n_neighbors=n_neighbors) -methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', - random_state=0) +methods["LLE"] = LLE(method="standard") +methods["LTSA"] = LLE(method="ltsa") +methods["Hessian LLE"] = LLE(method="hessian") +methods["Modified LLE"] = LLE(method="modified") +methods["Isomap"] = manifold.Isomap(n_neighbors=n_neighbors, n_components=n_components) +methods["MDS"] = manifold.MDS(n_components, max_iter=100, n_init=1) +methods["SE"] = manifold.SpectralEmbedding( + n_components=n_components, n_neighbors=n_neighbors +) +methods["t-SNE"] = manifold.TSNE(n_components=n_components, init="pca", random_state=0) # Plot results for i, (label, method) in enumerate(methods.items()): @@ -80,6 +83,6 @@ ax.set_title("%s (%.2g sec)" % (label, t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - ax.axis('tight') + ax.axis("tight") plt.show() diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py index fbc125fb8773f..41b0df181b344 100644 --- a/examples/manifold/plot_manifold_sphere.py +++ b/examples/manifold/plot_manifold_sphere.py @@ -55,32 +55,39 @@ t = random_state.rand(n_samples) * np.pi # Sever the poles from the sphere. -indices = ((t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8)))) +indices = (t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8))) colors = p[indices] -x, y, z = np.sin(t[indices]) * np.cos(p[indices]), \ - np.sin(t[indices]) * np.sin(p[indices]), \ - np.cos(t[indices]) +x, y, z = ( + np.sin(t[indices]) * np.cos(p[indices]), + np.sin(t[indices]) * np.sin(p[indices]), + np.cos(t[indices]), +) # Plot our dataset. fig = plt.figure(figsize=(15, 8)) -plt.suptitle("Manifold Learning with %i points, %i neighbors" - % (1000, n_neighbors), fontsize=14) +plt.suptitle( + "Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14 +) -ax = fig.add_subplot(251, projection='3d') +ax = fig.add_subplot(251, projection="3d") ax.scatter(x, y, z, c=p[indices], cmap=plt.cm.rainbow) ax.view_init(40, -10) sphere_data = np.array([x, y, z]).T # Perform Locally Linear Embedding Manifold learning -methods = ['standard', 'ltsa', 'hessian', 'modified'] -labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE'] +methods = ["standard", "ltsa", "hessian", "modified"] +labels = ["LLE", "LTSA", "Hessian LLE", "Modified LLE"] for i, method in enumerate(methods): t0 = time() - trans_data = manifold.LocallyLinearEmbedding( - n_neighbors=n_neighbors, n_components=2, - method=method).fit_transform(sphere_data).T + trans_data = ( + manifold.LocallyLinearEmbedding( + n_neighbors=n_neighbors, n_components=2, method=method + ) + .fit_transform(sphere_data) + .T + ) t1 = time() print("%s: %.2g sec" % (methods[i], t1 - t0)) @@ -89,21 +96,24 @@ plt.title("%s (%.2g sec)" % (labels[i], t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - plt.axis('tight') + plt.axis("tight") # Perform Isomap Manifold learning. t0 = time() -trans_data = manifold.Isomap(n_neighbors=n_neighbors, - n_components=2).fit_transform(sphere_data).T +trans_data = ( + manifold.Isomap(n_neighbors=n_neighbors, n_components=2) + .fit_transform(sphere_data) + .T +) t1 = time() -print("%s: %.2g sec" % ('ISO', t1 - t0)) +print("%s: %.2g sec" % ("ISO", t1 - t0)) ax = fig.add_subplot(257) plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow) -plt.title("%s (%.2g sec)" % ('Isomap', t1 - t0)) +plt.title("%s (%.2g sec)" % ("Isomap", t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') +plt.axis("tight") # Perform Multi-dimensional scaling. t0 = time() @@ -117,12 +127,11 @@ plt.title("MDS (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') +plt.axis("tight") # Perform Spectral Embedding. t0 = time() -se = manifold.SpectralEmbedding(n_components=2, - n_neighbors=n_neighbors) +se = manifold.SpectralEmbedding(n_components=2, n_neighbors=n_neighbors) trans_data = se.fit_transform(sphere_data).T t1 = time() print("Spectral Embedding: %.2g sec" % (t1 - t0)) @@ -132,11 +141,11 @@ plt.title("Spectral Embedding (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') +plt.axis("tight") # Perform t-distributed stochastic neighbor embedding. t0 = time() -tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) +tsne = manifold.TSNE(n_components=2, init="pca", random_state=0) trans_data = tsne.fit_transform(sphere_data).T t1 = time() print("t-SNE: %.2g sec" % (t1 - t0)) @@ -146,6 +155,6 @@ plt.title("t-SNE (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') +plt.axis("tight") plt.show() diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index aa2218a399a34..1605ae5b20164 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -38,13 +38,26 @@ noise[np.arange(noise.shape[0]), np.arange(noise.shape[0])] = 0 similarities += noise -mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, - dissimilarity="precomputed", n_jobs=1) +mds = manifold.MDS( + n_components=2, + max_iter=3000, + eps=1e-9, + random_state=seed, + dissimilarity="precomputed", + n_jobs=1, +) pos = mds.fit(similarities).embedding_ -nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, - dissimilarity="precomputed", random_state=seed, n_jobs=1, - n_init=1) +nmds = manifold.MDS( + n_components=2, + metric=False, + max_iter=3000, + eps=1e-12, + dissimilarity="precomputed", + random_state=seed, + n_jobs=1, + n_init=1, +) npos = nmds.fit_transform(similarities, init=pos) # Rescale the data @@ -60,14 +73,13 @@ npos = clf.fit_transform(npos) fig = plt.figure(1) -ax = plt.axes([0., 0., 1., 1.]) +ax = plt.axes([0.0, 0.0, 1.0, 1.0]) s = 100 -plt.scatter(X_true[:, 0], X_true[:, 1], color='navy', s=s, lw=0, - label='True Position') -plt.scatter(pos[:, 0], pos[:, 1], color='turquoise', s=s, lw=0, label='MDS') -plt.scatter(npos[:, 0], npos[:, 1], color='darkorange', s=s, lw=0, label='NMDS') -plt.legend(scatterpoints=1, loc='best', shadow=False) +plt.scatter(X_true[:, 0], X_true[:, 1], color="navy", s=s, lw=0, label="True Position") +plt.scatter(pos[:, 0], pos[:, 1], color="turquoise", s=s, lw=0, label="MDS") +plt.scatter(npos[:, 0], npos[:, 1], color="darkorange", s=s, lw=0, label="NMDS") +plt.legend(scatterpoints=1, loc="best", shadow=False) similarities = similarities.max() / (similarities + EPSILON) * 100 np.fill_diagonal(similarities, 0) @@ -75,12 +87,13 @@ start_idx, end_idx = np.where(pos) # a sequence of (*line0*, *line1*, *line2*), where:: # linen = (x0, y0), (x1, y1), ... (xm, ym) -segments = [[X_true[i, :], X_true[j, :]] - for i in range(len(pos)) for j in range(len(pos))] +segments = [ + [X_true[i, :], X_true[j, :]] for i in range(len(pos)) for j in range(len(pos)) +] values = np.abs(similarities) -lc = LineCollection(segments, - zorder=0, cmap=plt.cm.Blues, - norm=plt.Normalize(0, values.max())) +lc = LineCollection( + segments, zorder=0, cmap=plt.cm.Blues, norm=plt.Normalize(0, values.max()) +) lc.set_array(similarities.flatten()) lc.set_linewidths(np.full(len(segments), 0.5)) ax.add_collection(lc) diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py index d049658f8f775..3aa2088c22687 100644 --- a/examples/manifold/plot_swissroll.py +++ b/examples/manifold/plot_swissroll.py @@ -16,17 +16,18 @@ # This import is needed to modify the way figure behaves from mpl_toolkits.mplot3d import Axes3D + Axes3D # ---------------------------------------------------------------------- # Locally linear embedding of the swiss roll from sklearn import manifold, datasets + X, color = datasets.make_swiss_roll(n_samples=1500) print("Computing LLE embedding") -X_r, err = manifold.locally_linear_embedding(X, n_neighbors=12, - n_components=2) +X_r, err = manifold.locally_linear_embedding(X, n_neighbors=12, n_components=2) print("Done. Reconstruction error: %g" % err) # ---------------------------------------------------------------------- @@ -34,13 +35,13 @@ fig = plt.figure() -ax = fig.add_subplot(211, projection='3d') +ax = fig.add_subplot(211, projection="3d") ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) ax.set_title("Original data") ax = fig.add_subplot(212) ax.scatter(X_r[:, 0], X_r[:, 1], c=color, cmap=plt.cm.Spectral) -plt.axis('tight') +plt.axis("tight") plt.xticks([]), plt.yticks([]) -plt.title('Projected data') +plt.title("Projected data") plt.show() diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py index dd7b4d1f21a09..04da2eb51acb1 100644 --- a/examples/manifold/plot_t_sne_perplexity.py +++ b/examples/manifold/plot_t_sne_perplexity.py @@ -40,7 +40,7 @@ (fig, subplots) = plt.subplots(3, 5, figsize=(15, 8)) perplexities = [5, 30, 50, 100] -X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) +X, y = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05) red = y == 0 green = y == 1 @@ -50,14 +50,15 @@ ax.scatter(X[green, 0], X[green, 1], c="g") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) -plt.axis('tight') +plt.axis("tight") for i, perplexity in enumerate(perplexities): ax = subplots[0][i + 1] t0 = time() - tsne = manifold.TSNE(n_components=n_components, init='random', - random_state=0, perplexity=perplexity) + tsne = manifold.TSNE( + n_components=n_components, init="random", random_state=0, perplexity=perplexity + ) Y = tsne.fit_transform(X) t1 = time() print("circles, perplexity=%d in %.2g sec" % (perplexity, t1 - t0)) @@ -66,7 +67,7 @@ ax.scatter(Y[green, 0], Y[green, 1], c="g") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - ax.axis('tight') + ax.axis("tight") # Another example using s-curve X, color = datasets.make_s_curve(n_samples, random_state=0) @@ -80,8 +81,9 @@ ax = subplots[1][i + 1] t0 = time() - tsne = manifold.TSNE(n_components=n_components, init='random', - random_state=0, perplexity=perplexity) + tsne = manifold.TSNE( + n_components=n_components, init="random", random_state=0, perplexity=perplexity + ) Y = tsne.fit_transform(X) t1 = time() print("S-curve, perplexity=%d in %.2g sec" % (perplexity, t1 - t0)) @@ -90,16 +92,18 @@ ax.scatter(Y[:, 0], Y[:, 1], c=color) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - ax.axis('tight') + ax.axis("tight") # Another example using a 2D uniform grid x = np.linspace(0, 1, int(np.sqrt(n_samples))) xx, yy = np.meshgrid(x, x) -X = np.hstack([ - xx.ravel().reshape(-1, 1), - yy.ravel().reshape(-1, 1), -]) +X = np.hstack( + [ + xx.ravel().reshape(-1, 1), + yy.ravel().reshape(-1, 1), + ] +) color = xx.ravel() ax = subplots[2][0] ax.scatter(X[:, 0], X[:, 1], c=color) @@ -110,8 +114,9 @@ ax = subplots[2][i + 1] t0 = time() - tsne = manifold.TSNE(n_components=n_components, init='random', - random_state=0, perplexity=perplexity) + tsne = manifold.TSNE( + n_components=n_components, init="random", random_state=0, perplexity=perplexity + ) Y = tsne.fit_transform(X) t1 = time() print("uniform grid, perplexity=%d in %.2g sec" % (perplexity, t1 - t0)) @@ -120,7 +125,7 @@ ax.scatter(Y[:, 0], Y[:, 1], c=color) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) - ax.axis('tight') + ax.axis("tight") plt.show() diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py index c0c3a4f890923..924ab47fa81ca 100644 --- a/examples/miscellaneous/plot_anomaly_comparison.py +++ b/examples/miscellaneous/plot_anomaly_comparison.py @@ -82,7 +82,7 @@ print(__doc__) -matplotlib.rcParams['contour.negative_linestyle'] = 'solid' +matplotlib.rcParams["contour.negative_linestyle"] = "solid" # Example settings n_samples = 300 @@ -95,46 +95,58 @@ # to give similar results to the OneClassSVM anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), - ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", - gamma=0.1)), - ("One-Class SVM (SGD)", make_pipeline( - Nystroem(gamma=0.1, random_state=42, n_components=150), - SGDOneClassSVM(nu=outliers_fraction, shuffle=True, - fit_intercept=True, random_state=42, tol=1e-6) - )), - ("Isolation Forest", IsolationForest(contamination=outliers_fraction, - random_state=42)), - ("Local Outlier Factor", LocalOutlierFactor( - n_neighbors=35, contamination=outliers_fraction))] + ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), + ( + "One-Class SVM (SGD)", + make_pipeline( + Nystroem(gamma=0.1, random_state=42, n_components=150), + SGDOneClassSVM( + nu=outliers_fraction, + shuffle=True, + fit_intercept=True, + random_state=42, + tol=1e-6, + ), + ), + ), + ( + "Isolation Forest", + IsolationForest(contamination=outliers_fraction, random_state=42), + ), + ( + "Local Outlier Factor", + LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction), + ), +] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ - make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, - **blobs_params)[0], - make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], - **blobs_params)[0], - make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], - **blobs_params)[0], - 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] - - np.array([0.5, 0.25])), - 14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)] + make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], + make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], + make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, 0.3], **blobs_params)[0], + 4.0 + * ( + make_moons(n_samples=n_samples, noise=0.05, random_state=0)[0] + - np.array([0.5, 0.25]) + ), + 14.0 * (np.random.RandomState(42).rand(n_samples, 2) - 0.5), +] # Compare given classifiers under given settings -xx, yy = np.meshgrid(np.linspace(-7, 7, 150), - np.linspace(-7, 7, 150)) +xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150)) plt.figure(figsize=(len(anomaly_algorithms) * 2 + 4, 12.5)) -plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, - hspace=.01) +plt.subplots_adjust( + left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01 +) plot_num = 1 rng = np.random.RandomState(42) for i_dataset, X in enumerate(datasets): # Add outliers - X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], - axis=0) + X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0) for name, algorithm in anomaly_algorithms: t0 = time.time() @@ -154,18 +166,23 @@ if name != "Local Outlier Factor": # LOF does not implement predict Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) - plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black') + plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="black") - colors = np.array(['#377eb8', '#ff7f00']) + colors = np.array(["#377eb8", "#ff7f00"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2]) plt.xlim(-7, 7) plt.ylim(-7, 7) plt.xticks(()) plt.yticks(()) - plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), - transform=plt.gca().transAxes, size=15, - horizontalalignment='right') + plt.text( + 0.99, + 0.01, + ("%.2fs" % (t1 - t0)).lstrip("0"), + transform=plt.gca().transAxes, + size=15, + horizontalalignment="right", + ) plot_num += 1 plt.show() diff --git a/examples/miscellaneous/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py index a35471105b6c1..d27b17f3cc82d 100644 --- a/examples/miscellaneous/plot_changed_only_pprint_parameter.py +++ b/examples/miscellaneous/plot_changed_only_pprint_parameter.py @@ -15,8 +15,8 @@ from sklearn import set_config -lr = LogisticRegression(penalty='l1') -print('Default representation:') +lr = LogisticRegression(penalty="l1") +print("Default representation:") print(lr) # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, # intercept_scaling=1, l1_ratio=None, max_iter=100, @@ -25,6 +25,6 @@ # warm_start=False) set_config(print_changed_only=True) -print('\nWith changed_only option:') +print("\nWith changed_only option:") print(lr) # LogisticRegression(penalty='l1') diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py index a05f17fc9aba5..bb9d252fe830c 100644 --- a/examples/miscellaneous/plot_display_object_visualization.py +++ b/examples/miscellaneous/plot_display_object_visualization.py @@ -58,6 +58,7 @@ # a decision function, we will use it to plot the roc curve: from sklearn.metrics import roc_curve from sklearn.metrics import RocCurveDisplay + y_score = clf.decision_function(X_test) fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1]) @@ -71,8 +72,7 @@ from sklearn.metrics import precision_recall_curve from sklearn.metrics import PrecisionRecallDisplay -prec, recall, _ = precision_recall_curve(y_test, y_score, - pos_label=clf.classes_[1]) +prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label=clf.classes_[1]) pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot() # %% @@ -85,6 +85,7 @@ # sphinx_gallery_thumbnail_number = 4 import matplotlib.pyplot as plt + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) roc_display.plot(ax=ax1) diff --git a/examples/miscellaneous/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py index 8848b1151e1fa..f75bc1144e881 100644 --- a/examples/miscellaneous/plot_isotonic_regression.py +++ b/examples/miscellaneous/plot_isotonic_regression.py @@ -35,7 +35,7 @@ n = 100 x = np.arange(n) rs = check_random_state(0) -y = rs.randint(-50, 50, size=(n,)) + 50. * np.log1p(np.arange(n)) +y = rs.randint(-50, 50, size=(n,)) + 50.0 * np.log1p(np.arange(n)) # %% # Fit IsotonicRegression and LinearRegression models: @@ -56,16 +56,16 @@ fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 6)) -ax0.plot(x, y, 'C0.', markersize=12) -ax0.plot(x, y_, 'C1.-', markersize=12) -ax0.plot(x, lr.predict(x[:, np.newaxis]), 'C2-') +ax0.plot(x, y, "C0.", markersize=12) +ax0.plot(x, y_, "C1.-", markersize=12) +ax0.plot(x, lr.predict(x[:, np.newaxis]), "C2-") ax0.add_collection(lc) -ax0.legend(('Training data', 'Isotonic fit', 'Linear fit'), loc='lower right') -ax0.set_title('Isotonic regression fit on noisy data (n=%d)' % n) +ax0.legend(("Training data", "Isotonic fit", "Linear fit"), loc="lower right") +ax0.set_title("Isotonic regression fit on noisy data (n=%d)" % n) x_test = np.linspace(-10, 110, 1000) -ax1.plot(x_test, ir.predict(x_test), 'C1-') -ax1.plot(ir.X_thresholds_, ir.y_thresholds_, 'C1.', markersize=12) +ax1.plot(x_test, ir.predict(x_test), "C1-") +ax1.plot(ir.X_thresholds_, ir.y_thresholds_, "C1.", markersize=12) ax1.set_title("Prediction function (%d thresholds)" % len(ir.X_thresholds_)) plt.show() diff --git a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py index 433f5a7f05d37..64815751efa36 100644 --- a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py +++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py @@ -27,10 +27,10 @@ from sklearn.utils.fixes import parse_version # `normed` is being deprecated in favor of `density` in histograms -if parse_version(matplotlib.__version__) >= parse_version('2.1'): - density_param = {'density': True} +if parse_version(matplotlib.__version__) >= parse_version("2.1"): + density_param = {"density": True} else: - density_param = {'normed': True} + density_param = {"normed": True} # %% # Theoretical bounds @@ -119,7 +119,7 @@ # digits dataset, pass the ``--use-digits-dataset`` command line argument to # this script. -if '--use-digits-dataset' in sys.argv: +if "--use-digits-dataset" in sys.argv: data = load_digits().data[:500] else: data = fetch_20newsgroups_vectorized().data[:500] @@ -133,8 +133,10 @@ # - 1D histogram of the ratio of those distances (projected / original). n_samples, n_features = data.shape -print("Embedding %d samples with dim %d using various random projections" - % (n_samples, n_features)) +print( + "Embedding %d samples with dim %d using various random projections" + % (n_samples, n_features) +) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel() @@ -147,38 +149,41 @@ t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) - print("Projected %d samples from %d to %d in %0.3fs" - % (n_samples, n_features, n_components, time() - t0)) - if hasattr(rp, 'components_'): + print( + "Projected %d samples from %d to %d in %0.3fs" + % (n_samples, n_features, n_components, time() - t0) + ) + if hasattr(rp, "components_"): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) - projected_dists = euclidean_distances( - projected_data, squared=True).ravel()[nonzero] + projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero] plt.figure() min_dist = min(projected_dists.min(), dists.min()) max_dist = max(projected_dists.max(), dists.max()) - plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu, - extent=[min_dist, max_dist, min_dist, max_dist]) + plt.hexbin( + dists, + projected_dists, + gridsize=100, + cmap=plt.cm.PuBu, + extent=[min_dist, max_dist, min_dist, max_dist], + ) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") - plt.title("Pairwise distances distribution for n_components=%d" % - n_components) + plt.title("Pairwise distances distribution for n_components=%d" % n_components) cb = plt.colorbar() - cb.set_label('Sample pairs counts') + cb.set_label("Sample pairs counts") rates = projected_dists / dists - print("Mean distances rate: %0.2f (%0.2f)" - % (np.mean(rates), np.std(rates))) + print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates))) plt.figure() - plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param) + plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", **density_param) plt.xlabel("Squared distances rate: projected / original") plt.ylabel("Distribution of samples pairs") - plt.title("Histogram of pairwise distance rates for n_components=%d" % - n_components) + plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components) # TODO: compute the expected value of eps and add them to the previous plot # as vertical lines / region diff --git a/examples/miscellaneous/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py index 80b4582a45670..ffd50e9dca06e 100644 --- a/examples/miscellaneous/plot_kernel_approximation.py +++ b/examples/miscellaneous/plot_kernel_approximation.py @@ -47,8 +47,7 @@ # Import datasets, classifiers and performance metrics from sklearn import datasets, svm, pipeline -from sklearn.kernel_approximation import (RBFSampler, - Nystroem) +from sklearn.kernel_approximation import RBFSampler, Nystroem from sklearn.decomposition import PCA # The digits dataset @@ -61,32 +60,32 @@ # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.data) -data = digits.data / 16. +data = digits.data / 16.0 data -= data.mean(axis=0) # We learn the digits on the first half of the digits -data_train, targets_train = (data[:n_samples // 2], - digits.target[:n_samples // 2]) +data_train, targets_train = (data[: n_samples // 2], digits.target[: n_samples // 2]) # Now predict the value of the digit on the second half: -data_test, targets_test = (data[n_samples // 2:], - digits.target[n_samples // 2:]) +data_test, targets_test = (data[n_samples // 2 :], digits.target[n_samples // 2 :]) # data_test = scaler.transform(data_test) # Create a classifier: a support vector classifier -kernel_svm = svm.SVC(gamma=.2) +kernel_svm = svm.SVC(gamma=0.2) linear_svm = svm.LinearSVC() # create pipeline from kernel approximation # and linear svm -feature_map_fourier = RBFSampler(gamma=.2, random_state=1) -feature_map_nystroem = Nystroem(gamma=.2, random_state=1) -fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier), - ("svm", svm.LinearSVC())]) +feature_map_fourier = RBFSampler(gamma=0.2, random_state=1) +feature_map_nystroem = Nystroem(gamma=0.2, random_state=1) +fourier_approx_svm = pipeline.Pipeline( + [("feature_map", feature_map_fourier), ("svm", svm.LinearSVC())] +) -nystroem_approx_svm = pipeline.Pipeline([("feature_map", feature_map_nystroem), - ("svm", svm.LinearSVC())]) +nystroem_approx_svm = pipeline.Pipeline( + [("feature_map", feature_map_nystroem), ("svm", svm.LinearSVC())] +) # fit and predict using linear and kernel svm: @@ -129,23 +128,35 @@ timescale = plt.subplot(122) accuracy.plot(sample_sizes, nystroem_scores, label="Nystroem approx. kernel") -timescale.plot(sample_sizes, nystroem_times, '--', - label='Nystroem approx. kernel') +timescale.plot(sample_sizes, nystroem_times, "--", label="Nystroem approx. kernel") accuracy.plot(sample_sizes, fourier_scores, label="Fourier approx. kernel") -timescale.plot(sample_sizes, fourier_times, '--', - label='Fourier approx. kernel') +timescale.plot(sample_sizes, fourier_times, "--", label="Fourier approx. kernel") # horizontal lines for exact rbf and linear kernels: -accuracy.plot([sample_sizes[0], sample_sizes[-1]], - [linear_svm_score, linear_svm_score], label="linear svm") -timescale.plot([sample_sizes[0], sample_sizes[-1]], - [linear_svm_time, linear_svm_time], '--', label='linear svm') - -accuracy.plot([sample_sizes[0], sample_sizes[-1]], - [kernel_svm_score, kernel_svm_score], label="rbf svm") -timescale.plot([sample_sizes[0], sample_sizes[-1]], - [kernel_svm_time, kernel_svm_time], '--', label='rbf svm') +accuracy.plot( + [sample_sizes[0], sample_sizes[-1]], + [linear_svm_score, linear_svm_score], + label="linear svm", +) +timescale.plot( + [sample_sizes[0], sample_sizes[-1]], + [linear_svm_time, linear_svm_time], + "--", + label="linear svm", +) + +accuracy.plot( + [sample_sizes[0], sample_sizes[-1]], + [kernel_svm_score, kernel_svm_score], + label="rbf svm", +) +timescale.plot( + [sample_sizes[0], sample_sizes[-1]], + [kernel_svm_time, kernel_svm_time], + "--", + label="rbf svm", +) # vertical line for dataset dimensionality = 64 accuracy.plot([64, 64], [0.7, 1], label="n_features") @@ -159,8 +170,8 @@ timescale.set_xlabel("Sampling steps = transformed feature dimension") accuracy.set_ylabel("Classification accuracy") timescale.set_ylabel("Training time in seconds") -accuracy.legend(loc='best') -timescale.legend(loc='best') +accuracy.legend(loc="best") +timescale.legend(loc="best") plt.tight_layout() plt.show() @@ -197,17 +208,16 @@ flat_grid = grid.reshape(-1, data.shape[1]) # title for the plots -titles = ['SVC with rbf kernel', - 'SVC (linear kernel)\n with Fourier rbf feature map\n' - 'n_components=100', - 'SVC (linear kernel)\n with Nystroem rbf feature map\n' - 'n_components=100'] +titles = [ + "SVC with rbf kernel", + "SVC (linear kernel)\n with Fourier rbf feature map\nn_components=100", + "SVC (linear kernel)\n with Nystroem rbf feature map\nn_components=100", +] plt.figure(figsize=(18, 7.5)) -plt.rcParams.update({'font.size': 14}) +plt.rcParams.update({"font.size": 14}) # predict and plot -for i, clf in enumerate((kernel_svm, nystroem_approx_svm, - fourier_approx_svm)): +for i, clf in enumerate((kernel_svm, nystroem_approx_svm, fourier_approx_svm)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) @@ -216,11 +226,12 @@ # Put the result into a color plot Z = Z.reshape(grid.shape[:-1]) plt.contourf(multiples, multiples, Z, cmap=plt.cm.Paired) - plt.axis('off') + plt.axis("off") # Plot also the training points - plt.scatter(X[:, 0], X[:, 1], c=targets_train, cmap=plt.cm.Paired, - edgecolors=(0, 0, 0)) + plt.scatter( + X[:, 0], X[:, 1], c=targets_train, cmap=plt.cm.Paired, edgecolors=(0, 0, 0) + ) plt.title(titles[i]) plt.tight_layout() diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py index 1eb84d8fdac81..eaff3d91fd82d 100644 --- a/examples/miscellaneous/plot_kernel_ridge_regression.py +++ b/examples/miscellaneous/plot_kernel_ridge_regression.py @@ -60,25 +60,25 @@ # ############################################################################# # Fit regression model train_size = 100 -svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1), - param_grid={"C": [1e0, 1e1, 1e2, 1e3], - "gamma": np.logspace(-2, 2, 5)}) +svr = GridSearchCV( + SVR(kernel="rbf", gamma=0.1), + param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, +) -kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), - param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], - "gamma": np.logspace(-2, 2, 5)}) +kr = GridSearchCV( + KernelRidge(kernel="rbf", gamma=0.1), + param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)}, +) t0 = time.time() svr.fit(X[:train_size], y[:train_size]) svr_fit = time.time() - t0 -print("SVR complexity and bandwidth selected and model fitted in %.3f s" - % svr_fit) +print("SVR complexity and bandwidth selected and model fitted in %.3f s" % svr_fit) t0 = time.time() kr.fit(X[:train_size], y[:train_size]) kr_fit = time.time() - t0 -print("KRR complexity and bandwidth selected and model fitted in %.3f s" - % kr_fit) +print("KRR complexity and bandwidth selected and model fitted in %.3f s" % kr_fit) sv_ratio = svr.best_estimator_.support_.shape[0] / train_size print("Support vector ratio: %.3f" % sv_ratio) @@ -86,30 +86,39 @@ t0 = time.time() y_svr = svr.predict(X_plot) svr_predict = time.time() - t0 -print("SVR prediction for %d inputs in %.3f s" - % (X_plot.shape[0], svr_predict)) +print("SVR prediction for %d inputs in %.3f s" % (X_plot.shape[0], svr_predict)) t0 = time.time() y_kr = kr.predict(X_plot) kr_predict = time.time() - t0 -print("KRR prediction for %d inputs in %.3f s" - % (X_plot.shape[0], kr_predict)) +print("KRR prediction for %d inputs in %.3f s" % (X_plot.shape[0], kr_predict)) # ############################################################################# # Look at the results sv_ind = svr.best_estimator_.support_ -plt.scatter(X[sv_ind], y[sv_ind], c='r', s=50, label='SVR support vectors', - zorder=2, edgecolors=(0, 0, 0)) -plt.scatter(X[:100], y[:100], c='k', label='data', zorder=1, - edgecolors=(0, 0, 0)) -plt.plot(X_plot, y_svr, c='r', - label='SVR (fit: %.3fs, predict: %.3fs)' % (svr_fit, svr_predict)) -plt.plot(X_plot, y_kr, c='g', - label='KRR (fit: %.3fs, predict: %.3fs)' % (kr_fit, kr_predict)) -plt.xlabel('data') -plt.ylabel('target') -plt.title('SVR versus Kernel Ridge') +plt.scatter( + X[sv_ind], + y[sv_ind], + c="r", + s=50, + label="SVR support vectors", + zorder=2, + edgecolors=(0, 0, 0), +) +plt.scatter(X[:100], y[:100], c="k", label="data", zorder=1, edgecolors=(0, 0, 0)) +plt.plot( + X_plot, + y_svr, + c="r", + label="SVR (fit: %.3fs, predict: %.3fs)" % (svr_fit, svr_predict), +) +plt.plot( + X_plot, y_kr, c="g", label="KRR (fit: %.3fs, predict: %.3fs)" % (kr_fit, kr_predict) +) +plt.xlabel("data") +plt.ylabel("target") +plt.title("SVR versus Kernel Ridge") plt.legend() # Visualize training and prediction time @@ -120,9 +129,10 @@ y = np.sin(X).ravel() y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) sizes = np.logspace(1, 4, 7).astype(int) -for name, estimator in {"KRR": KernelRidge(kernel='rbf', alpha=0.1, - gamma=10), - "SVR": SVR(kernel='rbf', C=1e1, gamma=10)}.items(): +for name, estimator in { + "KRR": KernelRidge(kernel="rbf", alpha=0.1, gamma=10), + "SVR": SVR(kernel="rbf", C=1e1, gamma=10), +}.items(): train_time = [] test_time = [] for train_test_size in sizes: @@ -134,37 +144,55 @@ estimator.predict(X_plot[:1000]) test_time.append(time.time() - t0) - plt.plot(sizes, train_time, 'o-', color="r" if name == "SVR" else "g", - label="%s (train)" % name) - plt.plot(sizes, test_time, 'o--', color="r" if name == "SVR" else "g", - label="%s (test)" % name) + plt.plot( + sizes, + train_time, + "o-", + color="r" if name == "SVR" else "g", + label="%s (train)" % name, + ) + plt.plot( + sizes, + test_time, + "o--", + color="r" if name == "SVR" else "g", + label="%s (test)" % name, + ) plt.xscale("log") plt.yscale("log") plt.xlabel("Train size") plt.ylabel("Time (seconds)") -plt.title('Execution Time') +plt.title("Execution Time") plt.legend(loc="best") # Visualize learning curves plt.figure() -svr = SVR(kernel='rbf', C=1e1, gamma=0.1) -kr = KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1) -train_sizes, train_scores_svr, test_scores_svr = \ - learning_curve(svr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), - scoring="neg_mean_squared_error", cv=10) -train_sizes_abs, train_scores_kr, test_scores_kr = \ - learning_curve(kr, X[:100], y[:100], train_sizes=np.linspace(0.1, 1, 10), - scoring="neg_mean_squared_error", cv=10) - -plt.plot(train_sizes, -test_scores_svr.mean(1), 'o-', color="r", - label="SVR") -plt.plot(train_sizes, -test_scores_kr.mean(1), 'o-', color="g", - label="KRR") +svr = SVR(kernel="rbf", C=1e1, gamma=0.1) +kr = KernelRidge(kernel="rbf", alpha=0.1, gamma=0.1) +train_sizes, train_scores_svr, test_scores_svr = learning_curve( + svr, + X[:100], + y[:100], + train_sizes=np.linspace(0.1, 1, 10), + scoring="neg_mean_squared_error", + cv=10, +) +train_sizes_abs, train_scores_kr, test_scores_kr = learning_curve( + kr, + X[:100], + y[:100], + train_sizes=np.linspace(0.1, 1, 10), + scoring="neg_mean_squared_error", + cv=10, +) + +plt.plot(train_sizes, -test_scores_svr.mean(1), "o-", color="r", label="SVR") +plt.plot(train_sizes, -test_scores_kr.mean(1), "o-", color="g", label="KRR") plt.xlabel("Train size") plt.ylabel("Mean Squared Error") -plt.title('Learning curves') +plt.title("Learning curves") plt.legend(loc="best") plt.show() diff --git a/examples/miscellaneous/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py index 828ca17fafa23..4958084b9dbd7 100644 --- a/examples/miscellaneous/plot_multilabel.py +++ b/examples/miscellaneous/plot_multilabel.py @@ -64,7 +64,7 @@ def plot_subfigure(X, Y, subplot, title, transform): min_y = np.min(X[:, 1]) max_y = np.max(X[:, 1]) - classif = OneVsRestClassifier(SVC(kernel='linear')) + classif = OneVsRestClassifier(SVC(kernel="linear")) classif.fit(X, Y) plt.subplot(2, 2, subplot) @@ -72,42 +72,58 @@ def plot_subfigure(X, Y, subplot, title, transform): zero_class = np.where(Y[:, 0]) one_class = np.where(Y[:, 1]) - plt.scatter(X[:, 0], X[:, 1], s=40, c='gray', edgecolors=(0, 0, 0)) - plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b', - facecolors='none', linewidths=2, label='Class 1') - plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange', - facecolors='none', linewidths=2, label='Class 2') - - plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--', - 'Boundary\nfor class 1') - plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.', - 'Boundary\nfor class 2') + plt.scatter(X[:, 0], X[:, 1], s=40, c="gray", edgecolors=(0, 0, 0)) + plt.scatter( + X[zero_class, 0], + X[zero_class, 1], + s=160, + edgecolors="b", + facecolors="none", + linewidths=2, + label="Class 1", + ) + plt.scatter( + X[one_class, 0], + X[one_class, 1], + s=80, + edgecolors="orange", + facecolors="none", + linewidths=2, + label="Class 2", + ) + + plot_hyperplane( + classif.estimators_[0], min_x, max_x, "k--", "Boundary\nfor class 1" + ) + plot_hyperplane( + classif.estimators_[1], min_x, max_x, "k-.", "Boundary\nfor class 2" + ) plt.xticks(()) plt.yticks(()) - plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x) - plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y) + plt.xlim(min_x - 0.5 * max_x, max_x + 0.5 * max_x) + plt.ylim(min_y - 0.5 * max_y, max_y + 0.5 * max_y) if subplot == 2: - plt.xlabel('First principal component') - plt.ylabel('Second principal component') + plt.xlabel("First principal component") + plt.ylabel("Second principal component") plt.legend(loc="upper left") plt.figure(figsize=(8, 6)) -X, Y = make_multilabel_classification(n_classes=2, n_labels=1, - allow_unlabeled=True, - random_state=1) +X, Y = make_multilabel_classification( + n_classes=2, n_labels=1, allow_unlabeled=True, random_state=1 +) plot_subfigure(X, Y, 1, "With unlabeled samples + CCA", "cca") plot_subfigure(X, Y, 2, "With unlabeled samples + PCA", "pca") -X, Y = make_multilabel_classification(n_classes=2, n_labels=1, - allow_unlabeled=False, - random_state=1) +X, Y = make_multilabel_classification( + n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1 +) plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca") plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca") -plt.subplots_adjust(.04, .02, .97, .94, .09, .2) +plt.subplots_adjust(0.04, 0.02, 0.97, 0.94, 0.09, 0.2) plt.show() diff --git a/examples/miscellaneous/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py index 62fd20d24645f..63b3bea4175ba 100644 --- a/examples/miscellaneous/plot_multioutput_face_completion.py +++ b/examples/miscellaneous/plot_multioutput_face_completion.py @@ -33,21 +33,22 @@ # Test on a subset of people n_faces = 5 rng = check_random_state(4) -face_ids = rng.randint(test.shape[0], size=(n_faces, )) +face_ids = rng.randint(test.shape[0], size=(n_faces,)) test = test[face_ids, :] n_pixels = data.shape[1] # Upper half of the faces -X_train = train[:, :(n_pixels + 1) // 2] +X_train = train[:, : (n_pixels + 1) // 2] # Lower half of the faces -y_train = train[:, n_pixels // 2:] -X_test = test[:, :(n_pixels + 1) // 2] -y_test = test[:, n_pixels // 2:] +y_train = train[:, n_pixels // 2 :] +X_test = test[:, : (n_pixels + 1) // 2] +y_test = test[:, n_pixels // 2 :] # Fit estimators ESTIMATORS = { - "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=32, - random_state=0), + "Extra trees": ExtraTreesRegressor( + n_estimators=10, max_features=32, random_state=0 + ), "K-nn": KNeighborsRegressor(), "Linear regression": LinearRegression(), "Ridge": RidgeCV(), @@ -62,7 +63,7 @@ image_shape = (64, 64) n_cols = 1 + len(ESTIMATORS) -plt.figure(figsize=(2. * n_cols, 2.26 * n_faces)) +plt.figure(figsize=(2.0 * n_cols, 2.26 * n_faces)) plt.suptitle("Face completion with multi-output estimators", size=16) for i in range(n_faces): @@ -71,13 +72,12 @@ if i: sub = plt.subplot(n_faces, n_cols, i * n_cols + 1) else: - sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, - title="true faces") + sub = plt.subplot(n_faces, n_cols, i * n_cols + 1, title="true faces") sub.axis("off") - sub.imshow(true_face.reshape(image_shape), - cmap=plt.cm.gray, - interpolation="nearest") + sub.imshow( + true_face.reshape(image_shape), cmap=plt.cm.gray, interpolation="nearest" + ) for j, est in enumerate(sorted(ESTIMATORS)): completed_face = np.hstack((X_test[i], y_test_predict[est][i])) @@ -86,12 +86,13 @@ sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j) else: - sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, - title=est) + sub = plt.subplot(n_faces, n_cols, i * n_cols + 2 + j, title=est) sub.axis("off") - sub.imshow(completed_face.reshape(image_shape), - cmap=plt.cm.gray, - interpolation="nearest") + sub.imshow( + completed_face.reshape(image_shape), + cmap=plt.cm.gray, + interpolation="nearest", + ) plt.show() diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py index 342ba14a338b1..5f6e61a89c4fe 100644 --- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py +++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py @@ -37,9 +37,10 @@ y = diabetes.target tree = DecisionTreeRegressor() -mlp = make_pipeline(StandardScaler(), - MLPRegressor(hidden_layer_sizes=(100, 100), - tol=1e-2, max_iter=500, random_state=0)) +mlp = make_pipeline( + StandardScaler(), + MLPRegressor(hidden_layer_sizes=(100, 100), tol=1e-2, max_iter=500, random_state=0), +) tree.fit(X, y) mlp.fit(X, y) @@ -63,8 +64,9 @@ # color of the curve. fig, ax = plt.subplots(figsize=(12, 6)) ax.set_title("Multi-layer Perceptron") -mlp_disp = PartialDependenceDisplay.from_estimator(mlp, X, ["age", "bmi"], ax=ax, - line_kw={"color": "red"}) +mlp_disp = PartialDependenceDisplay.from_estimator( + mlp, X, ["age", "bmi"], ax=ax, line_kw={"color": "red"} +) # %% # Plotting partial dependence of the two models together @@ -101,8 +103,9 @@ # sphinx_gallery_thumbnail_number = 4 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6)) tree_disp.plot(ax=[ax1, ax2], line_kw={"label": "Decision Tree"}) -mlp_disp.plot(ax=[ax1, ax2], line_kw={"label": "Multi-layer Perceptron", - "color": "red"}) +mlp_disp.plot( + ax=[ax1, ax2], line_kw={"label": "Multi-layer Perceptron", "color": "red"} +) ax1.legend() ax2.legend() @@ -115,8 +118,9 @@ # `plot` will only show the y label and y ticks on the left most plot. tree_disp.plot(line_kw={"label": "Decision Tree"}) -mlp_disp.plot(line_kw={"label": "Multi-layer Perceptron", "color": "red"}, - ax=tree_disp.axes_) +mlp_disp.plot( + line_kw={"label": "Multi-layer Perceptron", "color": "red"}, ax=tree_disp.axes_ +) tree_disp.figure_.set_size_inches(10, 6) tree_disp.axes_[0, 0].legend() tree_disp.axes_[0, 1].legend() @@ -131,4 +135,5 @@ # plot function. tree_disp = PartialDependenceDisplay.from_estimator(tree, X, ["age"]) mlp_disp = PartialDependenceDisplay.from_estimator( - mlp, X, ["age"], ax=tree_disp.axes_, line_kw={"color": "red"}) + mlp, X, ["age"], ax=tree_disp.axes_, line_kw={"color": "red"} +) diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py index 6da379d118436..65830b6dc1182 100644 --- a/examples/mixture/plot_concentration_prior.py +++ b/examples/mixture/plot_concentration_prior.py @@ -49,89 +49,116 @@ def plot_ellipses(ax, weights, means, covars): angle = 180 * angle / np.pi # eigenvector normalization eig_vals = 2 * np.sqrt(2) * np.sqrt(eig_vals) - ell = mpl.patches.Ellipse(means[n], eig_vals[0], eig_vals[1], - 180 + angle, edgecolor='black') + ell = mpl.patches.Ellipse( + means[n], eig_vals[0], eig_vals[1], 180 + angle, edgecolor="black" + ) ell.set_clip_box(ax.bbox) ell.set_alpha(weights[n]) - ell.set_facecolor('#56B4E9') + ell.set_facecolor("#56B4E9") ax.add_artist(ell) def plot_results(ax1, ax2, estimator, X, y, title, plot_title=False): ax1.set_title(title) - ax1.scatter(X[:, 0], X[:, 1], s=5, marker='o', color=colors[y], alpha=0.8) - ax1.set_xlim(-2., 2.) - ax1.set_ylim(-3., 3.) + ax1.scatter(X[:, 0], X[:, 1], s=5, marker="o", color=colors[y], alpha=0.8) + ax1.set_xlim(-2.0, 2.0) + ax1.set_ylim(-3.0, 3.0) ax1.set_xticks(()) ax1.set_yticks(()) - plot_ellipses(ax1, estimator.weights_, estimator.means_, - estimator.covariances_) + plot_ellipses(ax1, estimator.weights_, estimator.means_, estimator.covariances_) - ax2.get_xaxis().set_tick_params(direction='out') + ax2.get_xaxis().set_tick_params(direction="out") ax2.yaxis.grid(True, alpha=0.7) for k, w in enumerate(estimator.weights_): - ax2.bar(k, w, width=0.9, color='#56B4E9', zorder=3, - align='center', edgecolor='black') - ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.), - horizontalalignment='center') - ax2.set_xlim(-.6, 2 * n_components - .4) - ax2.set_ylim(0., 1.1) - ax2.tick_params(axis='y', which='both', left=False, - right=False, labelleft=False) - ax2.tick_params(axis='x', which='both', top=False) + ax2.bar( + k, + w, + width=0.9, + color="#56B4E9", + zorder=3, + align="center", + edgecolor="black", + ) + ax2.text(k, w + 0.007, "%.1f%%" % (w * 100.0), horizontalalignment="center") + ax2.set_xlim(-0.6, 2 * n_components - 0.4) + ax2.set_ylim(0.0, 1.1) + ax2.tick_params(axis="y", which="both", left=False, right=False, labelleft=False) + ax2.tick_params(axis="x", which="both", top=False) if plot_title: - ax1.set_ylabel('Estimated Mixtures') - ax2.set_ylabel('Weight of each component') + ax1.set_ylabel("Estimated Mixtures") + ax2.set_ylabel("Weight of each component") # Parameters of the dataset random_state, n_components, n_features = 2, 3, 2 -colors = np.array(['#0072B2', '#F0E442', '#D55E00']) +colors = np.array(["#0072B2", "#F0E442", "#D55E00"]) -covars = np.array([[[.7, .0], [.0, .1]], - [[.5, .0], [.0, .1]], - [[.5, .0], [.0, .1]]]) +covars = np.array( + [[[0.7, 0.0], [0.0, 0.1]], [[0.5, 0.0], [0.0, 0.1]], [[0.5, 0.0], [0.0, 0.1]]] +) samples = np.array([200, 500, 200]) -means = np.array([[.0, -.70], - [.0, .0], - [.0, .70]]) +means = np.array([[0.0, -0.70], [0.0, 0.0], [0.0, 0.70]]) # mean_precision_prior= 0.8 to minimize the influence of the prior estimators = [ - ("Finite mixture with a Dirichlet distribution\nprior and " - r"$\gamma_0=$", BayesianGaussianMixture( - weight_concentration_prior_type="dirichlet_distribution", - n_components=2 * n_components, reg_covar=0, init_params='random', - max_iter=1500, mean_precision_prior=.8, - random_state=random_state), [0.001, 1, 1000]), - ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", - BayesianGaussianMixture( - weight_concentration_prior_type="dirichlet_process", - n_components=2 * n_components, reg_covar=0, init_params='random', - max_iter=1500, mean_precision_prior=.8, - random_state=random_state), [1, 1000, 100000])] + ( + "Finite mixture with a Dirichlet distribution\nprior and " r"$\gamma_0=$", + BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_distribution", + n_components=2 * n_components, + reg_covar=0, + init_params="random", + max_iter=1500, + mean_precision_prior=0.8, + random_state=random_state, + ), + [0.001, 1, 1000], + ), + ( + "Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", + BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_process", + n_components=2 * n_components, + reg_covar=0, + init_params="random", + max_iter=1500, + mean_precision_prior=0.8, + random_state=random_state, + ), + [1, 1000, 100000], + ), +] # Generate data rng = np.random.RandomState(random_state) -X = np.vstack([ - rng.multivariate_normal(means[j], covars[j], samples[j]) - for j in range(n_components)]) -y = np.concatenate([np.full(samples[j], j, dtype=int) - for j in range(n_components)]) +X = np.vstack( + [ + rng.multivariate_normal(means[j], covars[j], samples[j]) + for j in range(n_components) + ] +) +y = np.concatenate([np.full(samples[j], j, dtype=int) for j in range(n_components)]) # Plot results in two different figures for (title, estimator, concentrations_prior) in estimators: plt.figure(figsize=(4.7 * 3, 8)) - plt.subplots_adjust(bottom=.04, top=0.90, hspace=.05, wspace=.05, - left=.03, right=.99) + plt.subplots_adjust( + bottom=0.04, top=0.90, hspace=0.05, wspace=0.05, left=0.03, right=0.99 + ) gs = gridspec.GridSpec(3, len(concentrations_prior)) for k, concentration in enumerate(concentrations_prior): estimator.weight_concentration_prior = concentration estimator.fit(X) - plot_results(plt.subplot(gs[0:2, k]), plt.subplot(gs[2, k]), estimator, - X, y, r"%s$%.1e$" % (title, concentration), - plot_title=k == 0) + plot_results( + plt.subplot(gs[0:2, k]), + plt.subplot(gs[2, k]), + estimator, + X, + y, + r"%s$%.1e$" % (title, concentration), + plot_title=k == 0, + ) plt.show() diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py index 5f2f8596d4bbe..d3548eb6bed83 100644 --- a/examples/mixture/plot_gmm.py +++ b/examples/mixture/plot_gmm.py @@ -32,34 +32,32 @@ from sklearn import mixture -color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', - 'darkorange']) +color_iter = itertools.cycle(["navy", "c", "cornflowerblue", "gold", "darkorange"]) def plot_results(X, Y_, means, covariances, index, title): splot = plt.subplot(2, 1, 1 + index) - for i, (mean, covar, color) in enumerate(zip( - means, covariances, color_iter)): + for i, (mean, covar, color) in enumerate(zip(means, covariances, color_iter)): v, w = linalg.eigh(covar) - v = 2. * np.sqrt(2.) * np.sqrt(v) + v = 2.0 * np.sqrt(2.0) * np.sqrt(v) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == i): continue - plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) + plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 0.8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) - angle = 180. * angle / np.pi # convert to degrees - ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) + angle = 180.0 * angle / np.pi # convert to degrees + ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) - plt.xlim(-9., 5.) - plt.ylim(-3., 6.) + plt.xlim(-9.0, 5.0) + plt.ylim(-3.0, 6.0) plt.xticks(()) plt.yticks(()) plt.title(title) @@ -70,19 +68,25 @@ def plot_results(X, Y_, means, covariances, index, title): # Generate random sample, two components np.random.seed(0) -C = np.array([[0., -0.1], [1.7, .4]]) -X = np.r_[np.dot(np.random.randn(n_samples, 2), C), - .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] +C = np.array([[0.0, -0.1], [1.7, 0.4]]) +X = np.r_[ + np.dot(np.random.randn(n_samples, 2), C), + 0.7 * np.random.randn(n_samples, 2) + np.array([-6, 3]), +] # Fit a Gaussian mixture with EM using five components -gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X) -plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, - 'Gaussian Mixture') +gmm = mixture.GaussianMixture(n_components=5, covariance_type="full").fit(X) +plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, "Gaussian Mixture") # Fit a Dirichlet process Gaussian mixture using five components -dpgmm = mixture.BayesianGaussianMixture(n_components=5, - covariance_type='full').fit(X) -plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1, - 'Bayesian Gaussian Mixture with a Dirichlet process prior') +dpgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type="full").fit(X) +plot_results( + X, + dpgmm.predict(X), + dpgmm.means_, + dpgmm.covariances_, + 1, + "Bayesian Gaussian Mixture with a Dirichlet process prior", +) plt.show() diff --git a/examples/mixture/plot_gmm_covariances.py b/examples/mixture/plot_gmm_covariances.py index f59807a971a08..123c9846156a1 100644 --- a/examples/mixture/plot_gmm_covariances.py +++ b/examples/mixture/plot_gmm_covariances.py @@ -41,30 +41,31 @@ print(__doc__) -colors = ['navy', 'turquoise', 'darkorange'] +colors = ["navy", "turquoise", "darkorange"] def make_ellipses(gmm, ax): for n, color in enumerate(colors): - if gmm.covariance_type == 'full': + if gmm.covariance_type == "full": covariances = gmm.covariances_[n][:2, :2] - elif gmm.covariance_type == 'tied': + elif gmm.covariance_type == "tied": covariances = gmm.covariances_[:2, :2] - elif gmm.covariance_type == 'diag': + elif gmm.covariance_type == "diag": covariances = np.diag(gmm.covariances_[n][:2]) - elif gmm.covariance_type == 'spherical': + elif gmm.covariance_type == "spherical": covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n] v, w = np.linalg.eigh(covariances) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi # convert to degrees - v = 2. * np.sqrt(2.) * np.sqrt(v) - ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], - 180 + angle, color=color) + v = 2.0 * np.sqrt(2.0) * np.sqrt(v) + ell = mpl.patches.Ellipse( + gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color + ) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) - ax.set_aspect('equal', 'datalim') + ax.set_aspect("equal", "datalim") iris = datasets.load_iris() @@ -84,22 +85,27 @@ def make_ellipses(gmm, ax): n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. -estimators = {cov_type: GaussianMixture(n_components=n_classes, - covariance_type=cov_type, max_iter=20, random_state=0) - for cov_type in ['spherical', 'diag', 'tied', 'full']} +estimators = { + cov_type: GaussianMixture( + n_components=n_classes, covariance_type=cov_type, max_iter=20, random_state=0 + ) + for cov_type in ["spherical", "diag", "tied", "full"] +} n_estimators = len(estimators) plt.figure(figsize=(3 * n_estimators // 2, 6)) -plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, - left=.01, right=.99) +plt.subplots_adjust( + bottom=0.01, top=0.95, hspace=0.15, wspace=0.05, left=0.01, right=0.99 +) for index, (name, estimator) in enumerate(estimators.items()): # Since we have class labels for the training data, we can # initialize the GMM parameters in a supervised manner. - estimator.means_init = np.array([X_train[y_train == i].mean(axis=0) - for i in range(n_classes)]) + estimator.means_init = np.array( + [X_train[y_train == i].mean(axis=0) for i in range(n_classes)] + ) # Train the other parameters using the EM algorithm. estimator.fit(X_train) @@ -109,28 +115,27 @@ def make_ellipses(gmm, ax): for n, color in enumerate(colors): data = iris.data[iris.target == n] - plt.scatter(data[:, 0], data[:, 1], s=0.8, color=color, - label=iris.target_names[n]) + plt.scatter( + data[:, 0], data[:, 1], s=0.8, color=color, label=iris.target_names[n] + ) # Plot the test data with crosses for n, color in enumerate(colors): data = X_test[y_test == n] - plt.scatter(data[:, 0], data[:, 1], marker='x', color=color) + plt.scatter(data[:, 0], data[:, 1], marker="x", color=color) y_train_pred = estimator.predict(X_train) train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 - plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy, - transform=h.transAxes) + plt.text(0.05, 0.9, "Train accuracy: %.1f" % train_accuracy, transform=h.transAxes) y_test_pred = estimator.predict(X_test) test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 - plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy, - transform=h.transAxes) + plt.text(0.05, 0.8, "Test accuracy: %.1f" % test_accuracy, transform=h.transAxes) plt.xticks(()) plt.yticks(()) plt.title(name) -plt.legend(scatterpoints=1, loc='lower right', prop=dict(size=12)) +plt.legend(scatterpoints=1, loc="lower right", prop=dict(size=12)) plt.show() diff --git a/examples/mixture/plot_gmm_pdf.py b/examples/mixture/plot_gmm_pdf.py index 4469c36a89625..73c46cd5e7fb1 100644 --- a/examples/mixture/plot_gmm_pdf.py +++ b/examples/mixture/plot_gmm_pdf.py @@ -22,29 +22,30 @@ shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 20]) # generate zero centered stretched Gaussian data -C = np.array([[0., -0.7], [3.5, .7]]) +C = np.array([[0.0, -0.7], [3.5, 0.7]]) stretched_gaussian = np.dot(np.random.randn(n_samples, 2), C) # concatenate the two datasets into the final training set X_train = np.vstack([shifted_gaussian, stretched_gaussian]) # fit a Gaussian Mixture Model with two components -clf = mixture.GaussianMixture(n_components=2, covariance_type='full') +clf = mixture.GaussianMixture(n_components=2, covariance_type="full") clf.fit(X_train) # display predicted scores by the model as a contour plot -x = np.linspace(-20., 30.) -y = np.linspace(-20., 40.) +x = np.linspace(-20.0, 30.0) +y = np.linspace(-20.0, 40.0) X, Y = np.meshgrid(x, y) XX = np.array([X.ravel(), Y.ravel()]).T Z = -clf.score_samples(XX) Z = Z.reshape(X.shape) -CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), - levels=np.logspace(0, 3, 10)) -CB = plt.colorbar(CS, shrink=0.8, extend='both') -plt.scatter(X_train[:, 0], X_train[:, 1], .8) +CS = plt.contour( + X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10) +) +CB = plt.colorbar(CS, shrink=0.8, extend="both") +plt.scatter(X_train[:, 0], X_train[:, 1], 0.8) -plt.title('Negative log-likelihood predicted by a GMM') -plt.axis('tight') +plt.title("Negative log-likelihood predicted by a GMM") +plt.axis("tight") plt.show() diff --git a/examples/mixture/plot_gmm_selection.py b/examples/mixture/plot_gmm_selection.py index 3340ea93ea965..1d2aebe7b077a 100644 --- a/examples/mixture/plot_gmm_selection.py +++ b/examples/mixture/plot_gmm_selection.py @@ -31,19 +31,22 @@ # Generate random sample, two components np.random.seed(0) -C = np.array([[0., -0.1], [1.7, .4]]) -X = np.r_[np.dot(np.random.randn(n_samples, 2), C), - .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] +C = np.array([[0.0, -0.1], [1.7, 0.4]]) +X = np.r_[ + np.dot(np.random.randn(n_samples, 2), C), + 0.7 * np.random.randn(n_samples, 2) + np.array([-6, 3]), +] lowest_bic = np.infty bic = [] n_components_range = range(1, 7) -cv_types = ['spherical', 'tied', 'diag', 'full'] +cv_types = ["spherical", "tied", "diag", "full"] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM - gmm = mixture.GaussianMixture(n_components=n_components, - covariance_type=cv_type) + gmm = mixture.GaussianMixture( + n_components=n_components, covariance_type=cv_type + ) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: @@ -51,8 +54,7 @@ best_gmm = gmm bic = np.array(bic) -color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue', - 'darkorange']) +color_iter = itertools.cycle(["navy", "turquoise", "cornflowerblue", "darkorange"]) clf = best_gmm bars = [] @@ -60,41 +62,50 @@ plt.figure(figsize=(8, 6)) spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): - xpos = np.array(n_components_range) + .2 * (i - 2) - bars.append(plt.bar(xpos, bic[i * len(n_components_range): - (i + 1) * len(n_components_range)], - width=.2, color=color)) + xpos = np.array(n_components_range) + 0.2 * (i - 2) + bars.append( + plt.bar( + xpos, + bic[i * len(n_components_range) : (i + 1) * len(n_components_range)], + width=0.2, + color=color, + ) + ) plt.xticks(n_components_range) -plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()]) -plt.title('BIC score per model') -xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\ - .2 * np.floor(bic.argmin() / len(n_components_range)) -plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14) -spl.set_xlabel('Number of components') +plt.ylim([bic.min() * 1.01 - 0.01 * bic.max(), bic.max()]) +plt.title("BIC score per model") +xpos = ( + np.mod(bic.argmin(), len(n_components_range)) + + 0.65 + + 0.2 * np.floor(bic.argmin() / len(n_components_range)) +) +plt.text(xpos, bic.min() * 0.97 + 0.03 * bic.max(), "*", fontsize=14) +spl.set_xlabel("Number of components") spl.legend([b[0] for b in bars], cv_types) # Plot the winner splot = plt.subplot(2, 1, 2) Y_ = clf.predict(X) -for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_, - color_iter)): +for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)): v, w = linalg.eigh(cov) if not np.any(Y_ == i): continue - plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) + plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], 0.8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan2(w[0][1], w[0][0]) - angle = 180. * angle / np.pi # convert to degrees - v = 2. * np.sqrt(2.) * np.sqrt(v) - ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) + angle = 180.0 * angle / np.pi # convert to degrees + v = 2.0 * np.sqrt(2.0) * np.sqrt(v) + ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color) ell.set_clip_box(splot.bbox) - ell.set_alpha(.5) + ell.set_alpha(0.5) splot.add_artist(ell) plt.xticks(()) plt.yticks(()) -plt.title(f'Selected GMM: {best_gmm.covariance_type} model, ' - f'{best_gmm.n_components} components') -plt.subplots_adjust(hspace=.35, bottom=.02) +plt.title( + f"Selected GMM: {best_gmm.covariance_type} model, " + f"{best_gmm.n_components} components" +) +plt.subplots_adjust(hspace=0.35, bottom=0.02) plt.show() diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py index 1d436b93d15cc..1b6ac48d436a7 100644 --- a/examples/mixture/plot_gmm_sin.py +++ b/examples/mixture/plot_gmm_sin.py @@ -50,34 +50,32 @@ print(__doc__) -color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', - 'darkorange']) +color_iter = itertools.cycle(["navy", "c", "cornflowerblue", "gold", "darkorange"]) def plot_results(X, Y, means, covariances, index, title): splot = plt.subplot(5, 1, 1 + index) - for i, (mean, covar, color) in enumerate(zip( - means, covariances, color_iter)): + for i, (mean, covar, color) in enumerate(zip(means, covariances, color_iter)): v, w = linalg.eigh(covar) - v = 2. * np.sqrt(2.) * np.sqrt(v) + v = 2.0 * np.sqrt(2.0) * np.sqrt(v) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y == i): continue - plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color) + plt.scatter(X[Y == i, 0], X[Y == i, 1], 0.8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) - angle = 180. * angle / np.pi # convert to degrees - ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) + angle = 180.0 * angle / np.pi # convert to degrees + ell = mpl.patches.Ellipse(mean, v[0], v[1], 180.0 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) - plt.xlim(-6., 4. * np.pi - 6.) - plt.ylim(-5., 5.) + plt.xlim(-6.0, 4.0 * np.pi - 6.0) + plt.ylim(-5.0, 5.0) plt.title(title) plt.xticks(()) plt.yticks(()) @@ -91,10 +89,10 @@ def plot_samples(X, Y, n_components, index, title): # components. if not np.any(Y == i): continue - plt.scatter(X[Y == i, 0], X[Y == i, 1], .8, color=color) + plt.scatter(X[Y == i, 0], X[Y == i, 1], 0.8, color=color) - plt.xlim(-6., 4. * np.pi - 6.) - plt.ylim(-5., 5.) + plt.xlim(-6.0, 4.0 * np.pi - 6.0) + plt.ylim(-5.0, 5.0) plt.title(title) plt.xticks(()) plt.yticks(()) @@ -106,49 +104,86 @@ def plot_samples(X, Y, n_components, index, title): # Generate random sample following a sine curve np.random.seed(0) X = np.zeros((n_samples, 2)) -step = 4. * np.pi / n_samples +step = 4.0 * np.pi / n_samples for i in range(X.shape[0]): - x = i * step - 6. + x = i * step - 6.0 X[i, 0] = x + np.random.normal(0, 0.1) - X[i, 1] = 3. * (np.sin(x) + np.random.normal(0, .2)) + X[i, 1] = 3.0 * (np.sin(x) + np.random.normal(0, 0.2)) plt.figure(figsize=(10, 10)) -plt.subplots_adjust(bottom=.04, top=0.95, hspace=.2, wspace=.05, - left=.03, right=.97) +plt.subplots_adjust( + bottom=0.04, top=0.95, hspace=0.2, wspace=0.05, left=0.03, right=0.97 +) # Fit a Gaussian mixture with EM using ten components -gmm = mixture.GaussianMixture(n_components=10, covariance_type='full', - max_iter=100).fit(X) -plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, - 'Expectation-maximization') +gmm = mixture.GaussianMixture( + n_components=10, covariance_type="full", max_iter=100 +).fit(X) +plot_results( + X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, "Expectation-maximization" +) dpgmm = mixture.BayesianGaussianMixture( - n_components=10, covariance_type='full', weight_concentration_prior=1e-2, - weight_concentration_prior_type='dirichlet_process', - mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2), - init_params="random", max_iter=100, random_state=2).fit(X) -plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1, - "Bayesian Gaussian mixture models with a Dirichlet process prior " - r"for $\gamma_0=0.01$.") + n_components=10, + covariance_type="full", + weight_concentration_prior=1e-2, + weight_concentration_prior_type="dirichlet_process", + mean_precision_prior=1e-2, + covariance_prior=1e0 * np.eye(2), + init_params="random", + max_iter=100, + random_state=2, +).fit(X) +plot_results( + X, + dpgmm.predict(X), + dpgmm.means_, + dpgmm.covariances_, + 1, + "Bayesian Gaussian mixture models with a Dirichlet process prior " + r"for $\gamma_0=0.01$.", +) X_s, y_s = dpgmm.sample(n_samples=2000) -plot_samples(X_s, y_s, dpgmm.n_components, 0, - "Gaussian mixture with a Dirichlet process prior " - r"for $\gamma_0=0.01$ sampled with $2000$ samples.") +plot_samples( + X_s, + y_s, + dpgmm.n_components, + 0, + "Gaussian mixture with a Dirichlet process prior " + r"for $\gamma_0=0.01$ sampled with $2000$ samples.", +) dpgmm = mixture.BayesianGaussianMixture( - n_components=10, covariance_type='full', weight_concentration_prior=1e+2, - weight_concentration_prior_type='dirichlet_process', - mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2), - init_params="kmeans", max_iter=100, random_state=2).fit(X) -plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 2, - "Bayesian Gaussian mixture models with a Dirichlet process prior " - r"for $\gamma_0=100$") + n_components=10, + covariance_type="full", + weight_concentration_prior=1e2, + weight_concentration_prior_type="dirichlet_process", + mean_precision_prior=1e-2, + covariance_prior=1e0 * np.eye(2), + init_params="kmeans", + max_iter=100, + random_state=2, +).fit(X) +plot_results( + X, + dpgmm.predict(X), + dpgmm.means_, + dpgmm.covariances_, + 2, + "Bayesian Gaussian mixture models with a Dirichlet process prior " + r"for $\gamma_0=100$", +) X_s, y_s = dpgmm.sample(n_samples=2000) -plot_samples(X_s, y_s, dpgmm.n_components, 1, - "Gaussian mixture with a Dirichlet process prior " - r"for $\gamma_0=100$ sampled with $2000$ samples.") +plot_samples( + X_s, + y_s, + dpgmm.n_components, + 1, + "Gaussian mixture with a Dirichlet process prior " + r"for $\gamma_0=100$ sampled with $2000$ samples.", +) plt.show() diff --git a/examples/model_selection/grid_search_text_feature_extraction.py b/examples/model_selection/grid_search_text_feature_extraction.py index a1bd12581768c..a28593eb90866 100644 --- a/examples/model_selection/grid_search_text_feature_extraction.py +++ b/examples/model_selection/grid_search_text_feature_extraction.py @@ -1,4 +1,3 @@ - """ ========================================================== Sample pipeline for text feature extraction and evaluation @@ -60,15 +59,14 @@ print(__doc__) # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # ############################################################################# # Load some categories from the training set categories = [ - 'alt.atheism', - 'talk.religion.misc', + "alt.atheism", + "talk.religion.misc", ] # Uncomment the following to do the analysis on all the categories # categories = None @@ -76,7 +74,7 @@ print("Loading 20 newsgroups dataset for categories:") print(categories) -data = fetch_20newsgroups(subset='train', categories=categories) +data = fetch_20newsgroups(subset="train", categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() @@ -84,23 +82,25 @@ # ############################################################################# # Define a pipeline combining a text feature extractor with a simple # classifier -pipeline = Pipeline([ - ('vect', CountVectorizer()), - ('tfidf', TfidfTransformer()), - ('clf', SGDClassifier()), -]) +pipeline = Pipeline( + [ + ("vect", CountVectorizer()), + ("tfidf", TfidfTransformer()), + ("clf", SGDClassifier()), + ] +) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { - 'vect__max_df': (0.5, 0.75, 1.0), + "vect__max_df": (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), - 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams + "vect__ngram_range": ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), - 'clf__max_iter': (20,), - 'clf__alpha': (0.00001, 0.000001), - 'clf__penalty': ('l2', 'elasticnet'), + "clf__max_iter": (20,), + "clf__alpha": (0.00001, 0.000001), + "clf__penalty": ("l2", "elasticnet"), # 'clf__max_iter': (10, 50, 80), } diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py index d54d9747a8cf3..251a4e175eb90 100644 --- a/examples/model_selection/plot_confusion_matrix.py +++ b/examples/model_selection/plot_confusion_matrix.py @@ -44,17 +44,23 @@ # Run classifier, using a model that is too regularized (C too low) to see # the impact on the results -classifier = svm.SVC(kernel='linear', C=0.01).fit(X_train, y_train) +classifier = svm.SVC(kernel="linear", C=0.01).fit(X_train, y_train) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix -titles_options = [("Confusion matrix, without normalization", None), - ("Normalized confusion matrix", 'true')] +titles_options = [ + ("Confusion matrix, without normalization", None), + ("Normalized confusion matrix", "true"), +] for title, normalize in titles_options: disp = ConfusionMatrixDisplay.from_estimator( - classifier, X_test, y_test, display_labels=class_names, - cmap=plt.cm.Blues, normalize=normalize + classifier, + X_test, + y_test, + display_labels=class_names, + cmap=plt.cm.Blues, + normalize=normalize, ) disp.ax_.set_title(title) diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py index f07fa1595e860..24deba92f11e5 100644 --- a/examples/model_selection/plot_cv_indices.py +++ b/examples/model_selection/plot_cv_indices.py @@ -11,13 +11,20 @@ for comparison. """ -from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit, - StratifiedKFold, GroupShuffleSplit, - GroupKFold, StratifiedShuffleSplit, - StratifiedGroupKFold) +from sklearn.model_selection import ( + TimeSeriesSplit, + KFold, + ShuffleSplit, + StratifiedKFold, + GroupShuffleSplit, + GroupKFold, + StratifiedShuffleSplit, + StratifiedGroupKFold, +) import numpy as np import matplotlib.pyplot as plt from matplotlib.patches import Patch + np.random.seed(1338) cmap_data = plt.cm.Paired cmap_cv = plt.cm.coolwarm @@ -41,9 +48,8 @@ n_points = 100 X = np.random.randn(100, 10) -percentiles_classes = [.1, .3, .6] -y = np.hstack([[ii] * int(100 * perc) - for ii, perc in enumerate(percentiles_classes)]) +percentiles_classes = [0.1, 0.3, 0.6] +y = np.hstack([[ii] * int(100 * perc) for ii, perc in enumerate(percentiles_classes)]) # Evenly spaced groups repeated once groups = np.hstack([[ii] * 10 for ii in range(10)]) @@ -52,15 +58,31 @@ def visualize_groups(classes, groups, name): # Visualize dataset groups fig, ax = plt.subplots() - ax.scatter(range(len(groups)), [.5] * len(groups), c=groups, marker='_', - lw=50, cmap=cmap_data) - ax.scatter(range(len(groups)), [3.5] * len(groups), c=classes, marker='_', - lw=50, cmap=cmap_data) - ax.set(ylim=[-1, 5], yticks=[.5, 3.5], - yticklabels=['Data\ngroup', 'Data\nclass'], xlabel="Sample index") - - -visualize_groups(y, groups, 'no groups') + ax.scatter( + range(len(groups)), + [0.5] * len(groups), + c=groups, + marker="_", + lw=50, + cmap=cmap_data, + ) + ax.scatter( + range(len(groups)), + [3.5] * len(groups), + c=classes, + marker="_", + lw=50, + cmap=cmap_data, + ) + ax.set( + ylim=[-1, 5], + yticks=[0.5, 3.5], + yticklabels=["Data\ngroup", "Data\nclass"], + xlabel="Sample index", + ) + + +visualize_groups(y, groups, "no groups") # %% # Define a function to visualize cross-validation behavior @@ -83,23 +105,37 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): indices[tr] = 0 # Visualize the results - ax.scatter(range(len(indices)), [ii + .5] * len(indices), - c=indices, marker='_', lw=lw, cmap=cmap_cv, - vmin=-.2, vmax=1.2) + ax.scatter( + range(len(indices)), + [ii + 0.5] * len(indices), + c=indices, + marker="_", + lw=lw, + cmap=cmap_cv, + vmin=-0.2, + vmax=1.2, + ) # Plot the data classes and groups at the end - ax.scatter(range(len(X)), [ii + 1.5] * len(X), - c=y, marker='_', lw=lw, cmap=cmap_data) + ax.scatter( + range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data + ) - ax.scatter(range(len(X)), [ii + 2.5] * len(X), - c=group, marker='_', lw=lw, cmap=cmap_data) + ax.scatter( + range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data + ) # Formatting - yticklabels = list(range(n_splits)) + ['class', 'group'] - ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels, - xlabel='Sample index', ylabel="CV iteration", - ylim=[n_splits+2.2, -.2], xlim=[0, 100]) - ax.set_title('{}'.format(type(cv).__name__), fontsize=15) + yticklabels = list(range(n_splits)) + ["class", "group"] + ax.set( + yticks=np.arange(n_splits + 2) + 0.5, + yticklabels=yticklabels, + xlabel="Sample index", + ylabel="CV iteration", + ylim=[n_splits + 2.2, -0.2], + xlim=[0, 100], + ) + ax.set_title("{}".format(type(cv).__name__), fontsize=15) return ax @@ -132,11 +168,14 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): for cv in cvs: fig, ax = plt.subplots(figsize=(6, 3)) plot_cv_indices(cv(n_splits), X, y, uneven_groups, ax, n_splits) - ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))], - ['Testing set', 'Training set'], loc=(1.02, .8)) + ax.legend( + [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))], + ["Testing set", "Training set"], + loc=(1.02, 0.8), + ) # Make the legend fit plt.tight_layout() - fig.subplots_adjust(right=.7) + fig.subplots_adjust(right=0.7) # %% # Next we'll visualize this behavior for a number of CV iterators. @@ -150,8 +189,16 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): # # Note how some use the group/class information while others do not. -cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, StratifiedGroupKFold, - GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit] +cvs = [ + KFold, + GroupKFold, + ShuffleSplit, + StratifiedKFold, + StratifiedGroupKFold, + GroupShuffleSplit, + StratifiedShuffleSplit, + TimeSeriesSplit, +] for cv in cvs: @@ -159,9 +206,12 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): fig, ax = plt.subplots(figsize=(6, 3)) plot_cv_indices(this_cv, X, y, groups, ax, n_splits) - ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))], - ['Testing set', 'Training set'], loc=(1.02, .8)) + ax.legend( + [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))], + ["Testing set", "Training set"], + loc=(1.02, 0.8), + ) # Make the legend fit plt.tight_layout() - fig.subplots_adjust(right=.7) + fig.subplots_adjust(right=0.7) plt.show() diff --git a/examples/model_selection/plot_cv_predict.py b/examples/model_selection/plot_cv_predict.py index ee3e82f42cba1..8d31da4ff2fec 100644 --- a/examples/model_selection/plot_cv_predict.py +++ b/examples/model_selection/plot_cv_predict.py @@ -22,7 +22,7 @@ fig, ax = plt.subplots() ax.scatter(y, predicted, edgecolors=(0, 0, 0)) -ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) -ax.set_xlabel('Measured') -ax.set_ylabel('Predicted') +ax.plot([y.min(), y.max()], [y.min(), y.max()], "k--", lw=4) +ax.set_xlabel("Measured") +ax.set_ylabel("Predicted") plt.show() diff --git a/examples/model_selection/plot_grid_search_digits.py b/examples/model_selection/plot_grid_search_digits.py index 498b00082b7c1..b6100489d4a53 100644 --- a/examples/model_selection/plot_grid_search_digits.py +++ b/examples/model_selection/plot_grid_search_digits.py @@ -33,23 +33,21 @@ y = digits.target # Split the dataset in two equal parts -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation -tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], - 'C': [1, 10, 100, 1000]}, - {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] +tuned_parameters = [ + {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]}, + {"kernel": ["linear"], "C": [1, 10, 100, 1000]}, +] -scores = ['precision', 'recall'] +scores = ["precision", "recall"] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() - clf = GridSearchCV( - SVC(), tuned_parameters, scoring='%s_macro' % score - ) + clf = GridSearchCV(SVC(), tuned_parameters, scoring="%s_macro" % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") @@ -58,11 +56,10 @@ print() print("Grid scores on development set:") print() - means = clf.cv_results_['mean_test_score'] - stds = clf.cv_results_['std_test_score'] - for mean, std, params in zip(means, stds, clf.cv_results_['params']): - print("%0.3f (+/-%0.03f) for %r" - % (mean, std * 2, params)) + means = clf.cv_results_["mean_test_score"] + stds = clf.cv_results_["std_test_score"] + for mean, std, params in zip(means, stds, clf.cv_results_["params"]): + print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") diff --git a/examples/model_selection/plot_grid_search_refit_callable.py b/examples/model_selection/plot_grid_search_refit_callable.py index b2e0cf9d8dcc7..3d363d6bb7384 100644 --- a/examples/model_selection/plot_grid_search_refit_callable.py +++ b/examples/model_selection/plot_grid_search_refit_callable.py @@ -46,10 +46,12 @@ def lower_bound(cv_results): Lower bound within 1 standard deviation of the best `mean_test_score`. """ - best_score_idx = np.argmax(cv_results['mean_test_score']) + best_score_idx = np.argmax(cv_results["mean_test_score"]) - return (cv_results['mean_test_score'][best_score_idx] - - cv_results['std_test_score'][best_score_idx]) + return ( + cv_results["mean_test_score"][best_score_idx] + - cv_results["std_test_score"][best_score_idx] + ) def best_low_complexity(cv_results): @@ -69,48 +71,56 @@ def best_low_complexity(cv_results): `mean_test_score`. """ threshold = lower_bound(cv_results) - candidate_idx = np.flatnonzero(cv_results['mean_test_score'] >= threshold) - best_idx = candidate_idx[cv_results['param_reduce_dim__n_components'] - [candidate_idx].argmin()] + candidate_idx = np.flatnonzero(cv_results["mean_test_score"] >= threshold) + best_idx = candidate_idx[ + cv_results["param_reduce_dim__n_components"][candidate_idx].argmin() + ] return best_idx -pipe = Pipeline([ - ('reduce_dim', PCA(random_state=42)), - ('classify', LinearSVC(random_state=42, C=0.01)), -]) +pipe = Pipeline( + [ + ("reduce_dim", PCA(random_state=42)), + ("classify", LinearSVC(random_state=42, C=0.01)), + ] +) -param_grid = { - 'reduce_dim__n_components': [6, 8, 10, 12, 14] -} +param_grid = {"reduce_dim__n_components": [6, 8, 10, 12, 14]} -grid = GridSearchCV(pipe, cv=10, n_jobs=1, param_grid=param_grid, - scoring='accuracy', refit=best_low_complexity) +grid = GridSearchCV( + pipe, + cv=10, + n_jobs=1, + param_grid=param_grid, + scoring="accuracy", + refit=best_low_complexity, +) X, y = load_digits(return_X_y=True) grid.fit(X, y) -n_components = grid.cv_results_['param_reduce_dim__n_components'] -test_scores = grid.cv_results_['mean_test_score'] +n_components = grid.cv_results_["param_reduce_dim__n_components"] +test_scores = grid.cv_results_["mean_test_score"] plt.figure() -plt.bar(n_components, test_scores, width=1.3, color='b') +plt.bar(n_components, test_scores, width=1.3, color="b") lower = lower_bound(grid.cv_results_) -plt.axhline(np.max(test_scores), linestyle='--', color='y', - label='Best score') -plt.axhline(lower, linestyle='--', color='.5', label='Best score - 1 std') +plt.axhline(np.max(test_scores), linestyle="--", color="y", label="Best score") +plt.axhline(lower, linestyle="--", color=".5", label="Best score - 1 std") plt.title("Balance model complexity and cross-validated score") -plt.xlabel('Number of PCA components used') -plt.ylabel('Digit classification accuracy') +plt.xlabel("Number of PCA components used") +plt.ylabel("Digit classification accuracy") plt.xticks(n_components.tolist()) plt.ylim((0, 1.0)) -plt.legend(loc='upper left') +plt.legend(loc="upper left") best_index_ = grid.best_index_ print("The best_index_ is %d" % best_index_) print("The n_components selected is %d" % n_components[best_index_]) -print("The corresponding accuracy score is %.2f" - % grid.cv_results_['mean_test_score'][best_index_]) +print( + "The corresponding accuracy score is %.2f" + % grid.cv_results_["mean_test_score"][best_index_] +) plt.show() diff --git a/examples/model_selection/plot_grid_search_stats.py b/examples/model_selection/plot_grid_search_stats.py index 1b434ec0e8b6e..70dd204390491 100644 --- a/examples/model_selection/plot_grid_search_stats.py +++ b/examples/model_selection/plot_grid_search_stats.py @@ -21,8 +21,7 @@ X, y = make_moons(noise=0.352, random_state=1, n_samples=100) sns.scatterplot( - x=X[:, 0], y=X[:, 1], hue=y, - marker='o', s=25, edgecolor='k', legend=False + x=X[:, 0], y=X[:, 1], hue=y, marker="o", s=25, edgecolor="k", legend=False ).set_title("Data") plt.show() @@ -40,21 +39,16 @@ from sklearn.svm import SVC param_grid = [ - {'kernel': ['linear']}, - {'kernel': ['poly'], 'degree': [2, 3]}, - {'kernel': ['rbf']} + {"kernel": ["linear"]}, + {"kernel": ["poly"], "degree": [2, 3]}, + {"kernel": ["rbf"]}, ] svc = SVC(random_state=0) -cv = RepeatedStratifiedKFold( - n_splits=10, n_repeats=10, random_state=0 -) +cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0) -search = GridSearchCV( - estimator=svc, param_grid=param_grid, - scoring='roc_auc', cv=cv -) +search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring="roc_auc", cv=cv) search.fit(X, y) # %% @@ -64,17 +58,11 @@ import pandas as pd results_df = pd.DataFrame(search.cv_results_) -results_df = results_df.sort_values(by=['rank_test_score']) -results_df = ( - results_df - .set_index(results_df["params"].apply( - lambda x: "_".join(str(val) for val in x.values())) - ) - .rename_axis('kernel') -) -results_df[ - ['params', 'rank_test_score', 'mean_test_score', 'std_test_score'] -] +results_df = results_df.sort_values(by=["rank_test_score"]) +results_df = results_df.set_index( + results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values())) +).rename_axis("kernel") +results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]] # %% # We can see that the estimator using the `'rbf'` kernel performed best, @@ -102,13 +90,17 @@ # in each fold, and calculating the correlation between models across folds: # create df of model scores ordered by performance -model_scores = results_df.filter(regex=r'split\d*_test_score') +model_scores = results_df.filter(regex=r"split\d*_test_score") # plot 30 examples of dependency between cv fold and AUC scores fig, ax = plt.subplots() sns.lineplot( data=model_scores.transpose().iloc[:30], - dashes=False, palette='Set1', marker='o', alpha=.5, ax=ax + dashes=False, + palette="Set1", + marker="o", + alpha=0.5, + ax=ax, ) ax.set_xlabel("CV test fold", size=12, labelpad=10) ax.set_ylabel("Model AUC", size=12) @@ -193,9 +185,7 @@ def corrected_std(differences, n_train, n_test): # kr = k times r, r times repeated k-fold crossvalidation, # kr equals the number of times the model was evaluated kr = len(differences) - corrected_var = ( - np.var(differences, ddof=1) * (1 / kr + n_test / n_train) - ) + corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train) corrected_std = np.sqrt(corrected_var) return corrected_std @@ -240,19 +230,18 @@ def compute_corrected_ttest(differences, df, n_train, n_test): n_test = len(list(cv.split(X, y))[0][1]) t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test) -print(f"Corrected t-value: {t_stat:.3f}\n" - f"Corrected p-value: {p_val:.3f}") +print(f"Corrected t-value: {t_stat:.3f}\nCorrected p-value: {p_val:.3f}") # %% # We can compare the corrected t- and p-values with the uncorrected ones: -t_stat_uncorrected = ( - np.mean(differences) / np.sqrt(np.var(differences, ddof=1) / n) -) +t_stat_uncorrected = np.mean(differences) / np.sqrt(np.var(differences, ddof=1) / n) p_val_uncorrected = t.sf(np.abs(t_stat_uncorrected), df) -print(f"Uncorrected t-value: {t_stat_uncorrected:.3f}\n" - f"Uncorrected p-value: {p_val_uncorrected:.3f}") +print( + f"Uncorrected t-value: {t_stat_uncorrected:.3f}\n" + f"Uncorrected p-value: {p_val_uncorrected:.3f}" +) # %% # Using the conventional significance alpha level at `p=0.05`, we observe that @@ -310,8 +299,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test): # initialize random variable t_post = t( - df, loc=np.mean(differences), - scale=corrected_std(differences, n_train, n_test) + df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test) ) # %% @@ -321,7 +309,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test): plt.plot(x, t_post.pdf(x)) plt.xticks(np.arange(-0.04, 0.06, 0.01)) -plt.fill_between(x, t_post.pdf(x), 0, facecolor='blue', alpha=.2) +plt.fill_between(x, t_post.pdf(x), 0, facecolor="blue", alpha=0.2) plt.ylabel("Probability density") plt.xlabel(r"Mean difference ($\mu$)") plt.title("Posterior distribution") @@ -336,10 +324,14 @@ def compute_corrected_ttest(differences, df, n_train, n_test): better_prob = 1 - t_post.cdf(0) -print(f"Probability of {model_scores.index[0]} being more accurate than " - f"{model_scores.index[1]}: {better_prob:.3f}") -print(f"Probability of {model_scores.index[1]} being more accurate than " - f"{model_scores.index[0]}: {1 - better_prob:.3f}") +print( + f"Probability of {model_scores.index[0]} being more accurate than " + f"{model_scores.index[1]}: {better_prob:.3f}" +) +print( + f"Probability of {model_scores.index[1]} being more accurate than " + f"{model_scores.index[0]}: {1 - better_prob:.3f}" +) # %% # In contrast with the frequentist approach, we can compute the probability @@ -373,8 +365,10 @@ def compute_corrected_ttest(differences, df, n_train, n_test): rope_interval = [-0.01, 0.01] rope_prob = t_post.cdf(rope_interval[1]) - t_post.cdf(rope_interval[0]) -print(f"Probability of {model_scores.index[0]} and {model_scores.index[1]} " - f"being practically equivalent: {rope_prob:.3f}") +print( + f"Probability of {model_scores.index[0]} and {model_scores.index[1]} " + f"being practically equivalent: {rope_prob:.3f}" +) # %% # We can plot how the posterior is distributed over the ROPE interval: @@ -384,7 +378,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test): plt.plot(x, t_post.pdf(x)) plt.xticks(np.arange(-0.04, 0.06, 0.01)) plt.vlines([-0.01, 0.01], ymin=0, ymax=(np.max(t_post.pdf(x)) + 1)) -plt.fill_between(x_rope, t_post.pdf(x_rope), 0, facecolor='blue', alpha=.2) +plt.fill_between(x_rope, t_post.pdf(x_rope), 0, facecolor="blue", alpha=0.2) plt.ylabel("Probability density") plt.xlabel(r"Mean difference ($\mu$)") plt.title("Posterior distribution under the ROPE") @@ -416,9 +410,8 @@ def compute_corrected_ttest(differences, df, n_train, n_test): cred_intervals.append([interval, cred_interval[0], cred_interval[1]]) cred_int_df = pd.DataFrame( - cred_intervals, - columns=['interval', 'lower value', 'upper value'] -).set_index('interval') + cred_intervals, columns=["interval", "lower value", "upper value"] +).set_index("interval") cred_int_df # %% @@ -448,9 +441,8 @@ def compute_corrected_ttest(differences, df, n_train, n_test): from itertools import combinations from math import factorial -n_comparisons = ( - factorial(len(model_scores)) - / (factorial(2) * factorial(len(model_scores) - 2)) +n_comparisons = factorial(len(model_scores)) / ( + factorial(2) * factorial(len(model_scores) - 2) ) pairwise_t_test = [] @@ -458,20 +450,16 @@ def compute_corrected_ttest(differences, df, n_train, n_test): model_i_scores = model_scores.iloc[model_i].values model_k_scores = model_scores.iloc[model_k].values differences = model_i_scores - model_k_scores - t_stat, p_val = compute_corrected_ttest( - differences, df, n_train, n_test - ) + t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test) p_val *= n_comparisons # implement Bonferroni correction # Bonferroni can output p-values higher than 1 p_val = 1 if p_val > 1 else p_val pairwise_t_test.append( - [model_scores.index[model_i], model_scores.index[model_k], - t_stat, p_val] + [model_scores.index[model_i], model_scores.index[model_k], t_stat, p_val] ) pairwise_comp_df = pd.DataFrame( - pairwise_t_test, - columns=['model_1', 'model_2', 't_stat', 'p_val'] + pairwise_t_test, columns=["model_1", "model_2", "t_stat", "p_val"] ).round(3) pairwise_comp_df @@ -499,8 +487,7 @@ def compute_corrected_ttest(differences, df, n_train, n_test): model_k_scores = model_scores.iloc[model_k].values differences = model_i_scores - model_k_scores t_post = t( - df, loc=np.mean(differences), - scale=corrected_std(differences, n_train, n_test) + df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test) ) worse_prob = t_post.cdf(rope_interval[0]) better_prob = 1 - t_post.cdf(rope_interval[1]) @@ -508,10 +495,9 @@ def compute_corrected_ttest(differences, df, n_train, n_test): pairwise_bayesian.append([worse_prob, better_prob, rope_prob]) -pairwise_bayesian_df = (pd.DataFrame( - pairwise_bayesian, - columns=['worse_prob', 'better_prob', 'rope_prob'] -).round(3)) +pairwise_bayesian_df = pd.DataFrame( + pairwise_bayesian, columns=["worse_prob", "better_prob", "rope_prob"] +).round(3) pairwise_comp_df = pairwise_comp_df.join(pairwise_bayesian_df) pairwise_comp_df diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py index 71cc565c3528c..46b11cad2e7e4 100644 --- a/examples/model_selection/plot_learning_curve.py +++ b/examples/model_selection/plot_learning_curve.py @@ -26,8 +26,17 @@ from sklearn.model_selection import ShuffleSplit -def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, - n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): +def plot_learning_curve( + estimator, + title, + X, + y, + axes=None, + ylim=None, + cv=None, + n_jobs=None, + train_sizes=np.linspace(0.1, 1.0, 5), +): """ Generate 3 plots: the test and training learning curve, the training samples vs fit times curve, the fit times vs score curve. @@ -96,10 +105,15 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") - train_sizes, train_scores, test_scores, fit_times, _ = \ - learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, - train_sizes=train_sizes, - return_times=True) + train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=n_jobs, + train_sizes=train_sizes, + return_times=True, + ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) @@ -109,32 +123,50 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, # Plot learning curve axes[0].grid() - axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, - train_scores_mean + train_scores_std, alpha=0.1, - color="r") - axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, alpha=0.1, - color="g") - axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", - label="Training score") - axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", - label="Cross-validation score") + axes[0].fill_between( + train_sizes, + train_scores_mean - train_scores_std, + train_scores_mean + train_scores_std, + alpha=0.1, + color="r", + ) + axes[0].fill_between( + train_sizes, + test_scores_mean - test_scores_std, + test_scores_mean + test_scores_std, + alpha=0.1, + color="g", + ) + axes[0].plot( + train_sizes, train_scores_mean, "o-", color="r", label="Training score" + ) + axes[0].plot( + train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score" + ) axes[0].legend(loc="best") # Plot n_samples vs fit_times axes[1].grid() - axes[1].plot(train_sizes, fit_times_mean, 'o-') - axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, - fit_times_mean + fit_times_std, alpha=0.1) + axes[1].plot(train_sizes, fit_times_mean, "o-") + axes[1].fill_between( + train_sizes, + fit_times_mean - fit_times_std, + fit_times_mean + fit_times_std, + alpha=0.1, + ) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() - axes[2].plot(fit_times_mean, test_scores_mean, 'o-') - axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, alpha=0.1) + axes[2].plot(fit_times_mean, test_scores_mean, "o-") + axes[2].fill_between( + fit_times_mean, + test_scores_mean - test_scores_std, + test_scores_mean + test_scores_std, + alpha=0.1, + ) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") @@ -152,14 +184,16 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = GaussianNB() -plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), - cv=cv, n_jobs=4) +plot_learning_curve( + estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01), cv=cv, n_jobs=4 +) title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) -plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), - cv=cv, n_jobs=4) +plot_learning_curve( + estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01), cv=cv, n_jobs=4 +) plt.show() diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py index 4f03f1b19462d..7d6ce84eb5f26 100644 --- a/examples/model_selection/plot_multi_metric_evaluation.py +++ b/examples/model_selection/plot_multi_metric_evaluation.py @@ -38,16 +38,20 @@ # The scorers can be either one of the predefined metric strings or a scorer # callable, like the one returned by make_scorer -scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)} +scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)} # Setting refit='AUC', refits an estimator on the whole dataset with the # parameter setting that has the best cross-validated AUC score. # That estimator is made available at ``gs.best_estimator_`` along with # parameters like ``gs.best_score_``, ``gs.best_params_`` and # ``gs.best_index_`` -gs = GridSearchCV(DecisionTreeClassifier(random_state=42), - param_grid={'min_samples_split': range(2, 403, 10)}, - scoring=scoring, refit='AUC', return_train_score=True) +gs = GridSearchCV( + DecisionTreeClassifier(random_state=42), + param_grid={"min_samples_split": range(2, 403, 10)}, + scoring=scoring, + refit="AUC", + return_train_score=True, +) gs.fit(X, y) results = gs.cv_results_ @@ -56,8 +60,7 @@ # ------------------- plt.figure(figsize=(13, 13)) -plt.title("GridSearchCV evaluating using multiple scorers simultaneously", - fontsize=16) +plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16) plt.xlabel("min_samples_split") plt.ylabel("Score") @@ -67,29 +70,47 @@ ax.set_ylim(0.73, 1) # Get the regular numpy array from the MaskedArray -X_axis = np.array(results['param_min_samples_split'].data, dtype=float) - -for scorer, color in zip(sorted(scoring), ['g', 'k']): - for sample, style in (('train', '--'), ('test', '-')): - sample_score_mean = results['mean_%s_%s' % (sample, scorer)] - sample_score_std = results['std_%s_%s' % (sample, scorer)] - ax.fill_between(X_axis, sample_score_mean - sample_score_std, - sample_score_mean + sample_score_std, - alpha=0.1 if sample == 'test' else 0, color=color) - ax.plot(X_axis, sample_score_mean, style, color=color, - alpha=1 if sample == 'test' else 0.7, - label="%s (%s)" % (scorer, sample)) - - best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0] - best_score = results['mean_test_%s' % scorer][best_index] +X_axis = np.array(results["param_min_samples_split"].data, dtype=float) + +for scorer, color in zip(sorted(scoring), ["g", "k"]): + for sample, style in (("train", "--"), ("test", "-")): + sample_score_mean = results["mean_%s_%s" % (sample, scorer)] + sample_score_std = results["std_%s_%s" % (sample, scorer)] + ax.fill_between( + X_axis, + sample_score_mean - sample_score_std, + sample_score_mean + sample_score_std, + alpha=0.1 if sample == "test" else 0, + color=color, + ) + ax.plot( + X_axis, + sample_score_mean, + style, + color=color, + alpha=1 if sample == "test" else 0.7, + label="%s (%s)" % (scorer, sample), + ) + + best_index = np.nonzero(results["rank_test_%s" % scorer] == 1)[0][0] + best_score = results["mean_test_%s" % scorer][best_index] # Plot a dotted vertical line at the best score for that scorer marked by x - ax.plot([X_axis[best_index], ] * 2, [0, best_score], - linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8) + ax.plot( + [ + X_axis[best_index], + ] + * 2, + [0, best_score], + linestyle="-.", + color=color, + marker="x", + markeredgewidth=3, + ms=8, + ) # Annotate the best score for that scorer - ax.annotate("%0.2f" % best_score, - (X_axis[best_index], best_score + 0.005)) + ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005)) plt.legend(loc="best") plt.grid(False) diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py index a2c53841bc4da..030b46a0c748d 100644 --- a/examples/model_selection/plot_nested_cross_validation_iris.py +++ b/examples/model_selection/plot_nested_cross_validation_iris.py @@ -60,8 +60,7 @@ y_iris = iris.target # Set up possible values of parameters to optimize over -p_grid = {"C": [1, 10, 100], - "gamma": [.01, .1]} +p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]} # We will use a Support Vector Classifier with "rbf" kernel svm = SVC(kernel="rbf") @@ -91,28 +90,39 @@ score_difference = non_nested_scores - nested_scores -print("Average difference of {:6f} with std. dev. of {:6f}." - .format(score_difference.mean(), score_difference.std())) +print( + "Average difference of {:6f} with std. dev. of {:6f}.".format( + score_difference.mean(), score_difference.std() + ) +) # Plot scores on each trial for nested and non-nested CV plt.figure() plt.subplot(211) -non_nested_scores_line, = plt.plot(non_nested_scores, color='r') -nested_line, = plt.plot(nested_scores, color='b') +(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r") +(nested_line,) = plt.plot(nested_scores, color="b") plt.ylabel("score", fontsize="14") -plt.legend([non_nested_scores_line, nested_line], - ["Non-Nested CV", "Nested CV"], - bbox_to_anchor=(0, .4, .5, 0)) -plt.title("Non-Nested and Nested Cross Validation on Iris Dataset", - x=.5, y=1.1, fontsize="15") +plt.legend( + [non_nested_scores_line, nested_line], + ["Non-Nested CV", "Nested CV"], + bbox_to_anchor=(0, 0.4, 0.5, 0), +) +plt.title( + "Non-Nested and Nested Cross Validation on Iris Dataset", + x=0.5, + y=1.1, + fontsize="15", +) # Plot bar chart of the difference. plt.subplot(212) difference_plot = plt.bar(range(NUM_TRIALS), score_difference) plt.xlabel("Individual Trial #") -plt.legend([difference_plot], - ["Non-Nested CV - Nested CV Score"], - bbox_to_anchor=(0, 1, .8, 0)) +plt.legend( + [difference_plot], + ["Non-Nested CV - Nested CV Score"], + bbox_to_anchor=(0, 1, 0.8, 0), +) plt.ylabel("score difference", fontsize="14") plt.show() diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py index ff5b51837ed2a..7893f704404ac 100644 --- a/examples/model_selection/plot_randomized_search.py +++ b/examples/model_selection/plot_randomized_search.py @@ -34,49 +34,60 @@ X, y = load_digits(return_X_y=True) # build a classifier -clf = SGDClassifier(loss='hinge', penalty='elasticnet', - fit_intercept=True) +clf = SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True) # Utility function to report best scores def report(results, n_top=3): for i in range(1, n_top + 1): - candidates = np.flatnonzero(results['rank_test_score'] == i) + candidates = np.flatnonzero(results["rank_test_score"] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) - print("Mean validation score: {0:.3f} (std: {1:.3f})" - .format(results['mean_test_score'][candidate], - results['std_test_score'][candidate])) - print("Parameters: {0}".format(results['params'][candidate])) + print( + "Mean validation score: {0:.3f} (std: {1:.3f})".format( + results["mean_test_score"][candidate], + results["std_test_score"][candidate], + ) + ) + print("Parameters: {0}".format(results["params"][candidate])) print("") # specify parameters and distributions to sample from -param_dist = {'average': [True, False], - 'l1_ratio': stats.uniform(0, 1), - 'alpha': loguniform(1e-4, 1e0)} +param_dist = { + "average": [True, False], + "l1_ratio": stats.uniform(0, 1), + "alpha": loguniform(1e-4, 1e0), +} # run randomized search n_iter_search = 20 -random_search = RandomizedSearchCV(clf, param_distributions=param_dist, - n_iter=n_iter_search) +random_search = RandomizedSearchCV( + clf, param_distributions=param_dist, n_iter=n_iter_search +) start = time() random_search.fit(X, y) -print("RandomizedSearchCV took %.2f seconds for %d candidates" - " parameter settings." % ((time() - start), n_iter_search)) +print( + "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." + % ((time() - start), n_iter_search) +) report(random_search.cv_results_) # use a full grid over all parameters -param_grid = {'average': [True, False], - 'l1_ratio': np.linspace(0, 1, num=10), - 'alpha': np.power(10, np.arange(-4, 1, dtype=float))} +param_grid = { + "average": [True, False], + "l1_ratio": np.linspace(0, 1, num=10), + "alpha": np.power(10, np.arange(-4, 1, dtype=float)), +} # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(X, y) -print("GridSearchCV took %.2f seconds for %d candidate parameter settings." - % (time() - start, len(grid_search.cv_results_['params']))) +print( + "GridSearchCV took %.2f seconds for %d candidate parameter settings." + % (time() - start, len(grid_search.cv_results_["params"])) +) report(grid_search.cv_results_) diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py index 86ca06eb00445..5cd76faf829d8 100644 --- a/examples/model_selection/plot_roc.py +++ b/examples/model_selection/plot_roc.py @@ -61,12 +61,12 @@ X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, - random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # Learn to predict each class against the other -classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, - random_state=random_state)) +classifier = OneVsRestClassifier( + svm.SVC(kernel="linear", probability=True, random_state=random_state) +) y_score = classifier.fit(X_train, y_train).decision_function(X_test) # Compute ROC curve and ROC area for each class @@ -86,14 +86,19 @@ # Plot of a ROC curve for a specific class plt.figure() lw = 2 -plt.plot(fpr[2], tpr[2], color='darkorange', - lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2]) -plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') +plt.plot( + fpr[2], + tpr[2], + color="darkorange", + lw=lw, + label="ROC curve (area = %0.2f)" % roc_auc[2], +) +plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) -plt.xlabel('False Positive Rate') -plt.ylabel('True Positive Rate') -plt.title('Receiver operating characteristic example') +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("Receiver operating characteristic example") plt.legend(loc="lower right") plt.show() @@ -120,28 +125,40 @@ # Plot all ROC curves plt.figure() -plt.plot(fpr["micro"], tpr["micro"], - label='micro-average ROC curve (area = {0:0.2f})' - ''.format(roc_auc["micro"]), - color='deeppink', linestyle=':', linewidth=4) - -plt.plot(fpr["macro"], tpr["macro"], - label='macro-average ROC curve (area = {0:0.2f})' - ''.format(roc_auc["macro"]), - color='navy', linestyle=':', linewidth=4) - -colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) +plt.plot( + fpr["micro"], + tpr["micro"], + label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]), + color="deeppink", + linestyle=":", + linewidth=4, +) + +plt.plot( + fpr["macro"], + tpr["macro"], + label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]), + color="navy", + linestyle=":", + linewidth=4, +) + +colors = cycle(["aqua", "darkorange", "cornflowerblue"]) for i, color in zip(range(n_classes), colors): - plt.plot(fpr[i], tpr[i], color=color, lw=lw, - label='ROC curve of class {0} (area = {1:0.2f})' - ''.format(i, roc_auc[i])) - -plt.plot([0, 1], [0, 1], 'k--', lw=lw) + plt.plot( + fpr[i], + tpr[i], + color=color, + lw=lw, + label="ROC curve of class {0} (area = {1:0.2f})".format(i, roc_auc[i]), + ) + +plt.plot([0, 1], [0, 1], "k--", lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) -plt.xlabel('False Positive Rate') -plt.ylabel('True Positive Rate') -plt.title('Some extension of Receiver operating characteristic to multi-class') +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("Some extension of Receiver operating characteristic to multi-class") plt.legend(loc="lower right") plt.show() @@ -156,17 +173,19 @@ # prevalence-weighted average. y_prob = classifier.predict_proba(X_test) -macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", - average="macro") -weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", - average="weighted") -macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", - average="macro") -weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", - average="weighted") -print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " - "(weighted by prevalence)" - .format(macro_roc_auc_ovo, weighted_roc_auc_ovo)) -print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " - "(weighted by prevalence)" - .format(macro_roc_auc_ovr, weighted_roc_auc_ovr)) +macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro") +weighted_roc_auc_ovo = roc_auc_score( + y_test, y_prob, multi_class="ovo", average="weighted" +) +macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro") +weighted_roc_auc_ovr = roc_auc_score( + y_test, y_prob, multi_class="ovr", average="weighted" +) +print( + "One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " + "(weighted by prevalence)".format(macro_roc_auc_ovo, weighted_roc_auc_ovo) +) +print( + "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} " + "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr) +) diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index 04a2d19df89f7..c78de42ab9359 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -33,13 +33,14 @@ gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7] Cs = [1, 10, 100, 1e3, 1e4, 1e5] -param_grid = {'gamma': gammas, 'C': Cs} +param_grid = {"gamma": gammas, "C": Cs} clf = SVC(random_state=rng) tic = time() -gsh = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, factor=2, - random_state=rng) +gsh = HalvingGridSearchCV( + estimator=clf, param_grid=param_grid, factor=2, random_state=rng +) gsh.fit(X, y) gsh_time = time() - tic @@ -55,46 +56,54 @@ def make_heatmap(ax, gs, is_sh=False, make_cbar=False): """Helper to make a heatmap.""" results = pd.DataFrame.from_dict(gs.cv_results_) - results['params_str'] = results.params.apply(str) + results["params_str"] = results.params.apply(str) if is_sh: # SH dataframe: get mean_test_score values for the highest iter - scores_matrix = results.sort_values('iter').pivot_table( - index='param_gamma', columns='param_C', - values='mean_test_score', aggfunc='last' + scores_matrix = results.sort_values("iter").pivot_table( + index="param_gamma", + columns="param_C", + values="mean_test_score", + aggfunc="last", ) else: - scores_matrix = results.pivot(index='param_gamma', columns='param_C', - values='mean_test_score') + scores_matrix = results.pivot( + index="param_gamma", columns="param_C", values="mean_test_score" + ) im = ax.imshow(scores_matrix) ax.set_xticks(np.arange(len(Cs))) - ax.set_xticklabels(['{:.0E}'.format(x) for x in Cs]) - ax.set_xlabel('C', fontsize=15) + ax.set_xticklabels(["{:.0E}".format(x) for x in Cs]) + ax.set_xlabel("C", fontsize=15) ax.set_yticks(np.arange(len(gammas))) - ax.set_yticklabels(['{:.0E}'.format(x) for x in gammas]) - ax.set_ylabel('gamma', fontsize=15) + ax.set_yticklabels(["{:.0E}".format(x) for x in gammas]) + ax.set_ylabel("gamma", fontsize=15) # Rotate the tick labels and set their alignment. - plt.setp(ax.get_xticklabels(), rotation=45, ha="right", - rotation_mode="anchor") + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") if is_sh: - iterations = results.pivot_table(index='param_gamma', - columns='param_C', values='iter', - aggfunc='max').values + iterations = results.pivot_table( + index="param_gamma", columns="param_C", values="iter", aggfunc="max" + ).values for i in range(len(gammas)): for j in range(len(Cs)): - ax.text(j, i, iterations[i, j], - ha="center", va="center", color="w", fontsize=20) + ax.text( + j, + i, + iterations[i, j], + ha="center", + va="center", + color="w", + fontsize=20, + ) if make_cbar: fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) fig.colorbar(im, cax=cbar_ax) - cbar_ax.set_ylabel('mean_test_score', rotation=-90, va="bottom", - fontsize=15) + cbar_ax.set_ylabel("mean_test_score", rotation=-90, va="bottom", fontsize=15) fig, axes = plt.subplots(ncols=2, sharey=True) @@ -103,9 +112,8 @@ def make_heatmap(ax, gs, is_sh=False, make_cbar=False): make_heatmap(ax1, gsh, is_sh=True) make_heatmap(ax2, gs, make_cbar=True) -ax1.set_title('Successive Halving\ntime = {:.3f}s'.format(gsh_time), - fontsize=15) -ax2.set_title('GridSearch\ntime = {:.3f}s'.format(gs_time), fontsize=15) +ax1.set_title("Successive Halving\ntime = {:.3f}s".format(gsh_time), fontsize=15) +ax2.set_title("GridSearch\ntime = {:.3f}s".format(gs_time), fontsize=15) plt.show() diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index 53d33849e9801..11c204ef939d5 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -32,17 +32,17 @@ clf = RandomForestClassifier(n_estimators=20, random_state=rng) -param_dist = {"max_depth": [3, None], - "max_features": randint(1, 11), - "min_samples_split": randint(2, 11), - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]} +param_dist = { + "max_depth": [3, None], + "max_features": randint(1, 11), + "min_samples_split": randint(2, 11), + "bootstrap": [True, False], + "criterion": ["gini", "entropy"], +} rsh = HalvingRandomSearchCV( - estimator=clf, - param_distributions=param_dist, - factor=2, - random_state=rng) + estimator=clf, param_distributions=param_dist, factor=2, random_state=rng +) rsh.fit(X, y) # %% @@ -50,23 +50,23 @@ # and plot the evolution of the search. results = pd.DataFrame(rsh.cv_results_) -results['params_str'] = results.params.apply(str) -results.drop_duplicates(subset=('params_str', 'iter'), inplace=True) -mean_scores = results.pivot(index='iter', columns='params_str', - values='mean_test_score') -ax = mean_scores.plot(legend=False, alpha=.6) +results["params_str"] = results.params.apply(str) +results.drop_duplicates(subset=("params_str", "iter"), inplace=True) +mean_scores = results.pivot( + index="iter", columns="params_str", values="mean_test_score" +) +ax = mean_scores.plot(legend=False, alpha=0.6) labels = [ - f'iter={i}\nn_samples={rsh.n_resources_[i]}\n' - f'n_candidates={rsh.n_candidates_[i]}' + f"iter={i}\nn_samples={rsh.n_resources_[i]}\nn_candidates={rsh.n_candidates_[i]}" for i in range(rsh.n_iterations_) ] ax.set_xticks(range(rsh.n_iterations_)) -ax.set_xticklabels(labels, rotation=45, multialignment='left') -ax.set_title('Scores of candidates over iterations') -ax.set_ylabel('mean test score', fontsize=15) -ax.set_xlabel('iterations', fontsize=15) +ax.set_xticklabels(labels, rotation=45, multialignment="left") +ax.set_title("Scores of candidates over iterations") +ax.set_ylabel("mean test score", fontsize=15) +ax.set_xlabel("iterations", fontsize=15) plt.tight_layout() plt.show() diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index f32d6e46f9933..d9c00805aa390 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -56,20 +56,27 @@ # Plot results functions import matplotlib.pyplot as plt + plt.subplot(2, 1, 1) -plt.semilogx(alphas, train_errors, label='Train') -plt.semilogx(alphas, test_errors, label='Test') -plt.vlines(alpha_optim, plt.ylim()[0], np.max(test_errors), color='k', - linewidth=3, label='Optimum on test') -plt.legend(loc='lower left') +plt.semilogx(alphas, train_errors, label="Train") +plt.semilogx(alphas, test_errors, label="Test") +plt.vlines( + alpha_optim, + plt.ylim()[0], + np.max(test_errors), + color="k", + linewidth=3, + label="Optimum on test", +) +plt.legend(loc="lower left") plt.ylim([0, 1.2]) -plt.xlabel('Regularization parameter') -plt.ylabel('Performance') +plt.xlabel("Regularization parameter") +plt.ylabel("Performance") # Show estimated coef_ vs true coef plt.subplot(2, 1, 2) -plt.plot(coef, label='True coef') -plt.plot(coef_, label='Estimated coef') +plt.plot(coef, label="True coef") +plt.plot(coef_, label="Estimated coef") plt.legend() plt.subplots_adjust(0.09, 0.04, 0.94, 0.94, 0.26, 0.26) plt.show() diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py index fcd799923f625..c93da3f2dc5b5 100644 --- a/examples/model_selection/plot_underfitting_overfitting.py +++ b/examples/model_selection/plot_underfitting_overfitting.py @@ -47,26 +47,33 @@ def true_fun(X): ax = plt.subplot(1, len(degrees), i + 1) plt.setp(ax, xticks=(), yticks=()) - polynomial_features = PolynomialFeatures(degree=degrees[i], - include_bias=False) + polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False) linear_regression = LinearRegression() - pipeline = Pipeline([("polynomial_features", polynomial_features), - ("linear_regression", linear_regression)]) + pipeline = Pipeline( + [ + ("polynomial_features", polynomial_features), + ("linear_regression", linear_regression), + ] + ) pipeline.fit(X[:, np.newaxis], y) # Evaluate the models using crossvalidation - scores = cross_val_score(pipeline, X[:, np.newaxis], y, - scoring="neg_mean_squared_error", cv=10) + scores = cross_val_score( + pipeline, X[:, np.newaxis], y, scoring="neg_mean_squared_error", cv=10 + ) X_test = np.linspace(0, 1, 100) plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") plt.plot(X_test, true_fun(X_test), label="True function") - plt.scatter(X, y, edgecolor='b', s=20, label="Samples") + plt.scatter(X, y, edgecolor="b", s=20, label="Samples") plt.xlabel("x") plt.ylabel("y") plt.xlim((0, 1)) plt.ylim((-2, 2)) plt.legend(loc="best") - plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( - degrees[i], -scores.mean(), scores.std())) + plt.title( + "Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( + degrees[i], -scores.mean(), scores.std() + ) + ) plt.show() diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index 0eb3850787c53..524dd71eda40f 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -24,8 +24,14 @@ param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( - SVC(), X, y, param_name="gamma", param_range=param_range, - scoring="accuracy", n_jobs=1) + SVC(), + X, + y, + param_name="gamma", + param_range=param_range, + scoring="accuracy", + n_jobs=1, +) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) @@ -36,15 +42,27 @@ plt.ylabel("Score") plt.ylim(0.0, 1.1) lw = 2 -plt.semilogx(param_range, train_scores_mean, label="Training score", - color="darkorange", lw=lw) -plt.fill_between(param_range, train_scores_mean - train_scores_std, - train_scores_mean + train_scores_std, alpha=0.2, - color="darkorange", lw=lw) -plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", - color="navy", lw=lw) -plt.fill_between(param_range, test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, alpha=0.2, - color="navy", lw=lw) +plt.semilogx( + param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw +) +plt.fill_between( + param_range, + train_scores_mean - train_scores_std, + train_scores_mean + train_scores_std, + alpha=0.2, + color="darkorange", + lw=lw, +) +plt.semilogx( + param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw +) +plt.fill_between( + param_range, + test_scores_mean - test_scores_std, + test_scores_mean + test_scores_std, + alpha=0.2, + color="navy", + lw=lw, +) plt.legend(loc="best") plt.show() diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py index 7ae80af3fdab3..a92249b97e4ad 100644 --- a/examples/multioutput/plot_classifier_chain_yeast.py +++ b/examples/multioutput/plot_classifier_chain_yeast.py @@ -47,10 +47,9 @@ print(__doc__) # Load a multi-label dataset from https://www.openml.org/d/40597 -X, Y = fetch_openml('yeast', version=4, return_X_y=True) -Y = Y == 'TRUE' -X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, - random_state=0) +X, Y = fetch_openml("yeast", version=4, return_X_y=True) +Y = Y == "TRUE" +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Fit an independent logistic regression model for each class using the # OneVsRestClassifier wrapper. @@ -58,41 +57,42 @@ ovr = OneVsRestClassifier(base_lr) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) -ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples') +ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average="samples") # Fit an ensemble of logistic regression classifier chains and take the # take the average prediction of all the chains. -chains = [ClassifierChain(base_lr, order='random', random_state=i) - for i in range(10)] +chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)] for chain in chains: chain.fit(X_train, Y_train) -Y_pred_chains = np.array([chain.predict(X_test) for chain in - chains]) -chain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5, - average='samples') - for Y_pred_chain in Y_pred_chains] +Y_pred_chains = np.array([chain.predict(X_test) for chain in chains]) +chain_jaccard_scores = [ + jaccard_score(Y_test, Y_pred_chain >= 0.5, average="samples") + for Y_pred_chain in Y_pred_chains +] Y_pred_ensemble = Y_pred_chains.mean(axis=0) -ensemble_jaccard_score = jaccard_score(Y_test, - Y_pred_ensemble >= .5, - average='samples') +ensemble_jaccard_score = jaccard_score( + Y_test, Y_pred_ensemble >= 0.5, average="samples" +) model_scores = [ovr_jaccard_score] + chain_jaccard_scores model_scores.append(ensemble_jaccard_score) -model_names = ('Independent', - 'Chain 1', - 'Chain 2', - 'Chain 3', - 'Chain 4', - 'Chain 5', - 'Chain 6', - 'Chain 7', - 'Chain 8', - 'Chain 9', - 'Chain 10', - 'Ensemble') +model_names = ( + "Independent", + "Chain 1", + "Chain 2", + "Chain 3", + "Chain 4", + "Chain 5", + "Chain 6", + "Chain 7", + "Chain 8", + "Chain 9", + "Chain 10", + "Ensemble", +) x_pos = np.arange(len(model_names)) @@ -102,12 +102,12 @@ fig, ax = plt.subplots(figsize=(7, 4)) ax.grid(True) -ax.set_title('Classifier Chain Ensemble Performance Comparison') +ax.set_title("Classifier Chain Ensemble Performance Comparison") ax.set_xticks(x_pos) -ax.set_xticklabels(model_names, rotation='vertical') -ax.set_ylabel('Jaccard Similarity Score') -ax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1]) -colors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g'] +ax.set_xticklabels(model_names, rotation="vertical") +ax.set_ylabel("Jaccard Similarity Score") +ax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1]) +colors = ["r"] + ["b"] * len(chain_jaccard_scores) + ["g"] ax.bar(x_pos, model_scores, alpha=0.5, color=colors) plt.tight_layout() plt.show() diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py index 78f5f184a0da7..055796600fd4e 100644 --- a/examples/neighbors/approximate_nearest_neighbors.py +++ b/examples/neighbors/approximate_nearest_neighbors.py @@ -73,8 +73,7 @@ class NMSlibTransformer(TransformerMixin, BaseEstimator): """Wrapper for using nmslib as sklearn's KNeighborsTransformer""" - def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph', - n_jobs=1): + def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=1): self.n_neighbors = n_neighbors self.method = method self.metric = metric @@ -86,10 +85,10 @@ def fit(self, X): # see more metric in the manual # https://github.com/nmslib/nmslib/tree/master/manual space = { - 'euclidean': 'l2', - 'cosine': 'cosinesimil', - 'l1': 'l1', - 'l2': 'l2', + "euclidean": "l2", + "cosine": "cosinesimil", + "l1": "l1", + "l2": "l2", }[self.metric] self.nmslib_ = nmslib.init(method=self.method, space=space) @@ -104,16 +103,15 @@ def transform(self, X): # neighbor, one extra neighbor will be computed. n_neighbors = self.n_neighbors + 1 - results = self.nmslib_.knnQueryBatch(X, k=n_neighbors, - num_threads=self.n_jobs) + results = self.nmslib_.knnQueryBatch(X, k=n_neighbors, num_threads=self.n_jobs) indices, distances = zip(*results) indices, distances = np.vstack(indices), np.vstack(distances) - indptr = np.arange(0, n_samples_transform * n_neighbors + 1, - n_neighbors) - kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), - indptr), shape=(n_samples_transform, - self.n_samples_fit_)) + indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors) + kneighbors_graph = csr_matrix( + (distances.ravel(), indices.ravel(), indptr), + shape=(n_samples_transform, self.n_samples_fit_), + ) return kneighbors_graph @@ -121,8 +119,7 @@ def transform(self, X): class AnnoyTransformer(TransformerMixin, BaseEstimator): """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer""" - def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10, - search_k=-1): + def __init__(self, n_neighbors=5, metric="euclidean", n_trees=10, search_k=-1): self.n_neighbors = n_neighbors self.n_trees = n_trees self.search_k = search_k @@ -151,34 +148,33 @@ def _transform(self, X): # neighbor, one extra neighbor will be computed. n_neighbors = self.n_neighbors + 1 - indices = np.empty((n_samples_transform, n_neighbors), - dtype=int) + indices = np.empty((n_samples_transform, n_neighbors), dtype=int) distances = np.empty((n_samples_transform, n_neighbors)) if X is None: for i in range(self.annoy_.get_n_items()): ind, dist = self.annoy_.get_nns_by_item( - i, n_neighbors, self.search_k, include_distances=True) + i, n_neighbors, self.search_k, include_distances=True + ) indices[i], distances[i] = ind, dist else: for i, x in enumerate(X): indices[i], distances[i] = self.annoy_.get_nns_by_vector( - x.tolist(), n_neighbors, self.search_k, - include_distances=True) + x.tolist(), n_neighbors, self.search_k, include_distances=True + ) - indptr = np.arange(0, n_samples_transform * n_neighbors + 1, - n_neighbors) - kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), - indptr), shape=(n_samples_transform, - self.n_samples_fit_)) + indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors) + kneighbors_graph = csr_matrix( + (distances.ravel(), indices.ravel(), indptr), + shape=(n_samples_transform, self.n_samples_fit_), + ) return kneighbors_graph def test_transformers(): - """Test that AnnoyTransformer and KNeighborsTransformer give same results - """ + """Test that AnnoyTransformer and KNeighborsTransformer give same results""" X = np.random.RandomState(42).randn(10, 2) knn = KNeighborsTransformer() @@ -203,8 +199,8 @@ def load_mnist(n_samples): def run_benchmark(): datasets = [ - ('MNIST_2000', load_mnist(n_samples=2000)), - ('MNIST_10000', load_mnist(n_samples=10000)), + ("MNIST_2000", load_mnist(n_samples=2000)), + ("MNIST_10000", load_mnist(n_samples=10000)), ] n_iter = 500 @@ -213,49 +209,67 @@ def run_benchmark(): # TSNE requires a certain number of neighbors which depends on the # perplexity parameter. # Add one since we include each sample as its own neighbor. - n_neighbors = int(3. * perplexity + 1) + 1 + n_neighbors = int(3.0 * perplexity + 1) + 1 - tsne_params = dict(perplexity=perplexity, method="barnes_hut", - random_state=42, n_iter=n_iter, - square_distances=True) + tsne_params = dict( + perplexity=perplexity, + method="barnes_hut", + random_state=42, + n_iter=n_iter, + square_distances=True, + ) transformers = [ - ('AnnoyTransformer', - AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)), - ('NMSlibTransformer', - NMSlibTransformer(n_neighbors=n_neighbors, metric=metric)), - ('KNeighborsTransformer', - KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', - metric=metric)), - ('TSNE with AnnoyTransformer', - make_pipeline( - AnnoyTransformer(n_neighbors=n_neighbors, metric=metric), - TSNE(metric='precomputed', **tsne_params))), - ('TSNE with NMSlibTransformer', - make_pipeline( - NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), - TSNE(metric='precomputed', **tsne_params))), - ('TSNE with KNeighborsTransformer', - make_pipeline( - KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', - metric=metric), - TSNE(metric='precomputed', **tsne_params))), - ('TSNE with internal NearestNeighbors', - TSNE(metric=metric, **tsne_params)), + ("AnnoyTransformer", AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)), + ( + "NMSlibTransformer", + NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), + ), + ( + "KNeighborsTransformer", + KNeighborsTransformer( + n_neighbors=n_neighbors, mode="distance", metric=metric + ), + ), + ( + "TSNE with AnnoyTransformer", + make_pipeline( + AnnoyTransformer(n_neighbors=n_neighbors, metric=metric), + TSNE(metric="precomputed", **tsne_params), + ), + ), + ( + "TSNE with NMSlibTransformer", + make_pipeline( + NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), + TSNE(metric="precomputed", **tsne_params), + ), + ), + ( + "TSNE with KNeighborsTransformer", + make_pipeline( + KNeighborsTransformer( + n_neighbors=n_neighbors, mode="distance", metric=metric + ), + TSNE(metric="precomputed", **tsne_params), + ), + ), + ("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)), ] # init the plot nrows = len(datasets) - ncols = np.sum([1 for name, model in transformers if 'TSNE' in name]) - fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False, - figsize=(5 * ncols, 4 * nrows)) + ncols = np.sum([1 for name, model in transformers if "TSNE" in name]) + fig, axes = plt.subplots( + nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows) + ) axes = axes.ravel() i_ax = 0 for dataset_name, (X, y) in datasets: - msg = 'Benchmarking on %s:' % dataset_name - print('\n%s\n%s' % (msg, '-' * len(msg))) + msg = "Benchmarking on %s:" % dataset_name + print("\n%s\n%s" % (msg, "-" * len(msg))) for transformer_name, transformer in transformers: start = time.time() @@ -264,23 +278,28 @@ def run_benchmark(): # print the duration report longest = np.max([len(name) for name, model in transformers]) - whitespaces = ' ' * (longest - len(transformer_name)) - print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration)) + whitespaces = " " * (longest - len(transformer_name)) + print("%s: %s%.3f sec" % (transformer_name, whitespaces, duration)) # plot TSNE embedding which should be very similar across methods - if 'TSNE' in transformer_name: - axes[i_ax].set_title(transformer_name + '\non ' + dataset_name) - axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y.astype(np.int32), - alpha=0.2, cmap=plt.cm.viridis) + if "TSNE" in transformer_name: + axes[i_ax].set_title(transformer_name + "\non " + dataset_name) + axes[i_ax].scatter( + Xt[:, 0], + Xt[:, 1], + c=y.astype(np.int32), + alpha=0.2, + cmap=plt.cm.viridis, + ) axes[i_ax].xaxis.set_major_formatter(NullFormatter()) axes[i_ax].yaxis.set_major_formatter(NullFormatter()) - axes[i_ax].axis('tight') + axes[i_ax].axis("tight") i_ax += 1 fig.tight_layout() plt.show() -if __name__ == '__main__': +if __name__ == "__main__": test_transformers() run_benchmark() diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py index a97ed3f2983a4..0ffe6b92d3053 100644 --- a/examples/neighbors/plot_caching_nearest_neighbors.py +++ b/examples/neighbors/plot_caching_nearest_neighbors.py @@ -35,30 +35,35 @@ # The transformer computes the nearest neighbors graph using the maximum number # of neighbors necessary in the grid search. The classifier model filters the # nearest neighbors graph as required by its own n_neighbors parameter. -graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), - mode='distance') -classifier_model = KNeighborsClassifier(metric='precomputed') +graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode="distance") +classifier_model = KNeighborsClassifier(metric="precomputed") # Note that we give `memory` a directory to cache the graph computation # that will be used several times when tuning the hyperparameters of the # classifier. with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir: full_model = Pipeline( - steps=[('graph', graph_model), ('classifier', classifier_model)], - memory=tmpdir) + steps=[("graph", graph_model), ("classifier", classifier_model)], memory=tmpdir + ) - param_grid = {'classifier__n_neighbors': n_neighbors_list} + param_grid = {"classifier__n_neighbors": n_neighbors_list} grid_model = GridSearchCV(full_model, param_grid) grid_model.fit(X, y) # Plot the results of the grid search. fig, axes = plt.subplots(1, 2, figsize=(8, 4)) -axes[0].errorbar(x=n_neighbors_list, - y=grid_model.cv_results_['mean_test_score'], - yerr=grid_model.cv_results_['std_test_score']) -axes[0].set(xlabel='n_neighbors', title='Classification accuracy') -axes[1].errorbar(x=n_neighbors_list, y=grid_model.cv_results_['mean_fit_time'], - yerr=grid_model.cv_results_['std_fit_time'], color='r') -axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)') +axes[0].errorbar( + x=n_neighbors_list, + y=grid_model.cv_results_["mean_test_score"], + yerr=grid_model.cv_results_["std_test_score"], +) +axes[0].set(xlabel="n_neighbors", title="Classification accuracy") +axes[1].errorbar( + x=n_neighbors_list, + y=grid_model.cv_results_["mean_fit_time"], + yerr=grid_model.cv_results_["std_fit_time"], + color="r", +) +axes[1].set(xlabel="n_neighbors", title="Fit time (with caching)") fig.tight_layout() plt.show() diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py index 0b8d828225f39..08e22a2874a16 100644 --- a/examples/neighbors/plot_classification.py +++ b/examples/neighbors/plot_classification.py @@ -24,13 +24,13 @@ X = iris.data[:, :2] y = iris.target -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh # Create color maps -cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue']) -cmap_bold = ['darkorange', 'c', 'darkblue'] +cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) +cmap_bold = ["darkorange", "c", "darkblue"] -for weights in ['uniform', 'distance']: +for weights in ["uniform", "distance"]: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(X, y) @@ -39,8 +39,7 @@ # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot @@ -49,12 +48,19 @@ plt.contourf(xx, yy, Z, cmap=cmap_light) # Plot also the training points - sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], - palette=cmap_bold, alpha=1.0, edgecolor="black") + sns.scatterplot( + x=X[:, 0], + y=X[:, 1], + hue=iris.target_names[y], + palette=cmap_bold, + alpha=1.0, + edgecolor="black", + ) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) - plt.title("3-Class classification (k = %i, weights = '%s')" - % (n_neighbors, weights)) + plt.title( + "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights) + ) plt.xlabel(iris.feature_names[0]) plt.ylabel(iris.feature_names[1]) diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py index 8367d16b955fe..44e4b1f65c38b 100644 --- a/examples/neighbors/plot_digits_kde_sampling.py +++ b/examples/neighbors/plot_digits_kde_sampling.py @@ -26,7 +26,7 @@ data = pca.fit_transform(digits.data) # use grid search cross-validation to optimize the bandwidth -params = {'bandwidth': np.logspace(-1, 1, 20)} +params = {"bandwidth": np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(data) @@ -48,14 +48,16 @@ for j in range(11): ax[4, j].set_visible(False) for i in range(4): - im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)), - cmap=plt.cm.binary, interpolation='nearest') + im = ax[i, j].imshow( + real_data[i, j].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest" + ) im.set_clim(0, 16) - im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)), - cmap=plt.cm.binary, interpolation='nearest') + im = ax[i + 5, j].imshow( + new_data[i, j].reshape((8, 8)), cmap=plt.cm.binary, interpolation="nearest" + ) im.set_clim(0, 16) -ax[0, 5].set_title('Selection from the input data') +ax[0, 5].set_title("Selection from the input data") ax[5, 5].set_title('"New" digits drawn from the kernel density model') plt.show() diff --git a/examples/neighbors/plot_kde_1d.py b/examples/neighbors/plot_kde_1d.py index fb2699e548ade..857be4feaa367 100644 --- a/examples/neighbors/plot_kde_1d.py +++ b/examples/neighbors/plot_kde_1d.py @@ -36,17 +36,18 @@ from sklearn.utils.fixes import parse_version # `normed` is being deprecated in favor of `density` in histograms -if parse_version(matplotlib.__version__) >= parse_version('2.1'): - density_param = {'density': True} +if parse_version(matplotlib.__version__) >= parse_version("2.1"): + density_param = {"density": True} else: - density_param = {'normed': True} + density_param = {"normed": True} # ---------------------------------------------------------------------- # Plot the progression of histograms to kernels np.random.seed(1) N = 20 -X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)), - np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis] +X = np.concatenate( + (np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N))) +)[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] bins = np.linspace(-5, 10, 10) @@ -54,35 +55,35 @@ fig.subplots_adjust(hspace=0.05, wspace=0.05) # histogram 1 -ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param) +ax[0, 0].hist(X[:, 0], bins=bins, fc="#AAAAFF", **density_param) ax[0, 0].text(-3.5, 0.31, "Histogram") # histogram 2 -ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param) +ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc="#AAAAFF", **density_param) ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted") # tophat KDE -kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X) +kde = KernelDensity(kernel="tophat", bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) -ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') +ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc="#AAAAFF") ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") # Gaussian KDE -kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) +kde = KernelDensity(kernel="gaussian", bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) -ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') +ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc="#AAAAFF") ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density") for axi in ax.ravel(): - axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k') + axi.plot(X[:, 0], np.full(X.shape[0], -0.01), "+k") axi.set_xlim(-4, 9) axi.set_ylim(-0.02, 0.34) for axi in ax[:, 0]: - axi.set_ylabel('Normalized Density') + axi.set_ylabel("Normalized Density") for axi in ax[1, :]: - axi.set_xlabel('x') + axi.set_xlabel("x") # ---------------------------------------------------------------------- # Plot all available kernels @@ -95,20 +96,21 @@ def format_func(x, loc): if x == 0: - return '0' + return "0" elif x == 1: - return 'h' + return "h" elif x == -1: - return '-h' + return "-h" else: - return '%ih' % x + return "%ih" % x -for i, kernel in enumerate(['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']): +for i, kernel in enumerate( + ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] +): axi = ax.ravel()[i] log_dens = KernelDensity(kernel=kernel).fit(X_src).score_samples(X_plot) - axi.fill(X_plot[:, 0], np.exp(log_dens), '-k', fc='#AAAAFF') + axi.fill(X_plot[:, 0], np.exp(log_dens), "-k", fc="#AAAAFF") axi.text(-2.6, 0.95, kernel) axi.xaxis.set_major_formatter(plt.FuncFormatter(format_func)) @@ -118,37 +120,42 @@ def format_func(x, loc): axi.set_ylim(0, 1.05) axi.set_xlim(-2.9, 2.9) -ax[0, 1].set_title('Available Kernels') +ax[0, 1].set_title("Available Kernels") # ---------------------------------------------------------------------- # Plot a 1D density example N = 100 np.random.seed(1) -X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)), - np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis] +X = np.concatenate( + (np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N))) +)[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] -true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) - + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) +true_dens = 0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0]) fig, ax = plt.subplots() -ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2, - label='input distribution') -colors = ['navy', 'cornflowerblue', 'darkorange'] -kernels = ['gaussian', 'tophat', 'epanechnikov'] +ax.fill(X_plot[:, 0], true_dens, fc="black", alpha=0.2, label="input distribution") +colors = ["navy", "cornflowerblue", "darkorange"] +kernels = ["gaussian", "tophat", "epanechnikov"] lw = 2 for color, kernel in zip(colors, kernels): kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) - ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw, - linestyle='-', label="kernel = '{0}'".format(kernel)) + ax.plot( + X_plot[:, 0], + np.exp(log_dens), + color=color, + lw=lw, + linestyle="-", + label="kernel = '{0}'".format(kernel), + ) ax.text(6, 0.38, "N={0} points".format(N)) -ax.legend(loc='upper left') -ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') +ax.legend(loc="upper left") +ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), "+k") ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) diff --git a/examples/neighbors/plot_lof_novelty_detection.py b/examples/neighbors/plot_lof_novelty_detection.py index f1129d0bd64e6..a37d28bb6d4bb 100644 --- a/examples/neighbors/plot_lof_novelty_detection.py +++ b/examples/neighbors/plot_lof_novelty_detection.py @@ -60,24 +60,29 @@ plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) -a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') -plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') +a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") +plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") s = 40 -b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') -b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, - edgecolors='k') -c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, - edgecolors='k') -plt.axis('tight') +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k") +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k") +plt.axis("tight") plt.xlim((-5, 5)) plt.ylim((-5, 5)) -plt.legend([a.collections[0], b1, b2, c], - ["learned frontier", "training observations", - "new regular observations", "new abnormal observations"], - loc="upper left", - prop=matplotlib.font_manager.FontProperties(size=11)) +plt.legend( + [a.collections[0], b1, b2, c], + [ + "learned frontier", + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="upper left", + prop=matplotlib.font_manager.FontProperties(size=11), +) plt.xlabel( "errors novel regular: %d/40 ; errors novel abnormal: %d/40" - % (n_error_test, n_error_outliers)) + % (n_error_test, n_error_outliers) +) plt.show() diff --git a/examples/neighbors/plot_lof_outlier_detection.py b/examples/neighbors/plot_lof_outlier_detection.py index 4bb2949bcdcd7..0c7706acc8b93 100644 --- a/examples/neighbors/plot_lof_outlier_detection.py +++ b/examples/neighbors/plot_lof_outlier_detection.py @@ -53,16 +53,22 @@ X_scores = clf.negative_outlier_factor_ plt.title("Local Outlier Factor (LOF)") -plt.scatter(X[:, 0], X[:, 1], color='k', s=3., label='Data points') +plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points") # plot circles with radius proportional to the outlier scores radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) -plt.scatter(X[:, 0], X[:, 1], s=1000 * radius, edgecolors='r', - facecolors='none', label='Outlier scores') -plt.axis('tight') +plt.scatter( + X[:, 0], + X[:, 1], + s=1000 * radius, + edgecolors="r", + facecolors="none", + label="Outlier scores", +) +plt.axis("tight") plt.xlim((-5, 5)) plt.ylim((-5, 5)) plt.xlabel("prediction errors: %d" % (n_errors)) -legend = plt.legend(loc='upper left') +legend = plt.legend(loc="upper left") legend.legendHandles[0]._sizes = [10] legend.legendHandles[1]._sizes = [20] plt.show() diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py index 5536e8eb69e89..79b4a7a370557 100644 --- a/examples/neighbors/plot_nca_classification.py +++ b/examples/neighbors/plot_nca_classification.py @@ -22,8 +22,7 @@ from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -from sklearn.neighbors import (KNeighborsClassifier, - NeighborhoodComponentsAnalysis) +from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis from sklearn.pipeline import Pipeline @@ -38,30 +37,37 @@ # slicing by using a two-dim dataset X = X[:, [0, 2]] -X_train, X_test, y_train, y_test = \ - train_test_split(X, y, stratify=y, test_size=0.7, random_state=42) +X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, test_size=0.7, random_state=42 +) -h = .01 # step size in the mesh +h = 0.01 # step size in the mesh # Create color maps -cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) -cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) - -names = ['KNN', 'NCA, KNN'] - -classifiers = [Pipeline([('scaler', StandardScaler()), - ('knn', KNeighborsClassifier(n_neighbors=n_neighbors)) - ]), - Pipeline([('scaler', StandardScaler()), - ('nca', NeighborhoodComponentsAnalysis()), - ('knn', KNeighborsClassifier(n_neighbors=n_neighbors)) - ]) - ] +cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"]) +cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"]) + +names = ["KNN", "NCA, KNN"] + +classifiers = [ + Pipeline( + [ + ("scaler", StandardScaler()), + ("knn", KNeighborsClassifier(n_neighbors=n_neighbors)), + ] + ), + Pipeline( + [ + ("scaler", StandardScaler()), + ("nca", NeighborhoodComponentsAnalysis()), + ("knn", KNeighborsClassifier(n_neighbors=n_neighbors)), + ] + ), +] x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) for name, clf in zip(names, classifiers): @@ -75,14 +81,21 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() - plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8) + plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=0.8) # Plot also the training and testing points - plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) + plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("{} (k = {})".format(name, n_neighbors)) - plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15, - ha='center', va='center', transform=plt.gca().transAxes) + plt.text( + 0.9, + 0.1, + "{:.2f}".format(score), + size=15, + ha="center", + va="center", + transform=plt.gca().transAxes, + ) plt.show() diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py index 64135f76ee58e..95be0e7ec327d 100644 --- a/examples/neighbors/plot_nca_dim_reduction.py +++ b/examples/neighbors/plot_nca_dim_reduction.py @@ -35,8 +35,7 @@ from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.neighbors import (KNeighborsClassifier, - NeighborhoodComponentsAnalysis) +from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler @@ -49,31 +48,30 @@ X, y = datasets.load_digits(return_X_y=True) # Split into train/test -X_train, X_test, y_train, y_test = \ - train_test_split(X, y, test_size=0.5, stratify=y, - random_state=random_state) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, stratify=y, random_state=random_state +) dim = len(X[0]) n_classes = len(np.unique(y)) # Reduce dimension to 2 with PCA -pca = make_pipeline(StandardScaler(), - PCA(n_components=2, random_state=random_state)) +pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state)) # Reduce dimension to 2 with LinearDiscriminantAnalysis -lda = make_pipeline(StandardScaler(), - LinearDiscriminantAnalysis(n_components=2)) +lda = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(n_components=2)) # Reduce dimension to 2 with NeighborhoodComponentAnalysis -nca = make_pipeline(StandardScaler(), - NeighborhoodComponentsAnalysis(n_components=2, - random_state=random_state)) +nca = make_pipeline( + StandardScaler(), + NeighborhoodComponentsAnalysis(n_components=2, random_state=random_state), +) # Use a nearest neighbor classifier to evaluate the methods knn = KNeighborsClassifier(n_neighbors=n_neighbors) # Make a list of the methods to be compared -dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)] +dim_reduction_methods = [("PCA", pca), ("LDA", lda), ("NCA", nca)] # plt.figure() for i, (name, model) in enumerate(dim_reduction_methods): @@ -93,8 +91,8 @@ X_embedded = model.transform(X) # Plot the projected points and show the evaluation score - plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1') - plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, - n_neighbors, - acc_knn)) + plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap="Set1") + plt.title( + "{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, n_neighbors, acc_knn) + ) plt.show() diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py index 1e1f435e7e57a..ec158e5ad9824 100644 --- a/examples/neighbors/plot_nca_illustration.py +++ b/examples/neighbors/plot_nca_illustration.py @@ -28,32 +28,37 @@ # point no. 3. The thickness of a link between point no. 3 and another point # is proportional to their distance. -X, y = make_classification(n_samples=9, n_features=2, n_informative=2, - n_redundant=0, n_classes=3, n_clusters_per_class=1, - class_sep=1.0, random_state=0) +X, y = make_classification( + n_samples=9, + n_features=2, + n_informative=2, + n_redundant=0, + n_classes=3, + n_clusters_per_class=1, + class_sep=1.0, + random_state=0, +) plt.figure(1) ax = plt.gca() for i in range(X.shape[0]): - ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center') + ax.text(X[i, 0], X[i, 1], str(i), va="center", ha="center") ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4) ax.set_title("Original points") ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) -ax.axis('equal') # so that boundaries are displayed correctly as circles +ax.axis("equal") # so that boundaries are displayed correctly as circles def link_thickness_i(X, i): diff_embedded = X[i] - X - dist_embedded = np.einsum('ij,ij->i', diff_embedded, - diff_embedded) + dist_embedded = np.einsum("ij,ij->i", diff_embedded, diff_embedded) dist_embedded[i] = np.inf # compute exponentiated distances (use the log-sum-exp trick to # avoid numerical instabilities - exp_dist_embedded = np.exp(-dist_embedded - - logsumexp(-dist_embedded)) + exp_dist_embedded = np.exp(-dist_embedded - logsumexp(-dist_embedded)) return exp_dist_embedded @@ -63,8 +68,7 @@ def relate_point(X, i, ax): thickness = link_thickness_i(X, i) if i != j: line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]]) - ax.plot(*line, c=cm.Set1(y[j]), - linewidth=5*thickness[j]) + ax.plot(*line, c=cm.Set1(y[j]), linewidth=5 * thickness[j]) i = 3 @@ -87,13 +91,11 @@ def relate_point(X, i, ax): relate_point(X_embedded, i, ax2) for i in range(len(X)): - ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i), - va='center', ha='center') - ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]), - alpha=0.4) + ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i), va="center", ha="center") + ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4) ax2.set_title("NCA embedding") ax2.axes.get_xaxis().set_visible(False) ax2.axes.get_yaxis().set_visible(False) -ax2.axis('equal') +ax2.axis("equal") plt.show() diff --git a/examples/neighbors/plot_nearest_centroid.py b/examples/neighbors/plot_nearest_centroid.py index 04a105c0e07fd..9e5d21777e718 100644 --- a/examples/neighbors/plot_nearest_centroid.py +++ b/examples/neighbors/plot_nearest_centroid.py @@ -23,13 +23,13 @@ X = iris.data[:, :2] y = iris.target -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh # Create color maps -cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue']) -cmap_bold = ListedColormap(['darkorange', 'c', 'darkblue']) +cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"]) +cmap_bold = ListedColormap(["darkorange", "c", "darkblue"]) -for shrinkage in [None, .2]: +for shrinkage in [None, 0.2]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) @@ -39,8 +39,7 @@ # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot @@ -49,10 +48,8 @@ plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points - plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, - edgecolor='k', s=20) - plt.title("3-Class classification (shrink_threshold=%r)" - % shrinkage) - plt.axis('tight') + plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=20) + plt.title("3-Class classification (shrink_threshold=%r)" % shrinkage) + plt.axis("tight") plt.show() diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py index 9625e205009aa..ba91a1a65d2a8 100644 --- a/examples/neighbors/plot_regression.py +++ b/examples/neighbors/plot_regression.py @@ -34,17 +34,16 @@ # Fit regression model n_neighbors = 5 -for i, weights in enumerate(['uniform', 'distance']): +for i, weights in enumerate(["uniform", "distance"]): knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) y_ = knn.fit(X, y).predict(T) plt.subplot(2, 1, i + 1) - plt.scatter(X, y, color='darkorange', label='data') - plt.plot(T, y_, color='navy', label='prediction') - plt.axis('tight') + plt.scatter(X, y, color="darkorange", label="data") + plt.plot(T, y_, color="navy", label="prediction") + plt.axis("tight") plt.legend() - plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, - weights)) + plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, weights)) plt.tight_layout() plt.show() diff --git a/examples/neighbors/plot_species_kde.py b/examples/neighbors/plot_species_kde.py index 344c36ed452f1..9607d1a20aae4 100644 --- a/examples/neighbors/plot_species_kde.py +++ b/examples/neighbors/plot_species_kde.py @@ -48,6 +48,7 @@ # otherwise, we'll improvise later... try: from mpl_toolkits.basemap import Basemap + basemap = True except ImportError: basemap = False @@ -82,13 +83,14 @@ def construct_grids(batch): # Get matrices/arrays of species IDs and locations data = fetch_species_distributions() -species_names = ['Bradypus Variegatus', 'Microryzomys Minutus'] +species_names = ["Bradypus Variegatus", "Microryzomys Minutus"] -Xtrain = np.vstack([data['train']['dd lat'], - data['train']['dd long']]).T -ytrain = np.array([d.decode('ascii').startswith('micro') - for d in data['train']['species']], dtype='int') -Xtrain *= np.pi / 180. # Convert lat/long to radians +Xtrain = np.vstack([data["train"]["dd lat"], data["train"]["dd long"]]).T +ytrain = np.array( + [d.decode("ascii").startswith("micro") for d in data["train"]["species"]], + dtype="int", +) +Xtrain *= np.pi / 180.0 # Convert lat/long to radians # Set up the data grid for the contour plot xgrid, ygrid = construct_grids(data) @@ -98,7 +100,7 @@ def construct_grids(batch): xy = np.vstack([Y.ravel(), X.ravel()]).T xy = xy[land_mask] -xy *= np.pi / 180. +xy *= np.pi / 180.0 # Plot map of South America with distributions of each species fig = plt.figure() @@ -109,12 +111,13 @@ def construct_grids(batch): # construct a kernel density estimate of the distribution print(" - computing KDE in spherical coordinates") - kde = KernelDensity(bandwidth=0.04, metric='haversine', - kernel='gaussian', algorithm='ball_tree') + kde = KernelDensity( + bandwidth=0.04, metric="haversine", kernel="gaussian", algorithm="ball_tree" + ) kde.fit(Xtrain[ytrain == i]) # evaluate only on the land: -9999 indicates ocean - Z = np.full(land_mask.shape[0], -9999, dtype='int') + Z = np.full(land_mask.shape[0], -9999, dtype="int") Z[land_mask] = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) @@ -124,16 +127,21 @@ def construct_grids(batch): if basemap: print(" - plot coastlines using basemap") - m = Basemap(projection='cyl', llcrnrlat=Y.min(), - urcrnrlat=Y.max(), llcrnrlon=X.min(), - urcrnrlon=X.max(), resolution='c') + m = Basemap( + projection="cyl", + llcrnrlat=Y.min(), + urcrnrlat=Y.max(), + llcrnrlon=X.min(), + urcrnrlon=X.max(), + resolution="c", + ) m.drawcoastlines() m.drawcountries() else: print(" - plot coastlines from coverage") - plt.contour(X, Y, land_reference, - levels=[-9998], colors="k", - linestyles="solid") + plt.contour( + X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid" + ) plt.xticks([]) plt.yticks([]) diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py index 15cd8e490efca..8e35f61fe2494 100644 --- a/examples/neural_networks/plot_mlp_alpha.py +++ b/examples/neural_networks/plot_mlp_alpha.py @@ -30,47 +30,55 @@ from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh alphas = np.logspace(-1, 1, 5) classifiers = [] names = [] for alpha in alphas: - classifiers.append(make_pipeline( - StandardScaler(), - MLPClassifier( - solver='lbfgs', alpha=alpha, random_state=1, max_iter=2000, - early_stopping=True, hidden_layer_sizes=[100, 100], + classifiers.append( + make_pipeline( + StandardScaler(), + MLPClassifier( + solver="lbfgs", + alpha=alpha, + random_state=1, + max_iter=2000, + early_stopping=True, + hidden_layer_sizes=[100, 100], + ), ) - )) + ) names.append(f"alpha {alpha:.2f}") -X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, - random_state=0, n_clusters_per_class=1) +X, y = make_classification( + n_features=2, n_redundant=0, n_informative=2, random_state=0, n_clusters_per_class=1 +) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) -datasets = [make_moons(noise=0.3, random_state=0), - make_circles(noise=0.2, factor=0.5, random_state=1), - linearly_separable] +datasets = [ + make_moons(noise=0.3, random_state=0), + make_circles(noise=0.2, factor=0.5, random_state=1), + linearly_separable, +] figure = plt.figure(figsize=(17, 9)) i = 1 # iterate over datasets for X, y in datasets: # split into training and test part - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) - x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 - y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 + y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm = plt.cm.RdBu - cm_bright = ListedColormap(['#FF0000', '#0000FF']) + cm_bright = ListedColormap(["#FF0000", "#0000FF"]) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) @@ -97,23 +105,41 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) - ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) # Plot also the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - edgecolors='black', s=25) + ax.scatter( + X_train[:, 0], + X_train[:, 1], + c=y_train, + cmap=cm_bright, + edgecolors="black", + s=25, + ) # and testing points - ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, - alpha=0.6, edgecolors='black', s=25) + ax.scatter( + X_test[:, 0], + X_test[:, 1], + c=y_test, + cmap=cm_bright, + alpha=0.6, + edgecolors="black", + s=25, + ) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(name) - ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), - size=15, horizontalalignment='right') + ax.text( + xx.max() - 0.3, + yy.min() + 0.3, + ("%.2f" % score).lstrip("0"), + size=15, + horizontalalignment="right", + ) i += 1 -figure.subplots_adjust(left=.02, right=.98) +figure.subplots_adjust(left=0.02, right=0.98) plt.show() diff --git a/examples/neural_networks/plot_mlp_training_curves.py b/examples/neural_networks/plot_mlp_training_curves.py index 608db0f75ad98..45c3d3529da80 100644 --- a/examples/neural_networks/plot_mlp_training_curves.py +++ b/examples/neural_networks/plot_mlp_training_curves.py @@ -25,32 +25,69 @@ from sklearn.exceptions import ConvergenceWarning # different learning rate schedules and momentum parameters -params = [{'solver': 'sgd', 'learning_rate': 'constant', 'momentum': 0, - 'learning_rate_init': 0.2}, - {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9, - 'nesterovs_momentum': False, 'learning_rate_init': 0.2}, - {'solver': 'sgd', 'learning_rate': 'constant', 'momentum': .9, - 'nesterovs_momentum': True, 'learning_rate_init': 0.2}, - {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': 0, - 'learning_rate_init': 0.2}, - {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9, - 'nesterovs_momentum': True, 'learning_rate_init': 0.2}, - {'solver': 'sgd', 'learning_rate': 'invscaling', 'momentum': .9, - 'nesterovs_momentum': False, 'learning_rate_init': 0.2}, - {'solver': 'adam', 'learning_rate_init': 0.01}] - -labels = ["constant learning-rate", "constant with momentum", - "constant with Nesterov's momentum", - "inv-scaling learning-rate", "inv-scaling with momentum", - "inv-scaling with Nesterov's momentum", "adam"] - -plot_args = [{'c': 'red', 'linestyle': '-'}, - {'c': 'green', 'linestyle': '-'}, - {'c': 'blue', 'linestyle': '-'}, - {'c': 'red', 'linestyle': '--'}, - {'c': 'green', 'linestyle': '--'}, - {'c': 'blue', 'linestyle': '--'}, - {'c': 'black', 'linestyle': '-'}] +params = [ + { + "solver": "sgd", + "learning_rate": "constant", + "momentum": 0, + "learning_rate_init": 0.2, + }, + { + "solver": "sgd", + "learning_rate": "constant", + "momentum": 0.9, + "nesterovs_momentum": False, + "learning_rate_init": 0.2, + }, + { + "solver": "sgd", + "learning_rate": "constant", + "momentum": 0.9, + "nesterovs_momentum": True, + "learning_rate_init": 0.2, + }, + { + "solver": "sgd", + "learning_rate": "invscaling", + "momentum": 0, + "learning_rate_init": 0.2, + }, + { + "solver": "sgd", + "learning_rate": "invscaling", + "momentum": 0.9, + "nesterovs_momentum": True, + "learning_rate_init": 0.2, + }, + { + "solver": "sgd", + "learning_rate": "invscaling", + "momentum": 0.9, + "nesterovs_momentum": False, + "learning_rate_init": 0.2, + }, + {"solver": "adam", "learning_rate_init": 0.01}, +] + +labels = [ + "constant learning-rate", + "constant with momentum", + "constant with Nesterov's momentum", + "inv-scaling learning-rate", + "inv-scaling with momentum", + "inv-scaling with Nesterov's momentum", + "adam", +] + +plot_args = [ + {"c": "red", "linestyle": "-"}, + {"c": "green", "linestyle": "-"}, + {"c": "blue", "linestyle": "-"}, + {"c": "red", "linestyle": "--"}, + {"c": "green", "linestyle": "--"}, + {"c": "blue", "linestyle": "--"}, + {"c": "black", "linestyle": "-"}, +] def plot_on_dataset(X, y, ax, name): @@ -68,14 +105,14 @@ def plot_on_dataset(X, y, ax, name): for label, param in zip(labels, params): print("training: %s" % label) - mlp = MLPClassifier(random_state=0, - max_iter=max_iter, **param) + mlp = MLPClassifier(random_state=0, max_iter=max_iter, **param) # some parameter combinations will not converge as can be seen on the # plots so they are ignored here with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning, - module="sklearn") + warnings.filterwarnings( + "ignore", category=ConvergenceWarning, module="sklearn" + ) mlp.fit(X, y) mlps.append(mlp) @@ -89,13 +126,16 @@ def plot_on_dataset(X, y, ax, name): # load / generate some toy datasets iris = datasets.load_iris() X_digits, y_digits = datasets.load_digits(return_X_y=True) -data_sets = [(iris.data, iris.target), - (X_digits, y_digits), - datasets.make_circles(noise=0.2, factor=0.5, random_state=1), - datasets.make_moons(noise=0.3, random_state=0)] - -for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits', - 'circles', 'moons']): +data_sets = [ + (iris.data, iris.target), + (X_digits, y_digits), + datasets.make_circles(noise=0.2, factor=0.5, random_state=1), + datasets.make_moons(noise=0.3, random_state=0), +] + +for ax, data, name in zip( + axes.ravel(), data_sets, ["iris", "digits", "circles", "moons"] +): plot_on_dataset(*data, ax=ax, name=name) fig.legend(ax.get_lines(), labels, ncol=3, loc="upper center") diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 33f421a226c33..0fba5412b96d0 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -33,22 +33,27 @@ print(__doc__) # Load data from https://www.openml.org/d/554 -X, y = fetch_openml('mnist_784', version=1, return_X_y=True) -X = X / 255. +X, y = fetch_openml("mnist_784", version=1, return_X_y=True) +X = X / 255.0 # rescale the data, use the traditional train/test split X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] -mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4, - solver='sgd', verbose=10, random_state=1, - learning_rate_init=.1) +mlp = MLPClassifier( + hidden_layer_sizes=(50,), + max_iter=10, + alpha=1e-4, + solver="sgd", + verbose=10, + random_state=1, + learning_rate_init=0.1, +) # this example won't converge because of CI's time constraints, so we catch the # warning and are ignore it here with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning, - module="sklearn") + warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") mlp.fit(X_train, y_train) print("Training set score: %f" % mlp.score(X_train, y_train)) @@ -58,8 +63,7 @@ # use global min / max to ensure all weights are shown on the same scale vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max() for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()): - ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, - vmax=.5 * vmax) + ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=0.5 * vmin, vmax=0.5 * vmax) ax.set_xticks(()) ax.set_yticks(()) diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py index f7ad3513499ca..c17bebfc38cff 100644 --- a/examples/neural_networks/plot_rbm_logistic_classification.py +++ b/examples/neural_networks/plot_rbm_logistic_classification.py @@ -44,53 +44,42 @@ # ############################################################################# # Setting up + def nudge_dataset(X, Y): """ This produces a dataset 5 times bigger than the original one, by moving the 8x8 images in X around by 1px to left, right, down, up """ direction_vectors = [ - [[0, 1, 0], - [0, 0, 0], - [0, 0, 0]], - - [[0, 0, 0], - [1, 0, 0], - [0, 0, 0]], - - [[0, 0, 0], - [0, 0, 1], - [0, 0, 0]], - - [[0, 0, 0], - [0, 0, 0], - [0, 1, 0]]] + [[0, 1, 0], [0, 0, 0], [0, 0, 0]], + [[0, 0, 0], [1, 0, 0], [0, 0, 0]], + [[0, 0, 0], [0, 0, 1], [0, 0, 0]], + [[0, 0, 0], [0, 0, 0], [0, 1, 0]], + ] def shift(x, w): - return convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel() + return convolve(x.reshape((8, 8)), mode="constant", weights=w).ravel() - X = np.concatenate([X] + - [np.apply_along_axis(shift, 1, X, vector) - for vector in direction_vectors]) + X = np.concatenate( + [X] + [np.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors] + ) Y = np.concatenate([Y for _ in range(5)], axis=0) return X, Y # Load Data X, y = datasets.load_digits(return_X_y=True) -X = np.asarray(X, 'float32') +X = np.asarray(X, "float32") X, Y = nudge_dataset(X, y) X = minmax_scale(X, feature_range=(0, 1)) # 0-1 scaling -X_train, X_test, Y_train, Y_test = train_test_split( - X, Y, test_size=0.2, random_state=0) +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Models we will use -logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1) +logistic = linear_model.LogisticRegression(solver="newton-cg", tol=1) rbm = BernoulliRBM(random_state=0, verbose=True) -rbm_features_classifier = Pipeline( - steps=[('rbm', rbm), ('logistic', logistic)]) +rbm_features_classifier = Pipeline(steps=[("rbm", rbm), ("logistic", logistic)]) # ############################################################################# # Training @@ -110,19 +99,23 @@ def shift(x, w): # Training the Logistic regression classifier directly on the pixel raw_pixel_classifier = clone(logistic) -raw_pixel_classifier.C = 100. +raw_pixel_classifier.C = 100.0 raw_pixel_classifier.fit(X_train, Y_train) # ############################################################################# # Evaluation Y_pred = rbm_features_classifier.predict(X_test) -print("Logistic regression using RBM features:\n%s\n" % ( - metrics.classification_report(Y_test, Y_pred))) +print( + "Logistic regression using RBM features:\n%s\n" + % (metrics.classification_report(Y_test, Y_pred)) +) Y_pred = raw_pixel_classifier.predict(X_test) -print("Logistic regression using raw pixel features:\n%s\n" % ( - metrics.classification_report(Y_test, Y_pred))) +print( + "Logistic regression using raw pixel features:\n%s\n" + % (metrics.classification_report(Y_test, Y_pred)) +) # ############################################################################# # Plotting @@ -130,11 +123,10 @@ def shift(x, w): plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(rbm.components_): plt.subplot(10, 10, i + 1) - plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r, - interpolation='nearest') + plt.imshow(comp.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation="nearest") plt.xticks(()) plt.yticks(()) -plt.suptitle('100 components extracted by RBM', fontsize=16) +plt.suptitle("100 components extracted by RBM", fontsize=16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) plt.show() diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index b52f35986d0f6..4dc93993330b2 100755 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -72,51 +72,55 @@ feature_names = dataset.feature_names feature_mapping = { - 'MedInc': 'Median income in block', - 'HousAge': 'Median house age in block', - 'AveRooms': 'Average number of rooms', - 'AveBedrms': 'Average number of bedrooms', - 'Population': 'Block population', - 'AveOccup': 'Average house occupancy', - 'Latitude': 'House block latitude', - 'Longitude': 'House block longitude' + "MedInc": "Median income in block", + "HousAge": "Median house age in block", + "AveRooms": "Average number of rooms", + "AveBedrms": "Average number of bedrooms", + "Population": "Block population", + "AveOccup": "Average house occupancy", + "Latitude": "House block latitude", + "Longitude": "House block longitude", } # Take only 2 features to make visualization easier # Feature MedInc has a long tail distribution. # Feature AveOccup has a few but very large outliers. -features = ['MedInc', 'AveOccup'] +features = ["MedInc", "AveOccup"] features_idx = [feature_names.index(feature) for feature in features] X = X_full[:, features_idx] distributions = [ - ('Unscaled data', X), - ('Data after standard scaling', - StandardScaler().fit_transform(X)), - ('Data after min-max scaling', - MinMaxScaler().fit_transform(X)), - ('Data after max-abs scaling', - MaxAbsScaler().fit_transform(X)), - ('Data after robust scaling', - RobustScaler(quantile_range=(25, 75)).fit_transform(X)), - ('Data after power transformation (Yeo-Johnson)', - PowerTransformer(method='yeo-johnson').fit_transform(X)), - ('Data after power transformation (Box-Cox)', - PowerTransformer(method='box-cox').fit_transform(X)), - ('Data after quantile transformation (uniform pdf)', - QuantileTransformer(output_distribution='uniform') - .fit_transform(X)), - ('Data after quantile transformation (gaussian pdf)', - QuantileTransformer(output_distribution='normal') - .fit_transform(X)), - ('Data after sample-wise L2 normalizing', - Normalizer().fit_transform(X)), + ("Unscaled data", X), + ("Data after standard scaling", StandardScaler().fit_transform(X)), + ("Data after min-max scaling", MinMaxScaler().fit_transform(X)), + ("Data after max-abs scaling", MaxAbsScaler().fit_transform(X)), + ( + "Data after robust scaling", + RobustScaler(quantile_range=(25, 75)).fit_transform(X), + ), + ( + "Data after power transformation (Yeo-Johnson)", + PowerTransformer(method="yeo-johnson").fit_transform(X), + ), + ( + "Data after power transformation (Box-Cox)", + PowerTransformer(method="box-cox").fit_transform(X), + ), + ( + "Data after quantile transformation (uniform pdf)", + QuantileTransformer(output_distribution="uniform").fit_transform(X), + ), + ( + "Data after quantile transformation (gaussian pdf)", + QuantileTransformer(output_distribution="normal").fit_transform(X), + ), + ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)), ] # scale the output between 0 and 1 for the colorbar y = minmax_scale(y_full) # plasma does not exist in matplotlib < 1.5 -cmap = getattr(cm, 'plasma_r', cm.hot_r) +cmap = getattr(cm, "plasma_r", cm.hot_r) def create_axes(title, figsize=(16, 6)): @@ -155,13 +159,14 @@ def create_axes(title, figsize=(16, 6)): rect_colorbar = [left, bottom, width, height] ax_colorbar = plt.axes(rect_colorbar) - return ((ax_scatter, ax_histy, ax_histx), - (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom), - ax_colorbar) + return ( + (ax_scatter, ax_histy, ax_histx), + (ax_scatter_zoom, ax_histy_zoom, ax_histx_zoom), + ax_colorbar, + ) -def plot_distribution(axes, X, y, hist_nbins=50, title="", - x0_label="", x1_label=""): +def plot_distribution(axes, X, y, hist_nbins=50, title="", x0_label="", x1_label=""): ax, hist_X1, hist_X0 = axes ax.set_title(title) @@ -170,28 +175,31 @@ def plot_distribution(axes, X, y, hist_nbins=50, title="", # The scatter plot colors = cmap(y) - ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker='o', s=5, lw=0, c=colors) + ax.scatter(X[:, 0], X[:, 1], alpha=0.5, marker="o", s=5, lw=0, c=colors) # Removing the top and the right spine for aesthetics # make nice axis layout - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() - ax.spines['left'].set_position(('outward', 10)) - ax.spines['bottom'].set_position(('outward', 10)) + ax.spines["left"].set_position(("outward", 10)) + ax.spines["bottom"].set_position(("outward", 10)) # Histogram for axis X1 (feature 5) hist_X1.set_ylim(ax.get_ylim()) - hist_X1.hist(X[:, 1], bins=hist_nbins, orientation='horizontal', - color='grey', ec='grey') - hist_X1.axis('off') + hist_X1.hist( + X[:, 1], bins=hist_nbins, orientation="horizontal", color="grey", ec="grey" + ) + hist_X1.axis("off") # Histogram for axis X0 (feature 0) hist_X0.set_xlim(ax.get_xlim()) - hist_X0.hist(X[:, 0], bins=hist_nbins, orientation='vertical', - color='grey', ec='grey') - hist_X0.axis('off') + hist_X0.hist( + X[:, 0], bins=hist_nbins, orientation="vertical", color="grey", ec="grey" + ) + hist_X0.axis("off") + # %% # Two plots will be shown for each scaler/normalizer/transformer. The left @@ -205,29 +213,42 @@ def make_plot(item_idx): title, X = distributions[item_idx] ax_zoom_out, ax_zoom_in, ax_colorbar = create_axes(title) axarr = (ax_zoom_out, ax_zoom_in) - plot_distribution(axarr[0], X, y, hist_nbins=200, - x0_label=feature_mapping[features[0]], - x1_label=feature_mapping[features[1]], - title="Full data") + plot_distribution( + axarr[0], + X, + y, + hist_nbins=200, + x0_label=feature_mapping[features[0]], + x1_label=feature_mapping[features[1]], + title="Full data", + ) # zoom-in zoom_in_percentile_range = (0, 99) cutoffs_X0 = np.percentile(X[:, 0], zoom_in_percentile_range) cutoffs_X1 = np.percentile(X[:, 1], zoom_in_percentile_range) - non_outliers_mask = ( - np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & - np.all(X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1)) - plot_distribution(axarr[1], X[non_outliers_mask], y[non_outliers_mask], - hist_nbins=50, - x0_label=feature_mapping[features[0]], - x1_label=feature_mapping[features[1]], - title="Zoom-in") + non_outliers_mask = np.all(X > [cutoffs_X0[0], cutoffs_X1[0]], axis=1) & np.all( + X < [cutoffs_X0[1], cutoffs_X1[1]], axis=1 + ) + plot_distribution( + axarr[1], + X[non_outliers_mask], + y[non_outliers_mask], + hist_nbins=50, + x0_label=feature_mapping[features[0]], + x1_label=feature_mapping[features[1]], + title="Zoom-in", + ) norm = mpl.colors.Normalize(y_full.min(), y_full.max()) - mpl.colorbar.ColorbarBase(ax_colorbar, cmap=cmap, - norm=norm, orientation='vertical', - label='Color mapping for values of y') + mpl.colorbar.ColorbarBase( + ax_colorbar, + cmap=cmap, + norm=norm, + orientation="vertical", + label="Color mapping for values of y", + ) # %% diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py index 9cfcb30e6fdd7..2e5be3f4640f4 100644 --- a/examples/preprocessing/plot_discretization.py +++ b/examples/preprocessing/plot_discretization.py @@ -49,19 +49,17 @@ X = X.reshape(-1, 1) # transform the dataset with KBinsDiscretizer -enc = KBinsDiscretizer(n_bins=10, encode='onehot') +enc = KBinsDiscretizer(n_bins=10, encode="onehot") X_binned = enc.fit_transform(X) # predict with original dataset fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) -ax1.plot(line, reg.predict(line), linewidth=2, color='green', - label="linear regression") +ax1.plot(line, reg.predict(line), linewidth=2, color="green", label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) -ax1.plot(line, reg.predict(line), linewidth=2, color='red', - label="decision tree") -ax1.plot(X[:, 0], y, 'o', c='k') +ax1.plot(line, reg.predict(line), linewidth=2, color="red", label="decision tree") +ax1.plot(X[:, 0], y, "o", c="k") ax1.legend(loc="best") ax1.set_ylabel("Regression output") ax1.set_xlabel("Input feature") @@ -70,14 +68,25 @@ # predict with transformed dataset line_binned = enc.transform(line) reg = LinearRegression().fit(X_binned, y) -ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green', - linestyle='-', label='linear regression') -reg = DecisionTreeRegressor(min_samples_split=3, - random_state=0).fit(X_binned, y) -ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red', - linestyle=':', label='decision tree') -ax2.plot(X[:, 0], y, 'o', c='k') -ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=.2) +ax2.plot( + line, + reg.predict(line_binned), + linewidth=2, + color="green", + linestyle="-", + label="linear regression", +) +reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X_binned, y) +ax2.plot( + line, + reg.predict(line_binned), + linewidth=2, + color="red", + linestyle=":", + label="decision tree", +) +ax2.plot(X[:, 0], y, "o", c="k") +ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=0.2) ax2.legend(loc="best") ax2.set_xlabel("Input feature") ax2.set_title("Result after discretization") diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index e55e7cb500eb1..355bb9253d963 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -50,42 +50,42 @@ print(__doc__) -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh def get_name(estimator): name = estimator.__class__.__name__ - if name == 'Pipeline': + if name == "Pipeline": name = [get_name(est[1]) for est in estimator.steps] - name = ' + '.join(name) + name = " + ".join(name) return name # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ - (LogisticRegression(random_state=0), { - 'C': np.logspace(-2, 7, 10) - }), - (LinearSVC(random_state=0), { - 'C': np.logspace(-2, 7, 10) - }), - (make_pipeline( - KBinsDiscretizer(encode='onehot'), - LogisticRegression(random_state=0)), { - 'kbinsdiscretizer__n_bins': np.arange(2, 10), - 'logisticregression__C': np.logspace(-2, 7, 10), - }), - (make_pipeline( - KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), { - 'kbinsdiscretizer__n_bins': np.arange(2, 10), - 'linearsvc__C': np.logspace(-2, 7, 10), - }), - (GradientBoostingClassifier(n_estimators=50, random_state=0), { - 'learning_rate': np.logspace(-4, 0, 10) - }), - (SVC(random_state=0), { - 'C': np.logspace(-2, 7, 10) - }), + (LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}), + (LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}), + ( + make_pipeline( + KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0) + ), + { + "kbinsdiscretizer__n_bins": np.arange(2, 10), + "logisticregression__C": np.logspace(-2, 7, 10), + }, + ), + ( + make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)), + { + "kbinsdiscretizer__n_bins": np.arange(2, 10), + "linearsvc__C": np.logspace(-2, 7, 10), + }, + ), + ( + GradientBoostingClassifier(n_estimators=50, random_state=0), + {"learning_rate": np.logspace(-4, 0, 10)}, + ), + (SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}), ] names = [get_name(e) for e, g in classifiers] @@ -94,57 +94,62 @@ def get_name(estimator): datasets = [ make_moons(n_samples=n_samples, noise=0.2, random_state=0), make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), - make_classification(n_samples=n_samples, n_features=2, n_redundant=0, - n_informative=2, random_state=2, - n_clusters_per_class=1) + make_classification( + n_samples=n_samples, + n_features=2, + n_redundant=0, + n_informative=2, + random_state=2, + n_clusters_per_class=1, + ), ] -fig, axes = plt.subplots(nrows=len(datasets), ncols=len(classifiers) + 1, - figsize=(21, 9)) +fig, axes = plt.subplots( + nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9) +) cm = plt.cm.PiYG -cm_bright = ListedColormap(['#b30065', '#178000']) +cm_bright = ListedColormap(["#b30065", "#178000"]) # iterate over datasets for ds_cnt, (X, y) in enumerate(datasets): - print('\ndataset %d\n---------' % ds_cnt) + print("\ndataset %d\n---------" % ds_cnt) # preprocess dataset, split into training and test part X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=.5, random_state=42) + X, y, test_size=0.5, random_state=42 + ) # create the grid for background colors - x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 - y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 - xx, yy = np.meshgrid( - np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 + y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # plot the dataset first ax = axes[ds_cnt, 0] if ds_cnt == 0: ax.set_title("Input data") # plot the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - edgecolors='k') + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k") # and testing points - ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, - edgecolors='k') + ax.scatter( + X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k" + ) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) # iterate over classifiers - for est_idx, (name, (estimator, param_grid)) in \ - enumerate(zip(names, classifiers)): + for est_idx, (name, (estimator, param_grid)) in enumerate(zip(names, classifiers)): ax = axes[ds_cnt, est_idx + 1] clf = GridSearchCV(estimator=estimator, param_grid=param_grid) with ignore_warnings(category=ConvergenceWarning): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) - print('%s: %.2f' % (name, score)) + print("%s: %.2f" % (name, score)) # plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]*[y_min, y_max]. @@ -155,24 +160,37 @@ def get_name(estimator): # put the result into a color plot Z = Z.reshape(xx.shape) - ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) # plot the training points - ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, - edgecolors='k') + ax.scatter( + X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k" + ) # and testing points - ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, - edgecolors='k', alpha=0.6) + ax.scatter( + X_test[:, 0], + X_test[:, 1], + c=y_test, + cmap=cm_bright, + edgecolors="k", + alpha=0.6, + ) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: - ax.set_title(name.replace(' + ', '\n')) - ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15, - bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'), - transform=ax.transAxes, horizontalalignment='right') + ax.set_title(name.replace(" + ", "\n")) + ax.text( + 0.95, + 0.06, + ("%.2f" % score).lstrip("0"), + size=15, + bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"), + transform=ax.transAxes, + horizontalalignment="right", + ) plt.tight_layout() @@ -180,12 +198,18 @@ def get_name(estimator): # Add suptitles above the figure plt.subplots_adjust(top=0.90) suptitles = [ - 'Linear classifiers', - 'Feature discretization and linear classifiers', - 'Non-linear classifiers', + "Linear classifiers", + "Feature discretization and linear classifiers", + "Non-linear classifiers", ] for i, suptitle in zip([1, 3, 5], suptitles): ax = axes[0, i] - ax.text(1.05, 1.25, suptitle, transform=ax.transAxes, - horizontalalignment='center', size='x-large') + ax.text( + 1.05, + 1.25, + suptitle, + transform=ax.transAxes, + horizontalalignment="center", + size="x-large", + ) plt.show() diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py index 9ef211a83ccf3..bee3a6314cd52 100644 --- a/examples/preprocessing/plot_discretization_strategies.py +++ b/examples/preprocessing/plot_discretization_strategies.py @@ -27,7 +27,7 @@ print(__doc__) -strategies = ['uniform', 'quantile', 'kmeans'] +strategies = ["uniform", "quantile", "kmeans"] n_samples = 200 centers_0 = np.array([[0, 0], [0, 5], [2, 4], [8, 8]]) @@ -37,13 +37,23 @@ random_state = 42 X_list = [ np.random.RandomState(random_state).uniform(-3, 3, size=(n_samples, 2)), - make_blobs(n_samples=[n_samples // 10, n_samples * 4 // 10, - n_samples // 10, n_samples * 4 // 10], - cluster_std=0.5, centers=centers_0, - random_state=random_state)[0], - make_blobs(n_samples=[n_samples // 5, n_samples * 4 // 5], - cluster_std=0.5, centers=centers_1, - random_state=random_state)[0], + make_blobs( + n_samples=[ + n_samples // 10, + n_samples * 4 // 10, + n_samples // 10, + n_samples * 4 // 10, + ], + cluster_std=0.5, + centers=centers_0, + random_state=random_state, + )[0], + make_blobs( + n_samples=[n_samples // 5, n_samples * 4 // 5], + cluster_std=0.5, + centers=centers_1, + random_state=random_state, + )[0], ] figure = plt.figure(figsize=(14, 9)) @@ -51,13 +61,14 @@ for ds_cnt, X in enumerate(X_list): ax = plt.subplot(len(X_list), len(strategies) + 1, i) - ax.scatter(X[:, 0], X[:, 1], edgecolors='k') + ax.scatter(X[:, 0], X[:, 1], edgecolors="k") if ds_cnt == 0: ax.set_title("Input data", size=14) xx, yy = np.meshgrid( np.linspace(X[:, 0].min(), X[:, 0].max(), 300), - np.linspace(X[:, 1].min(), X[:, 1].max(), 300)) + np.linspace(X[:, 1].min(), X[:, 1].max(), 300), + ) grid = np.c_[xx.ravel(), yy.ravel()] ax.set_xlim(xx.min(), xx.max()) @@ -68,7 +79,7 @@ i += 1 # transform the dataset with KBinsDiscretizer for strategy in strategies: - enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy) + enc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy=strategy) enc.fit(X) grid_encoded = enc.transform(grid) @@ -76,18 +87,18 @@ # horizontal stripes horizontal = grid_encoded[:, 0].reshape(xx.shape) - ax.contourf(xx, yy, horizontal, alpha=.5) + ax.contourf(xx, yy, horizontal, alpha=0.5) # vertical stripes vertical = grid_encoded[:, 1].reshape(xx.shape) - ax.contourf(xx, yy, vertical, alpha=.5) + ax.contourf(xx, yy, vertical, alpha=0.5) - ax.scatter(X[:, 0], X[:, 1], edgecolors='k') + ax.scatter(X[:, 0], X[:, 1], edgecolors="k") ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) if ds_cnt == 0: - ax.set_title("strategy='%s'" % (strategy, ), size=14) + ax.set_title("strategy='%s'" % (strategy,), size=14) i += 1 diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py index 581ca20a83a42..ef605cedbe5f5 100644 --- a/examples/preprocessing/plot_map_data_to_normal.py +++ b/examples/preprocessing/plot_map_data_to_normal.py @@ -53,12 +53,13 @@ rng = np.random.RandomState(304) -bc = PowerTransformer(method='box-cox') -yj = PowerTransformer(method='yeo-johnson') +bc = PowerTransformer(method="box-cox") +yj = PowerTransformer(method="yeo-johnson") # n_quantiles is set to the training set size rather than the default value # to avoid a warning being raised by this example -qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', - random_state=rng) +qt = QuantileTransformer( + n_quantiles=500, output_distribution="normal", random_state=rng +) size = (N_SAMPLES, 1) @@ -88,28 +89,32 @@ # create plots distributions = [ - ('Lognormal', X_lognormal), - ('Chi-squared', X_chisq), - ('Weibull', X_weibull), - ('Gaussian', X_gaussian), - ('Uniform', X_uniform), - ('Bimodal', X_bimodal) + ("Lognormal", X_lognormal), + ("Chi-squared", X_chisq), + ("Weibull", X_weibull), + ("Gaussian", X_gaussian), + ("Uniform", X_uniform), + ("Bimodal", X_bimodal), ] -colors = ['#D81B60', '#0188FF', '#FFC107', - '#B7A2FF', '#000000', '#2EC5AC'] +colors = ["#D81B60", "#0188FF", "#FFC107", "#B7A2FF", "#000000", "#2EC5AC"] fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2)) axes = axes.flatten() -axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21), - (13, 16, 19, 22), (14, 17, 20, 23)] -axes_list = [(axes[i], axes[j], axes[k], axes[l]) - for (i, j, k, l) in axes_idxs] +axes_idxs = [ + (0, 3, 6, 9), + (1, 4, 7, 10), + (2, 5, 8, 11), + (12, 15, 18, 21), + (13, 16, 19, 22), + (14, 17, 20, 23), +] +axes_list = [(axes[i], axes[j], axes[k], axes[l]) for (i, j, k, l) in axes_idxs] for distribution, color, axes in zip(distributions, colors, axes_list): name, X = distribution - X_train, X_test = train_test_split(X, test_size=.5) + X_train, X_test = train_test_split(X, test_size=0.5) # perform power transforms and quantile transform X_trans_bc = bc.fit(X_train).transform(X_test) @@ -122,19 +127,20 @@ ax_original.hist(X_train, color=color, bins=BINS) ax_original.set_title(name, fontsize=FONT_SIZE) - ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE) + ax_original.tick_params(axis="both", which="major", labelsize=FONT_SIZE) for ax, X_trans, meth_name, lmbda in zip( - (ax_bc, ax_yj, ax_qt), - (X_trans_bc, X_trans_yj, X_trans_qt), - ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'), - (lmbda_bc, lmbda_yj, None)): + (ax_bc, ax_yj, ax_qt), + (X_trans_bc, X_trans_yj, X_trans_qt), + ("Box-Cox", "Yeo-Johnson", "Quantile transform"), + (lmbda_bc, lmbda_yj, None), + ): ax.hist(X_trans, color=color, bins=BINS) - title = 'After {}'.format(meth_name) + title = "After {}".format(meth_name) if lmbda is not None: - title += '\n$\\lambda$ = {}'.format(lmbda) + title += "\n$\\lambda$ = {}".format(lmbda) ax.set_title(title, fontsize=FONT_SIZE) - ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE) + ax.tick_params(axis="both", which="major", labelsize=FONT_SIZE) ax.set_xlim([-3.5, 3.5]) diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index b24786e1a018d..f80debf306867 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -49,6 +49,7 @@ import matplotlib.pyplot as plt from sklearn.datasets import load_wine from sklearn.pipeline import make_pipeline + print(__doc__) # Code source: Tyler Lanigan @@ -63,9 +64,9 @@ features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size -X_train, X_test, y_train, y_test = train_test_split(features, target, - test_size=0.30, - random_state=RANDOM_STATE) +X_train, X_test, y_train, y_test = train_test_split( + features, target, test_size=0.30, random_state=RANDOM_STATE +) # Fit to data and predict using pipelined GNB and PCA. unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) @@ -78,54 +79,56 @@ pred_test_std = std_clf.predict(X_test) # Show prediction accuracies in scaled and unscaled data. -print('\nPrediction accuracy for the normal test dataset with PCA') -print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) +print("\nPrediction accuracy for the normal test dataset with PCA") +print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test))) -print('\nPrediction accuracy for the standardized test dataset with PCA') -print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) +print("\nPrediction accuracy for the standardized test dataset with PCA") +print("{:.2%}\n".format(metrics.accuracy_score(y_test, pred_test_std))) # Extract PCA from pipeline -pca = unscaled_clf.named_steps['pca'] -pca_std = std_clf.named_steps['pca'] +pca = unscaled_clf.named_steps["pca"] +pca_std = std_clf.named_steps["pca"] # Show first principal components -print('\nPC 1 without scaling:\n', pca.components_[0]) -print('\nPC 1 with scaling:\n', pca_std.components_[0]) +print("\nPC 1 without scaling:\n", pca.components_[0]) +print("\nPC 1 with scaling:\n", pca_std.components_[0]) # Use PCA without and with scale on X_train data for visualization. X_train_transformed = pca.transform(X_train) -scaler = std_clf.named_steps['standardscaler'] +scaler = std_clf.named_steps["standardscaler"] X_train_std_transformed = pca_std.transform(scaler.transform(X_train)) # visualize standardized vs. untouched dataset with PCA performed fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE) -for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): - ax1.scatter(X_train_transformed[y_train == l, 0], - X_train_transformed[y_train == l, 1], - color=c, - label='class %s' % l, - alpha=0.5, - marker=m - ) - -for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')): - ax2.scatter(X_train_std_transformed[y_train == l, 0], - X_train_std_transformed[y_train == l, 1], - color=c, - label='class %s' % l, - alpha=0.5, - marker=m - ) - -ax1.set_title('Training dataset after PCA') -ax2.set_title('Standardized training dataset after PCA') +for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")): + ax1.scatter( + X_train_transformed[y_train == l, 0], + X_train_transformed[y_train == l, 1], + color=c, + label="class %s" % l, + alpha=0.5, + marker=m, + ) + +for l, c, m in zip(range(0, 3), ("blue", "red", "green"), ("^", "s", "o")): + ax2.scatter( + X_train_std_transformed[y_train == l, 0], + X_train_std_transformed[y_train == l, 1], + color=c, + label="class %s" % l, + alpha=0.5, + marker=m, + ) + +ax1.set_title("Training dataset after PCA") +ax2.set_title("Standardized training dataset after PCA") for ax in (ax1, ax2): - ax.set_xlabel('1st principal component') - ax.set_ylabel('2nd principal component') - ax.legend(loc='upper right') + ax.set_xlabel("1st principal component") + ax.set_ylabel("2nd principal component") + ax.legend(loc="upper right") ax.grid() plt.tight_layout() diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py index cc0cfe674c61d..adac61ed9688f 100644 --- a/examples/release_highlights/plot_release_highlights_0_22_0.py +++ b/examples/release_highlights/plot_release_highlights_0_22_0.py @@ -81,16 +81,11 @@ X, y = load_iris(return_X_y=True) estimators = [ - ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), - ('svr', make_pipeline(StandardScaler(), - LinearSVC(random_state=42))) + ("rf", RandomForestClassifier(n_estimators=10, random_state=42)), + ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42))), ] -clf = StackingClassifier( - estimators=estimators, final_estimator=LogisticRegression() -) -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=42 -) +clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # %% @@ -107,16 +102,16 @@ from sklearn.inspection import permutation_importance X, y = make_classification(random_state=0, n_features=5, n_informative=3) -feature_names = np.array([f'x_{i}' for i in range(X.shape[1])]) +feature_names = np.array([f"x_{i}" for i in range(X.shape[1])]) rf = RandomForestClassifier(random_state=0).fit(X, y) -result = permutation_importance(rf, X, y, n_repeats=10, random_state=0, - n_jobs=-1) +result = permutation_importance(rf, X, y, n_repeats=10, random_state=0, n_jobs=-1) fig, ax = plt.subplots() sorted_idx = result.importances_mean.argsort() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=feature_names[sorted_idx]) +ax.boxplot( + result.importances[sorted_idx].T, vert=False, labels=feature_names[sorted_idx] +) ax.set_title("Permutation Importance of each feature") ax.set_ylabel("Features") fig.tight_layout() @@ -161,9 +156,10 @@ with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: estimator = make_pipeline( - KNeighborsTransformer(n_neighbors=10, mode='distance'), - Isomap(n_neighbors=10, metric='precomputed'), - memory=tmpdir) + KNeighborsTransformer(n_neighbors=10, mode="distance"), + Isomap(n_neighbors=10, metric="precomputed"), + memory=tmpdir, + ) estimator.fit(X) # We can decrease the number of neighbors and the graph will not be @@ -204,12 +200,18 @@ X, y = make_classification(random_state=0) rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y) -print("Average number of nodes without pruning {:.1f}".format( - np.mean([e.tree_.node_count for e in rf.estimators_]))) +print( + "Average number of nodes without pruning {:.1f}".format( + np.mean([e.tree_.node_count for e in rf.estimators_]) + ) +) rf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y) -print("Average number of nodes with pruning {:.1f}".format( - np.mean([e.tree_.node_count for e in rf.estimators_]))) +print( + "Average number of nodes with pruning {:.1f}".format( + np.mean([e.tree_.node_count for e in rf.estimators_]) + ) +) # %% # Retrieve dataframes from OpenML @@ -219,8 +221,8 @@ from sklearn.datasets import fetch_openml -titanic = fetch_openml('titanic', version=1, as_frame=True) -print(titanic.data.head()[['pclass', 'embarked']]) +titanic = fetch_openml("titanic", version=1, as_frame=True) +print(titanic.data.head()[["pclass", "embarked"]]) # %% # Checking scikit-learn compatibility of an estimator @@ -245,6 +247,7 @@ def test_sklearn_compatible_estimator(estimator, check): check(estimator) + # %% # ROC AUC now supports multiclass classification # ---------------------------------------------- @@ -266,5 +269,5 @@ def test_sklearn_compatible_estimator(estimator, check): from sklearn.metrics import roc_auc_score X, y = make_classification(n_classes=4, n_informative=16) -clf = SVC(decision_function_shape='ovo', probability=True).fit(X, y) -print(roc_auc_score(y, clf.predict_proba(X), multi_class='ovo')) +clf = SVC(decision_function_shape="ovo", probability=True).fit(X, y) +print(roc_auc_score(y, clf.predict_proba(X), multi_class="ovo")) diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py index 409c41a035540..d81f5886e6c63 100644 --- a/examples/release_highlights/plot_release_highlights_0_23_0.py +++ b/examples/release_highlights/plot_release_highlights_0_23_0.py @@ -45,7 +45,7 @@ y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() -gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) +gbdt = HistGradientBoostingRegressor(loss="poisson", learning_rate=0.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) @@ -67,16 +67,19 @@ from sklearn.impute import SimpleImputer from sklearn.compose import make_column_transformer from sklearn.linear_model import LogisticRegression -set_config(display='diagram') -num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler()) +set_config(display="diagram") + +num_proc = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) cat_proc = make_pipeline( - SimpleImputer(strategy='constant', fill_value='missing'), - OneHotEncoder(handle_unknown='ignore')) + SimpleImputer(strategy="constant", fill_value="missing"), + OneHotEncoder(handle_unknown="ignore"), +) -preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')), - (cat_proc, ('feat0', 'feat2'))) +preprocessor = make_column_transformer( + (num_proc, ("feat1", "feat3")), (cat_proc, ("feat0", "feat2")) +) clf = make_pipeline(preprocessor, LogisticRegression()) clf @@ -101,7 +104,7 @@ X, y = make_blobs(random_state=rng) X = scipy.sparse.csr_matrix(X) X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng) -kmeans = KMeans(algorithm='elkan').fit(X_train) +kmeans = KMeans(algorithm="elkan").fit(X_train) print(completeness_score(kmeans.predict(X_test), y_test)) ############################################################################## @@ -129,21 +132,30 @@ rng = np.random.RandomState(0) X = rng.randn(n_samples, 2) noise = rng.normal(loc=0.0, scale=0.01, size=n_samples) -y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise) +y = 5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y) gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y) disp = plot_partial_dependence( - gbdt_no_cst, X, features=[0], feature_names=['feature 0'], - line_kw={'linewidth': 4, 'label': 'unconstrained', "color": "tab:blue"}) -plot_partial_dependence(gbdt_cst, X, features=[0], - line_kw={'linewidth': 4, 'label': 'constrained', "color": "tab:orange"}, - ax=disp.axes_) + gbdt_no_cst, + X, + features=[0], + feature_names=["feature 0"], + line_kw={"linewidth": 4, "label": "unconstrained", "color": "tab:blue"}, +) +plot_partial_dependence( + gbdt_cst, + X, + features=[0], + line_kw={"linewidth": 4, "label": "constrained", "color": "tab:orange"}, + ax=disp.axes_, +) disp.axes_[0, 0].plot( - X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples', color="tab:green" + X[:, 0], y, "o", alpha=0.5, zorder=-1, label="samples", color="tab:green" ) -disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1) +disp.axes_[0, 0].set_ylim(-3, 3) +disp.axes_[0, 0].set_xlim(-1, 1) plt.legend() plt.show() @@ -163,7 +175,8 @@ X, y = make_regression(n_samples, n_features, random_state=rng) sample_weight = rng.rand(n_samples) X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( - X, y, sample_weight, random_state=rng) + X, y, sample_weight, random_state=rng +) reg = Lasso() reg.fit(X_train, y_train, sample_weight=sw_train) print(reg.score(X_test, y_test, sw_test)) diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py index f5b10dfb21acc..e54e91fe5aafb 100644 --- a/examples/release_highlights/plot_release_highlights_0_24_0.py +++ b/examples/release_highlights/plot_release_highlights_0_24_0.py @@ -43,7 +43,7 @@ # Read more in the :ref:`User Guide ` (note: # the Successive Halving estimators are still :term:`experimental # `). -# +# # .. figure:: ../model_selection/images/sphx_glr_plot_successive_halving_iterations_001.png # :target: ../model_selection/plot_successive_halving_iterations.html # :align: center @@ -61,14 +61,17 @@ clf = RandomForestClassifier(n_estimators=10, random_state=rng) -param_dist = {"max_depth": [3, None], - "max_features": randint(1, 11), - "min_samples_split": randint(2, 11), - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]} +param_dist = { + "max_depth": [3, None], + "max_features": randint(1, 11), + "min_samples_split": randint(2, 11), + "bootstrap": [True, False], + "criterion": ["gini", "entropy"], +} -rsh = HalvingRandomSearchCV(estimator=clf, param_distributions=param_dist, - factor=2, random_state=rng) +rsh = HalvingRandomSearchCV( + estimator=clf, param_distributions=param_dist, factor=2, random_state=rng +) rsh.fit(X, y) rsh.best_params_ @@ -145,8 +148,10 @@ knn = KNeighborsClassifier(n_neighbors=3) sfs = SequentialFeatureSelector(knn, n_features_to_select=2) sfs.fit(X, y) -print("Features selected by forward sequential selection: " - f"{feature_names[sfs.get_support()].tolist()}") +print( + "Features selected by forward sequential selection: " + f"{feature_names[sfs.get_support()].tolist()}" +) ############################################################################## # New PolynomialCountSketch kernel approximation function @@ -164,19 +169,20 @@ from sklearn.linear_model import LogisticRegression X, y = fetch_covtype(return_X_y=True) -pipe = make_pipeline(MinMaxScaler(), - PolynomialCountSketch(degree=2, n_components=300), - LogisticRegression(max_iter=1000)) -X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, - test_size=10000, - random_state=42) +pipe = make_pipeline( + MinMaxScaler(), + PolynomialCountSketch(degree=2, n_components=300), + LogisticRegression(max_iter=1000), +) +X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=5000, test_size=10000, random_state=42 +) pipe.fit(X_train, y_train).score(X_test, y_test) ############################################################################## # For comparison, here is the score of a linear baseline for the same data: -linear_baseline = make_pipeline(MinMaxScaler(), - LogisticRegression(max_iter=1000)) +linear_baseline = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1000)) linear_baseline.fit(X_train, y_train).score(X_test, y_test) ############################################################################## @@ -192,16 +198,22 @@ from sklearn.inspection import plot_partial_dependence X, y = fetch_california_housing(return_X_y=True, as_frame=True) -features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms'] +features = ["MedInc", "AveOccup", "HouseAge", "AveRooms"] est = RandomForestRegressor(n_estimators=10) est.fit(X, y) display = plot_partial_dependence( - est, X, features, kind="individual", subsample=50, - n_jobs=3, grid_resolution=20, random_state=0 + est, + X, + features, + kind="individual", + subsample=50, + n_jobs=3, + grid_resolution=20, + random_state=0, ) display.figure_.suptitle( - 'Partial dependence of house value on non-location features\n' - 'for the California housing dataset, with BayesianRidge' + "Partial dependence of house value on non-location features\n" + "for the California housing dataset, with BayesianRidge" ) display.figure_.subplots_adjust(hspace=0.3) @@ -223,7 +235,7 @@ # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) -regressor = DecisionTreeRegressor(criterion='poisson', random_state=0) +regressor = DecisionTreeRegressor(criterion="poisson", random_state=0) regressor.fit(X_train, y_train) ############################################################################## diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py index 715546f78ab25..d75edca605b99 100644 --- a/examples/semi_supervised/plot_label_propagation_digits.py +++ b/examples/semi_supervised/plot_label_propagation_digits.py @@ -52,15 +52,17 @@ class will be very good. # ############################################################################# # Learn with LabelSpreading -lp_model = LabelSpreading(gamma=.25, max_iter=20) +lp_model = LabelSpreading(gamma=0.25, max_iter=20) lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_set] true_labels = y[unlabeled_set] cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) -print("Label Spreading model: %d labeled & %d unlabeled points (%d total)" % - (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) +print( + "Label Spreading model: %d labeled & %d unlabeled points (%d total)" + % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples) +) print(classification_report(true_labels, predicted_labels)) @@ -85,8 +87,9 @@ class will be very good. sub.imshow(image, cmap=plt.cm.gray_r) plt.xticks([]) plt.yticks([]) - sub.set_title('predict: %i\ntrue: %i' % ( - lp_model.transduction_[image_index], y[image_index])) + sub.set_title( + "predict: %i\ntrue: %i" % (lp_model.transduction_[image_index], y[image_index]) + ) -f.suptitle('Learning with small amount of labeled data') +f.suptitle("Learning with small amount of labeled data") plt.show() diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index a05fa07d755d7..e6c19403aa728 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -60,13 +60,13 @@ predicted_labels = lp_model.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] - cm = confusion_matrix(true_labels, predicted_labels, - labels=lp_model.classes_) + cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print("Iteration %i %s" % (i, 70 * "_")) - print("Label Spreading model: %d labeled & %d unlabeled (%d total)" - % (n_labeled_points, n_total_samples - n_labeled_points, - n_total_samples)) + print( + "Label Spreading model: %d labeled & %d unlabeled (%d total)" + % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples) + ) print(classification_report(true_labels, predicted_labels)) @@ -74,42 +74,50 @@ print(cm) # compute the entropies of transduced label distributions - pred_entropies = stats.distributions.entropy( - lp_model.label_distributions_.T) + pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T) # select up to 5 digit examples that the classifier is most uncertain about uncertainty_index = np.argsort(pred_entropies)[::-1] uncertainty_index = uncertainty_index[ - np.in1d(uncertainty_index, unlabeled_indices)][:5] + np.in1d(uncertainty_index, unlabeled_indices) + ][:5] # keep track of indices that we get labels for delete_indices = np.array([], dtype=int) # for more than 5 iterations, visualize the gain only on the first 5 if i < 5: - f.text(.05, (1 - (i + 1) * .183), - "model %d\n\nfit with\n%d labels" % - ((i + 1), i * 5 + 10), size=10) + f.text( + 0.05, + (1 - (i + 1) * 0.183), + "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), + size=10, + ) for index, image_index in enumerate(uncertainty_index): image = images[image_index] # for more than 5 iterations, visualize the gain only on the first 5 if i < 5: sub = f.add_subplot(5, 5, index + 1 + (5 * i)) - sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none') - sub.set_title("predict: %i\ntrue: %i" % ( - lp_model.transduction_[image_index], y[image_index]), size=10) - sub.axis('off') + sub.imshow(image, cmap=plt.cm.gray_r, interpolation="none") + sub.set_title( + "predict: %i\ntrue: %i" + % (lp_model.transduction_[image_index], y[image_index]), + size=10, + ) + sub.axis("off") # labeling 5 points, remote from labeled set - delete_index, = np.where(unlabeled_indices == image_index) + (delete_index,) = np.where(unlabeled_indices == image_index) delete_indices = np.concatenate((delete_indices, delete_index)) unlabeled_indices = np.delete(unlabeled_indices, delete_indices) n_labeled_points += len(uncertainty_index) -f.suptitle("Active learning with Label Propagation.\nRows show 5 most " - "uncertain labels to learn with the next model.", y=1.15) -plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, - hspace=0.85) +f.suptitle( + "Active learning with Label Propagation.\nRows show 5 most " + "uncertain labels to learn with the next model.", + y=1.15, +) +plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, hspace=0.85) plt.show() diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py index f0145bcd53ccb..e94eab6e16dc1 100644 --- a/examples/semi_supervised/plot_label_propagation_structure.py +++ b/examples/semi_supervised/plot_label_propagation_structure.py @@ -24,13 +24,13 @@ n_samples = 200 X, y = make_circles(n_samples=n_samples, shuffle=False) outer, inner = 0, 1 -labels = np.full(n_samples, -1.) +labels = np.full(n_samples, -1.0) labels[0] = outer labels[-1] = inner # ############################################################################# # Learn with LabelSpreading -label_spread = LabelSpreading(kernel='knn', alpha=0.8) +label_spread = LabelSpreading(kernel="knn", alpha=0.8) label_spread.fit(X, labels) # ############################################################################# @@ -38,24 +38,57 @@ output_labels = label_spread.transduction_ plt.figure(figsize=(8.5, 4)) plt.subplot(1, 2, 1) -plt.scatter(X[labels == outer, 0], X[labels == outer, 1], color='navy', - marker='s', lw=0, label="outer labeled", s=10) -plt.scatter(X[labels == inner, 0], X[labels == inner, 1], color='c', - marker='s', lw=0, label='inner labeled', s=10) -plt.scatter(X[labels == -1, 0], X[labels == -1, 1], color='darkorange', - marker='.', label='unlabeled') -plt.legend(scatterpoints=1, shadow=False, loc='upper right') +plt.scatter( + X[labels == outer, 0], + X[labels == outer, 1], + color="navy", + marker="s", + lw=0, + label="outer labeled", + s=10, +) +plt.scatter( + X[labels == inner, 0], + X[labels == inner, 1], + color="c", + marker="s", + lw=0, + label="inner labeled", + s=10, +) +plt.scatter( + X[labels == -1, 0], + X[labels == -1, 1], + color="darkorange", + marker=".", + label="unlabeled", +) +plt.legend(scatterpoints=1, shadow=False, loc="upper right") plt.title("Raw data (2 classes=outer and inner)") plt.subplot(1, 2, 2) output_label_array = np.asarray(output_labels) outer_numbers = np.where(output_label_array == outer)[0] inner_numbers = np.where(output_label_array == inner)[0] -plt.scatter(X[outer_numbers, 0], X[outer_numbers, 1], color='navy', - marker='s', lw=0, s=10, label="outer learned") -plt.scatter(X[inner_numbers, 0], X[inner_numbers, 1], color='c', - marker='s', lw=0, s=10, label="inner learned") -plt.legend(scatterpoints=1, shadow=False, loc='upper right') +plt.scatter( + X[outer_numbers, 0], + X[outer_numbers, 1], + color="navy", + marker="s", + lw=0, + s=10, + label="outer learned", +) +plt.scatter( + X[inner_numbers, 0], + X[inner_numbers, 1], + color="c", + marker="s", + lw=0, + s=10, + label="inner learned", +) +plt.legend(scatterpoints=1, shadow=False, loc="upper right") plt.title("Labels learned with Label Spreading (KNN)") plt.subplots_adjust(left=0.07, bottom=0.07, right=0.93, top=0.92) diff --git a/examples/semi_supervised/plot_self_training_varying_threshold.py b/examples/semi_supervised/plot_self_training_varying_threshold.py index c2c89a36b5e8c..fa2ac289086d3 100644 --- a/examples/semi_supervised/plot_self_training_varying_threshold.py +++ b/examples/semi_supervised/plot_self_training_varying_threshold.py @@ -58,8 +58,7 @@ amount_iterations = np.empty((x_values.shape[0], n_splits)) for (i, threshold) in enumerate(x_values): - self_training_clf = SelfTrainingClassifier(base_classifier, - threshold=threshold) + self_training_clf = SelfTrainingClassifier(base_classifier, threshold=threshold) # We need manual cross validation so that we don't treat -1 as a separate # class when computing accuracy @@ -74,8 +73,10 @@ self_training_clf.fit(X_train, y_train) # The amount of labeled samples that at the end of fitting - amount_labeled[i, fold] = total_samples - np.unique( - self_training_clf.labeled_iter_, return_counts=True)[1][0] + amount_labeled[i, fold] = ( + total_samples + - np.unique(self_training_clf.labeled_iter_, return_counts=True)[1][0] + ) # The last iteration the classifier labeled a sample in amount_iterations[i, fold] = np.max(self_training_clf.labeled_iter_) @@ -84,26 +85,34 @@ ax1 = plt.subplot(211) -ax1.errorbar(x_values, scores.mean(axis=1), - yerr=scores.std(axis=1), - capsize=2, color='b') -ax1.set_ylabel('Accuracy', color='b') -ax1.tick_params('y', colors='b') +ax1.errorbar( + x_values, scores.mean(axis=1), yerr=scores.std(axis=1), capsize=2, color="b" +) +ax1.set_ylabel("Accuracy", color="b") +ax1.tick_params("y", colors="b") ax2 = ax1.twinx() -ax2.errorbar(x_values, amount_labeled.mean(axis=1), - yerr=amount_labeled.std(axis=1), - capsize=2, color='g') +ax2.errorbar( + x_values, + amount_labeled.mean(axis=1), + yerr=amount_labeled.std(axis=1), + capsize=2, + color="g", +) ax2.set_ylim(bottom=0) -ax2.set_ylabel('Amount of labeled samples', color='g') -ax2.tick_params('y', colors='g') +ax2.set_ylabel("Amount of labeled samples", color="g") +ax2.tick_params("y", colors="g") ax3 = plt.subplot(212, sharex=ax1) -ax3.errorbar(x_values, amount_iterations.mean(axis=1), - yerr=amount_iterations.std(axis=1), - capsize=2, color='b') +ax3.errorbar( + x_values, + amount_iterations.mean(axis=1), + yerr=amount_iterations.std(axis=1), + capsize=2, + color="b", +) ax3.set_ylim(bottom=0) -ax3.set_ylabel('Amount of iterations') -ax3.set_xlabel('Threshold') +ax3.set_ylabel("Amount of iterations") +ax3.set_xlabel("Threshold") plt.show() diff --git a/examples/semi_supervised/plot_semi_supervised_newsgroups.py b/examples/semi_supervised/plot_semi_supervised_newsgroups.py index 72815471f54b9..7316417e86120 100644 --- a/examples/semi_supervised/plot_semi_supervised_newsgroups.py +++ b/examples/semi_supervised/plot_semi_supervised_newsgroups.py @@ -24,45 +24,52 @@ from sklearn.semi_supervised import LabelSpreading from sklearn.metrics import f1_score -data = fetch_20newsgroups(subset='train', categories=None) +data = fetch_20newsgroups(subset="train", categories=None) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() # Parameters -sdg_params = dict(alpha=1e-5, penalty='l2', loss='log') +sdg_params = dict(alpha=1e-5, penalty="l2", loss="log") vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8) # Supervised Pipeline -pipeline = Pipeline([ - ('vect', CountVectorizer(**vectorizer_params)), - ('tfidf', TfidfTransformer()), - ('clf', SGDClassifier(**sdg_params)), -]) +pipeline = Pipeline( + [ + ("vect", CountVectorizer(**vectorizer_params)), + ("tfidf", TfidfTransformer()), + ("clf", SGDClassifier(**sdg_params)), + ] +) # SelfTraining Pipeline -st_pipeline = Pipeline([ - ('vect', CountVectorizer(**vectorizer_params)), - ('tfidf', TfidfTransformer()), - ('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)), -]) +st_pipeline = Pipeline( + [ + ("vect", CountVectorizer(**vectorizer_params)), + ("tfidf", TfidfTransformer()), + ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)), + ] +) # LabelSpreading Pipeline -ls_pipeline = Pipeline([ - ('vect', CountVectorizer(**vectorizer_params)), - ('tfidf', TfidfTransformer()), - # LabelSpreading does not support dense matrices - ('todense', FunctionTransformer(lambda x: x.todense())), - ('clf', LabelSpreading()), -]) +ls_pipeline = Pipeline( + [ + ("vect", CountVectorizer(**vectorizer_params)), + ("tfidf", TfidfTransformer()), + # LabelSpreading does not support dense matrices + ("todense", FunctionTransformer(lambda x: x.todense())), + ("clf", LabelSpreading()), + ] +) def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test): print("Number of training samples:", len(X_train)) - print("Unlabeled samples in training set:", - sum(1 for x in y_train if x == -1)) + print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1)) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) - print("Micro-averaged F1 score on test set: " - "%0.3f" % f1_score(y_test, y_pred, average='micro')) + print( + "Micro-averaged F1 score on test set: %0.3f" + % f1_score(y_test, y_pred, average="micro") + ) print("-" * 10) print() @@ -78,18 +85,18 @@ def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test): y_mask = np.random.rand(len(y_train)) < 0.2 # X_20 and y_20 are the subset of the train dataset indicated by the mask - X_20, y_20 = map(list, zip(*((x, y) - for x, y, m in zip(X_train, y_train, y_mask) if m))) + X_20, y_20 = map( + list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m)) + ) print("Supervised SGDClassifier on 20% of the training data:") eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test) # set the non-masked subset to be unlabeled y_train[~y_mask] = -1 - print("SelfTrainingClassifier on 20% of the training data (rest " - "is unlabeled):") + print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):") eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test) - if 'CI' not in os.environ: + if "CI" not in os.environ: # LabelSpreading takes too long to run in the online documentation print("LabelSpreading on 20% of the data (rest is unlabeled):") eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test) diff --git a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py index f93dc2f28370e..f9703fd44a902 100644 --- a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py +++ b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py @@ -33,7 +33,7 @@ y = iris.target # step size in the mesh -h = .02 +h = 0.02 rng = np.random.RandomState(0) y_rand = rng.rand(y.shape[0]) @@ -43,26 +43,31 @@ y_50[y_rand < 0.5] = -1 # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors -ls30 = (LabelSpreading().fit(X, y_30), y_30, 'Label Spreading 30% data') -ls50 = (LabelSpreading().fit(X, y_50), y_50, 'Label Spreading 50% data') -ls100 = (LabelSpreading().fit(X, y), y, 'Label Spreading 100% data') +ls30 = (LabelSpreading().fit(X, y_30), y_30, "Label Spreading 30% data") +ls50 = (LabelSpreading().fit(X, y_50), y_50, "Label Spreading 50% data") +ls100 = (LabelSpreading().fit(X, y), y, "Label Spreading 100% data") # the base classifier for self-training is identical to the SVC -base_classifier = SVC(kernel='rbf', gamma=.5, probability=True) -st30 = (SelfTrainingClassifier(base_classifier).fit(X, y_30), - y_30, 'Self-training 30% data') -st50 = (SelfTrainingClassifier(base_classifier).fit(X, y_50), - y_50, 'Self-training 50% data') - -rbf_svc = (SVC(kernel='rbf', gamma=.5).fit(X, y), y, 'SVC with rbf kernel') +base_classifier = SVC(kernel="rbf", gamma=0.5, probability=True) +st30 = ( + SelfTrainingClassifier(base_classifier).fit(X, y_30), + y_30, + "Self-training 30% data", +) +st50 = ( + SelfTrainingClassifier(base_classifier).fit(X, y_50), + y_50, + "Self-training 50% data", +) + +rbf_svc = (SVC(kernel="rbf", gamma=0.5).fit(X, y), y, "SVC with rbf kernel") # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) -color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)} +color_map = {-1: (1, 1, 1), 0: (0, 0, 0.9), 1: (1, 0, 0), 2: (0.8, 0.6, 0)} classifiers = (ls30, st30, ls50, st50, ls100, rbf_svc) for i, (clf, y_train, title) in enumerate(classifiers): @@ -74,11 +79,11 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) - plt.axis('off') + plt.axis("off") # Plot also the training points colors = [color_map[y] for y in y_train] - plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black') + plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors="black") plt.title(title) diff --git a/examples/svm/plot_custom_kernel.py b/examples/svm/plot_custom_kernel.py index 5ee70f8cc7801..86fe0aa8e585e 100644 --- a/examples/svm/plot_custom_kernel.py +++ b/examples/svm/plot_custom_kernel.py @@ -32,7 +32,7 @@ def my_kernel(X, Y): return np.dot(np.dot(X, M), Y.T) -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh # we create an instance of SVM and fit out data. clf = svm.SVC(kernel=my_kernel) @@ -50,8 +50,7 @@ def my_kernel(X, Y): plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points -plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k') -plt.title('3-Class classification using Support Vector Machine with custom' - ' kernel') -plt.axis('tight') +plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors="k") +plt.title("3-Class classification using Support Vector Machine with custom kernel") +plt.axis("tight") plt.show() diff --git a/examples/svm/plot_iris_svc.py b/examples/svm/plot_iris_svc.py index ab7860296985c..10b64f1c62e09 100644 --- a/examples/svm/plot_iris_svc.py +++ b/examples/svm/plot_iris_svc.py @@ -40,7 +40,7 @@ from sklearn import svm, datasets -def make_meshgrid(x, y, h=.02): +def make_meshgrid(x, y, h=0.02): """Create a mesh of points to plot in Parameters @@ -55,8 +55,7 @@ def make_meshgrid(x, y, h=.02): """ x_min, x_max = x.min() - 1, x.max() + 1 y_min, y_max = y.min() - 1, y.max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) return xx, yy @@ -86,17 +85,21 @@ def plot_contours(ax, clf, xx, yy, **params): # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors C = 1.0 # SVM regularization parameter -models = (svm.SVC(kernel='linear', C=C), - svm.LinearSVC(C=C, max_iter=10000), - svm.SVC(kernel='rbf', gamma=0.7, C=C), - svm.SVC(kernel='poly', degree=3, gamma='auto', C=C)) +models = ( + svm.SVC(kernel="linear", C=C), + svm.LinearSVC(C=C, max_iter=10000), + svm.SVC(kernel="rbf", gamma=0.7, C=C), + svm.SVC(kernel="poly", degree=3, gamma="auto", C=C), +) models = (clf.fit(X, y) for clf in models) # title for the plots -titles = ('SVC with linear kernel', - 'LinearSVC (linear kernel)', - 'SVC with RBF kernel', - 'SVC with polynomial (degree 3) kernel') +titles = ( + "SVC with linear kernel", + "LinearSVC (linear kernel)", + "SVC with RBF kernel", + "SVC with polynomial (degree 3) kernel", +) # Set-up 2x2 grid for plotting. fig, sub = plt.subplots(2, 2) @@ -106,13 +109,12 @@ def plot_contours(ax, clf, xx, yy, **params): xx, yy = make_meshgrid(X0, X1) for clf, title, ax in zip(models, titles, sub.flatten()): - plot_contours(ax, clf, xx, yy, - cmap=plt.cm.coolwarm, alpha=0.8) - ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k') + plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8) + ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k") ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) - ax.set_xlabel('Sepal length') - ax.set_ylabel('Sepal width') + ax.set_xlabel("Sepal length") + ax.set_ylabel("Sepal width") ax.set_xticks(()) ax.set_yticks(()) ax.set_title(title) diff --git a/examples/svm/plot_linearsvc_support_vectors.py b/examples/svm/plot_linearsvc_support_vectors.py index cc7e9caa5cda8..298ec5e2419fb 100644 --- a/examples/svm/plot_linearsvc_support_vectors.py +++ b/examples/svm/plot_linearsvc_support_vectors.py @@ -26,8 +26,7 @@ # decision_function = np.dot(X, clf.coef_[0]) + clf.intercept_[0] # The support vectors are the samples that lie within the margin # boundaries, whose size is conventionally constrained to 1 - support_vector_indices = np.where( - np.abs(decision_function) <= 1 + 1e-15)[0] + support_vector_indices = np.where(np.abs(decision_function) <= 1 + 1e-15)[0] support_vectors = X[support_vector_indices] plt.subplot(1, 2, i + 1) @@ -35,14 +34,28 @@ ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() - xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50), - np.linspace(ylim[0], ylim[1], 50)) + xx, yy = np.meshgrid( + np.linspace(xlim[0], xlim[1], 50), np.linspace(ylim[0], ylim[1], 50) + ) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) - plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, - linestyles=['--', '-', '--']) - plt.scatter(support_vectors[:, 0], support_vectors[:, 1], s=100, - linewidth=1, facecolors='none', edgecolors='k') + plt.contour( + xx, + yy, + Z, + colors="k", + levels=[-1, 0, 1], + alpha=0.5, + linestyles=["--", "-", "--"], + ) + plt.scatter( + support_vectors[:, 0], + support_vectors[:, 1], + s=100, + linewidth=1, + facecolors="none", + edgecolors="k", + ) plt.title("C=" + str(C)) plt.tight_layout() plt.show() diff --git a/examples/svm/plot_oneclass.py b/examples/svm/plot_oneclass.py index 3f04537ca1f00..a3b060a0e67c0 100644 --- a/examples/svm/plot_oneclass.py +++ b/examples/svm/plot_oneclass.py @@ -42,25 +42,29 @@ plt.title("Novelty Detection") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) -a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') -plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') +a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="darkred") +plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred") s = 40 -b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') -b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, - edgecolors='k') -c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, - edgecolors='k') -plt.axis('tight') +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k") +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k") +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k") +plt.axis("tight") plt.xlim((-5, 5)) plt.ylim((-5, 5)) -plt.legend([a.collections[0], b1, b2, c], - ["learned frontier", "training observations", - "new regular observations", "new abnormal observations"], - loc="upper left", - prop=matplotlib.font_manager.FontProperties(size=11)) +plt.legend( + [a.collections[0], b1, b2, c], + [ + "learned frontier", + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="upper left", + prop=matplotlib.font_manager.FontProperties(size=11), +) plt.xlabel( - "error train: %d/200 ; errors novel regular: %d/40 ; " - "errors novel abnormal: %d/40" - % (n_error_train, n_error_test, n_error_outliers)) + "error train: %d/200 ; errors novel regular: %d/40 ; errors novel abnormal: %d/40" + % (n_error_train, n_error_test, n_error_outliers) +) plt.show() diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index eda5a7248c24d..19cae930d93a8 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -1,4 +1,4 @@ -''' +""" ================== RBF SVM parameters ================== @@ -73,7 +73,7 @@ ``gamma_range`` steps will increase the resolution of the hyper-parameter heat map. -''' +""" print(__doc__) import numpy as np @@ -90,8 +90,8 @@ # Utility function to move the midpoint of a colormap to be around # the values of interest. -class MidpointNormalize(Normalize): +class MidpointNormalize(Normalize): def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False): self.midpoint = midpoint Normalize.__init__(self, vmin, vmax, clip) @@ -100,6 +100,7 @@ def __call__(self, value, clip=None): x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1] return np.ma.masked_array(np.interp(value, x, y)) + # ############################################################################# # Load and prepare data set # @@ -142,8 +143,10 @@ def __call__(self, value, clip=None): grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X, y) -print("The best parameters are %s with a score of %0.2f" - % (grid.best_params_, grid.best_score_)) +print( + "The best parameters are %s with a score of %0.2f" + % (grid.best_params_, grid.best_score_) +) # Now we need to fit a classifier for all parameters in the 2d version # (we use a smaller set of parameters here because it takes a while to train) @@ -171,19 +174,16 @@ def __call__(self, value, clip=None): # visualize decision function for these parameters plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1) - plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)), - size='medium') + plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)), size="medium") # visualize parameter's effect on decision function plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu) - plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r, - edgecolors='k') + plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r, edgecolors="k") plt.xticks(()) plt.yticks(()) - plt.axis('tight') + plt.axis("tight") -scores = grid.cv_results_['mean_test_score'].reshape(len(C_range), - len(gamma_range)) +scores = grid.cv_results_["mean_test_score"].reshape(len(C_range), len(gamma_range)) # Draw heatmap of the validation accuracy as a function of gamma and C # @@ -195,13 +195,17 @@ def __call__(self, value, clip=None): # the same color. plt.figure(figsize=(8, 6)) -plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) -plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, - norm=MidpointNormalize(vmin=0.2, midpoint=0.92)) -plt.xlabel('gamma') -plt.ylabel('C') +plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95) +plt.imshow( + scores, + interpolation="nearest", + cmap=plt.cm.hot, + norm=MidpointNormalize(vmin=0.2, midpoint=0.92), +) +plt.xlabel("gamma") +plt.ylabel("C") plt.colorbar() plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) plt.yticks(np.arange(len(C_range)), C_range) -plt.title('Validation accuracy') +plt.title("Validation accuracy") plt.show() diff --git a/examples/svm/plot_separating_hyperplane.py b/examples/svm/plot_separating_hyperplane.py index cbd61abad53e6..cfb4a195e8a12 100644 --- a/examples/svm/plot_separating_hyperplane.py +++ b/examples/svm/plot_separating_hyperplane.py @@ -19,7 +19,7 @@ X, y = make_blobs(n_samples=40, centers=2, random_state=6) # fit the model, don't regularize for illustration purposes -clf = svm.SVC(kernel='linear', C=1000) +clf = svm.SVC(kernel="linear", C=1000) clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.Paired) @@ -37,9 +37,16 @@ Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins -ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, - linestyles=['--', '-', '--']) +ax.contour( + XX, YY, Z, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"] +) # plot support vectors -ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100, - linewidth=1, facecolors='none', edgecolors='k') +ax.scatter( + clf.support_vectors_[:, 0], + clf.support_vectors_[:, 1], + s=100, + linewidth=1, + facecolors="none", + edgecolors="k", +) plt.show() diff --git a/examples/svm/plot_separating_hyperplane_unbalanced.py b/examples/svm/plot_separating_hyperplane_unbalanced.py index 2a0540fead310..a1db23f5f5ca8 100644 --- a/examples/svm/plot_separating_hyperplane_unbalanced.py +++ b/examples/svm/plot_separating_hyperplane_unbalanced.py @@ -36,21 +36,24 @@ n_samples_2 = 100 centers = [[0.0, 0.0], [2.0, 2.0]] clusters_std = [1.5, 0.5] -X, y = make_blobs(n_samples=[n_samples_1, n_samples_2], - centers=centers, - cluster_std=clusters_std, - random_state=0, shuffle=False) +X, y = make_blobs( + n_samples=[n_samples_1, n_samples_2], + centers=centers, + cluster_std=clusters_std, + random_state=0, + shuffle=False, +) # fit the model and get the separating hyperplane -clf = svm.SVC(kernel='linear', C=1.0) +clf = svm.SVC(kernel="linear", C=1.0) clf.fit(X, y) # fit the model and get the separating hyperplane using weighted classes -wclf = svm.SVC(kernel='linear', class_weight={1: 10}) +wclf = svm.SVC(kernel="linear", class_weight={1: 10}) wclf.fit(X, y) # plot the samples -plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') +plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors="k") # plot the decision functions for both classifiers ax = plt.gca() @@ -67,14 +70,17 @@ Z = clf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins -a = ax.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.5, linestyles=['-']) +a = ax.contour(XX, YY, Z, colors="k", levels=[0], alpha=0.5, linestyles=["-"]) # get the separating hyperplane for weighted classes Z = wclf.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins for weighted classes -b = ax.contour(XX, YY, Z, colors='r', levels=[0], alpha=0.5, linestyles=['-']) +b = ax.contour(XX, YY, Z, colors="r", levels=[0], alpha=0.5, linestyles=["-"]) -plt.legend([a.collections[0], b.collections[0]], ["non weighted", "weighted"], - loc="upper right") +plt.legend( + [a.collections[0], b.collections[0]], + ["non weighted", "weighted"], + loc="upper right", +) plt.show() diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py index b0392b1c00361..3fa7d05240df0 100644 --- a/examples/svm/plot_svm_anova.py +++ b/examples/svm/plot_svm_anova.py @@ -30,9 +30,13 @@ # ############################################################################# # Create a feature-selection transform, a scaler and an instance of SVM that we # combine together to have a full-blown estimator -clf = Pipeline([('anova', SelectPercentile(chi2)), - ('scaler', StandardScaler()), - ('svc', SVC(gamma="auto"))]) +clf = Pipeline( + [ + ("anova", SelectPercentile(chi2)), + ("scaler", StandardScaler()), + ("svc", SVC(gamma="auto")), + ] +) # ############################################################################# # Plot the cross-validation score as a function of percentile of features @@ -47,10 +51,9 @@ score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) -plt.title( - 'Performance of the SVM-Anova varying the percentile of features selected') +plt.title("Performance of the SVM-Anova varying the percentile of features selected") plt.xticks(np.linspace(0, 100, 11, endpoint=True)) -plt.xlabel('Percentile') -plt.ylabel('Accuracy Score') -plt.axis('tight') +plt.xlabel("Percentile") +plt.ylabel("Accuracy Score") +plt.axis("tight") plt.show() diff --git a/examples/svm/plot_svm_kernels.py b/examples/svm/plot_svm_kernels.py index dbad4e0b725e2..5d8ef3403797f 100644 --- a/examples/svm/plot_svm_kernels.py +++ b/examples/svm/plot_svm_kernels.py @@ -24,30 +24,32 @@ # Our dataset and targets -X = np.c_[(.4, -.7), - (-1.5, -1), - (-1.4, -.9), - (-1.3, -1.2), - (-1.1, -.2), - (-1.2, -.4), - (-.5, 1.2), - (-1.5, 2.1), - (1, 1), - # -- - (1.3, .8), - (1.2, .5), - (.2, -2), - (.5, -2.4), - (.2, -2.3), - (0, -2.7), - (1.3, 2.1)].T +X = np.c_[ + (0.4, -0.7), + (-1.5, -1), + (-1.4, -0.9), + (-1.3, -1.2), + (-1.1, -0.2), + (-1.2, -0.4), + (-0.5, 1.2), + (-1.5, 2.1), + (1, 1), + # -- + (1.3, 0.8), + (1.2, 0.5), + (0.2, -2), + (0.5, -2.4), + (0.2, -2.3), + (0, -2.7), + (1.3, 2.1), +].T Y = [0] * 8 + [1] * 8 # figure number fignum = 1 # fit the model -for kernel in ('linear', 'poly', 'rbf'): +for kernel in ("linear", "poly", "rbf"): clf = svm.SVC(kernel=kernel, gamma=2) clf.fit(X, Y) @@ -55,12 +57,17 @@ plt.figure(fignum, figsize=(4, 3)) plt.clf() - plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, - facecolors='none', zorder=10, edgecolors='k') - plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired, - edgecolors='k') - - plt.axis('tight') + plt.scatter( + clf.support_vectors_[:, 0], + clf.support_vectors_[:, 1], + s=80, + facecolors="none", + zorder=10, + edgecolors="k", + ) + plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired, edgecolors="k") + + plt.axis("tight") x_min = -3 x_max = 3 y_min = -3 @@ -73,8 +80,14 @@ Z = Z.reshape(XX.shape) plt.figure(fignum, figsize=(4, 3)) plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) - plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], - levels=[-.5, 0, .5]) + plt.contour( + XX, + YY, + Z, + colors=["k", "k", "k"], + linestyles=["--", "-", "--"], + levels=[-0.5, 0, 0.5], + ) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py index 5b267957677f8..123cfafff68e1 100644 --- a/examples/svm/plot_svm_margin.py +++ b/examples/svm/plot_svm_margin.py @@ -36,9 +36,9 @@ fignum = 1 # fit the model -for name, penalty in (('unreg', 1), ('reg', 0.05)): +for name, penalty in (("unreg", 1), ("reg", 0.05)): - clf = svm.SVC(kernel='linear', C=penalty) + clf = svm.SVC(kernel="linear", C=penalty) clf.fit(X, Y) # get the separating hyperplane @@ -58,17 +58,24 @@ # plot the line, the points, and the nearest vectors to the plane plt.figure(fignum, figsize=(4, 3)) plt.clf() - plt.plot(xx, yy, 'k-') - plt.plot(xx, yy_down, 'k--') - plt.plot(xx, yy_up, 'k--') - - plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, - facecolors='none', zorder=10, edgecolors='k', - cmap=cm.get_cmap('RdBu')) - plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=cm.get_cmap('RdBu'), - edgecolors='k') - - plt.axis('tight') + plt.plot(xx, yy, "k-") + plt.plot(xx, yy_down, "k--") + plt.plot(xx, yy_up, "k--") + + plt.scatter( + clf.support_vectors_[:, 0], + clf.support_vectors_[:, 1], + s=80, + facecolors="none", + zorder=10, + edgecolors="k", + cmap=cm.get_cmap("RdBu"), + ) + plt.scatter( + X[:, 0], X[:, 1], c=Y, zorder=10, cmap=cm.get_cmap("RdBu"), edgecolors="k" + ) + + plt.axis("tight") x_min = -4.8 x_max = 4.2 y_min = -6 @@ -79,8 +86,7 @@ Z = clf.decision_function(xy).reshape(XX.shape) # Put the result into a contour plot - plt.contourf(XX, YY, Z, cmap=cm.get_cmap('RdBu'), - alpha=0.5, linestyles=['-']) + plt.contourf(XX, YY, Z, cmap=cm.get_cmap("RdBu"), alpha=0.5, linestyles=["-"]) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) diff --git a/examples/svm/plot_svm_nonlinear.py b/examples/svm/plot_svm_nonlinear.py index 47575d992a63b..09ae1febc8873 100644 --- a/examples/svm/plot_svm_nonlinear.py +++ b/examples/svm/plot_svm_nonlinear.py @@ -15,27 +15,29 @@ import matplotlib.pyplot as plt from sklearn import svm -xx, yy = np.meshgrid(np.linspace(-3, 3, 500), - np.linspace(-3, 3, 500)) +xx, yy = np.meshgrid(np.linspace(-3, 3, 500), np.linspace(-3, 3, 500)) np.random.seed(0) X = np.random.randn(300, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) # fit the model -clf = svm.NuSVC(gamma='auto') +clf = svm.NuSVC(gamma="auto") clf.fit(X, Y) # plot the decision function for each datapoint on the grid Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) -plt.imshow(Z, interpolation='nearest', - extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', - origin='lower', cmap=plt.cm.PuOr_r) -contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, - linestyles='dashed') -plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, - edgecolors='k') +plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + aspect="auto", + origin="lower", + cmap=plt.cm.PuOr_r, +) +contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linestyles="dashed") +plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired, edgecolors="k") plt.xticks(()) plt.yticks(()) plt.axis([-3, 3, -3, 3]) diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py index a91b588a15f63..ca45951a9f224 100644 --- a/examples/svm/plot_svm_regression.py +++ b/examples/svm/plot_svm_regression.py @@ -23,34 +23,52 @@ # ############################################################################# # Fit regression model -svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1) -svr_lin = SVR(kernel='linear', C=100, gamma='auto') -svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1, - coef0=1) +svr_rbf = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1) +svr_lin = SVR(kernel="linear", C=100, gamma="auto") +svr_poly = SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1) # ############################################################################# # Look at the results lw = 2 svrs = [svr_rbf, svr_lin, svr_poly] -kernel_label = ['RBF', 'Linear', 'Polynomial'] -model_color = ['m', 'c', 'g'] +kernel_label = ["RBF", "Linear", "Polynomial"] +model_color = ["m", "c", "g"] fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True) for ix, svr in enumerate(svrs): - axes[ix].plot(X, svr.fit(X, y).predict(X), color=model_color[ix], lw=lw, - label='{} model'.format(kernel_label[ix])) - axes[ix].scatter(X[svr.support_], y[svr.support_], facecolor="none", - edgecolor=model_color[ix], s=50, - label='{} support vectors'.format(kernel_label[ix])) - axes[ix].scatter(X[np.setdiff1d(np.arange(len(X)), svr.support_)], - y[np.setdiff1d(np.arange(len(X)), svr.support_)], - facecolor="none", edgecolor="k", s=50, - label='other training data') - axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), - ncol=1, fancybox=True, shadow=True) - -fig.text(0.5, 0.04, 'data', ha='center', va='center') -fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical') + axes[ix].plot( + X, + svr.fit(X, y).predict(X), + color=model_color[ix], + lw=lw, + label="{} model".format(kernel_label[ix]), + ) + axes[ix].scatter( + X[svr.support_], + y[svr.support_], + facecolor="none", + edgecolor=model_color[ix], + s=50, + label="{} support vectors".format(kernel_label[ix]), + ) + axes[ix].scatter( + X[np.setdiff1d(np.arange(len(X)), svr.support_)], + y[np.setdiff1d(np.arange(len(X)), svr.support_)], + facecolor="none", + edgecolor="k", + s=50, + label="other training data", + ) + axes[ix].legend( + loc="upper center", + bbox_to_anchor=(0.5, 1.1), + ncol=1, + fancybox=True, + shadow=True, + ) + +fig.text(0.5, 0.04, "data", ha="center", va="center") +fig.text(0.06, 0.5, "target", ha="center", va="center", rotation="vertical") fig.suptitle("Support Vector Regression", fontsize=14) plt.show() diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 5208519172824..cc3793fefc7d3 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -100,22 +100,31 @@ n_features = 300 # l1 data (only 5 informative features) -X_1, y_1 = datasets.make_classification(n_samples=n_samples, - n_features=n_features, n_informative=5, - random_state=1) +X_1, y_1 = datasets.make_classification( + n_samples=n_samples, n_features=n_features, n_informative=5, random_state=1 +) # l2 data: non sparse, but less features -y_2 = np.sign(.5 - rnd.rand(n_samples)) +y_2 = np.sign(0.5 - rnd.rand(n_samples)) X_2 = rnd.randn(n_samples, n_features // 5) + y_2[:, np.newaxis] X_2 += 5 * rnd.randn(n_samples, n_features // 5) -clf_sets = [(LinearSVC(penalty='l1', loss='squared_hinge', dual=False, - tol=1e-3), - np.logspace(-2.3, -1.3, 10), X_1, y_1), - (LinearSVC(penalty='l2', loss='squared_hinge', dual=True), - np.logspace(-4.5, -2, 10), X_2, y_2)] - -colors = ['navy', 'cyan', 'darkorange'] +clf_sets = [ + ( + LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3), + np.logspace(-2.3, -1.3, 10), + X_1, + y_1, + ), + ( + LinearSVC(penalty="l2", loss="squared_hinge", dual=True), + np.logspace(-4.5, -2, 10), + X_2, + y_2, + ), +] + +colors = ["navy", "cyan", "darkorange"] lw = 2 for clf, cs, X, y in clf_sets: @@ -126,25 +135,36 @@ param_grid = dict(C=cs) # To get nice curve, we need a large number of iterations to # reduce the variance - grid = GridSearchCV(clf, refit=False, param_grid=param_grid, - cv=ShuffleSplit(train_size=train_size, - test_size=.3, - n_splits=250, random_state=1)) + grid = GridSearchCV( + clf, + refit=False, + param_grid=param_grid, + cv=ShuffleSplit( + train_size=train_size, test_size=0.3, n_splits=250, random_state=1 + ), + ) grid.fit(X, y) - scores = grid.cv_results_['mean_test_score'] + scores = grid.cv_results_["mean_test_score"] - scales = [(1, 'No scaling'), - ((n_samples * train_size), '1/n_samples'), - ] + scales = [ + (1, "No scaling"), + ((n_samples * train_size), "1/n_samples"), + ] for ax, (scaler, name) in zip(axes, scales): - ax.set_xlabel('C') - ax.set_ylabel('CV Score') + ax.set_xlabel("C") + ax.set_ylabel("CV Score") grid_cs = cs * float(scaler) # scale the C's - ax.semilogx(grid_cs, scores, label="fraction %.2f" % - train_size, color=colors[k], lw=lw) - ax.set_title('scaling=%s, penalty=%s, loss=%s' % - (name, clf.penalty, clf.loss)) + ax.semilogx( + grid_cs, + scores, + label="fraction %.2f" % train_size, + color=colors[k], + lw=lw, + ) + ax.set_title( + "scaling=%s, penalty=%s, loss=%s" % (name, clf.penalty, clf.loss) + ) plt.legend(loc="best") plt.show() diff --git a/examples/svm/plot_svm_tie_breaking.py b/examples/svm/plot_svm_tie_breaking.py index 76eabfa1e35be..aea34ac97fd8a 100644 --- a/examples/svm/plot_svm_tie_breaking.py +++ b/examples/svm/plot_svm_tie_breaking.py @@ -27,13 +27,13 @@ X, y = make_blobs(random_state=27) fig, sub = plt.subplots(2, 1, figsize=(5, 8)) -titles = ("break_ties = False", - "break_ties = True") +titles = ("break_ties = False", "break_ties = True") for break_ties, title, ax in zip((False, True), titles, sub.flatten()): - svm = SVC(kernel="linear", C=1, break_ties=break_ties, - decision_function_shape='ovr').fit(X, y) + svm = SVC( + kernel="linear", C=1, break_ties=break_ties, decision_function_shape="ovr" + ).fit(X, y) xlim = [X[:, 0].min(), X[:, 0].max()] ylim = [X[:, 1].min(), X[:, 1].max()] @@ -49,8 +49,12 @@ points = ax.scatter(X[:, 0], X[:, 1], c=y, cmap="Accent") classes = [(0, 1), (0, 2), (1, 2)] line = np.linspace(X[:, 1].min() - 5, X[:, 1].max() + 5) - ax.imshow(-pred.reshape(xx.shape), cmap="Accent", alpha=.2, - extent=(xlim[0], xlim[1], ylim[1], ylim[0])) + ax.imshow( + -pred.reshape(xx.shape), + cmap="Accent", + alpha=0.2, + extent=(xlim[0], xlim[1], ylim[1], ylim[0]), + ) for coef, intercept, col in zip(svm.coef_, svm.intercept_, classes): line2 = -(line * coef[1] + intercept) / coef[0] diff --git a/examples/svm/plot_weighted_samples.py b/examples/svm/plot_weighted_samples.py index 0549da7a38084..f25390446fc87 100644 --- a/examples/svm/plot_weighted_samples.py +++ b/examples/svm/plot_weighted_samples.py @@ -28,10 +28,17 @@ def plot_decision_function(classifier, sample_weight, axis, title): # plot the line, the points, and the nearest vectors to the plane axis.contourf(xx, yy, Z, alpha=0.75, cmap=plt.cm.bone) - axis.scatter(X[:, 0], X[:, 1], c=y, s=100 * sample_weight, alpha=0.9, - cmap=plt.cm.bone, edgecolors='black') + axis.scatter( + X[:, 0], + X[:, 1], + c=y, + s=100 * sample_weight, + alpha=0.9, + cmap=plt.cm.bone, + edgecolors="black", + ) - axis.axis('off') + axis.axis("off") axis.set_title(title) @@ -55,9 +62,9 @@ def plot_decision_function(classifier, sample_weight, axis, title): clf_no_weights.fit(X, y) fig, axes = plt.subplots(1, 2, figsize=(14, 6)) -plot_decision_function(clf_no_weights, sample_weight_constant, axes[0], - "Constant weights") -plot_decision_function(clf_weights, sample_weight_last_ten, axes[1], - "Modified weights") +plot_decision_function( + clf_no_weights, sample_weight_constant, axes[0], "Constant weights" +) +plot_decision_function(clf_weights, sample_weight_last_ten, axes[1], "Modified weights") plt.show() diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 7f7bc422808dc..5351bb5bef3e3 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -45,40 +45,60 @@ # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") op = OptionParser() -op.add_option("--report", - action="store_true", dest="print_report", - help="Print a detailed classification report.") -op.add_option("--chi2_select", - action="store", type="int", dest="select_chi2", - help="Select some number of features using a chi-squared test") -op.add_option("--confusion_matrix", - action="store_true", dest="print_cm", - help="Print the confusion matrix.") -op.add_option("--top10", - action="store_true", dest="print_top10", - help="Print ten most discriminative terms per class" - " for every classifier.") -op.add_option("--all_categories", - action="store_true", dest="all_categories", - help="Whether to use all categories or not.") -op.add_option("--use_hashing", - action="store_true", - help="Use a hashing vectorizer.") -op.add_option("--n_features", - action="store", type=int, default=2 ** 16, - help="n_features when using the hashing vectorizer.") -op.add_option("--filtered", - action="store_true", - help="Remove newsgroup information that is easily overfit: " - "headers, signatures, and quoting.") +op.add_option( + "--report", + action="store_true", + dest="print_report", + help="Print a detailed classification report.", +) +op.add_option( + "--chi2_select", + action="store", + type="int", + dest="select_chi2", + help="Select some number of features using a chi-squared test", +) +op.add_option( + "--confusion_matrix", + action="store_true", + dest="print_cm", + help="Print the confusion matrix.", +) +op.add_option( + "--top10", + action="store_true", + dest="print_top10", + help="Print ten most discriminative terms per class for every classifier.", +) +op.add_option( + "--all_categories", + action="store_true", + dest="all_categories", + help="Whether to use all categories or not.", +) +op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") +op.add_option( + "--n_features", + action="store", + type=int, + default=2 ** 16, + help="n_features when using the hashing vectorizer.", +) +op.add_option( + "--filtered", + action="store_true", + help=( + "Remove newsgroup information that is easily overfit: " + "headers, signatures, and quoting." + ), +) def is_interactive(): - return not hasattr(sys.modules['__main__'], '__file__') + return not hasattr(sys.modules["__main__"], "__file__") # work-around for Jupyter notebook and IPython console @@ -103,44 +123,44 @@ def is_interactive(): categories = None else: categories = [ - 'alt.atheism', - 'talk.religion.misc', - 'comp.graphics', - 'sci.space', + "alt.atheism", + "talk.religion.misc", + "comp.graphics", + "sci.space", ] if opts.filtered: - remove = ('headers', 'footers', 'quotes') + remove = ("headers", "footers", "quotes") else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") -data_train = fetch_20newsgroups(subset='train', categories=categories, - shuffle=True, random_state=42, - remove=remove) +data_train = fetch_20newsgroups( + subset="train", categories=categories, shuffle=True, random_state=42, remove=remove +) -data_test = fetch_20newsgroups(subset='test', categories=categories, - shuffle=True, random_state=42, - remove=remove) -print('data loaded') +data_test = fetch_20newsgroups( + subset="test", categories=categories, shuffle=True, random_state=42, remove=remove +) +print("data loaded") # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names def size_mb(docs): - return sum(len(s.encode('utf-8')) for s in docs) / 1e6 + return sum(len(s.encode("utf-8")) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) -print("%d documents - %0.3fMB (training set)" % ( - len(data_train.data), data_train_size_mb)) -print("%d documents - %0.3fMB (test set)" % ( - len(data_test.data), data_test_size_mb)) +print( + "%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb) +) +print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(target_names)) print() @@ -150,12 +170,12 @@ def size_mb(docs): print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: - vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, - n_features=opts.n_features) + vectorizer = HashingVectorizer( + stop_words="english", alternate_sign=False, n_features=opts.n_features + ) X_train = vectorizer.transform(data_train.data) else: - vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, - stop_words='english') + vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) @@ -177,8 +197,7 @@ def size_mb(docs): feature_names = vectorizer.get_feature_names_out() if opts.select_chi2: - print("Extracting %d best features by a chi-squared test" % - opts.select_chi2) + print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) @@ -201,7 +220,7 @@ def trim(s): # We train and test the datasets with 15 different classification models # and get performance results for each model. def benchmark(clf): - print('_' * 80) + print("_" * 80) print("Training: ") print(clf) t0 = time() @@ -217,7 +236,7 @@ def benchmark(clf): score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) - if hasattr(clf, 'coef_'): + if hasattr(clf, "coef_"): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) @@ -230,67 +249,74 @@ def benchmark(clf): if opts.print_report: print("classification report:") - print(metrics.classification_report(y_test, pred, - target_names=target_names)) + print(metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() - clf_descr = str(clf).split('(')[0] + clf_descr = str(clf).split("(")[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( - (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), - (Perceptron(max_iter=50), "Perceptron"), - (PassiveAggressiveClassifier(max_iter=50), - "Passive-Aggressive"), - (KNeighborsClassifier(n_neighbors=10), "kNN"), - (RandomForestClassifier(), "Random forest")): - print('=' * 80) + (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), + (Perceptron(max_iter=50), "Perceptron"), + (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), + (KNeighborsClassifier(n_neighbors=10), "kNN"), + (RandomForestClassifier(), "Random forest"), +): + print("=" * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: - print('=' * 80) + print("=" * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model - results.append(benchmark(LinearSVC(penalty=penalty, dual=False, - tol=1e-3))) + results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model - results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, - penalty=penalty))) + results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty -print('=' * 80) +print("=" * 80) print("Elastic-Net penalty") -results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50, - penalty="elasticnet"))) +results.append( + benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet")) +) # Train NearestCentroid without threshold -print('=' * 80) +print("=" * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers -print('=' * 80) +print("=" * 80) print("Naive Bayes") -results.append(benchmark(MultinomialNB(alpha=.01))) -results.append(benchmark(BernoulliNB(alpha=.01))) -results.append(benchmark(ComplementNB(alpha=.1))) +results.append(benchmark(MultinomialNB(alpha=0.01))) +results.append(benchmark(BernoulliNB(alpha=0.01))) +results.append(benchmark(ComplementNB(alpha=0.1))) -print('=' * 80) +print("=" * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. -results.append(benchmark(Pipeline([ - ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, - tol=1e-3))), - ('classification', LinearSVC(penalty="l2"))]))) +results.append( + benchmark( + Pipeline( + [ + ( + "feature_selection", + SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)), + ), + ("classification", LinearSVC(penalty="l2")), + ] + ) + ) +) # %% @@ -308,17 +334,16 @@ def benchmark(clf): plt.figure(figsize=(12, 8)) plt.title("Score") -plt.barh(indices, score, .2, label="score", color='navy') -plt.barh(indices + .3, training_time, .2, label="training time", - color='c') -plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') +plt.barh(indices, score, 0.2, label="score", color="navy") +plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c") +plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange") plt.yticks(()) -plt.legend(loc='best') -plt.subplots_adjust(left=.25) -plt.subplots_adjust(top=.95) -plt.subplots_adjust(bottom=.05) +plt.legend(loc="best") +plt.subplots_adjust(left=0.25) +plt.subplots_adjust(top=0.95) +plt.subplots_adjust(bottom=0.05) for i, c in zip(indices, clf_names): - plt.text(-.3, i, c) + plt.text(-0.3, i, c) plt.show() diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index 128138681bc72..832dabb6ba067 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -72,36 +72,56 @@ # Display progress logs on stdout -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s') +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # parse commandline arguments op = OptionParser() -op.add_option("--lsa", - dest="n_components", type="int", - help="Preprocess documents with latent semantic analysis.") -op.add_option("--no-minibatch", - action="store_false", dest="minibatch", default=True, - help="Use ordinary k-means algorithm (in batch mode).") -op.add_option("--no-idf", - action="store_false", dest="use_idf", default=True, - help="Disable Inverse Document Frequency feature weighting.") -op.add_option("--use-hashing", - action="store_true", default=False, - help="Use a hashing feature vectorizer") -op.add_option("--n-features", type=int, default=10000, - help="Maximum number of features (dimensions)" - " to extract from text.") -op.add_option("--verbose", - action="store_true", dest="verbose", default=False, - help="Print progress reports inside k-means algorithm.") +op.add_option( + "--lsa", + dest="n_components", + type="int", + help="Preprocess documents with latent semantic analysis.", +) +op.add_option( + "--no-minibatch", + action="store_false", + dest="minibatch", + default=True, + help="Use ordinary k-means algorithm (in batch mode).", +) +op.add_option( + "--no-idf", + action="store_false", + dest="use_idf", + default=True, + help="Disable Inverse Document Frequency feature weighting.", +) +op.add_option( + "--use-hashing", + action="store_true", + default=False, + help="Use a hashing feature vectorizer", +) +op.add_option( + "--n-features", + type=int, + default=10000, + help="Maximum number of features (dimensions) to extract from text.", +) +op.add_option( + "--verbose", + action="store_true", + dest="verbose", + default=False, + help="Print progress reports inside k-means algorithm.", +) print(__doc__) op.print_help() def is_interactive(): - return not hasattr(sys.modules['__main__'], '__file__') + return not hasattr(sys.modules["__main__"], "__file__") # work-around for Jupyter notebook and IPython console @@ -115,10 +135,10 @@ def is_interactive(): # ############################################################################# # Load some categories from the training set categories = [ - 'alt.atheism', - 'talk.religion.misc', - 'comp.graphics', - 'sci.space', + "alt.atheism", + "talk.religion.misc", + "comp.graphics", + "sci.space", ] # Uncomment the following to do the analysis on all the categories # categories = None @@ -126,8 +146,9 @@ def is_interactive(): print("Loading 20 newsgroups dataset for categories:") print(categories) -dataset = fetch_20newsgroups(subset='all', categories=categories, - shuffle=True, random_state=42) +dataset = fetch_20newsgroups( + subset="all", categories=categories, shuffle=True, random_state=42 +) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) @@ -136,24 +157,33 @@ def is_interactive(): labels = dataset.target true_k = np.unique(labels).shape[0] -print("Extracting features from the training dataset " - "using a sparse vectorizer") +print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer - hasher = HashingVectorizer(n_features=opts.n_features, - stop_words='english', alternate_sign=False, - norm=None) + hasher = HashingVectorizer( + n_features=opts.n_features, + stop_words="english", + alternate_sign=False, + norm=None, + ) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: - vectorizer = HashingVectorizer(n_features=opts.n_features, - stop_words='english', - alternate_sign=False, norm='l2') + vectorizer = HashingVectorizer( + n_features=opts.n_features, + stop_words="english", + alternate_sign=False, + norm="l2", + ) else: - vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, - min_df=2, stop_words='english', - use_idf=opts.use_idf) + vectorizer = TfidfVectorizer( + max_df=0.5, + max_features=opts.n_features, + min_df=2, + stop_words="english", + use_idf=opts.use_idf, + ) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) @@ -175,8 +205,9 @@ def is_interactive(): print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() - print("Explained variance of the SVD step: {}%".format( - int(explained_variance * 100))) + print( + "Explained variance of the SVD step: {}%".format(int(explained_variance * 100)) + ) print() @@ -185,11 +216,22 @@ def is_interactive(): # Do the actual clustering if opts.minibatch: - km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, - init_size=1000, batch_size=1000, verbose=opts.verbose) + km = MiniBatchKMeans( + n_clusters=true_k, + init="k-means++", + n_init=1, + init_size=1000, + batch_size=1000, + verbose=opts.verbose, + ) else: - km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, - verbose=opts.verbose) + km = KMeans( + n_clusters=true_k, + init="k-means++", + max_iter=100, + n_init=1, + verbose=opts.verbose, + ) print("Clustering sparse data with %s" % km) t0 = time() @@ -200,10 +242,11 @@ def is_interactive(): print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) -print("Adjusted Rand-Index: %.3f" - % metrics.adjusted_rand_score(labels, km.labels_)) -print("Silhouette Coefficient: %0.3f" - % metrics.silhouette_score(X, km.labels_, sample_size=1000)) +print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) +print( + "Silhouette Coefficient: %0.3f" + % metrics.silhouette_score(X, km.labels_, sample_size=1000) +) print() @@ -219,7 +262,7 @@ def is_interactive(): terms = vectorizer.get_feature_names_out() for i in range(true_k): - print("Cluster %d:" % i, end='') + print("Cluster %d:" % i, end="") for ind in order_centroids[i, :10]: - print(' %s' % terms[ind], end='') + print(" %s" % terms[ind], end="") print() diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py index 1cf5c0aa6a0ce..1284fb7f164cd 100644 --- a/examples/text/plot_hashing_vs_dict_vectorizer.py +++ b/examples/text/plot_hashing_vs_dict_vectorizer.py @@ -51,13 +51,13 @@ def token_freqs(doc): categories = [ - 'alt.atheism', - 'comp.graphics', - 'comp.sys.ibm.pc.hardware', - 'misc.forsale', - 'rec.autos', - 'sci.space', - 'talk.religion.misc', + "alt.atheism", + "comp.graphics", + "comp.sys.ibm.pc.hardware", + "misc.forsale", + "rec.autos", + "sci.space", + "talk.religion.misc", ] # Uncomment the following line to use a larger set (11k+ documents) # categories = None @@ -77,9 +77,8 @@ def token_freqs(doc): print("Loading 20 newsgroups training data") -raw_data, _ = fetch_20newsgroups(subset='train', categories=categories, - return_X_y=True) -data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6 +raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True) +data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6 print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb)) print() diff --git a/examples/tree/plot_cost_complexity_pruning.py b/examples/tree/plot_cost_complexity_pruning.py index 822d7a206f842..9f317cc96b4fa 100644 --- a/examples/tree/plot_cost_complexity_pruning.py +++ b/examples/tree/plot_cost_complexity_pruning.py @@ -45,7 +45,7 @@ # In the following plot, the maximum effective alpha value is removed, because # it is the trivial tree with only one node. fig, ax = plt.subplots() -ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") +ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") @@ -59,8 +59,11 @@ clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) -print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( - clfs[-1].tree_.node_count, ccp_alphas[-1])) +print( + "Number of nodes in the last tree is: {} with ccp_alpha: {}".format( + clfs[-1].tree_.node_count, ccp_alphas[-1] + ) +) # %% # For the remainder of this example, we remove the last element in @@ -73,11 +76,11 @@ node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplots(2, 1) -ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") +ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post") ax[0].set_xlabel("alpha") ax[0].set_ylabel("number of nodes") ax[0].set_title("Number of nodes vs alpha") -ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") +ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post") ax[1].set_xlabel("alpha") ax[1].set_ylabel("depth of tree") ax[1].set_title("Depth vs alpha") @@ -98,9 +101,7 @@ ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") -ax.plot(ccp_alphas, train_scores, marker='o', label="train", - drawstyle="steps-post") -ax.plot(ccp_alphas, test_scores, marker='o', label="test", - drawstyle="steps-post") +ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post") +ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post") ax.legend() plt.show() diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index 60328c4f90d4f..7dbe203163de2 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -30,8 +30,7 @@ # Load data iris = load_iris() -for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], - [1, 2], [1, 3], [2, 3]]): +for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): # We only take the two corresponding features X = iris.data[:, pair] y = iris.target @@ -44,8 +43,9 @@ x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), - np.arange(y_min, y_max, plot_step)) + xx, yy = np.meshgrid( + np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step) + ) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) @@ -58,11 +58,18 @@ # Plot the training points for i, color in zip(range(n_classes), plot_colors): idx = np.where(y == i) - plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], - cmap=plt.cm.RdYlBu, edgecolor='black', s=15) + plt.scatter( + X[idx, 0], + X[idx, 1], + c=color, + label=iris.target_names[i], + cmap=plt.cm.RdYlBu, + edgecolor="black", + s=15, + ) plt.suptitle("Decision surface of a decision tree using paired features") -plt.legend(loc='lower right', borderpad=0, handletextpad=0) +plt.legend(loc="lower right", borderpad=0, handletextpad=0) plt.axis("tight") plt.figure() diff --git a/examples/tree/plot_tree_regression.py b/examples/tree/plot_tree_regression.py index 717de4ab72e4e..7a71b0450f2b7 100644 --- a/examples/tree/plot_tree_regression.py +++ b/examples/tree/plot_tree_regression.py @@ -39,10 +39,8 @@ # Plot the results plt.figure() -plt.scatter(X, y, s=20, edgecolor="black", - c="darkorange", label="data") -plt.plot(X_test, y_1, color="cornflowerblue", - label="max_depth=2", linewidth=2) +plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data") +plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) plt.xlabel("data") plt.ylabel("target") diff --git a/examples/tree/plot_tree_regression_multioutput.py b/examples/tree/plot_tree_regression_multioutput.py index b47bfcd80e49a..ab9a530b0faee 100644 --- a/examples/tree/plot_tree_regression_multioutput.py +++ b/examples/tree/plot_tree_regression_multioutput.py @@ -24,7 +24,7 @@ rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(100, 1) - 100, axis=0) y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T -y[::5, :] += (0.5 - rng.rand(20, 2)) +y[::5, :] += 0.5 - rng.rand(20, 2) # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=2) @@ -43,14 +43,19 @@ # Plot the results plt.figure() s = 25 -plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, - edgecolor="black", label="data") -plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s, - edgecolor="black", label="max_depth=2") -plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s, - edgecolor="black", label="max_depth=5") -plt.scatter(y_3[:, 0], y_3[:, 1], c="orange", s=s, - edgecolor="black", label="max_depth=8") +plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data") +plt.scatter( + y_1[:, 0], + y_1[:, 1], + c="cornflowerblue", + s=s, + edgecolor="black", + label="max_depth=2", +) +plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s, edgecolor="black", label="max_depth=5") +plt.scatter( + y_3[:, 0], y_3[:, 1], c="orange", s=s, edgecolor="black", label="max_depth=8" +) plt.xlim([-6, 6]) plt.ylim([-6, 6]) plt.xlabel("target 1") diff --git a/examples/tree/plot_unveil_tree_structure.py b/examples/tree/plot_unveil_tree_structure.py index 81f67c615c94c..65aa685463bac 100644 --- a/examples/tree/plot_unveil_tree_structure.py +++ b/examples/tree/plot_unveil_tree_structure.py @@ -92,22 +92,30 @@ else: is_leaves[node_id] = True -print("The binary tree structure has {n} nodes and has " - "the following tree structure:\n".format(n=n_nodes)) +print( + "The binary tree structure has {n} nodes and has " + "the following tree structure:\n".format(n=n_nodes) +) for i in range(n_nodes): if is_leaves[i]: - print("{space}node={node} is a leaf node.".format( - space=node_depth[i] * "\t", node=i)) + print( + "{space}node={node} is a leaf node.".format( + space=node_depth[i] * "\t", node=i + ) + ) else: - print("{space}node={node} is a split node: " - "go to node {left} if X[:, {feature}] <= {threshold} " - "else to node {right}.".format( - space=node_depth[i] * "\t", - node=i, - left=children_left[i], - feature=feature[i], - threshold=threshold[i], - right=children_right[i])) + print( + "{space}node={node} is a split node: " + "go to node {left} if X[:, {feature}] <= {threshold} " + "else to node {right}.".format( + space=node_depth[i] * "\t", + node=i, + left=children_left[i], + feature=feature[i], + threshold=threshold[i], + right=children_right[i], + ) + ) ############################################################################## # We can compare the above output to the plot of the decision tree. @@ -139,29 +147,33 @@ sample_id = 0 # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id` -node_index = node_indicator.indices[node_indicator.indptr[sample_id]: - node_indicator.indptr[sample_id + 1]] +node_index = node_indicator.indices[ + node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1] +] -print('Rules used to predict sample {id}:\n'.format(id=sample_id)) +print("Rules used to predict sample {id}:\n".format(id=sample_id)) for node_id in node_index: # continue to the next node if it is a leaf node if leaf_id[sample_id] == node_id: continue # check if value of the split feature for sample 0 is below threshold - if (X_test[sample_id, feature[node_id]] <= threshold[node_id]): + if X_test[sample_id, feature[node_id]] <= threshold[node_id]: threshold_sign = "<=" else: threshold_sign = ">" - print("decision node {node} : (X_test[{sample}, {feature}] = {value}) " - "{inequality} {threshold})".format( - node=node_id, - sample=sample_id, - feature=feature[node_id], - value=X_test[sample_id, feature[node_id]], - inequality=threshold_sign, - threshold=threshold[node_id])) + print( + "decision node {node} : (X_test[{sample}, {feature}] = {value}) " + "{inequality} {threshold})".format( + node=node_id, + sample=sample_id, + feature=feature[node_id], + value=X_test[sample_id, feature[node_id]], + inequality=threshold_sign, + threshold=threshold[node_id], + ) + ) ############################################################################## # For a group of samples, we can determine the common nodes the samples go @@ -169,12 +181,13 @@ sample_ids = [0, 1] # boolean array indicating the nodes both samples go through -common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) == - len(sample_ids)) +common_nodes = node_indicator.toarray()[sample_ids].sum(axis=0) == len(sample_ids) # obtain node ids using position in array common_node_id = np.arange(n_nodes)[common_nodes] -print("\nThe following samples {samples} share the node(s) {nodes} in the " - "tree.".format(samples=sample_ids, nodes=common_node_id)) -print("This is {prop}% of all nodes.".format( - prop=100 * len(common_node_id) / n_nodes)) +print( + "\nThe following samples {samples} share the node(s) {nodes} in the tree.".format( + samples=sample_ids, nodes=common_node_id + ) +) +print("This is {prop}% of all nodes.".format(prop=100 * len(common_node_id) / n_nodes)) diff --git a/pyproject.toml b/pyproject.toml index 71da52002cc96..3762d2f229f76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ exclude = ''' | \.git # root of the project | \.mypy_cache | \.vscode - | examples | build | dist | doc/tutorial From 12f46cc51865a7514b2793034e25896f13c10bb7 Mon Sep 17 00:00:00 2001 From: Pinky Date: Thu, 7 Oct 2021 16:02:10 +0530 Subject: [PATCH 11/18] DOC Ensures that SplineTransformer passes numpydoc validation (#21248) * Remove SplineTransformer from DOCSTRING_IGNORE_LIST * Fix numpydocs from SplineTransformer --- maint_tools/test_docstrings.py | 1 - sklearn/preprocessing/_polynomial.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index 3388b7635d214..35ed4c515dd81 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -28,7 +28,6 @@ "SpectralBiclustering", "SpectralCoclustering", "SpectralEmbedding", - "SplineTransformer", "StackingRegressor", "TransformedTargetRegressor", ] diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 66d5a06773077..dbe78c0967582 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -693,6 +693,7 @@ def get_feature_names(self, input_features=None): Returns ------- output_feature_names : list of str of shape (n_output_features,) + Transformed feature names. """ n_splines = self.bsplines_[0].c.shape[0] if input_features is None: From e11e8208a977e6498ca7c2aec893808f9119f729 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 7 Oct 2021 10:19:55 -0400 Subject: [PATCH 12/18] BLD Fixes osx build by downgrading to 11.X (#21227) --- build_tools/github/build_wheels.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/github/build_wheels.sh b/build_tools/github/build_wheels.sh index c484513b648e3..2671c50f66bb3 100755 --- a/build_tools/github/build_wheels.sh +++ b/build_tools/github/build_wheels.sh @@ -12,7 +12,7 @@ if [[ "$RUNNER_OS" == "macOS" ]]; then # supported macos version is: High Sierra / 10.13. When upgrading this, be # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in # wheels.yml accordingly. Note that Darwin_17 == High Sierra / 10.13. - wget https://packages.macports.org/libomp/libomp-12.0.0_0+universal.darwin_17.i386-x86_64.tbz2 -O libomp.tbz2 + wget https://packages.macports.org/libomp/libomp-11.0.1_0+universal.darwin_17.i386-x86_64.tbz2 -O libomp.tbz2 sudo tar -C / -xvjf libomp.tbz2 opt export CC=/usr/bin/clang From ff7d9c6f2cdeb33310aff53e9bced27a6d5bea21 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 7 Oct 2021 17:39:15 +0200 Subject: [PATCH 13/18] DOC Cross-link check_estimator and parametrize_with_checks (#21269) --- sklearn/utils/estimator_checks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 37537bc1b0498..ccc6ff23ed8fc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -472,6 +472,10 @@ def parametrize_with_checks(estimators): ------- decorator : `pytest.mark.parametrize` + See Also + -------- + check_estimator : Check if estimator adheres to scikit-learn conventions. + Examples -------- >>> from sklearn.utils.estimator_checks import parametrize_with_checks @@ -547,6 +551,11 @@ def check_estimator(Estimator, generate_only=False): checks_generator : generator Generator that yields (estimator, check) tuples. Returned when `generate_only=True`. + + See Also + -------- + parametrize_with_checks : Pytest specific decorator for parametrizing estimator + checks. """ if isinstance(Estimator, type): msg = ( From 39fd93f84ff09e0e3c7d1f3bbee0919d5b4f80a6 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Thu, 7 Oct 2021 23:42:58 +0800 Subject: [PATCH 14/18] DOC Clarify use_idf in TfidfTransformer/TfidfVectorizer docstrings (#21213) --- sklearn/feature_extraction/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 8dd743813fa27..82582de8a5b60 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1504,7 +1504,7 @@ class TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): See :func:`preprocessing.normalize`. use_idf : bool, default=True - Enable inverse-document-frequency reweighting. + Enable inverse-document-frequency reweighting. If False, idf(t) = 1. smooth_idf : bool, default=True Smooth idf weights by adding one to document frequencies, as if an @@ -1842,7 +1842,7 @@ class TfidfVectorizer(CountVectorizer): See :func:`preprocessing.normalize`. use_idf : bool, default=True - Enable inverse-document-frequency reweighting. + Enable inverse-document-frequency reweighting. If False, idf(t) = 1. smooth_idf : bool, default=True Smooth idf weights by adding one to document frequencies, as if an From 4abc00bfd34731ee0f538d7299276b38b7ac018f Mon Sep 17 00:00:00 2001 From: Juan Martin Loyola Date: Fri, 8 Oct 2021 06:14:11 -0300 Subject: [PATCH 15/18] DOC Ensures that SelfTrainingClassifier passes numpydoc validation (#21277) * Remove SelfTrainingClassifier from DOCSTRING_IGNORE_LIST * Fix numpydocs from SelfTrainingClassifier * Change docstrings to maintain consistency --- maint_tools/test_docstrings.py | 1 - sklearn/semi_supervised/_self_training.py | 55 ++++++++++++----------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index 35ed4c515dd81..bb73b3ad3e22f 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -23,7 +23,6 @@ "PassiveAggressiveClassifier", "PassiveAggressiveRegressor", "QuadraticDiscriminantAnalysis", - "SelfTrainingClassifier", "SparseRandomProjection", "SpectralBiclustering", "SpectralCoclustering", diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index ad627c6f98574..71d2a7b32236b 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -37,30 +37,30 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator): Parameters ---------- base_estimator : estimator object - An estimator object implementing ``fit`` and ``predict_proba``. - Invoking the ``fit`` method will fit a clone of the passed estimator, - which will be stored in the ``base_estimator_`` attribute. + An estimator object implementing `fit` and `predict_proba`. + Invoking the `fit` method will fit a clone of the passed estimator, + which will be stored in the `base_estimator_` attribute. threshold : float, default=0.75 The decision threshold for use with `criterion='threshold'`. - Should be in [0, 1). When using the 'threshold' criterion, a + Should be in [0, 1). When using the `'threshold'` criterion, a :ref:`well calibrated classifier ` should be used. criterion : {'threshold', 'k_best'}, default='threshold' The selection criterion used to select which labels to add to the - training set. If 'threshold', pseudo-labels with prediction - probabilities above `threshold` are added to the dataset. If 'k_best', + training set. If `'threshold'`, pseudo-labels with prediction + probabilities above `threshold` are added to the dataset. If `'k_best'`, the `k_best` pseudo-labels with highest prediction probabilities are added to the dataset. When using the 'threshold' criterion, a :ref:`well calibrated classifier ` should be used. k_best : int, default=10 The amount of samples to add in each iteration. Only used when - `criterion` is k_best'. + `criterion='k_best'`. max_iter : int or None, default=10 Maximum number of iterations allowed. Should be greater than or equal - to 0. If it is ``None``, the classifier will continue to predict labels + to 0. If it is `None`, the classifier will continue to predict labels until no new pseudo-labels are added, or all unlabeled samples have been labeled. @@ -74,7 +74,7 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator): classes_ : ndarray or list of ndarray of shape (n_classes,) Class labels for each output. (Taken from the trained - ``base_estimator_``). + `base_estimator_`). transduction_ : ndarray of shape (n_samples,) The labels used for the final fit of the classifier, including @@ -104,11 +104,24 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator): termination_condition_ : {'max_iter', 'no_change', 'all_labeled'} The reason that fitting was stopped. - - 'max_iter': `n_iter_` reached `max_iter`. - - 'no_change': no new labels were predicted. - - 'all_labeled': all unlabeled samples were labeled before `max_iter` + - `'max_iter'`: `n_iter_` reached `max_iter`. + - `'no_change'`: no new labels were predicted. + - `'all_labeled'`: all unlabeled samples were labeled before `max_iter` was reached. + See Also + -------- + LabelPropagation : Label propagation classifier. + LabelSpreading : Label spreading model for semi-supervised learning. + + References + ---------- + David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling + supervised methods. In Proceedings of the 33rd annual meeting on + Association for Computational Linguistics (ACL '95). Association for + Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI: + https://doi.org/10.3115/981658.981684 + Examples -------- >>> import numpy as np @@ -123,14 +136,6 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator): >>> self_training_model = SelfTrainingClassifier(svc) >>> self_training_model.fit(iris.data, iris.target) SelfTrainingClassifier(...) - - References - ---------- - David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling - supervised methods. In Proceedings of the 33rd annual meeting on - Association for Computational Linguistics (ACL '95). Association for - Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI: - https://doi.org/10.3115/981658.981684 """ _estimator_type = "classifier" @@ -153,7 +158,7 @@ def __init__( def fit(self, X, y): """ - Fits this ``SelfTrainingClassifier`` to a dataset. + Fit self-training classifier using `X`, `y` as training data. Parameters ---------- @@ -167,7 +172,7 @@ def fit(self, X, y): Returns ------- self : object - Returns an instance of self. + Fitted estimator. """ # we need row slicing support for sparce matrices, but costly finiteness check # can be delegated to the base estimator. @@ -281,7 +286,7 @@ def fit(self, X, y): @if_delegate_has_method(delegate="base_estimator") def predict(self, X): - """Predict the classes of X. + """Predict the classes of `X`. Parameters ---------- @@ -326,7 +331,7 @@ def predict_proba(self, X): @if_delegate_has_method(delegate="base_estimator") def decision_function(self, X): - """Calls decision function of the `base_estimator`. + """Call decision function of the `base_estimator`. Parameters ---------- @@ -372,7 +377,7 @@ def predict_log_proba(self, X): @if_delegate_has_method(delegate="base_estimator") def score(self, X, y): - """Calls score on the `base_estimator`. + """Call score on the `base_estimator`. Parameters ---------- From eb2b5fa767a49bf056f6ffdd253b8bea9d4328ff Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 8 Oct 2021 13:55:15 +0200 Subject: [PATCH 16/18] DOC Remove some str/unicode leftovers from Python 2 (#21270) Co-authored-by: Thomas J. Fan --- sklearn/datasets/_base.py | 4 ++-- sklearn/datasets/_california_housing.py | 2 +- sklearn/datasets/_lfw.py | 4 ++-- sklearn/datasets/_svmlight_format_io.py | 5 ++--- sklearn/datasets/_twenty_newsgroups.py | 2 +- sklearn/feature_extraction/text.py | 8 ++++---- sklearn/linear_model/_base.py | 2 +- sklearn/linear_model/_stochastic_gradient.py | 2 +- sklearn/metrics/_base.py | 2 +- sklearn/metrics/_classification.py | 2 +- sklearn/metrics/_dist_metrics.pyx | 2 +- sklearn/mixture/_base.py | 2 +- sklearn/mixture/_gaussian_mixture.py | 2 +- sklearn/preprocessing/tests/test_encoders.py | 4 ++-- sklearn/tree/_export.py | 4 ++-- sklearn/utils/graph.py | 2 +- sklearn/utils/metaestimators.py | 2 +- 17 files changed, 25 insertions(+), 26 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index b5f6fd22f9c33..dab3c92d654bb 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -141,10 +141,10 @@ def load_files( Parameters ---------- - container_path : str or unicode + container_path : str Path to the main folder holding one subfolder per category - description : str or unicode, default=None + description : str, default=None A paragraph describing the characteristic of the dataset: its source, reference, etc. diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index 34a936e51cbb2..59ff356e90838 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -102,7 +102,7 @@ def fetch_california_housing( If ``as_frame`` is True, ``target`` is a pandas object. feature_names : list of length 8 Array of ordered feature names used in the dataset. - DESCR : string + DESCR : str Description of the California housing dataset. frame : pandas DataFrame Only present when `as_frame=True`. DataFrame with ``data`` and diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py index fb7d603bfc0ff..0af8c8635bc85 100644 --- a/sklearn/datasets/_lfw.py +++ b/sklearn/datasets/_lfw.py @@ -301,7 +301,7 @@ def fetch_lfw_people( target : numpy array of shape (13233,) Labels associated to each face image. Those labels range from 0-5748 and correspond to the person IDs. - DESCR : string + DESCR : str Description of the Labeled Faces in the Wild (LFW) dataset. (data, target) : tuple if ``return_X_y`` is True @@ -486,7 +486,7 @@ def fetch_lfw_pairs( target : numpy array of shape (2200,). Shape depends on ``subset``. Labels associated to each pair of images. The two label values being different persons or the same person. - DESCR : string + DESCR : str Description of the Labeled Faces in the Wild (LFW) dataset. """ diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 48e258b4e8512..6a7d9dcc1936c 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -446,7 +446,7 @@ def dump_svmlight_file( integer or float, or array-like objects of integer or float for multilabel classifications. - f : string or file-like in binary mode + f : str or file-like in binary mode If string, specifies the path that will contain the data. If file-like, data will be written to f. f should be opened in binary mode. @@ -455,7 +455,7 @@ def dump_svmlight_file( Whether column indices should be written zero-based (True) or one-based (False). - comment : string, default=None + comment : str, default=None Comment to insert at the top of the file. This should be either a Unicode string, which will be encoded as UTF-8, or an ASCII byte string. @@ -478,7 +478,6 @@ def dump_svmlight_file( # Convert comment string to list of lines in UTF-8. # If a byte string is passed, then check whether it's ASCII; # if a user wants to get fancy, they'll have to decode themselves. - # Avoid mention of str and unicode types for Python 3.x compat. if isinstance(comment, bytes): comment.decode("ascii") # just for the exception else: diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index 24046367c69c6..ef0ce6b99a25e 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -184,7 +184,7 @@ def fetch_20newsgroups( Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. - categories : array-like, dtype=str or unicode, default=None + categories : array-like, dtype=str, default=None If None (default), load all the categories. If not None, list of category names to load (other categories ignored). diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 82582de8a5b60..02af3ff289ae2 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -156,7 +156,7 @@ def strip_accents_ascii(s): Parameters ---------- - s : string + s : str The string to strip See Also @@ -175,7 +175,7 @@ def strip_tags(s): Parameters ---------- - s : string + s : str The string to strip """ return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s) @@ -204,7 +204,7 @@ def decode(self, doc): Parameters ---------- - doc : str + doc : bytes or str The string to decode. Returns @@ -620,7 +620,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have - an direct ASCII mapping. + a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) does nothing. diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 8b5102ecdd403..841ed6a1c1cc4 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -66,7 +66,7 @@ def _deprecate_normalize(normalize, default, estimator_name): default : bool, default normalize value used by the estimator - estimator_name : string, + estimator_name : str name of the linear estimator which calls this function. The name will be used for writing the deprecation warnings diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 23ba7c77d85ac..3ae077f4331cc 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -392,7 +392,7 @@ def fit_binary( C : float Maximum step size for passive aggressive - learning_rate : string + learning_rate : str The learning rate. Accepted values are 'constant', 'optimal', 'invscaling', 'pa1' and 'pa2'. diff --git a/sklearn/metrics/_base.py b/sklearn/metrics/_base.py index 5640848b1a9d4..dd0258f600ccc 100644 --- a/sklearn/metrics/_base.py +++ b/sklearn/metrics/_base.py @@ -32,7 +32,7 @@ def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 7237fa53fda25..b4316053c0f74 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2009,7 +2009,7 @@ def classification_report( Returns ------- - report : string / dict + report : str or dict Text summary of the precision, recall, F1 score for each class. Dictionary returned if output_dict is True. Dictionary has the following structure:: diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index c592c1d8c5d4a..c94cf597c0fac 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -234,7 +234,7 @@ cdef class DistanceMetric: Parameters ---------- - metric : string or class name + metric : str or class name The distance metric to use **kwargs additional arguments will be passed to the requested metric diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index d40903899c187..bbe9699859ded 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -28,7 +28,7 @@ def _check_shape(param, param_shape, name): param_shape : tuple - name : string + name : str """ param = np.array(param) if param.shape != param_shape: diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index 850adfdd6d47f..f4bb194e1e33d 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -108,7 +108,7 @@ def _check_precisions(precisions, covariance_type, n_components, n_features): 'diag' : shape of (n_components, n_features) 'spherical' : shape of (n_components,) - covariance_type : string + covariance_type : str n_components : int Number of components. diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 0429dc00c2322..dcc07d25af5fd 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -800,8 +800,8 @@ def test_encoder_dtypes(): for X in [ np.array([[1, 2], [3, 4]], dtype="int64"), np.array([[1, 2], [3, 4]], dtype="float64"), - np.array([["a", "b"], ["c", "d"]]), # unicode dtype - np.array([[b"a", b"b"], [b"c", b"d"]]), # string dtype + np.array([["a", "b"], ["c", "d"]]), # str dtype + np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype np.array([[1, "a"], [3, "b"]], dtype="object"), ]: enc.fit(X) diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 18f98d36871b9..dc50ee70f05f0 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -839,7 +839,7 @@ def export_graphviz( Returns ------- - dot_data : string + dot_data : str String representation of the input tree in GraphViz dot format. Only returned if ``out_file`` is None. @@ -961,7 +961,7 @@ def export_text( Returns ------- - report : string + report : str Text summary of all the rules in the decision tree. Examples diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py index 8eacb17e628c3..020227ba001a9 100644 --- a/sklearn/utils/graph.py +++ b/sklearn/utils/graph.py @@ -92,7 +92,7 @@ def graph_shortest_path(dist_matrix, directed=True, method="auto"): if False, then find the shortest path on an undirected graph: the algorithm can progress from a point to its neighbors and vice versa. - method : string ['auto'|'FW'|'D'] + method : {'auto', 'FW', 'D'}, default='auto' method to use. Options are 'auto' : attempt to choose the best method for the current problem 'FW' : Floyd-Warshall algorithm. O[N^3] diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py index bd43eeba2a3dd..5d71d28c5ffab 100644 --- a/sklearn/utils/metaestimators.py +++ b/sklearn/utils/metaestimators.py @@ -205,7 +205,7 @@ def if_delegate_has_method(delegate): Parameters ---------- - delegate : string, list of strings or tuple of strings + delegate : str, list of str or tuple of str Name of the sub-estimator that can be accessed as an attribute of the base object. If a list or a tuple of names are provided, the first sub-estimator that is an attribute of the base object will be used. From 46a6cf29ab019afa7cf3c815cb206fa822f0ee0a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 8 Oct 2021 10:16:50 -0400 Subject: [PATCH 17/18] Re-introduce 'surrogate' for the wording and adapt docstrings accordingly Co-authored-by: Roman Yurchak Co-authored-by: Olivier Grisel Co-authored-by: Olivier Grisel --- sklearn/metrics/_dist_metrics.pyx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index c94cf597c0fac..2698d5dea8769 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -296,8 +296,7 @@ cdef class DistanceMetric: The rank-preserving surrogate distance is any measure that yields the same rank as the distance, but is more efficient to compute. For example, for the - Euclidean metric, the rank-preserving surrogate distance is the - squared-euclidean distance. + Euclidean metric, the surrogate distance is the squared-euclidean distance. """ return self.dist(x1, x2, size) @@ -322,25 +321,24 @@ cdef class DistanceMetric: return 0 cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1: - """Convert the ranking-preserving distance to the distance""" + """Convert the rank-preserving surrogate distance to the distance""" return rdist cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - """Convert the distance to the ranking-preserving distance""" + """Convert the distance to the rank-preserving surrogate distance""" return dist def rdist_to_dist(self, rdist): - """Convert the ranking-preserving distance to the true distance. + """Convert the ranking-preserving surrogate distance to the distance. - The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For example, for the - Euclidean metric, the rank-preserving surrogate distance is the - squared-euclidean distance. + The surrogate distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, for the + Euclidean metric, the surrogate distance is the squared-euclidean distance. Parameters ---------- rdist : double - Ranking-preserving distance. + Surrogate distance. Returns ------- @@ -352,10 +350,9 @@ cdef class DistanceMetric: def dist_to_rdist(self, dist): """Convert the true distance to the rank-preserving surrogate distance. - The rank-preserving surrogate distance is any measure that yields the same - rank as the distance, but is more efficient to compute. For example, for the - Euclidean metric, the rank-preserving surrogate distance is the - squared-euclidean distance. + The surrogate distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, for the + Euclidean metric, the surrogate distance is the squared-euclidean distance. Parameters ---------- @@ -365,7 +362,7 @@ cdef class DistanceMetric: Returns ------- double - Ranking-preserving distance. + Surrogate distance. """ return dist From f00c134d770584fb79f9ecd5fbc2a78840e8f044 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 8 Oct 2021 10:56:26 -0400 Subject: [PATCH 18/18] Re-word even more for "rank-preserving surrogate" Co-authored-by: Olivier Grisel Co-authored-by: Roman Yurchak Co-authored-by: Roman Yurchak --- sklearn/metrics/_dist_metrics.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index 2698d5dea8769..a8fb4c45ddd0c 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -290,7 +290,7 @@ cdef class DistanceMetric: cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: - """Compute the ranking-preserving distance between vectors x1 and x2. + """Compute the rank-preserving surrogate distance between vectors x1 and x2. This can optionally be overridden in a base class. @@ -329,7 +329,7 @@ cdef class DistanceMetric: return dist def rdist_to_dist(self, rdist): - """Convert the ranking-preserving surrogate distance to the distance. + """Convert the rank-preserving surrogate distance to the distance. The surrogate distance is any measure that yields the same rank as the distance, but is more efficient to compute. For example, for the