From e4bf4e76318fba0939d5e86a77f9365e7f5b84f7 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 7 Feb 2023 10:06:49 +0100 Subject: [PATCH 01/21] introduce PairwiseDistance --- .gitignore | 2 + setup.cfg | 2 + setup.py | 7 + .../_pairwise_distances_reduction/__init__.py | 5 + .../_base.pxd.tp | 1 + .../_datasets_pair.pxd.tp | 2 + .../_datasets_pair.pyx.tp | 44 +++-- .../_dispatcher.py | 138 ++++++++++++- .../_pairwise_distances.pxd.tp | 40 ++++ .../_pairwise_distances.pyx.tp | 181 ++++++++++++++++++ sklearn/metrics/_pairwise_fast.pyx | 80 -------- sklearn/metrics/pairwise.py | 108 +++++++---- sklearn/metrics/tests/test_pairwise.py | 48 +---- .../test_pairwise_distances_reduction.py | 39 ++++ sklearn/neighbors/_nca.py | 2 +- 15 files changed, 517 insertions(+), 182 deletions(-) create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp create mode 100644 sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp diff --git a/.gitignore b/.gitignore index 47ec8fa2faf79..7754773320c6e 100644 --- a/.gitignore +++ b/.gitignore @@ -95,5 +95,7 @@ sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx +sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd +sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx diff --git a/setup.cfg b/setup.cfg index 081e78c92d480..117ec77bc2e35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -96,6 +96,8 @@ ignore = sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx + sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd + sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx diff --git a/setup.py b/setup.py index f252e1ec03ad8..6314a2d18b609 100755 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ "sklearn.metrics._pairwise_distances_reduction._datasets_pair", "sklearn.metrics._pairwise_distances_reduction._middle_term_computer", "sklearn.metrics._pairwise_distances_reduction._base", + "sklearn.metrics._pairwise_distances_reduction._pairwise_distances", "sklearn.metrics._pairwise_distances_reduction._argkmin", "sklearn.metrics._pairwise_distances_reduction._radius_neighbors", "sklearn.metrics._pairwise_fast", @@ -343,6 +344,12 @@ def check_package_status(package, min_version): "include_np": True, "extra_compile_args": ["-std=c++11"], }, + { + "sources": ["_pairwise_distances.pyx.tp", "_pairwise_distances.pxd.tp"], + "language": "c++", + "include_np": True, + "extra_compile_args": ["-std=c++11"], + }, { "sources": ["_argkmin.pyx.tp", "_argkmin.pxd.tp"], "language": "c++", diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py index 133c854682f0c..f4caf911eb898 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/__init__.py +++ b/sklearn/metrics/_pairwise_distances_reduction/__init__.py @@ -89,13 +89,18 @@ from ._dispatcher import ( BaseDistancesReductionDispatcher, ArgKmin, + PairwiseDistances, RadiusNeighbors, sqeuclidean_row_norms, ) +from ._pairwise_distances import _precompute_metric_params + __all__ = [ "BaseDistancesReductionDispatcher", "ArgKmin", + "PairwiseDistances", "RadiusNeighbors", "sqeuclidean_row_norms", + "_precompute_metric_params", ] diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index be44f3a98a263..d93bef308b2a6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -50,6 +50,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: ITYPE_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk ITYPE_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk + bint X_is_Y bint execute_in_parallel_on_Y @final diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index e220f730e7529..16521561a58dc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -25,6 +25,8 @@ cdef class DatasetsPair{{name_suffix}}: {{DistanceMetric}} distance_metric ITYPE_t n_features + readonly bint X_is_Y + cdef ITYPE_t n_samples_X(self) nogil cdef ITYPE_t n_samples_Y(self) nogil diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 78857341f9c97..18238d781f73a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -1,3 +1,5 @@ +import copy + {{py: implementation_specific_values = [ @@ -91,18 +93,24 @@ cdef class DatasetsPair{{name_suffix}}: datasets_pair: DatasetsPair{{name_suffix}} The suited DatasetsPair{{name_suffix}} implementation. """ - # Y_norm_squared might be propagated down to DatasetsPairs - # via metrics_kwargs when the Euclidean specialisations - # can't be used. To prevent Y_norm_squared to be passed + # X_norm_squared and Y_norm_squared might be propagated + # down to DatasetsPairs via metrics_kwargs when the Euclidean + # specialisations can't be used. + # To prevent X_norm_squared and Y_norm_squared to be passed # down to DistanceMetrics (whose constructors would raise - # a RuntimeError), we pop it here. + # a RuntimeError), we pop them here. if metric_kwargs is not None: + # Copying metric_kwargs not to pop "X_norm_squared" + # and "Y_norm_squared" where they are used + metric_kwargs = copy.copy(metric_kwargs) + metric_kwargs.pop("X_norm_squared", None) metric_kwargs.pop("Y_norm_squared", None) cdef: {{DistanceMetric}} distance_metric = {{DistanceMetric}}.get_metric( metric, **(metric_kwargs or {}) ) + bint X_is_Y = X is Y # Metric-specific checks that do not replace nor duplicate `check_array`. distance_metric._validate_data(X) @@ -112,15 +120,15 @@ cdef class DatasetsPair{{name_suffix}}: Y_is_sparse = issparse(Y) if not X_is_sparse and not Y_is_sparse: - return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y) if X_is_sparse and Y_is_sparse: - return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y) if X_is_sparse and not Y_is_sparse: - return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y) - return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric, X_is_Y) @classmethod def unpack_csr_matrix(cls, X: csr_matrix): @@ -130,8 +138,9 @@ cdef class DatasetsPair{{name_suffix}}: X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) return X_data, X_indices, X_indptr - def __init__(self, {{DistanceMetric}} distance_metric, ITYPE_t n_features): + def __init__(self, {{DistanceMetric}} distance_metric, ITYPE_t n_features, bint X_is_Y): self.distance_metric = distance_metric + self.X_is_Y = X_is_Y self.n_features = n_features cdef ITYPE_t n_samples_X(self) nogil: @@ -179,8 +188,9 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): const {{INPUT_DTYPE_t}}[:, ::1] X, const {{INPUT_DTYPE_t}}[:, ::1] Y, {{DistanceMetric}} distance_metric, + bint X_is_Y, ): - super().__init__(distance_metric, n_features=X.shape[1]) + super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y) # Arrays have already been checked self.X = X self.Y = Y @@ -219,8 +229,8 @@ cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): between two vectors of (X, Y). """ - def __init__(self, X, Y, {{DistanceMetric}} distance_metric): - super().__init__(distance_metric, n_features=X.shape[1]) + def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y): + super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y) self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) @@ -279,8 +289,8 @@ cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): between two vectors of (X, Y). """ - def __init__(self, X, Y, {{DistanceMetric}} distance_metric): - super().__init__(distance_metric, n_features=X.shape[1]) + def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y): + super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y) self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) @@ -377,10 +387,10 @@ cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): between two vectors of (X, Y). """ - def __init__(self, X, Y, {{DistanceMetric}} distance_metric): - super().__init__(distance_metric, n_features=X.shape[1]) + def __init__(self, X, Y, {{DistanceMetric}} distance_metric, bint X_is_Y): + super().__init__(distance_metric, n_features=X.shape[1], X_is_Y=X_is_Y) # Swapping arguments on the constructor - self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric) + self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric, X_is_Y) @final cdef ITYPE_t n_samples_X(self) nogil: diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 62403d1c334f0..5a3692a4d55e3 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -13,6 +13,12 @@ ArgKmin64, ArgKmin32, ) + +from ._pairwise_distances import ( + PairwiseDistances64, + PairwiseDistances32, +) + from ._radius_neighbors import ( RadiusNeighbors64, RadiusNeighbors32, @@ -167,6 +173,132 @@ def compute( """ +class PairwiseDistances(BaseDistancesReductionDispatcher): + """Compute the pairwise distances matrix for two sets of vectors. + + The distance function `dist` depends on the values of the `metric` + and `metric_kwargs` parameters. + + This class only computes the pairwise distances matrix without + applying any reduction on it. It shares most of the underlying + code infrastructure with reducing variants to leverage + cache-aware chunking and multi-thread parallelism. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + Y = X if Y is None else Y + return metric != "sqeuclidean" and super().is_usable_for(X, Y, metric) + + @classmethod + def compute( + cls, + X, + Y, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + ): + """Return pairwise distances matrix for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + Returns + ------- + pairwise_distances_matrix : ndarray of shape (n_samples_X, n_samples_Y) + The pairwise distances matrix. + + Notes + ----- + This public classmethod is responsible for introspecting the arguments + values to dispatch to the private dtype-specialized implementation of + :class:`PairwiseDistances`. + + All temporarily allocated datastructures necessary for the concrete + implementation are therefore freed when this classmethod returns. + + This allows entirely decoupling the API entirely from the + implementation details whilst maintaining RAII. + """ + Y = X if Y is None else Y + if X.dtype == Y.dtype == np.float64: + return PairwiseDistances64.compute( + X=X, + Y=Y, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + if X.dtype == Y.dtype == np.float32: + return PairwiseDistances32.compute( + X=X, + Y=Y, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported, but " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + + class ArgKmin(BaseDistancesReductionDispatcher): """Compute the argkmin of row vectors of X on the ones of Y. @@ -242,7 +374,7 @@ def compute( 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' - brings more opportunity for parallelism and is therefore more efficient + brings more opportunity for parallelism and is therefore more efficient. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. @@ -381,9 +513,7 @@ def compute( 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' - brings more opportunity for parallelism and is therefore more efficient - despite the synchronization step at each iteration of the outer loop - on chunks of `X`. + brings more opportunity for parallelism and is therefore more efficient. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp new file mode 100644 index 0000000000000..00562fe50d86b --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp @@ -0,0 +1,40 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, INPUT_DTYPE_t + # + # + ('64', 'cnp.float64_t'), + ('32', 'cnp.float32_t') +] + +}} +cimport numpy as cnp + +from ...utils._typedefs cimport DTYPE_t, ITYPE_t +{{for name_suffix, INPUT_DTYPE_t in implementation_specific_values}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + +cdef class PairwiseDistances{{name_suffix}}: + """float{{name_suffix}} implementation of PairwiseDistances.""" + + cdef: + readonly DatasetsPair{{name_suffix}} datasets_pair + + ITYPE_t n_samples_X, n_samples_Y + ITYPE_t effective_n_threads + bint X_is_Y + bint execute_in_parallel_on_Y + + {{INPUT_DTYPE_t}}[:, ::1] pairwise_distances_matrix + + cdef void _parallel_on_X(self) + + cdef void _parallel_on_Y(self) + + +{{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp new file mode 100644 index 0000000000000..98f25f3f50504 --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp @@ -0,0 +1,181 @@ +cimport numpy as cnp +from cython cimport final +from cython.parallel cimport prange +from ...utils._typedefs cimport ITYPE_t, DTYPE_t + +import numpy as np + +from scipy.sparse import issparse +from sklearn import get_config +from ...utils import check_array, _in_unstable_openblas_configuration +from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils.fixes import threadpool_limits, sp_version, parse_version +from ...utils._typedefs import DTYPE + +cnp.import_array() + + +def _precompute_metric_params(X, Y, metric=None, **kwds): + """Precompute data-derived metric parameters if not provided.""" + if metric == "seuclidean" and "V" not in kwds: + # There is a bug in scipy < 1.5 that will cause a crash if + # X.dtype != np.double (float64). See PR #15730 + dtype = np.float64 if sp_version < parse_version("1.5") else None + if X is Y: + V = np.var(X, axis=0, ddof=1, dtype=dtype) + else: + raise ValueError( + "The 'V' parameter is required for the seuclidean metric " + "when Y is passed." + ) + return {"V": V} + if metric == "mahalanobis" and "VI" not in kwds: + if X is Y: + VI = np.linalg.inv(np.cov(X.T)).T + else: + raise ValueError( + "The 'VI' parameter is required for the mahalanobis metric " + "when Y is passed." + ) + return {"VI": VI} + return {} + +{{for name_suffix, INPUT_DTYPE in (('64', 'DTYPE'),('32', 'np.float32'))}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + +cdef class PairwiseDistances{{name_suffix}}: + """float{{name_suffix}} implementation of PairwiseDistances.""" + + @classmethod + def compute( + cls, + X, + Y, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the pairwise-distances matrix. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`PairwiseDistances{{name_suffix}}`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance should directly be created outside of this class method. + """ + # Precompute data-derived distance metric parameters + metric_kwargs = {} if metric_kwargs is None else metric_kwargs + + params = _precompute_metric_params( + X, + Y, + metric=metric, + **metric_kwargs, + ) + metric_kwargs.update(**params) + + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pdr = PairwiseDistances{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + strategy=strategy, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if pdr.execute_in_parallel_on_Y: + pdr._parallel_on_Y() + else: + pdr._parallel_on_X() + + return pdr._finalize_results() + + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + strategy=None, + sort_results=False, + ): + self.datasets_pair = datasets_pair + self.n_samples_X = datasets_pair.n_samples_X() + self.n_samples_Y = datasets_pair.n_samples_Y() + + self.effective_n_threads = _openmp_effective_n_threads() + + if strategy is None: + strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') + + if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'): + raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', " + f"or 'auto', but currently strategy='{self.strategy}'.") + + if strategy == 'auto': + # This is a simple heuristic whose constant for the + # comparison has been chosen based on experiments. + # parallel_on_X has less synchronization overhead than + # parallel_on_Y and should therefore be used whenever + # n_samples_X is large enough to not starve any of the + # available hardware threads. + if self.n_samples_Y < self.n_samples_X: + # No point to even consider parallelizing on Y in this case. This + # is in particular important to do this on machines with a large + # number of hardware threads. + strategy = 'parallel_on_X' + elif 4 * self.effective_n_threads < self.n_samples_X: + # If Y is larger than X, but X is still large enough to allow for + # parallelism, we might still want to favor parallelizing on X. + strategy = 'parallel_on_X' + else: + strategy = 'parallel_on_Y' + + self.execute_in_parallel_on_Y = strategy == "parallel_on_Y" + + # Distance matrix which will be complete and returned to the caller. + self.pairwise_distances_matrix = np.empty( + (self.n_samples_X, self.n_samples_Y), dtype={{INPUT_DTYPE}}, + ) + + cdef void _parallel_on_X(self): + + cdef: + ITYPE_t n_X = self.n_samples_X + ITYPE_t n_Y = self.n_samples_Y + ITYPE_t i, j + + for i in prange(n_X, nogil=True, num_threads=self.effective_n_threads): + for j in range(n_Y): + self.pairwise_distances_matrix[i, j] = self.datasets_pair.dist(i, j) + + cdef void _parallel_on_Y(self): + + cdef: + ITYPE_t n_X = self.n_samples_X + ITYPE_t n_Y = self.n_samples_Y + ITYPE_t i, j + + for i in range(n_X): + for j in prange(n_Y, nogil=True, num_threads=self.effective_n_threads): + self.pairwise_distances_matrix[i, j] = self.datasets_pair.dist(i, j) + + def _finalize_results(self): + # If X is Y, then catastrophic cancellation might have occurred for + # computations of terms on the diagonal which must equal zero. + # We enforce it by zeroing the diagonal. + distance_matrix = np.asarray(self.pairwise_distances_matrix) + if self.datasets_pair.X_is_Y: + np.fill_diagonal(distance_matrix, 0.) + + return distance_matrix + + +{{endfor}} diff --git a/sklearn/metrics/_pairwise_fast.pyx b/sklearn/metrics/_pairwise_fast.pyx index f7ddd68c46c1e..b6941dba290db 100644 --- a/sklearn/metrics/_pairwise_fast.pyx +++ b/sklearn/metrics/_pairwise_fast.pyx @@ -6,10 +6,6 @@ cimport numpy as cnp from cython cimport floating -from cython.parallel cimport prange -from libc.math cimport fabs - -from ..utils._openmp_helpers import _openmp_effective_n_threads cnp.import_array() @@ -33,79 +29,3 @@ def _chi2_kernel_fast(floating[:, :] X, if nom != 0: res += denom * denom / nom result[i, j] = -res - - -def _sparse_manhattan( - const floating[::1] X_data, - const int[:] X_indices, - const int[:] X_indptr, - const floating[::1] Y_data, - const int[:] Y_indices, - const int[:] Y_indptr, - double[:, ::1] D, -): - """Pairwise L1 distances for CSR matrices. - - Usage: - >>> D = np.zeros(X.shape[0], Y.shape[0]) - >>> _sparse_manhattan(X.data, X.indices, X.indptr, - ... Y.data, Y.indices, Y.indptr, - ... D) - """ - cdef cnp.npy_intp px, py, i, j, ix, iy - cdef double d = 0.0 - - cdef int m = D.shape[0] - cdef int n = D.shape[1] - - cdef int X_indptr_end = 0 - cdef int Y_indptr_end = 0 - - cdef int num_threads = _openmp_effective_n_threads() - - # We scan the matrices row by row. - # Given row px in X and row py in Y, we find the positions (i and j - # respectively), in .indices where the indices for the two rows start. - # If the indices (ix and iy) are the same, the corresponding data values - # are processed and the cursors i and j are advanced. - # If not, the lowest index is considered. Its associated data value is - # processed and its cursor is advanced. - # We proceed like this until one of the cursors hits the end for its row. - # Then we process all remaining data values in the other row. - - # Below the avoidance of inplace operators is intentional. - # When prange is used, the inplace operator has a special meaning, i.e. it - # signals a "reduction" - - for px in prange(m, nogil=True, num_threads=num_threads): - X_indptr_end = X_indptr[px + 1] - for py in range(n): - Y_indptr_end = Y_indptr[py + 1] - i = X_indptr[px] - j = Y_indptr[py] - d = 0.0 - while i < X_indptr_end and j < Y_indptr_end: - ix = X_indices[i] - iy = Y_indices[j] - - if ix == iy: - d = d + fabs(X_data[i] - Y_data[j]) - i = i + 1 - j = j + 1 - elif ix < iy: - d = d + fabs(X_data[i]) - i = i + 1 - else: - d = d + fabs(Y_data[j]) - j = j + 1 - - if i == X_indptr_end: - while j < Y_indptr_end: - d = d + fabs(Y_data[j]) - j = j + 1 - else: - while i < X_indptr_end: - d = d + fabs(X_data[i]) - i = i + 1 - - D[px, py] = d diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4105cd367663a..a0c84f3dbabfd 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,11 +28,15 @@ from ..preprocessing import normalize from ..utils._mask import _get_mask from ..utils.parallel import delayed, Parallel -from ..utils.fixes import sp_base_version, sp_version, parse_version +from ..utils.fixes import sp_base_version, parse_version from ..utils._param_validation import validate_params -from ._pairwise_distances_reduction import ArgKmin -from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan +from ._pairwise_distances_reduction import ( + ArgKmin, + PairwiseDistances, + _precompute_metric_params, +) +from ._pairwise_fast import _chi2_kernel_fast from ..exceptions import DataConversionWarning @@ -338,6 +342,21 @@ def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared float32, norms needs to be recomputed on upcast chunks. TODO: use a float64 accumulator in row_norms to avoid the latter. """ + metric = "sqeuclidean" if squared else "euclidean" + if PairwiseDistances.is_usable_for(X, Y, metric): + metric_kwargs = {} + if X_norm_squared is not None: + metric_kwargs["X_norm_squared"] = np.ravel(X_norm_squared) + + if Y_norm_squared is not None: + metric_kwargs["Y_norm_squared"] = np.ravel(Y_norm_squared) + + return PairwiseDistances.compute(X, Y, metric, metric_kwargs=metric_kwargs) + + # XXX: the following code is still used for list-of-lists of numbers which + # aren't converted to numpy arrays in validation steps done in `check_array`. + # TODO: convert list-of-lists to numpy arrays in `check_array`. + # See: https://github.com/scikit-learn/scikit-learn/issues/24745 if X_norm_squared is not None: if X_norm_squared.dtype == np.float32: XX = None @@ -881,6 +900,15 @@ def haversine_distances(X, Y=None): array([[ 0. , 11099.54035582], [11099.54035582, 0. ]]) """ + + if PairwiseDistances.is_usable_for(X, Y, metric="haversine"): + return PairwiseDistances.compute(X, Y, metric="haversine") + + # XXX: the following code is still used for list-of-lists of numbers which + # aren't converted to numpy arrays in validation steps done in `check_array`. + # TODO: convert list-of-lists to numpy arrays in `check_array`. + # See: https://github.com/scikit-learn/scikit-learn/issues/24745 + from ..metrics import DistanceMetric return DistanceMetric.get_metric("haversine").pairwise(X, Y) @@ -954,20 +982,30 @@ def manhattan_distances(X, Y=None, *, sum_over_features="deprecated"): X, Y = check_pairwise_arrays(X, Y) - if issparse(X) or issparse(Y): - if not sum_over_features: - raise TypeError( - "sum_over_features=%r not supported for sparse matrices" - % sum_over_features - ) - + if issparse(X): X = csr_matrix(X, copy=False) + # This also sorts indices in-place. + X.sum_duplicates() + + if issparse(Y): Y = csr_matrix(Y, copy=False) - X.sum_duplicates() # this also sorts indices in-place + # This also sorts indices in-place. Y.sum_duplicates() - D = np.zeros((X.shape[0], Y.shape[0])) - _sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D) - return D + + if sum_over_features and PairwiseDistances.is_usable_for(X, Y, metric="manhattan"): + return PairwiseDistances.compute(X, Y, metric="manhattan") + + # XXX: the following code is still used for list-of-lists of numbers which + # aren't converted to numpy arrays in validation steps done in `check_array` + # and for supporting `sum_over_features` which we should probably remove. + # TODO: convert list-of-lists to numpy arrays in `check_array`. + # See: https://github.com/scikit-learn/scikit-learn/issues/24745 + # TODO: remove `sum_over_features`, see: + # https://github.com/scikit-learn/scikit-learn/issues/24597 + + if issparse(X) or issparse(Y): + if not sum_over_features: + raise TypeError("sum_over_features=False not supported for sparse matrices") if sum_over_features: return distance.cdist(X, Y, "cityblock") @@ -1687,32 +1725,6 @@ def _check_chunk_size(reduced, chunk_size): ) -def _precompute_metric_params(X, Y, metric=None, **kwds): - """Precompute data-derived metric parameters if not provided.""" - if metric == "seuclidean" and "V" not in kwds: - # There is a bug in scipy < 1.5 that will cause a crash if - # X.dtype != np.double (float64). See PR #15730 - dtype = np.float64 if sp_version < parse_version("1.5") else None - if X is Y: - V = np.var(X, axis=0, ddof=1, dtype=dtype) - else: - raise ValueError( - "The 'V' parameter is required for the seuclidean metric " - "when Y is passed." - ) - return {"V": V} - if metric == "mahalanobis" and "VI" not in kwds: - if X is Y: - VI = np.linalg.inv(np.cov(X.T)).T - else: - raise ValueError( - "The 'VI' parameter is required for the mahalanobis metric " - "when Y is passed." - ) - return {"VI": VI} - return {} - - def pairwise_distances_chunked( X, Y=None, @@ -2008,6 +2020,17 @@ def pairwise_distances( % (metric, _VALID_METRICS) ) + if PairwiseDistances.is_usable_for(X, X if Y is None else Y, metric=metric): + # This is an adaptor for one "sqeuclidean" specification. + # For this backend, we can directly use "sqeuclidean". + if kwds.get("squared", False) and metric == "euclidean": + metric = "sqeuclidean" + kwds = {} + + return PairwiseDistances.compute( + X, X if Y is None else Y, metric=metric, metric_kwargs=kwds + ) + if metric == "precomputed": X, _ = check_pairwise_arrays( X, Y, precomputed=True, force_all_finite=force_all_finite @@ -2026,6 +2049,11 @@ def pairwise_distances( _pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds ) else: + # XXX: the following code is still used for list-of-lists of numbers which + # aren't converted to numpy arrays in validation steps done in `check_array`. + # TODO: convert list-of-lists to numpy arrays in `check_array`. + # See: https://github.com/scikit-learn/scikit-learn/issues/24745 + if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not support sparse matrices.") diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index ba583e4ddb965..c826cfd5bff9a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -154,25 +154,14 @@ def test_pairwise_distances(global_dtype): S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_allclose(S, S2) - if global_dtype == np.float64: - assert S.dtype == S2.dtype == global_dtype - else: - # TODO Fix manhattan_distances to preserve dtype. - # currently pairwise_distances uses manhattan_distances but converts the result - # back to the input dtype - with pytest.raises(AssertionError): - assert S.dtype == S2.dtype == global_dtype + + # pairwise_distances must preserves dtypes for the manhattan distance metric + assert S.dtype == S2.dtype == global_dtype S2 = manhattan_distances(X, Y) assert_allclose(S, S2) - if global_dtype == np.float64: - assert S.dtype == S2.dtype == global_dtype - else: - # TODO Fix manhattan_distances to preserve dtype. - # currently pairwise_distances uses manhattan_distances but converts the result - # back to the input dtype - with pytest.raises(AssertionError): - assert S.dtype == S2.dtype == global_dtype + # manhattan_distances must preserves dtypes + assert S.dtype == S2.dtype == global_dtype # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} @@ -186,12 +175,6 @@ def test_pairwise_distances(global_dtype): S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_allclose(S, S2) - # Test that scipy distance metrics throw an error if sparse matrix given - with pytest.raises(TypeError): - pairwise_distances(X_sparse, metric="minkowski") - with pytest.raises(TypeError): - pairwise_distances(X, Y_sparse, metric="minkowski") - # Test that a value error is raised if the metric is unknown with pytest.raises(ValueError): pairwise_distances(X, Y, metric="blah") @@ -796,16 +779,6 @@ def test_euclidean_distances_with_norms(global_dtype, y_array_constr): assert_allclose(D3, D1) assert_allclose(D4, D1) - # check we get the wrong answer with wrong {X,Y}_norm_squared - wrong_D = euclidean_distances( - X, - Y, - X_norm_squared=np.zeros_like(X_norm_sq), - Y_norm_squared=np.zeros_like(Y_norm_sq), - ) - with pytest.raises(AssertionError): - assert_allclose(wrong_D, D1) - def test_euclidean_distances_norm_shapes(): # Check all accepted shapes for the norms or appropriate error messages. @@ -941,15 +914,10 @@ def test_euclidean_distances_upcast_sym(batch_size, x_array_constr): "dtype, eps, rtol", [ (np.float32, 1e-4, 1e-5), - pytest.param( - np.float64, - 1e-8, - 0.99, - marks=pytest.mark.xfail(reason="failing due to lack of precision"), - ), + (np.float64, 1e-8, 0.99), ], ) -@pytest.mark.parametrize("dim", [1, 1000000]) +@pytest.mark.parametrize("dim", [1, 100000]) def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): # check that euclidean distances is correct with float32 input thanks to # upcasting. On float64 there are still precision issues. @@ -959,7 +927,7 @@ def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): distances = euclidean_distances(X, Y) expected = cdist(X, Y) - assert_allclose(distances, expected, rtol=1e-5) + assert_allclose(distances, expected, rtol=1e-5, atol=4e-4) @pytest.mark.parametrize("squared", [True, False]) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 4fe8013cd3602..8a2f87e6d50c3 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -14,6 +14,7 @@ BaseDistancesReductionDispatcher, ArgKmin, RadiusNeighbors, + PairwiseDistances, sqeuclidean_row_norms, ) @@ -752,6 +753,44 @@ def test_radius_neighbors_factory_method_wrong_usages(): ) +def test_pairwise_distances_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + metric = "euclidean" + + msg = ( + "Only float64 or float32 datasets pairs are supported, but " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises( + ValueError, + match=msg, + ): + PairwiseDistances.compute(X=X.astype(np.float32), Y=Y, metric=metric) + + msg = ( + "Only float64 or float32 datasets pairs are supported, but " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises( + ValueError, + match=msg, + ): + PairwiseDistances.compute(X=X, Y=Y.astype(np.int32), metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + PairwiseDistances.compute(X=X, Y=Y, metric="wrong metric") + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + PairwiseDistances.compute(X=np.array([1.0, 2.0]), Y=Y, metric=metric) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + PairwiseDistances.compute(X=np.asfortranarray(X), Y=Y, metric=metric) + + @pytest.mark.parametrize( "n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)] ) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 4a83fcc7bc080..66e103810c6fa 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -486,7 +486,7 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): X_embedded = np.dot(X, transformation.T) # (n_samples, n_components) # Compute softmax distances - p_ij = pairwise_distances(X_embedded, squared=True) + p_ij = pairwise_distances(X_embedded, metric="sqeuclidean") np.fill_diagonal(p_ij, np.inf) p_ij = softmax(-p_ij) # (n_samples, n_samples) From ddc464ba818dee21cd854d0f9c3957c3c91ed98b Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 7 Feb 2023 14:01:52 +0100 Subject: [PATCH 02/21] temporarily swap 'sqeuclidean' with 'euclidean' to fix tests --- sklearn/metrics/pairwise.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a0c84f3dbabfd..8a974e57cc835 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -2024,7 +2024,9 @@ def pairwise_distances( # This is an adaptor for one "sqeuclidean" specification. # For this backend, we can directly use "sqeuclidean". if kwds.get("squared", False) and metric == "euclidean": - metric = "sqeuclidean" + # TODO: use 'sqeuclidean' instead of 'euclidean' + # with EuclideanPairwiseDistance + metric = "euclidean" kwds = {} return PairwiseDistances.compute( From 2a8b47a74180c91d7ae6c9f2314227e196514674 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 8 Feb 2023 15:35:35 +0100 Subject: [PATCH 03/21] Add nogil for benchmark purposes Co-authored-by: Julien Jerphanion --- .../_pairwise_distances_reduction/_pairwise_distances.pxd.tp | 4 ++-- .../_pairwise_distances_reduction/_pairwise_distances.pyx.tp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp index 00562fe50d86b..ed2ce850af11c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp @@ -32,9 +32,9 @@ cdef class PairwiseDistances{{name_suffix}}: {{INPUT_DTYPE_t}}[:, ::1] pairwise_distances_matrix - cdef void _parallel_on_X(self) + cdef void _parallel_on_X(self) nogil - cdef void _parallel_on_Y(self) + cdef void _parallel_on_Y(self) nogil {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp index 98f25f3f50504..c8c7755e4dc29 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp @@ -145,7 +145,7 @@ cdef class PairwiseDistances{{name_suffix}}: (self.n_samples_X, self.n_samples_Y), dtype={{INPUT_DTYPE}}, ) - cdef void _parallel_on_X(self): + cdef void _parallel_on_X(self) nogil: cdef: ITYPE_t n_X = self.n_samples_X @@ -156,7 +156,7 @@ cdef class PairwiseDistances{{name_suffix}}: for j in range(n_Y): self.pairwise_distances_matrix[i, j] = self.datasets_pair.dist(i, j) - cdef void _parallel_on_Y(self): + cdef void _parallel_on_Y(self) nogil: cdef: ITYPE_t n_X = self.n_samples_X From fd4117b9660ac35c5eee77269b5dbf0952752831 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 11 Feb 2023 10:49:57 +0100 Subject: [PATCH 04/21] update is_usable_for and docstrings --- .../_dispatcher.py | 57 ++++++++++++------- .../_pairwise_distances.pyx.tp | 2 + sklearn/metrics/pairwise.py | 4 +- sklearn/metrics/tests/test_pairwise.py | 12 +++- sklearn/neighbors/_nca.py | 2 +- 5 files changed, 51 insertions(+), 26 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 5a3692a4d55e3..357f6ce55cf29 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -7,6 +7,7 @@ from scipy.sparse import isspmatrix_csr from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING +from ...utils._openmp_helpers import _openmp_effective_n_threads from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64 from ._argkmin import ( @@ -181,8 +182,10 @@ class PairwiseDistances(BaseDistancesReductionDispatcher): This class only computes the pairwise distances matrix without applying any reduction on it. It shares most of the underlying - code infrastructure with reducing variants to leverage - cache-aware chunking and multi-thread parallelism. + code infrastructure with reducing variants to leverage multi-thread + parallelism. However contrary to the reducing variants, no chunking + is applied to allow for contiguous write access to the final distance + array that is not expected to fit in the CPU cache in general. This class is not meant to be instantiated, one should only use its :meth:`compute` classmethod which handles allocation and @@ -190,9 +193,27 @@ class PairwiseDistances(BaseDistancesReductionDispatcher): """ @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: + def is_usable_for(cls, X, Y, metric, metric_kwargs=None) -> bool: + + effective_n_threads = _openmp_effective_n_threads() + + def is_euclidean(metric, metric_kwargs): + metric_kwargs = metric_kwargs or dict() + euclidean_metrics = [ + "euclidean", + "sqeuclidean", + "l2", + ] + return metric in euclidean_metrics or ( + metric == "minkowski" and metric_kwargs.get("p") == 2 + ) + Y = X if Y is None else Y - return metric != "sqeuclidean" and super().is_usable_for(X, Y, metric) + return ( + not is_euclidean(metric, metric_kwargs) + and super().is_usable_for(X, Y, metric) + and effective_n_threads != 1 + ) @classmethod def compute( @@ -219,29 +240,24 @@ def compute( For a list of available metrics, see the documentation of :class:`~sklearn.metrics.DistanceMetric`. - chunk_size : int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None - The chunking strategy defining which dataset parallelization are made on. + The strategy defining which dataset parallelization are made on. For both strategies the computations happens with two nested loops, - respectively on chunks of X and chunks of Y. + respectively on rows of X and rows of Y. Strategies differs on which loop (outer or inner) is made to run in parallel with the Cython `prange` construct: - - 'parallel_on_X' dispatches chunks of X uniformly on threads. - Each thread then iterates on all the chunks of Y. This strategy is + - 'parallel_on_X' dispatches rows of X uniformly on threads. + Each thread then iterates on all the rows of Y. This strategy is embarrassingly parallel and comes with no datastructures synchronisation. - - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. - Each thread processes all the chunks of X in turn. This strategy is + - 'parallel_on_Y' dispatches rows of Y uniformly on threads. + Each thread processes all the rows of X in turn. This strategy is a sequence of embarrassingly parallel subtasks (the inner loop on Y chunks) with intermediate datastructures synchronisation at each iteration of the sequential outer loop on X chunks. @@ -267,7 +283,7 @@ def compute( :class:`PairwiseDistances`. All temporarily allocated datastructures necessary for the concrete - implementation are therefore freed when this classmethod returns. + implementations are therefore freed when this classmethod returns. This allows entirely decoupling the API entirely from the implementation details whilst maintaining RAII. @@ -343,11 +359,6 @@ def compute( For a list of available metrics, see the documentation of :class:`~sklearn.metrics.DistanceMetric`. - chunk_size : int, default=None, - The number of vectors per chunk. If None (default) looks-up in - scikit-learn configuration for `pairwise_dist_chunk_size`, - and use 256 if it is not set. - metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. @@ -513,7 +524,9 @@ def compute( 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' - brings more opportunity for parallelism and is therefore more efficient. + brings more opportunity for parallelism and is therefore more efficient + despite the synchronization step at each iteration of the outer loop + on chunks of `X`. - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp index c8c7755e4dc29..8a3e0e82e6a80 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp @@ -120,6 +120,8 @@ cdef class PairwiseDistances{{name_suffix}}: f"or 'auto', but currently strategy='{self.strategy}'.") if strategy == 'auto': + # TODO: inspect if the current heuristic is relevant + # for PairwiseDistances # This is a simple heuristic whose constant for the # comparison has been chosen based on experiments. # parallel_on_X has less synchronization overhead than diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 8a974e57cc835..54ab7aaf0c750 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -2020,7 +2020,9 @@ def pairwise_distances( % (metric, _VALID_METRICS) ) - if PairwiseDistances.is_usable_for(X, X if Y is None else Y, metric=metric): + if PairwiseDistances.is_usable_for( + X, X if Y is None else Y, metric=metric, metric_kwargs=kwds + ): # This is an adaptor for one "sqeuclidean" specification. # For this backend, we can directly use "sqeuclidean". if kwds.get("squared", False) and metric == "euclidean": diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index c826cfd5bff9a..794209f5dbac7 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -155,12 +155,10 @@ def test_pairwise_distances(global_dtype): S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_allclose(S, S2) - # pairwise_distances must preserves dtypes for the manhattan distance metric assert S.dtype == S2.dtype == global_dtype S2 = manhattan_distances(X, Y) assert_allclose(S, S2) - # manhattan_distances must preserves dtypes assert S.dtype == S2.dtype == global_dtype # Test with scipy.spatial.distance metric, with a kwd @@ -779,6 +777,16 @@ def test_euclidean_distances_with_norms(global_dtype, y_array_constr): assert_allclose(D3, D1) assert_allclose(D4, D1) + # check we get the wrong answer with wrong {X,Y}_norm_squared + wrong_D = euclidean_distances( + X, + Y, + X_norm_squared=np.zeros_like(X_norm_sq), + Y_norm_squared=np.zeros_like(Y_norm_sq), + ) + with pytest.raises(AssertionError): + assert_allclose(wrong_D, D1) + def test_euclidean_distances_norm_shapes(): # Check all accepted shapes for the norms or appropriate error messages. diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 66e103810c6fa..4a83fcc7bc080 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -486,7 +486,7 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): X_embedded = np.dot(X, transformation.T) # (n_samples, n_components) # Compute softmax distances - p_ij = pairwise_distances(X_embedded, metric="sqeuclidean") + p_ij = pairwise_distances(X_embedded, squared=True) np.fill_diagonal(p_ij, np.inf) p_ij = softmax(-p_ij) # (n_samples, n_samples) From cfa7d8c2beda3c5129bb841db36c80b05d4d4ddd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 11 Feb 2023 11:09:01 +0100 Subject: [PATCH 05/21] update whats_new --- doc/whats_new/v1.3.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 3ef8a7653b5f7..897cb31f91e3a 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -155,6 +155,16 @@ Changelog - |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. :pr:`25432` by :user:`Julien Jerphanion `. +- |Efficiency| :func:`pairwise.pairwise_distances` performances has been improved + when OpenMP runs with n_threads > 1. + :pr:`25561` by :user:`Julien Jerphanion ` and + :user:`Vincent Maladiere `. + +- |Feature| :func:`pairwise.pairwise_distances` now support CSR-CSR, Dense-CSR and + CSR-Dense computations. + :pr:`25561` by :user:`Julien Jerphanion ` and + :user:`Vincent Maladiere `. + :mod:`sklearn.naive_bayes` .......................... From 5d9dcdb4897a298dc86bf3a6b43f5c486c4295a3 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 14 Feb 2023 12:08:52 +0100 Subject: [PATCH 06/21] remove chunksize and extend tests --- .../_dispatcher.py | 14 +- .../_pairwise_distances.pyx.tp | 1 - sklearn/metrics/tests/test_pairwise.py | 6 + .../test_pairwise_distances_reduction.py | 137 ++++++++++++++++++ 4 files changed, 151 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 357f6ce55cf29..d48b1b1dc5fa5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -204,8 +204,9 @@ def is_euclidean(metric, metric_kwargs): "sqeuclidean", "l2", ] + # TODO: pass `p` as a standalone argument instead of a metric_kwargs. return metric in euclidean_metrics or ( - metric == "minkowski" and metric_kwargs.get("p") == 2 + metric == "minkowski" and metric_kwargs.get("p", 2) == 2 ) Y = X if Y is None else Y @@ -221,7 +222,6 @@ def compute( X, Y, metric="euclidean", - chunk_size=None, metric_kwargs=None, strategy=None, ): @@ -259,8 +259,7 @@ def compute( - 'parallel_on_Y' dispatches rows of Y uniformly on threads. Each thread processes all the rows of X in turn. This strategy is a sequence of embarrassingly parallel subtasks (the inner loop on Y - chunks) with intermediate datastructures synchronisation at each - iteration of the sequential outer loop on X chunks. + chunks) with no intermediate datastructures synchronisation. - 'auto' relies on a simple heuristic to choose between 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, @@ -294,7 +293,6 @@ def compute( X=X, Y=Y, metric=metric, - chunk_size=chunk_size, metric_kwargs=metric_kwargs, strategy=strategy, ) @@ -304,7 +302,6 @@ def compute( X=X, Y=Y, metric=metric, - chunk_size=chunk_size, metric_kwargs=metric_kwargs, strategy=strategy, ) @@ -359,6 +356,11 @@ def compute( For a list of available metrics, see the documentation of :class:`~sklearn.metrics.DistanceMetric`. + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + metric_kwargs : dict, default=None Keyword arguments to pass to specified metric function. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp index 8a3e0e82e6a80..5e7e0e1109d37 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp @@ -54,7 +54,6 @@ cdef class PairwiseDistances{{name_suffix}}: X, Y, str metric="euclidean", - chunk_size=None, dict metric_kwargs=None, str strategy=None, ): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 794209f5dbac7..b37df58f87925 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -173,6 +173,12 @@ def test_pairwise_distances(global_dtype): S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_allclose(S, S2) + # Test PairwiseDistance + kwds = {"p": 3.0} + S = pairwise_distances(X, metric="minkowski", **kwds) + S2 = pairwise_distances(X, metric=minkowski, **kwds) + assert_allclose(S, S2) + # Test that a value error is raised if the metric is unknown with pytest.raises(ValueError): pairwise_distances(X, Y, metric="blah") diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 8a2f87e6d50c3..b9a4fcee98083 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1,4 +1,5 @@ import itertools +from functools import partial import re import warnings from collections import defaultdict @@ -321,6 +322,11 @@ def assert_radius_neighbors_results_quasi_equality( ): assert_radius_neighbors_results_quasi_equality, } +ASSERT_RESULT_PAIRWISE = { + np.float32: partial(assert_allclose, rtol=1e-4), + np.float64: assert_array_equal, +} + def test_assert_argkmin_results_quasi_equality(): @@ -900,6 +906,29 @@ def test_n_threads_agnosticism( ) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_n_threads_agnosticism_pairwise_distances( + global_random_seed, + dtype, + n_features=100, +): + """Check that results do not depend on the number of threads.""" + # TODO: Parametrize `n_samples_X` and `n_samples_Y` when the + # strategy heuristic has been inspected. + n_samples_X, n_samples_Y = 100, 100 + rng = np.random.RandomState(global_random_seed) + spread = 100 + X = rng.rand(n_samples_X, n_features).astype(dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread + + ref_dist = PairwiseDistances.compute(X, Y) + + with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"): + dist = PairwiseDistances.compute(X, Y) + + ASSERT_RESULT_PAIRWISE[dtype](ref_dist, dist) + + @pytest.mark.parametrize( "Dispatcher, dtype", [ @@ -965,6 +994,31 @@ def test_format_agnosticism( ) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_format_agnosticism_pairwise_distances( + global_random_seed, + dtype, +): + """Check that results do not depend on the format (dense, sparse) of the input.""" + rng = np.random.RandomState(global_random_seed) + spread = 100 + n_samples, n_features = 100, 100 + + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + X_csr = csr_matrix(X) + Y_csr = csr_matrix(Y) + + dist_dense = PairwiseDistances.compute(X, Y) + + for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)): + if _X is X and _Y is Y: + continue + dist = PairwiseDistances.compute(_X, _Y) + ASSERT_RESULT_PAIRWISE[dtype](dist, dist_dense) + + @pytest.mark.parametrize( "n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)] ) @@ -1042,6 +1096,58 @@ def test_strategies_consistency( ) +@pytest.mark.parametrize( + "metric", + ["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"], +) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_strategies_consistency_pairwise_distances( + global_random_seed, + metric, + dtype, + n_features=10, +): + """Check that the results do not depend on the strategy used.""" + # TODO: Parametrize `n_samples_X` and `n_samples_Y` when the + # strategy heuristic has been inspected. + n_samples_X, n_samples_Y = 100, 100 + rng = np.random.RandomState(global_random_seed) + spread = 100 + X = rng.rand(n_samples_X, n_features).astype(dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + dist_par_X = PairwiseDistances.compute( + X, + Y, + metric=metric, + # Taking the first + metric_kwargs=_get_metric_params_list( + metric, n_features, seed=global_random_seed + )[0], + # To be sure to use parallelization + strategy="parallel_on_X", + ) + + dist_par_Y = PairwiseDistances.compute( + X, + Y, + metric=metric, + # Taking the first + metric_kwargs=_get_metric_params_list( + metric, n_features, seed=global_random_seed + )[0], + # To be sure to use parallelization + strategy="parallel_on_Y", + ) + + ASSERT_RESULT_PAIRWISE[dtype](dist_par_X, dist_par_Y) + + # "Concrete Dispatchers"-specific tests # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -1237,6 +1343,37 @@ def test_memmap_backed_data( ) +@pytest.mark.parametrize("metric", ["manhattan", "euclidean"]) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_memmap_backed_data_pairwise_distances( + metric, + dtype, +): + """Check that the results do not depend on the datasets writability.""" + rng = np.random.RandomState(0) + spread = 100 + n_samples, n_features = 128, 10 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + # Create read only datasets + X_mm, Y_mm = create_memmap_backed_data([X, Y]) + + ref_dist = PairwiseDistances.compute( + X, + Y, + metric=metric, + ) + + dist_mm = PairwiseDistances.compute( + X_mm, + Y_mm, + metric=metric, + ) + + ASSERT_RESULT_PAIRWISE[dtype](ref_dist, dist_mm) + + @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("n_features", [5, 10, 100]) @pytest.mark.parametrize("num_threads", [1, 2, 8]) From 878774799114061c74de9813fa99d2ee6be64b5d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 14 Feb 2023 12:10:53 +0100 Subject: [PATCH 07/21] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- doc/whats_new/v1.3.rst | 18 +++++++++--------- .../_dispatcher.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index f34d8b7a93eb6..bba44680fbaf5 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -159,15 +159,15 @@ Changelog - |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. :pr:`25432` by :user:`Julien Jerphanion `. -- |Efficiency| :func:`pairwise.pairwise_distances` performances has been improved - when OpenMP runs with n_threads > 1. - :pr:`25561` by :user:`Julien Jerphanion ` and - :user:`Vincent Maladiere `. - -- |Feature| :func:`pairwise.pairwise_distances` now support CSR-CSR, Dense-CSR and - CSR-Dense computations. - :pr:`25561` by :user:`Julien Jerphanion ` and - :user:`Vincent Maladiere `. +- |Efficiency| :func:`pairwise.pairwise_distances`' performance has been improved + when providing dense datasets. + :pr:`25561` by :user:`Vincent Maladiere ` and + :user:`Julien Jerphanion `. + +- |Feature| :func:`pairwise.pairwise_distances` now supports combination of + dense arrays and sparse CSR matrices datasets. + :pr:`25561` by :user:`Vincent Maladiere ` and + :user:`Julien Jerphanion `. :mod:`sklearn.naive_bayes` .......................... diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index d48b1b1dc5fa5..5a6de2d4a5a3b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -387,7 +387,7 @@ def compute( 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, 'parallel_on_X' is usually the most efficient strategy. When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' - brings more opportunity for parallelism and is therefore more efficient. + brings more opportunity for parallelism and is therefore more efficient - None (default) looks-up in scikit-learn configuration for `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. From 3fd55b9c2118e8076460f49dbe791d0a0d368eea Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 14 Feb 2023 14:45:07 +0100 Subject: [PATCH 08/21] add test_pairwise_distances_is_usable_for --- .../test_pairwise_distances_reduction.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index b9a4fcee98083..decb892ac53de 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1402,3 +1402,39 @@ def test_sqeuclidean_row_norms( with pytest.raises(ValueError): X = np.asfortranarray(X) sqeuclidean_row_norms(X, num_threads=num_threads) + + +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_pairwise_distances_is_usable_for( + global_random_seed, + dtype, + monkeypatch, +): + rng = np.random.RandomState(global_random_seed) + n_samples, n_features = 100, 10 + X = rng.rand(n_samples, n_features).astype(dtype) + + assert not PairwiseDistances.is_usable_for(X, X, metric="euclidean") + assert not PairwiseDistances.is_usable_for(X, X, metric="minkowski") + assert not PairwiseDistances.is_usable_for( + X, X, metric="minkowski", metric_kwargs={"p": 2} + ) + assert PairwiseDistances.is_usable_for( + X, X, metric="minkowski", metric_kwargs={"p": 3} + ) + + def mock_openmp_effective_n_threads(): + return 4 + + monkeypatch.setattr( + "sklearn.utils._openmp_helpers._openmp_effective_n_threads", + mock_openmp_effective_n_threads, + ) + + controler = threadpoolctl.ThreadpoolController() + + with controler.limit(limits=1, user_api=None): + assert not PairwiseDistances.is_usable_for(X, X, metric="manhattan") + + with controler.limit(limits=2, user_api=None): + assert PairwiseDistances.is_usable_for(X, X, metric="manhattan") From e8dfef585cff33277c68ac5d7adfe73fb72c2ad2 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 14 Feb 2023 18:14:36 +0100 Subject: [PATCH 09/21] fix monkeypatch test and extend is_usable_for to single threaded manhattan --- .../_pairwise_distances_reduction/_dispatcher.py | 9 ++++++++- .../tests/test_pairwise_distances_reduction.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 5a6de2d4a5a3b..0a5e22c61bb7d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -209,12 +209,19 @@ def is_euclidean(metric, metric_kwargs): metric == "minkowski" and metric_kwargs.get("p", 2) == 2 ) + manhattan_metrics = ["cityblock", "l1", "manhattan"] + Y = X if Y is None else Y - return ( + + is_usable = ( not is_euclidean(metric, metric_kwargs) and super().is_usable_for(X, Y, metric) and effective_n_threads != 1 ) + manhattan_single_thread_guard = ( + metric in manhattan_metrics and effective_n_threads == 1 + ) + return is_usable or manhattan_single_thread_guard @classmethod def compute( diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index decb892ac53de..b6e061e984e72 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1414,6 +1414,14 @@ def test_pairwise_distances_is_usable_for( n_samples, n_features = 100, 10 X = rng.rand(n_samples, n_features).astype(dtype) + def mock_openmp_effective_n_threads(): + return 4 + + monkeypatch.setattr( + "sklearn.utils._openmp_helpers._openmp_effective_n_threads", + mock_openmp_effective_n_threads, + ) + assert not PairwiseDistances.is_usable_for(X, X, metric="euclidean") assert not PairwiseDistances.is_usable_for(X, X, metric="minkowski") assert not PairwiseDistances.is_usable_for( @@ -1423,14 +1431,6 @@ def test_pairwise_distances_is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 3} ) - def mock_openmp_effective_n_threads(): - return 4 - - monkeypatch.setattr( - "sklearn.utils._openmp_helpers._openmp_effective_n_threads", - mock_openmp_effective_n_threads, - ) - controler = threadpoolctl.ThreadpoolController() with controler.limit(limits=1, user_api=None): From 9eed73f82bc6859e7227f2c8e9f1df91168a20a4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 8 Mar 2023 10:50:31 +0100 Subject: [PATCH 10/21] DOC Add comments --- .../_dispatcher.py | 11 +++++------ .../tests/test_pairwise_distances_reduction.py | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 0a5e22c61bb7d..5f19fe12983e2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -209,8 +209,6 @@ def is_euclidean(metric, metric_kwargs): metric == "minkowski" and metric_kwargs.get("p", 2) == 2 ) - manhattan_metrics = ["cityblock", "l1", "manhattan"] - Y = X if Y is None else Y is_usable = ( @@ -218,10 +216,11 @@ def is_euclidean(metric, metric_kwargs): and super().is_usable_for(X, Y, metric) and effective_n_threads != 1 ) - manhattan_single_thread_guard = ( - metric in manhattan_metrics and effective_n_threads == 1 - ) - return is_usable or manhattan_single_thread_guard + + # We need to rely on `PairwiseDistances` for manhattan anyway because + # the implementation of manhattan distances on sparse data has been removed. + manhattan_metrics = ["cityblock", "l1", "manhattan"] + return is_usable or metric in manhattan_metrics @classmethod def compute( diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index b6e061e984e72..10ebf84b23439 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1414,27 +1414,29 @@ def test_pairwise_distances_is_usable_for( n_samples, n_features = 100, 10 X = rng.rand(n_samples, n_features).astype(dtype) - def mock_openmp_effective_n_threads(): - return 4 - + # We monkey patch this interface to test the expected behavior + # when the machine running the test only uses one thread. monkeypatch.setattr( "sklearn.utils._openmp_helpers._openmp_effective_n_threads", - mock_openmp_effective_n_threads, + lambda _: 4, ) + # Equivalent specifications of the Euclidean metric. + # TODO: support Euclidean metric. assert not PairwiseDistances.is_usable_for(X, X, metric="euclidean") assert not PairwiseDistances.is_usable_for(X, X, metric="minkowski") assert not PairwiseDistances.is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 2} ) + assert PairwiseDistances.is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 3} ) - controler = threadpoolctl.ThreadpoolController() + controller = threadpoolctl.ThreadpoolController() - with controler.limit(limits=1, user_api=None): - assert not PairwiseDistances.is_usable_for(X, X, metric="manhattan") + with controller.limit(limits=1, user_api=None): + assert PairwiseDistances.is_usable_for(X, X, metric="manhattan") - with controler.limit(limits=2, user_api=None): + with controller.limit(limits=2, user_api=None): assert PairwiseDistances.is_usable_for(X, X, metric="manhattan") From 502725d4ab95a97966fae2174af16861c465076e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 8 Mar 2023 16:31:12 +0100 Subject: [PATCH 11/21] Convert sparse matrices to be CSR --- sklearn/metrics/pairwise.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 54ab7aaf0c750..402b275640a1f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -2020,9 +2020,7 @@ def pairwise_distances( % (metric, _VALID_METRICS) ) - if PairwiseDistances.is_usable_for( - X, X if Y is None else Y, metric=metric, metric_kwargs=kwds - ): + if PairwiseDistances.is_usable_for(X, Y, metric=metric, metric_kwargs=kwds): # This is an adaptor for one "sqeuclidean" specification. # For this backend, we can directly use "sqeuclidean". if kwds.get("squared", False) and metric == "euclidean": @@ -2031,9 +2029,17 @@ def pairwise_distances( metric = "euclidean" kwds = {} - return PairwiseDistances.compute( - X, X if Y is None else Y, metric=metric, metric_kwargs=kwds - ) + if issparse(X): + X = csr_matrix(X, copy=False) + # This also sorts indices in-place. + X.sum_duplicates() + + if issparse(Y): + Y = csr_matrix(Y, copy=False) + # This also sorts indices in-place. + Y.sum_duplicates() + + return PairwiseDistances.compute(X, Y, metric=metric, metric_kwargs=kwds) if metric == "precomputed": X, _ = check_pairwise_arrays( From face5a918079e3a4da09478784497f1907e91feb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 8 Mar 2023 16:32:32 +0100 Subject: [PATCH 12/21] Correct sqeucliden adaptation --- sklearn/metrics/pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 402b275640a1f..13ffe3eb3f835 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -2025,8 +2025,8 @@ def pairwise_distances( # For this backend, we can directly use "sqeuclidean". if kwds.get("squared", False) and metric == "euclidean": # TODO: use 'sqeuclidean' instead of 'euclidean' - # with EuclideanPairwiseDistance - metric = "euclidean" + # with EuclideanPairwiseDistances + metric = "sqeuclidean" kwds = {} if issparse(X): From cdd2567ba41bde8e2152b89a2f607ebe4628203d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 9 Mar 2023 17:54:19 +0100 Subject: [PATCH 13/21] Correct condition for PairwiseDistances.is_usable_for --- .../_pairwise_distances_reduction/_dispatcher.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 5f19fe12983e2..c1d7cb0e0b53f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -211,16 +211,16 @@ def is_euclidean(metric, metric_kwargs): Y = X if Y is None else Y - is_usable = ( - not is_euclidean(metric, metric_kwargs) - and super().is_usable_for(X, Y, metric) - and effective_n_threads != 1 - ) - # We need to rely on `PairwiseDistances` for manhattan anyway because # the implementation of manhattan distances on sparse data has been removed. manhattan_metrics = ["cityblock", "l1", "manhattan"] - return is_usable or metric in manhattan_metrics + + is_usable = super().is_usable_for(X, Y, metric) and ( + (not is_euclidean(metric, metric_kwargs) and effective_n_threads != 1) + or metric in manhattan_metrics + ) + + return is_usable @classmethod def compute( From 32b4dadf806f3fdd0d60880e7e1f6f28ea693648 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 9 Mar 2023 18:48:18 +0100 Subject: [PATCH 14/21] TST Adapt test_pairwise_distances_is_usable_for --- .../tests/test_pairwise_distances_reduction.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 10ebf84b23439..90994d2f82f11 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -20,6 +20,7 @@ ) from sklearn.metrics import euclidean_distances +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.fixes import sp_version, parse_version from sklearn.utils._testing import ( assert_array_equal, @@ -1414,13 +1415,6 @@ def test_pairwise_distances_is_usable_for( n_samples, n_features = 100, 10 X = rng.rand(n_samples, n_features).astype(dtype) - # We monkey patch this interface to test the expected behavior - # when the machine running the test only uses one thread. - monkeypatch.setattr( - "sklearn.utils._openmp_helpers._openmp_effective_n_threads", - lambda _: 4, - ) - # Equivalent specifications of the Euclidean metric. # TODO: support Euclidean metric. assert not PairwiseDistances.is_usable_for(X, X, metric="euclidean") @@ -1429,9 +1423,12 @@ def test_pairwise_distances_is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 2} ) + # PairwiseDistances must not be used for sequential execution because + # They are not yet competitive with the previous joblib-based back-end. + # TODO: make PairwiseDistances competitive for sequential execution. assert PairwiseDistances.is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 3} - ) + ) == (_openmp_effective_n_threads() != 1) controller = threadpoolctl.ThreadpoolController() From 9d6cd412afd3c0e05deaf4e1068decdcc43ffc93 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 9 Mar 2023 19:59:32 +0100 Subject: [PATCH 15/21] Use threadpool_limits over ThreadpoolController This allows using older versions of threadpoolctl. --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 90994d2f82f11..a693c92589fa5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -1430,10 +1430,8 @@ def test_pairwise_distances_is_usable_for( X, X, metric="minkowski", metric_kwargs={"p": 3} ) == (_openmp_effective_n_threads() != 1) - controller = threadpoolctl.ThreadpoolController() - - with controller.limit(limits=1, user_api=None): + with threadpoolctl.threadpool_limits(limits=1, user_api=None): assert PairwiseDistances.is_usable_for(X, X, metric="manhattan") - with controller.limit(limits=2, user_api=None): + with threadpoolctl.threadpool_limits(limits=2, user_api=None): assert PairwiseDistances.is_usable_for(X, X, metric="manhattan") From 304d7d87a6270a9c0bebba6d587930e7d30126dd Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 9 Mar 2023 20:02:33 +0100 Subject: [PATCH 16/21] TST Increase atol for test_euclidean_distances_extreme_values --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b37df58f87925..5ded1d9104db3 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -941,7 +941,7 @@ def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): distances = euclidean_distances(X, Y) expected = cdist(X, Y) - assert_allclose(distances, expected, rtol=1e-5, atol=4e-4) + assert_allclose(distances, expected, rtol=1e-5, atol=6e-4) @pytest.mark.parametrize("squared", [True, False]) From 99de9919ff8f18e00c3588db334bfa9b9a1abc63 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 9 Mar 2023 21:26:51 +0100 Subject: [PATCH 17/21] DOC Add docstring for PairwiseDistances.is_usable_for --- .../_dispatcher.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index c1d7cb0e0b53f..95e6b4cbc420f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -194,7 +194,29 @@ class PairwiseDistances(BaseDistancesReductionDispatcher): @classmethod def is_usable_for(cls, X, Y, metric, metric_kwargs=None) -> bool: + """Return True if the dispatcher can be used for the + given parameters. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + True if the dispatcher can be used, else False. + """ effective_n_threads = _openmp_effective_n_threads() def is_euclidean(metric, metric_kwargs): From cb518abbbffc519223aa17bc8a41e20e1cf88325 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 21 Jun 2023 18:01:10 +0200 Subject: [PATCH 18/21] finalize merging with main by removing deprecated DTYPE and ITYPE --- .../_pairwise_distances.pxd.tp | 6 +++--- .../_pairwise_distances.pyx.tp | 17 ++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp index ed2ce850af11c..797cbe97874ac 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pxd.tp @@ -13,7 +13,7 @@ implementation_specific_values = [ }} cimport numpy as cnp -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +from ...utils._typedefs cimport intp_t {{for name_suffix, INPUT_DTYPE_t in implementation_specific_values}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} @@ -25,8 +25,8 @@ cdef class PairwiseDistances{{name_suffix}}: cdef: readonly DatasetsPair{{name_suffix}} datasets_pair - ITYPE_t n_samples_X, n_samples_Y - ITYPE_t effective_n_threads + intp_t n_samples_X, n_samples_Y + intp_t effective_n_threads bint X_is_Y bint execute_in_parallel_on_Y diff --git a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp index 5e7e0e1109d37..bac3904b85e97 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_pairwise_distances.pyx.tp @@ -1,7 +1,7 @@ cimport numpy as cnp from cython cimport final from cython.parallel cimport prange -from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from ...utils._typedefs cimport intp_t import numpy as np @@ -10,7 +10,6 @@ from sklearn import get_config from ...utils import check_array, _in_unstable_openblas_configuration from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils.fixes import threadpool_limits, sp_version, parse_version -from ...utils._typedefs import DTYPE cnp.import_array() @@ -40,7 +39,7 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): return {"VI": VI} return {} -{{for name_suffix, INPUT_DTYPE in (('64', 'DTYPE'),('32', 'np.float32'))}} +{{for name_suffix, INPUT_DTYPE in (('64', 'np.float64'),('32', 'np.float32'))}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} @@ -149,9 +148,9 @@ cdef class PairwiseDistances{{name_suffix}}: cdef void _parallel_on_X(self) nogil: cdef: - ITYPE_t n_X = self.n_samples_X - ITYPE_t n_Y = self.n_samples_Y - ITYPE_t i, j + intp_t n_X = self.n_samples_X + intp_t n_Y = self.n_samples_Y + intp_t i, j for i in prange(n_X, nogil=True, num_threads=self.effective_n_threads): for j in range(n_Y): @@ -160,9 +159,9 @@ cdef class PairwiseDistances{{name_suffix}}: cdef void _parallel_on_Y(self) nogil: cdef: - ITYPE_t n_X = self.n_samples_X - ITYPE_t n_Y = self.n_samples_Y - ITYPE_t i, j + intp_t n_X = self.n_samples_X + intp_t n_Y = self.n_samples_Y + intp_t i, j for i in range(n_X): for j in prange(n_Y, nogil=True, num_threads=self.effective_n_threads): From 8e94b7f2c840910f8c4d95974acd2995a78cb633 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Thu, 22 Jun 2023 01:22:14 +1000 Subject: [PATCH 19/21] DOC Update link to best sphinx version for doc build (#26626) --- doc/developers/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 96e0672eb10ea..d60e559ea2e50 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -750,7 +750,7 @@ To build the PDF manual, run: versions of Sphinx as possible, the different versions tend to behave slightly differently. To get the best results, you should use the same version as the one we used on CircleCI. Look at this - `GitHub search `_ + `GitHub search `_ to know the exact version. Guidelines for writing documentation From 53bdbe55cb3456a6784460bfa85b62ffe7173a56 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 21 Jun 2023 17:50:07 +0200 Subject: [PATCH 20/21] MNT add isort to ruff's rules (#26649) --- .github/scripts/label_title_regex.py | 5 +- .pre-commit-config.yaml | 1 + asv_benchmarks/benchmarks/cluster.py | 2 +- asv_benchmarks/benchmarks/common.py | 8 +- asv_benchmarks/benchmarks/datasets.py | 13 +- asv_benchmarks/benchmarks/decomposition.py | 4 +- asv_benchmarks/benchmarks/ensemble.py | 2 +- asv_benchmarks/benchmarks/linear_model.py | 4 +- benchmarks/bench_20newsgroups.py | 19 +- benchmarks/bench_covertype.py | 16 +- benchmarks/bench_feature_expansions.py | 4 +- benchmarks/bench_glm.py | 3 +- benchmarks/bench_glmnet.py | 10 +- benchmarks/bench_hist_gradient_boosting.py | 15 +- .../bench_hist_gradient_boosting_adult.py | 7 +- ...hist_gradient_boosting_categorical_only.py | 3 +- ...bench_hist_gradient_boosting_higgsboson.py | 10 +- .../bench_hist_gradient_boosting_threading.py | 17 +- benchmarks/bench_isolation_forest.py | 7 +- benchmarks/bench_isotonic.py | 10 +- ...kernel_pca_solvers_time_vs_n_components.py | 7 +- ...ch_kernel_pca_solvers_time_vs_n_samples.py | 7 +- benchmarks/bench_lasso.py | 4 +- benchmarks/bench_lof.py | 8 +- benchmarks/bench_mnist.py | 16 +- benchmarks/bench_multilabel_metrics.py | 11 +- benchmarks/bench_online_ocsvm.py | 19 +- benchmarks/bench_plot_incremental_pca.py | 8 +- benchmarks/bench_plot_lasso_path.py | 5 +- benchmarks/bench_plot_neighbors.py | 4 +- benchmarks/bench_plot_nmf.py | 19 +- benchmarks/bench_plot_omp_lars.py | 2 +- benchmarks/bench_plot_parallel_pairwise.py | 3 +- ...ch_plot_polynomial_kernel_approximation.py | 14 +- benchmarks/bench_plot_randomized_svd.py | 27 +-- benchmarks/bench_plot_svd.py | 7 +- benchmarks/bench_plot_ward.py | 2 +- benchmarks/bench_random_projections.py | 6 +- benchmarks/bench_rcv1_logreg_convergence.py | 9 +- benchmarks/bench_saga.py | 8 +- .../bench_sample_without_replacement.py | 6 +- benchmarks/bench_sgd_regression.py | 11 +- benchmarks/bench_sparsify.py | 3 +- benchmarks/bench_text_vectorizers.py | 4 +- benchmarks/bench_tree.py | 5 +- benchmarks/bench_tsne_mnist.py | 7 +- benchmarks/plot_tsne_mnist.py | 7 +- build_tools/azure/get_commit_message.py | 2 +- build_tools/circle/list_versions.py | 2 +- build_tools/generate_authors_table.py | 7 +- build_tools/github/check_wheels.py | 5 +- build_tools/github/vendor.py | 1 - .../update_environments_and_lock_files.py | 12 +- doc/conf.py | 13 +- doc/conftest.py | 12 +- doc/sphinxext/allow_nan_estimators.py | 9 +- doc/sphinxext/doi_role.py | 1 - doc/sphinxext/github_link.py | 4 +- .../plot_cyclical_feature_engineering.py | 15 +- .../applications/plot_digits_denoising.py | 3 +- .../applications/plot_face_recognition.py | 12 +- .../plot_model_complexity_influence.py | 10 +- .../plot_out_of_core_classification.py | 16 +- .../plot_outlier_detection_wine.py | 7 +- .../applications/plot_prediction_latency.py | 13 +- .../plot_species_distribution_modeling.py | 6 +- examples/applications/plot_stock_market.py | 1 + .../plot_tomography_l1_reconstruction.py | 9 +- .../plot_topics_extraction_with_nmf_lda.py | 5 +- examples/applications/svm_gui.py | 8 +- .../wikipedia_principal_eigenvector.py | 6 +- .../bicluster/plot_bicluster_newsgroups.py | 5 +- .../bicluster/plot_spectral_biclustering.py | 3 +- .../bicluster/plot_spectral_coclustering.py | 2 +- examples/calibration/plot_calibration.py | 2 +- .../calibration/plot_calibration_curve.py | 6 +- .../plot_calibration_multiclass.py | 1 + .../plot_classification_probability.py | 8 +- .../plot_classifier_comparison.py | 23 +-- .../plot_digits_classification.py | 2 +- examples/classification/plot_lda.py | 5 +- examples/classification/plot_lda_qda.py | 8 +- examples/cluster/plot_affinity_propagation.py | 2 +- .../cluster/plot_agglomerative_clustering.py | 1 + .../plot_agglomerative_clustering_metrics.py | 2 +- .../cluster/plot_agglomerative_dendrogram.py | 4 +- .../cluster/plot_birch_vs_minibatchkmeans.py | 8 +- examples/cluster/plot_bisect_kmeans.py | 3 +- examples/cluster/plot_cluster_comparison.py | 4 +- examples/cluster/plot_cluster_iris.py | 4 +- examples/cluster/plot_coin_segmentation.py | 5 +- examples/cluster/plot_color_quantization.py | 8 +- examples/cluster/plot_dbscan.py | 3 +- examples/cluster/plot_digits_agglomeration.py | 4 +- examples/cluster/plot_digits_linkage.py | 2 +- ...e_agglomeration_vs_univariate_selection.py | 9 +- examples/cluster/plot_hdbscan.py | 4 +- examples/cluster/plot_inductive_clustering.py | 2 +- examples/cluster/plot_kmeans_assumptions.py | 1 + examples/cluster/plot_kmeans_digits.py | 2 + examples/cluster/plot_kmeans_plusplus.py | 3 +- .../plot_kmeans_silhouette_analysis.py | 10 +- .../plot_kmeans_stability_low_dim_dense.py | 10 +- examples/cluster/plot_linkage_comparison.py | 4 +- examples/cluster/plot_mean_shift.py | 1 + examples/cluster/plot_mini_batch_kmeans.py | 2 + examples/cluster/plot_optics.py | 3 +- examples/cluster/plot_segmentation_toy.py | 3 +- .../plot_ward_structured_vs_unstructured.py | 4 - examples/compose/plot_column_transformer.py | 4 +- .../plot_column_transformer_mixed_types.py | 8 +- examples/compose/plot_compare_reduction.py | 12 +- examples/compose/plot_digits_pipe.py | 4 +- examples/compose/plot_feature_union.py | 6 +- examples/compose/plot_transformed_target.py | 2 + .../covariance/plot_covariance_estimation.py | 5 +- examples/covariance/plot_lw_vs_oas.py | 6 +- .../covariance/plot_mahalanobis_distances.py | 1 + .../plot_robust_vs_empirical_covariance.py | 4 +- examples/covariance/plot_sparse_cov.py | 1 + .../cross_decomposition/plot_pcr_vs_pls.py | 9 +- examples/datasets/plot_digits_last_image.py | 4 +- examples/datasets/plot_random_dataset.py | 4 +- .../plot_random_multilabel_dataset.py | 2 +- .../decomposition/plot_faces_decomposition.py | 5 +- .../plot_ica_blind_source_separation.py | 2 +- .../decomposition/plot_image_denoising.py | 1 - .../decomposition/plot_incremental_pca.py | 2 +- examples/decomposition/plot_pca_3d.py | 5 +- examples/decomposition/plot_pca_iris.py | 8 +- .../plot_pca_vs_fa_model_selection.py | 6 +- examples/decomposition/plot_sparse_coding.py | 2 +- examples/decomposition/plot_varimax_fa.py | 4 +- .../ensemble/plot_adaboost_hastie_10_2.py | 1 + examples/ensemble/plot_adaboost_multiclass.py | 1 - examples/ensemble/plot_adaboost_twoclass.py | 7 +- examples/ensemble/plot_bias_variance.py | 2 +- examples/ensemble/plot_ensemble_oob.py | 3 +- .../ensemble/plot_feature_transformation.py | 6 +- ...ot_forest_hist_grad_boosting_comparison.py | 6 +- examples/ensemble/plot_forest_importances.py | 1 + .../ensemble/plot_forest_importances_faces.py | 1 + examples/ensemble/plot_forest_iris.py | 6 +- .../plot_gradient_boosting_categorical.py | 9 +- .../plot_gradient_boosting_early_stopping.py | 5 +- .../ensemble/plot_gradient_boosting_oob.py | 8 +- .../plot_gradient_boosting_quantile.py | 3 +- .../plot_gradient_boosting_regression.py | 1 + .../plot_gradient_boosting_regularization.py | 5 +- examples/ensemble/plot_isolation_forest.py | 2 + .../ensemble/plot_monotonic_constraints.py | 6 +- .../ensemble/plot_random_forest_embedding.py | 4 +- ...ot_random_forest_regression_multioutput.py | 4 +- examples/ensemble/plot_stack_predictors.py | 7 +- .../ensemble/plot_voting_decision_regions.py | 6 +- examples/ensemble/plot_voting_probas.py | 5 +- examples/ensemble/plot_voting_regressor.py | 8 +- examples/exercises/plot_cv_digits.py | 3 +- .../plot_digits_classification_exercise.py | 2 +- examples/exercises/plot_iris_exercise.py | 3 +- .../feature_selection/plot_f_test_vs_mi.py | 3 +- .../plot_feature_selection.py | 1 + examples/feature_selection/plot_rfe_digits.py | 5 +- .../plot_rfe_with_cross_validation.py | 2 +- .../plot_select_from_model_diabetes.py | 4 +- .../gaussian_process/plot_compare_gpr_krr.py | 4 +- examples/gaussian_process/plot_gpc.py | 4 +- examples/gaussian_process/plot_gpc_iris.py | 3 +- .../plot_gpc_isoprobability.py | 6 +- examples/gaussian_process/plot_gpc_xor.py | 3 +- examples/gaussian_process/plot_gpr_co2.py | 1 + .../plot_gpr_on_structured_data.py | 7 +- ...t_iterative_imputer_variants_comparison.py | 14 +- examples/impute/plot_missing_values.py | 8 +- ...linear_model_coefficient_interpretation.py | 15 +- .../inspection/plot_partial_dependence.py | 11 +- .../inspection/plot_permutation_importance.py | 2 +- ...t_permutation_importance_multicollinear.py | 2 +- .../plot_scalable_poly_kernels.py | 3 +- examples/linear_model/plot_ard.py | 3 +- ...puted_gram_matrix_with_weighted_samples.py | 1 + examples/linear_model/plot_huber_vs_ridge.py | 2 +- examples/linear_model/plot_iris_logistic.py | 3 +- .../linear_model/plot_lasso_and_elasticnet.py | 5 +- .../plot_lasso_coordinate_descent_path.py | 6 +- .../plot_lasso_dense_vs_sparse_data.py | 5 +- examples/linear_model/plot_lasso_lars.py | 5 +- examples/linear_model/plot_lasso_lars_ic.py | 2 +- .../plot_lasso_model_selection.py | 3 +- examples/linear_model/plot_logistic.py | 1 + .../plot_logistic_l1_l2_sparsity.py | 4 +- .../linear_model/plot_logistic_multinomial.py | 5 +- .../plot_multi_task_lasso_support.py | 2 +- examples/linear_model/plot_nnls.py | 3 +- examples/linear_model/plot_ols.py | 1 + examples/linear_model/plot_ols_3d.py | 3 +- .../linear_model/plot_ols_ridge_variance.py | 2 +- examples/linear_model/plot_omp.py | 4 +- ...plot_poisson_regression_non_normal_loss.py | 28 +-- .../plot_polynomial_interpolation.py | 5 +- .../linear_model/plot_quantile_regression.py | 5 +- examples/linear_model/plot_ransac.py | 3 +- examples/linear_model/plot_ridge_path.py | 3 +- examples/linear_model/plot_robust_fit.py | 8 +- examples/linear_model/plot_sgd_comparison.py | 13 +- .../linear_model/plot_sgd_early_stopping.py | 10 +- examples/linear_model/plot_sgd_iris.py | 5 +- .../linear_model/plot_sgd_loss_functions.py | 2 +- examples/linear_model/plot_sgd_penalties.py | 2 +- .../plot_sgd_separating_hyperplane.py | 5 +- .../linear_model/plot_sgd_weighted_samples.py | 3 +- .../linear_model/plot_sgdocsvm_vs_ocsvm.py | 9 +- ...sparse_logistic_regression_20newsgroups.py | 2 +- .../plot_sparse_logistic_regression_mnist.py | 1 + examples/linear_model/plot_theilsen.py | 7 +- ...lot_tweedie_regression_insurance_claims.py | 26 +-- examples/manifold/plot_compare_methods.py | 4 +- examples/manifold/plot_lle_digits.py | 5 +- examples/manifold/plot_manifold_sphere.py | 10 +- examples/manifold/plot_mds.py | 3 +- examples/manifold/plot_swissroll.py | 2 +- examples/manifold/plot_t_sne_perplexity.py | 9 +- .../miscellaneous/plot_anomaly_comparison.py | 8 +- .../plot_display_object_visualization.py | 13 +- .../plot_estimator_representation.py | 7 +- .../miscellaneous/plot_isotonic_regression.py | 4 +- .../plot_johnson_lindenstrauss_bound.py | 13 +- .../plot_kernel_approximation.py | 7 +- .../plot_kernel_ridge_regression.py | 2 +- .../miscellaneous/plot_metadata_routing.py | 30 ++-- examples/miscellaneous/plot_multilabel.py | 6 +- .../plot_multioutput_face_completion.py | 8 +- .../plot_outlier_detection_bench.py | 9 +- ...ot_partial_dependence_visualization_api.py | 8 +- .../miscellaneous/plot_pipeline_display.py | 26 +-- .../plot_roc_curve_visualization_api.py | 5 +- examples/miscellaneous/plot_set_output.py | 8 +- examples/mixture/plot_concentration_prior.py | 4 +- examples/mixture/plot_gmm.py | 4 +- examples/mixture/plot_gmm_covariances.py | 1 - examples/mixture/plot_gmm_init.py | 6 +- examples/mixture/plot_gmm_pdf.py | 3 +- examples/mixture/plot_gmm_sin.py | 4 +- .../model_selection/plot_confusion_matrix.py | 6 +- examples/model_selection/plot_cv_indices.py | 15 +- examples/model_selection/plot_cv_predict.py | 1 + examples/model_selection/plot_det.py | 1 + .../plot_grid_search_refit_callable.py | 2 +- .../model_selection/plot_grid_search_stats.py | 1 + ...lot_grid_search_text_feature_extraction.py | 1 + .../model_selection/plot_learning_curve.py | 1 + .../model_selection/plot_likelihood_ratios.py | 8 +- .../plot_multi_metric_evaluation.py | 3 +- .../plot_nested_cross_validation_iris.py | 7 +- ...ot_permutation_tests_for_classification.py | 3 +- .../model_selection/plot_precision_recall.py | 7 +- .../model_selection/plot_randomized_search.py | 6 +- examples/model_selection/plot_roc.py | 4 +- examples/model_selection/plot_roc_crossval.py | 4 +- .../plot_successive_halving_heatmap.py | 6 +- .../plot_successive_halving_iterations.py | 9 +- .../plot_train_error_vs_test_error.py | 1 + .../plot_underfitting_overfitting.py | 7 +- .../model_selection/plot_validation_curve.py | 2 +- .../plot_classifier_chain_yeast.py | 9 +- .../approximate_nearest_neighbors.py | 1 + .../plot_caching_nearest_neighbors.py | 5 +- examples/neighbors/plot_classification.py | 3 +- .../neighbors/plot_digits_kde_sampling.py | 4 +- examples/neighbors/plot_kde_1d.py | 3 +- .../neighbors/plot_lof_novelty_detection.py | 3 +- examples/neighbors/plot_nca_classification.py | 6 +- examples/neighbors/plot_nca_dim_reduction.py | 5 +- examples/neighbors/plot_nca_illustration.py | 7 +- examples/neighbors/plot_nearest_centroid.py | 6 +- examples/neighbors/plot_regression.py | 3 +- examples/neighbors/plot_species_kde.py | 3 +- examples/neural_networks/plot_mlp_alpha.py | 5 +- .../plot_mlp_training_curves.py | 4 +- .../neural_networks/plot_mnist_filters.py | 4 +- .../plot_rbm_logistic_classification.py | 4 +- examples/preprocessing/plot_all_scaling.py | 24 +-- examples/preprocessing/plot_discretization.py | 2 +- .../plot_discretization_classification.py | 15 +- .../plot_discretization_strategies.py | 4 +- .../preprocessing/plot_map_data_to_normal.py | 6 +- .../preprocessing/plot_scaling_importance.py | 9 +- examples/preprocessing/plot_target_encoder.py | 8 +- .../plot_target_encoder_cross_val.py | 10 +- .../plot_release_highlights_0_22_0.py | 28 +-- .../plot_label_propagation_digits.py | 5 +- ...abel_propagation_digits_active_learning.py | 4 +- .../plot_label_propagation_structure.py | 1 + .../plot_self_training_varying_threshold.py | 7 +- .../plot_semi_supervised_newsgroups.py | 10 +- .../plot_semi_supervised_versus_svm_iris.py | 7 +- examples/svm/plot_custom_kernel.py | 5 +- examples/svm/plot_iris_svc.py | 4 +- .../svm/plot_linearsvc_support_vectors.py | 5 +- examples/svm/plot_oneclass.py | 5 +- examples/svm/plot_rbf_parameters.py | 3 +- examples/svm/plot_separating_hyperplane.py | 2 +- .../plot_separating_hyperplane_unbalanced.py | 1 + examples/svm/plot_svm_anova.py | 4 +- examples/svm/plot_svm_kernels.py | 4 +- examples/svm/plot_svm_margin.py | 3 +- examples/svm/plot_svm_nonlinear.py | 3 +- examples/svm/plot_svm_regression.py | 3 +- examples/svm/plot_svm_scale_c.py | 3 +- examples/svm/plot_svm_tie_breaking.py | 5 +- examples/svm/plot_weighted_samples.py | 3 +- ...ot_document_classification_20newsgroups.py | 19 +- examples/text/plot_document_clustering.py | 10 +- .../text/plot_hashing_vs_dict_vectorizer.py | 1 + examples/tree/plot_cost_complexity_pruning.py | 3 +- examples/tree/plot_iris_dtc.py | 5 +- examples/tree/plot_tree_regression.py | 3 +- .../tree/plot_tree_regression_multioutput.py | 3 +- examples/tree/plot_unveil_tree_structure.py | 4 +- maint_tools/check_pxd_in_installation.py | 5 +- maint_tools/sort_whats_new.py | 2 +- maint_tools/update_tracking_issue.py | 4 +- pyproject.toml | 4 + setup.py | 13 +- sklearn/__init__.py | 11 +- sklearn/_build_utils/__init__.py | 8 +- sklearn/_build_utils/pre_build_helpers.py | 4 +- sklearn/_config.py | 2 +- sklearn/_loss/__init__.py | 13 +- sklearn/_loss/link.py | 1 + sklearn/_loss/loss.py | 26 +-- sklearn/_loss/tests/test_link.py | 7 +- sklearn/_loss/tests/test_loss.py | 9 +- sklearn/_min_dependencies.py | 5 +- sklearn/base.py | 35 ++-- sklearn/calibration.py | 48 +++-- sklearn/cluster/__init__.py | 22 +-- sklearn/cluster/_affinity_propagation.py | 10 +- sklearn/cluster/_agglomerative.py | 15 +- sklearn/cluster/_bicluster.py | 16 +- sklearn/cluster/_birch.py | 19 +- sklearn/cluster/_bisect_k_means.py | 19 +- sklearn/cluster/_dbscan.py | 7 +- sklearn/cluster/_feature_agglomeration.py | 5 +- sklearn/cluster/_hdbscan/hdbscan.py | 11 +- .../_hdbscan/tests/test_reachibility.py | 3 +- sklearn/cluster/_kmeans.py | 58 +++--- sklearn/cluster/_mean_shift.py | 18 +- sklearn/cluster/_optics.py | 23 +-- sklearn/cluster/_spectral.py | 14 +- sklearn/cluster/tests/common.py | 1 - .../tests/test_affinity_propagation.py | 12 +- sklearn/cluster/tests/test_bicluster.py | 28 ++- sklearn/cluster/tests/test_birch.py | 9 +- sklearn/cluster/tests/test_bisect_k_means.py | 2 +- sklearn/cluster/tests/test_dbscan.py | 17 +- .../tests/test_feature_agglomeration.py | 7 +- sklearn/cluster/tests/test_hdbscan.py | 12 +- sklearn/cluster/tests/test_hierarchical.py | 40 ++--- sklearn/cluster/tests/test_k_means.py | 47 +++-- sklearn/cluster/tests/test_mean_shift.py | 13 +- sklearn/cluster/tests/test_optics.py | 17 +- sklearn/cluster/tests/test_spectral.py | 15 +- sklearn/compose/__init__.py | 3 +- sklearn/compose/_column_transformer.py | 29 ++- sklearn/compose/_target.py | 13 +- .../compose/tests/test_column_transformer.py | 23 ++- sklearn/compose/tests/test_target.py | 23 +-- sklearn/conftest.py | 30 ++-- sklearn/covariance/__init__.py | 15 +- sklearn/covariance/_elliptic_envelope.py | 11 +- sklearn/covariance/_empirical_covariance.py | 6 +- sklearn/covariance/_graph_lasso.py | 22 ++- sklearn/covariance/_robust_covariance.py | 7 +- sklearn/covariance/_shrunk_covariance.py | 5 +- sklearn/covariance/tests/test_covariance.py | 19 +- .../tests/test_elliptic_envelope.py | 8 +- .../covariance/tests/test_graphical_lasso.py | 19 +- .../tests/test_robust_covariance.py | 6 +- sklearn/cross_decomposition/__init__.py | 2 +- sklearn/cross_decomposition/_pls.py | 24 +-- sklearn/cross_decomposition/tests/test_pls.py | 13 +- sklearn/datasets/__init__.py | 87 ++++----- sklearn/datasets/_arff_parser.py | 1 - sklearn/datasets/_base.py | 19 +- sklearn/datasets/_california_housing.py | 24 +-- sklearn/datasets/_covtype.py | 23 +-- sklearn/datasets/_kddcup99.py | 22 +-- sklearn/datasets/_lfw.py | 14 +- sklearn/datasets/_olivetti_faces.py | 13 +- sklearn/datasets/_openml.py | 10 +- sklearn/datasets/_rcv1.py | 19 +- sklearn/datasets/_samples_generator.py | 8 +- sklearn/datasets/_species_distributions.py | 11 +- sklearn/datasets/_svmlight_format_io.py | 11 +- sklearn/datasets/_twenty_newsgroups.py | 29 +-- sklearn/datasets/tests/conftest.py | 1 + sklearn/datasets/tests/test_20news.py | 13 +- sklearn/datasets/tests/test_arff_parser.py | 2 +- sklearn/datasets/tests/test_base.py | 32 ++-- .../datasets/tests/test_california_housing.py | 3 +- sklearn/datasets/tests/test_common.py | 2 +- sklearn/datasets/tests/test_covtype.py | 2 + sklearn/datasets/tests/test_kddcup99.py | 9 +- sklearn/datasets/tests/test_lfw.py | 11 +- sklearn/datasets/tests/test_olivetti_faces.py | 3 +- sklearn/datasets/tests/test_openml.py | 20 +-- sklearn/datasets/tests/test_rcv1.py | 9 +- .../datasets/tests/test_samples_generator.py | 52 +++--- .../datasets/tests/test_svmlight_format.py | 23 +-- sklearn/decomposition/__init__.py | 31 ++-- sklearn/decomposition/_base.py | 5 +- sklearn/decomposition/_dict_learning.py | 26 +-- sklearn/decomposition/_factor_analysis.py | 14 +- sklearn/decomposition/_fastica.py | 12 +- sklearn/decomposition/_incremental_pca.py | 5 +- sklearn/decomposition/_kernel_pca.py | 25 +-- sklearn/decomposition/_lda.py | 22 ++- sklearn/decomposition/_nmf.py | 33 ++-- sklearn/decomposition/_pca.py | 10 +- sklearn/decomposition/_sparse_pca.py | 14 +- sklearn/decomposition/_truncated_svd.py | 11 +- .../decomposition/tests/test_dict_learning.py | 51 +++--- .../tests/test_factor_analysis.py | 10 +- sklearn/decomposition/tests/test_fastica.py | 9 +- .../tests/test_incremental_pca.py | 15 +- .../decomposition/tests/test_kernel_pca.py | 23 ++- sklearn/decomposition/tests/test_nmf.py | 25 ++- .../decomposition/tests/test_online_lda.py | 19 +- sklearn/decomposition/tests/test_pca.py | 14 +- .../decomposition/tests/test_sparse_pca.py | 13 +- .../decomposition/tests/test_truncated_svd.py | 7 +- sklearn/discriminant_analysis.py | 27 +-- sklearn/dummy.py | 25 ++- sklearn/ensemble/__init__.py | 31 ++-- sklearn/ensemble/_bagging.py | 25 ++- sklearn/ensemble/_base.py | 11 +- sklearn/ensemble/_forest.py | 36 ++-- sklearn/ensemble/_gb.py | 33 ++-- sklearn/ensemble/_gb_losses.py | 6 +- .../_hist_gradient_boosting/binning.py | 8 +- .../gradient_boosting.py | 39 ++-- .../_hist_gradient_boosting/grower.py | 25 +-- .../_hist_gradient_boosting/predictor.py | 8 +- .../tests/test_binning.py | 10 +- .../tests/test_bitset.py | 4 +- .../tests/test_compare_lightgbm.py | 12 +- .../tests/test_gradient_boosting.py | 31 ++-- .../tests/test_grower.py | 19 +- .../tests/test_histogram.py | 16 +- .../tests/test_monotonic_contraints.py | 17 +- .../tests/test_predictor.py | 22 +-- .../tests/test_splitting.py | 14 +- .../tests/test_warm_start.py | 12 +- sklearn/ensemble/_iforest.py | 16 +- sklearn/ensemble/_stacking.py | 35 ++-- sklearn/ensemble/_voting.py | 27 ++- sklearn/ensemble/_weight_boosting.py | 34 ++-- sklearn/ensemble/tests/test_bagging.py | 38 ++-- sklearn/ensemble/tests/test_base.py | 12 +- sklearn/ensemble/tests/test_common.py | 30 ++-- sklearn/ensemble/tests/test_forest.py | 72 ++++---- .../ensemble/tests/test_gradient_boosting.py | 39 ++-- .../test_gradient_boosting_loss_functions.py | 25 +-- sklearn/ensemble/tests/test_iforest.py | 26 ++- sklearn/ensemble/tests/test_stacking.py | 76 ++++---- sklearn/ensemble/tests/test_voting.py | 38 ++-- .../ensemble/tests/test_weight_boosting.py | 32 ++-- .../experimental/enable_halving_search_cv.py | 5 +- .../enable_hist_gradient_boosting.py | 1 - .../experimental/enable_iterative_imputer.py | 2 +- sklearn/feature_extraction/__init__.py | 4 +- .../feature_extraction/_dict_vectorizer.py | 7 +- sklearn/feature_extraction/_hash.py | 7 +- sklearn/feature_extraction/image.py | 9 +- .../tests/test_dict_vectorizer.py | 7 +- .../tests/test_feature_hasher.py | 2 +- .../feature_extraction/tests/test_image.py | 10 +- sklearn/feature_extraction/tests/test_text.py | 48 +++-- sklearn/feature_extraction/text.py | 19 +- sklearn/feature_selection/__init__.py | 40 ++--- sklearn/feature_selection/_base.py | 6 +- sklearn/feature_selection/_from_model.py | 15 +- sklearn/feature_selection/_mutual_info.py | 9 +- sklearn/feature_selection/_rfe.py | 27 ++- sklearn/feature_selection/_sequential.py | 12 +- .../_univariate_selection.py | 13 +- .../feature_selection/_variance_threshold.py | 8 +- sklearn/feature_selection/tests/test_base.py | 3 +- sklearn/feature_selection/tests/test_chi2.py | 5 +- .../tests/test_feature_select.py | 35 ++-- .../tests/test_from_model.py | 36 ++-- .../tests/test_mutual_info.py | 6 +- sklearn/feature_selection/tests/test_rfe.py | 25 ++- .../tests/test_sequential.py | 14 +- .../tests/test_variance_threshold.py | 4 +- sklearn/gaussian_process/__init__.py | 5 +- sklearn/gaussian_process/_gpc.py | 17 +- sklearn/gaussian_process/_gpr.py | 11 +- sklearn/gaussian_process/kernels.py | 13 +- .../tests/_mini_sequence_kernel.py | 10 +- sklearn/gaussian_process/tests/test_gpc.py | 12 +- sklearn/gaussian_process/tests/test_gpr.py | 22 +-- .../gaussian_process/tests/test_kernels.py | 38 ++-- sklearn/impute/_base.py | 14 +- sklearn/impute/_iterative.py | 22 +-- sklearn/impute/_knn.py | 7 +- sklearn/impute/tests/test_base.py | 8 +- sklearn/impute/tests/test_common.py | 17 +- sklearn/impute/tests/test_impute.py | 36 ++-- sklearn/impute/tests/test_knn.py | 3 +- sklearn/inspection/__init__.py | 4 +- sklearn/inspection/_partial_dependence.py | 38 ++-- sklearn/inspection/_permutation_importance.py | 7 +- sklearn/inspection/_plot/decision_boundary.py | 7 +- .../inspection/_plot/partial_dependence.py | 18 +- .../tests/test_boundary_decision_display.py | 20 +-- .../tests/test_plot_partial_dependence.py | 25 ++- .../tests/test_partial_dependence.py | 48 +++-- sklearn/inspection/tests/test_pd_utils.py | 3 +- .../tests/test_permutation_importance.py | 26 ++- sklearn/isotonic.py | 15 +- sklearn/kernel_approximation.py | 29 +-- sklearn/kernel_ridge.py | 9 +- sklearn/linear_model/__init__.py | 46 +++-- sklearn/linear_model/_base.py | 38 ++-- sklearn/linear_model/_bayes.py | 10 +- sklearn/linear_model/_coordinate_descent.py | 16 +- sklearn/linear_model/_glm/__init__.py | 4 +- sklearn/linear_model/_glm/glm.py | 5 +- sklearn/linear_model/_glm/tests/test_glm.py | 10 +- sklearn/linear_model/_huber.py | 9 +- sklearn/linear_model/_least_angle.py | 21 +-- sklearn/linear_model/_linear_loss.py | 1 + sklearn/linear_model/_logistic.py | 40 +++-- sklearn/linear_model/_omp.py | 14 +- sklearn/linear_model/_passive_aggressive.py | 4 +- sklearn/linear_model/_perceptron.py | 2 +- sklearn/linear_model/_quantile.py | 9 +- sklearn/linear_model/_ransac.py | 29 +-- sklearn/linear_model/_ridge.py | 44 ++--- sklearn/linear_model/_sag.py | 6 +- sklearn/linear_model/_stochastic_gradient.py | 57 +++--- sklearn/linear_model/_theil_sen.py | 15 +- sklearn/linear_model/tests/test_base.py | 31 ++-- sklearn/linear_model/tests/test_bayes.py | 14 +- .../tests/test_coordinate_descent.py | 52 +++--- sklearn/linear_model/tests/test_huber.py | 11 +- .../linear_model/tests/test_least_angle.py | 29 +-- .../linear_model/tests/test_linear_loss.py | 3 +- sklearn/linear_model/tests/test_logistic.py | 47 ++--- sklearn/linear_model/tests/test_omp.py | 23 +-- .../tests/test_passive_aggressive.py | 16 +- sklearn/linear_model/tests/test_perceptron.py | 7 +- sklearn/linear_model/tests/test_quantile.py | 5 +- sklearn/linear_model/tests/test_ransac.py | 19 +- sklearn/linear_model/tests/test_ridge.py | 87 ++++----- sklearn/linear_model/tests/test_sag.py | 25 +-- sklearn/linear_model/tests/test_sgd.py | 35 ++-- .../tests/test_sparse_coordinate_descent.py | 16 +- sklearn/linear_model/tests/test_theil_sen.py | 16 +- sklearn/manifold/__init__.py | 2 +- sklearn/manifold/_isomap.py | 25 +-- sklearn/manifold/_locally_linear.py | 16 +- sklearn/manifold/_mds.py | 14 +- sklearn/manifold/_spectral_embedding.py | 14 +- sklearn/manifold/_t_sne.py | 25 +-- sklearn/manifold/tests/test_isomap.py | 11 +- sklearn/manifold/tests/test_locally_linear.py | 16 +- sklearn/manifold/tests/test_mds.py | 3 +- .../manifold/tests/test_spectral_embedding.py | 26 ++- sklearn/manifold/tests/test_t_sne.py | 58 +++--- sklearn/metrics/__init__.py | 169 +++++++++--------- sklearn/metrics/_classification.py | 29 ++- .../_pairwise_distances_reduction/__init__.py | 2 +- .../_dispatcher.py | 18 +- sklearn/metrics/_plot/confusion_matrix.py | 4 +- sklearn/metrics/_plot/det_curve.py | 2 +- .../metrics/_plot/precision_recall_curve.py | 3 +- sklearn/metrics/_plot/regression.py | 4 +- sklearn/metrics/_plot/roc_curve.py | 3 +- .../_plot/tests/test_common_curve_display.py | 9 +- .../tests/test_confusion_matrix_display.py | 11 +- .../_plot/tests/test_det_curve_display.py | 6 +- .../tests/test_precision_recall_display.py | 8 +- .../_plot/tests/test_predict_error_display.py | 2 - .../_plot/tests/test_roc_curve_display.py | 14 +- sklearn/metrics/_ranking.py | 22 +-- sklearn/metrics/_regression.py | 11 +- sklearn/metrics/_scorer.py | 84 ++++----- sklearn/metrics/cluster/__init__.py | 40 +++-- sklearn/metrics/cluster/_bicluster.py | 2 +- sklearn/metrics/cluster/_supervised.py | 5 +- sklearn/metrics/cluster/_unsupervised.py | 12 +- .../metrics/cluster/tests/test_bicluster.py | 5 +- sklearn/metrics/cluster/tests/test_common.py | 30 ++-- .../metrics/cluster/tests/test_supervised.py | 37 ++-- .../cluster/tests/test_unsupervised.py | 18 +- sklearn/metrics/pairwise.py | 40 +++-- sklearn/metrics/tests/test_classification.py | 85 +++++---- sklearn/metrics/tests/test_common.py | 114 ++++++------ sklearn/metrics/tests/test_dist_metrics.py | 6 +- sklearn/metrics/tests/test_pairwise.py | 91 +++++----- .../test_pairwise_distances_reduction.py | 8 +- sklearn/metrics/tests/test_ranking.py | 67 +++---- sklearn/metrics/tests/test_regression.py | 52 +++--- sklearn/metrics/tests/test_score_objects.py | 66 +++---- sklearn/mixture/__init__.py | 3 +- sklearn/mixture/_base.py | 8 +- sklearn/mixture/_bayesian_mixture.py | 19 +- sklearn/mixture/_gaussian_mixture.py | 6 +- .../mixture/tests/test_bayesian_mixture.py | 20 +-- .../mixture/tests/test_gaussian_mixture.py | 31 ++-- sklearn/mixture/tests/test_mixture.py | 5 +- sklearn/model_selection/__init__.py | 65 ++++--- sklearn/model_selection/_plot.py | 4 +- sklearn/model_selection/_search.py | 38 ++-- .../_search_successive_halving.py | 13 +- sklearn/model_selection/_split.py | 28 +-- sklearn/model_selection/_validation.py | 29 ++- sklearn/model_selection/tests/test_plot.py | 9 +- sklearn/model_selection/tests/test_search.py | 112 ++++++------ sklearn/model_selection/tests/test_split.py | 82 +++++---- .../tests/test_successive_halving.py | 25 +-- .../model_selection/tests/test_validation.py | 137 +++++++------- sklearn/multiclass.py | 30 ++-- sklearn/multioutput.py | 10 +- sklearn/naive_bayes.py | 15 +- sklearn/neighbors/__init__.py | 19 +- sklearn/neighbors/_base.py | 27 ++- sklearn/neighbors/_classification.py | 17 +- sklearn/neighbors/_graph.py | 8 +- sklearn/neighbors/_kde.py | 6 +- sklearn/neighbors/_lof.py | 12 +- sklearn/neighbors/_nca.py | 26 +-- sklearn/neighbors/_nearest_centroid.py | 15 +- sklearn/neighbors/_regression.py | 6 +- sklearn/neighbors/_unsupervised.py | 4 +- sklearn/neighbors/tests/test_ball_tree.py | 3 +- sklearn/neighbors/tests/test_kd_tree.py | 2 +- sklearn/neighbors/tests/test_kde.py | 13 +- sklearn/neighbors/tests/test_lof.py | 22 +-- sklearn/neighbors/tests/test_nca.py | 13 +- .../neighbors/tests/test_nearest_centroid.py | 4 +- sklearn/neighbors/tests/test_neighbors.py | 28 ++- .../tests/test_neighbors_pipeline.py | 25 ++- .../neighbors/tests/test_neighbors_tree.py | 21 ++- sklearn/neighbors/tests/test_quad_tree.py | 2 +- sklearn/neural_network/__init__.py | 4 +- sklearn/neural_network/_base.py | 1 - .../neural_network/_multilayer_perceptron.py | 42 +++-- sklearn/neural_network/_rbm.py | 18 +- sklearn/neural_network/tests/test_base.py | 5 +- sklearn/neural_network/tests/test_mlp.py | 28 +-- sklearn/neural_network/tests/test_rbm.py | 14 +- .../tests/test_stochastic_optimizers.py | 3 +- sklearn/pipeline.py | 22 +-- sklearn/preprocessing/__init__.py | 57 +++--- sklearn/preprocessing/_data.py | 24 ++- sklearn/preprocessing/_discretization.py | 25 +-- sklearn/preprocessing/_encoders.py | 17 +- .../preprocessing/_function_transformer.py | 5 +- sklearn/preprocessing/_label.py | 19 +- sklearn/preprocessing/_polynomial.py | 19 +- sklearn/preprocessing/_target_encoder.py | 16 +- sklearn/preprocessing/tests/test_common.py | 38 ++-- sklearn/preprocessing/tests/test_data.py | 83 ++++----- .../tests/test_discretization.py | 12 +- sklearn/preprocessing/tests/test_encoders.py | 14 +- .../tests/test_function_transformer.py | 10 +- sklearn/preprocessing/tests/test_label.py | 41 ++--- .../preprocessing/tests/test_polynomial.py | 13 +- .../tests/test_target_encoder.py | 25 +-- sklearn/random_projection.py | 13 +- sklearn/semi_supervised/_label_propagation.py | 9 +- sklearn/semi_supervised/_self_training.py | 7 +- .../tests/test_label_propagation.py | 13 +- .../tests/test_self_training.py | 11 +- sklearn/svm/__init__.py | 2 +- sklearn/svm/_base.py | 35 ++-- sklearn/svm/_bounds.py | 4 +- sklearn/svm/_classes.py | 13 +- sklearn/svm/tests/test_bounds.py | 10 +- sklearn/svm/tests/test_sparse.py | 12 +- sklearn/svm/tests/test_svm.py | 45 +++-- sklearn/tests/random_seed.py | 3 +- sklearn/tests/test_base.py | 37 ++-- sklearn/tests/test_build.py | 3 +- sklearn/tests/test_calibration.py | 59 +++--- sklearn/tests/test_common.py | 85 ++++----- sklearn/tests/test_config.py | 4 +- sklearn/tests/test_discriminant_analysis.py | 33 ++-- sklearn/tests/test_docstring_parameters.py | 42 ++--- sklearn/tests/test_docstrings.py | 14 +- sklearn/tests/test_dummy.py | 16 +- sklearn/tests/test_isotonic.py | 20 +-- sklearn/tests/test_kernel_approximation.py | 31 ++-- sklearn/tests/test_kernel_ridge.py | 7 +- sklearn/tests/test_metadata_routing.py | 45 +++-- sklearn/tests/test_metaestimators.py | 27 +-- sklearn/tests/test_min_dependencies_readme.py | 3 +- sklearn/tests/test_multiclass.py | 59 +++--- sklearn/tests/test_multioutput.py | 68 ++++--- sklearn/tests/test_naive_bayes.py | 31 ++-- sklearn/tests/test_pipeline.py | 51 +++--- sklearn/tests/test_public_functions.py | 12 +- sklearn/tests/test_random_projection.py | 32 ++-- sklearn/tree/__init__.py | 14 +- sklearn/tree/_classes.py | 53 +++--- sklearn/tree/_export.py | 12 +- sklearn/tree/tests/test_export.py | 13 +- sklearn/tree/tests/test_reingold_tilford.py | 3 +- sklearn/tree/tests/test_tree.py | 90 +++++----- sklearn/utils/__init__.py | 37 ++-- sklearn/utils/_array_api.py | 2 +- sklearn/utils/_available_if.py | 3 +- sklearn/utils/_encode.py | 3 +- sklearn/utils/_estimator_html_repr.py | 4 +- sklearn/utils/_joblib.py | 21 ++- sklearn/utils/_mask.py | 3 +- sklearn/utils/_mocking.py | 5 +- sklearn/utils/_param_validation.py | 15 +- sklearn/utils/_plotting.py | 2 +- sklearn/utils/_pprint.py | 2 +- sklearn/utils/_set_output.py | 2 +- sklearn/utils/_show_versions.py | 7 +- sklearn/utils/_testing.py | 47 +++-- sklearn/utils/class_weight.py | 1 - sklearn/utils/deprecation.py | 3 +- sklearn/utils/discovery.py | 8 +- sklearn/utils/estimator_checks.py | 107 ++++++----- sklearn/utils/extmath.py | 2 +- sklearn/utils/fixes.py | 10 +- sklearn/utils/metaestimators.py | 8 +- sklearn/utils/multiclass.py | 9 +- sklearn/utils/optimize.py | 5 +- sklearn/utils/random.py | 3 +- sklearn/utils/sparsefuncs.py | 10 +- sklearn/utils/tests/test_array_api.py | 19 +- sklearn/utils/tests/test_arrayfuncs.py | 2 +- sklearn/utils/tests/test_class_weight.py | 7 +- sklearn/utils/tests/test_cython_blas.py | 33 ++-- sklearn/utils/tests/test_cython_templating.py | 2 + sklearn/utils/tests/test_deprecation.py | 4 +- sklearn/utils/tests/test_encode.py | 5 +- sklearn/utils/tests/test_estimator_checks.py | 57 +++--- .../utils/tests/test_estimator_html_repr.py | 40 ++--- sklearn/utils/tests/test_extmath.py | 52 +++--- sklearn/utils/tests/test_fast_dict.py | 2 +- sklearn/utils/tests/test_fixes.py | 1 - sklearn/utils/tests/test_graph.py | 4 +- sklearn/utils/tests/test_metaestimators.py | 4 +- sklearn/utils/tests/test_mocking.py | 12 +- sklearn/utils/tests/test_multiclass.py | 50 +++--- sklearn/utils/tests/test_murmurhash.py | 4 +- sklearn/utils/tests/test_optimize.py | 3 +- sklearn/utils/tests/test_parallel.py | 3 +- sklearn/utils/tests/test_param_validation.py | 53 +++--- sklearn/utils/tests/test_plotting.py | 2 +- sklearn/utils/tests/test_random.py | 4 +- sklearn/utils/tests/test_response.py | 4 +- sklearn/utils/tests/test_seq_dataset.py | 4 +- sklearn/utils/tests/test_set_output.py | 14 +- sklearn/utils/tests/test_shortest_path.py | 1 + sklearn/utils/tests/test_show_versions.py | 6 +- sklearn/utils/tests/test_sparsefuncs.py | 19 +- sklearn/utils/tests/test_testing.py | 37 ++-- sklearn/utils/tests/test_utils.py | 51 +++--- sklearn/utils/tests/test_validation.py | 93 +++++----- sklearn/utils/tests/test_weight_vector.py | 1 + sklearn/utils/validation.py | 25 ++- 771 files changed, 5515 insertions(+), 5559 deletions(-) diff --git a/.github/scripts/label_title_regex.py b/.github/scripts/label_title_regex.py index ddf9bda3492de..a022c3c4dd2a7 100644 --- a/.github/scripts/label_title_regex.py +++ b/.github/scripts/label_title_regex.py @@ -1,10 +1,11 @@ """Labels PRs based on title. Must be run in a github action with the pull_request_target event.""" -from github import Github -import os import json +import os import re +from github import Github + context_dict = json.loads(os.getenv("CONTEXT_GITHUB")) repo = context_dict["repository"] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 643c2141819d5..abffbbe149f2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,7 @@ repos: rev: v0.0.272 hooks: - id: ruff + args: ["--fix", "--show-source"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.3.0 hooks: diff --git a/asv_benchmarks/benchmarks/cluster.py b/asv_benchmarks/benchmarks/cluster.py index ba460e6b503a6..457a15dd938e9 100644 --- a/asv_benchmarks/benchmarks/cluster.py +++ b/asv_benchmarks/benchmarks/cluster.py @@ -1,7 +1,7 @@ from sklearn.cluster import KMeans, MiniBatchKMeans from .common import Benchmark, Estimator, Predictor, Transformer -from .datasets import _blobs_dataset, _20newsgroups_highdim_dataset +from .datasets import _20newsgroups_highdim_dataset, _blobs_dataset from .utils import neg_mean_inertia diff --git a/asv_benchmarks/benchmarks/common.py b/asv_benchmarks/benchmarks/common.py index c3e114a212047..aeea558844587 100644 --- a/asv_benchmarks/benchmarks/common.py +++ b/asv_benchmarks/benchmarks/common.py @@ -1,11 +1,11 @@ -import os +import itertools import json -import timeit +import os import pickle -import itertools +import timeit from abc import ABC, abstractmethod -from pathlib import Path from multiprocessing import cpu_count +from pathlib import Path import numpy as np diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py index dbe0eac0b822c..8f0c915c95e63 100644 --- a/asv_benchmarks/benchmarks/datasets.py +++ b/asv_benchmarks/benchmarks/datasets.py @@ -1,21 +1,22 @@ +from pathlib import Path + import numpy as np import scipy.sparse as sp from joblib import Memory -from pathlib import Path -from sklearn.decomposition import TruncatedSVD from sklearn.datasets import ( - make_blobs, fetch_20newsgroups, + fetch_olivetti_faces, fetch_openml, load_digits, - make_regression, + make_blobs, make_classification, - fetch_olivetti_faces, + make_regression, ) -from sklearn.preprocessing import MaxAbsScaler, StandardScaler +from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MaxAbsScaler, StandardScaler # memory location for caching datasets M = Memory(location=str(Path(__file__).resolve().parent / "cache")) diff --git a/asv_benchmarks/benchmarks/decomposition.py b/asv_benchmarks/benchmarks/decomposition.py index 02a7862caeb69..0a7bb7ad07f3e 100644 --- a/asv_benchmarks/benchmarks/decomposition.py +++ b/asv_benchmarks/benchmarks/decomposition.py @@ -1,8 +1,8 @@ from sklearn.decomposition import PCA, DictionaryLearning, MiniBatchDictionaryLearning from .common import Benchmark, Estimator, Transformer -from .datasets import _olivetti_faces_dataset, _mnist_dataset -from .utils import make_pca_scorers, make_dict_learning_scorers +from .datasets import _mnist_dataset, _olivetti_faces_dataset +from .utils import make_dict_learning_scorers, make_pca_scorers class PCABenchmark(Transformer, Estimator, Benchmark): diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index 8c5a28e3da90f..c336d1e5f8805 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -1,7 +1,7 @@ from sklearn.ensemble import ( - RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, + RandomForestClassifier, ) from .common import Benchmark, Estimator, Predictor diff --git a/asv_benchmarks/benchmarks/linear_model.py b/asv_benchmarks/benchmarks/linear_model.py index b694a109329f0..7e7b9d33540c6 100644 --- a/asv_benchmarks/benchmarks/linear_model.py +++ b/asv_benchmarks/benchmarks/linear_model.py @@ -1,9 +1,9 @@ from sklearn.linear_model import ( - LogisticRegression, - Ridge, ElasticNet, Lasso, LinearRegression, + LogisticRegression, + Ridge, SGDRegressor, ) diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py index c542349839178..a559bc59b5f8a 100644 --- a/benchmarks/bench_20newsgroups.py +++ b/benchmarks/bench_20newsgroups.py @@ -1,18 +1,19 @@ -from time import time import argparse -import numpy as np +from time import time -from sklearn.dummy import DummyClassifier +import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.metrics import accuracy_score -from sklearn.utils.validation import check_array - -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import AdaBoostClassifier +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ( + AdaBoostClassifier, + ExtraTreesClassifier, + RandomForestClassifier, +) from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score from sklearn.naive_bayes import MultinomialNB +from sklearn.utils.validation import check_array ESTIMATORS = { "dummy": DummyClassifier(), diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py index 8a13a2d9806c6..5b8cdd588c8ee 100644 --- a/benchmarks/bench_covertype.py +++ b/benchmarks/bench_covertype.py @@ -45,20 +45,24 @@ # Arnaud Joly # License: BSD 3 clause +import argparse import os from time import time -import argparse + import numpy as np from joblib import Memory from sklearn.datasets import fetch_covtype, get_data_home -from sklearn.svm import LinearSVC -from sklearn.linear_model import SGDClassifier, LogisticRegression +from sklearn.ensemble import ( + ExtraTreesClassifier, + GradientBoostingClassifier, + RandomForestClassifier, +) +from sklearn.linear_model import LogisticRegression, SGDClassifier +from sklearn.metrics import zero_one_loss from sklearn.naive_bayes import GaussianNB +from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import zero_one_loss from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting diff --git a/benchmarks/bench_feature_expansions.py b/benchmarks/bench_feature_expansions.py index fd5a4f0ebccff..b9d9efbdea4f1 100644 --- a/benchmarks/bench_feature_expansions.py +++ b/benchmarks/bench_feature_expansions.py @@ -1,8 +1,10 @@ +from time import time + import matplotlib.pyplot as plt import numpy as np import scipy.sparse as sparse + from sklearn.preprocessing import PolynomialFeatures -from time import time degree = 2 trials = 3 diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py index c6c2a6f5fa117..803043398d1ac 100644 --- a/benchmarks/bench_glm.py +++ b/benchmarks/bench_glm.py @@ -5,9 +5,10 @@ """ from datetime import datetime + import numpy as np -from sklearn import linear_model +from sklearn import linear_model if __name__ == "__main__": import matplotlib.pyplot as plt diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py index 8a0a0545bb627..7b111f95044e2 100644 --- a/benchmarks/bench_glmnet.py +++ b/benchmarks/bench_glmnet.py @@ -16,9 +16,11 @@ In both cases, only 10% of the features are informative. """ -import numpy as np import gc from time import time + +import numpy as np + from sklearn.datasets import make_regression alpha = 0.1 @@ -45,11 +47,11 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): if __name__ == "__main__": - from glmnet.elastic_net import Lasso as GlmnetLasso - from sklearn.linear_model import Lasso as ScikitLasso - # Delayed import of matplotlib.pyplot import matplotlib.pyplot as plt + from glmnet.elastic_net import Lasso as GlmnetLasso + + from sklearn.linear_model import Lasso as ScikitLasso scikit_results = [] glmnet_results = [] diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 163e21f98ed0d..c1dfffabe71c2 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -1,15 +1,16 @@ -from time import time import argparse +from time import time import matplotlib.pyplot as plt import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression -from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py index 1b5905b1cf4e8..5fa5bbae0c35c 100644 --- a/benchmarks/bench_hist_gradient_boosting_adult.py +++ b/benchmarks/bench_hist_gradient_boosting_adult.py @@ -4,15 +4,14 @@ import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.compose import make_column_transformer, make_column_selector +from sklearn.compose import make_column_selector, make_column_transformer from sklearn.datasets import fetch_openml -from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import train_test_split from sklearn.preprocessing import OrdinalEncoder - parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) parser.add_argument("--n-trees", type=int, default=100) diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py index e8d215170f9c8..1085bbc49f4f8 100644 --- a/benchmarks/bench_hist_gradient_boosting_categorical_only.py +++ b/benchmarks/bench_hist_gradient_boosting_categorical_only.py @@ -1,11 +1,10 @@ import argparse from time import time -from sklearn.preprocessing import KBinsDiscretizer from sklearn.datasets import make_classification from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator - +from sklearn.preprocessing import KBinsDiscretizer parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index d6ed3b8e9700f..65be02ec0c4b9 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -1,17 +1,17 @@ -from urllib.request import urlretrieve +import argparse import os from gzip import GzipFile from time import time -import argparse +from urllib.request import urlretrieve import numpy as np import pandas as pd from joblib import Memory -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, roc_auc_score + from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator - +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py index 70787fd2eb479..9acf65bdbaf6a 100644 --- a/benchmarks/bench_hist_gradient_boosting_threading.py +++ b/benchmarks/bench_hist_gradient_boosting_threading.py @@ -1,18 +1,19 @@ -from time import time import argparse import os from pprint import pprint +from time import time import numpy as np from threadpoolctl import threadpool_limits + import sklearn -from sklearn.model_selection import train_test_split -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator - +from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser() parser.add_argument("--n-leaf-nodes", type=int, default=31) @@ -290,8 +291,8 @@ def one_run(n_threads, n_samples): if args.plot or args.plot_filename: - import matplotlib.pyplot as plt import matplotlib + import matplotlib.pyplot as plt fig, axs = plt.subplots(2, figsize=(12, 12)) diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index 1c85cfb79d321..021114af56ea6 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -17,12 +17,13 @@ """ from time import time -import numpy as np + import matplotlib.pyplot as plt +import numpy as np +from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml from sklearn.ensemble import IsolationForest -from sklearn.metrics import roc_curve, auc -from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml +from sklearn.metrics import auc, roc_curve from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle as sh diff --git a/benchmarks/bench_isotonic.py b/benchmarks/bench_isotonic.py index 458a04a463303..221e6fb12da75 100644 --- a/benchmarks/bench_isotonic.py +++ b/benchmarks/bench_isotonic.py @@ -10,13 +10,15 @@ This allows the scaling of the algorithm with the problem size to be visualized and understood. """ -import numpy as np +import argparse import gc from datetime import datetime -from sklearn.isotonic import isotonic_regression -from scipy.special import expit + import matplotlib.pyplot as plt -import argparse +import numpy as np +from scipy.special import expit + +from sklearn.isotonic import isotonic_regression def generate_perturbed_logarithm_dataset(size): diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py index 00721aa7f18a9..6551cb74ff86e 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py @@ -39,13 +39,12 @@ import time -import numpy as np import matplotlib.pyplot as plt - +import numpy as np from numpy.testing import assert_array_almost_equal -from sklearn.decomposition import KernelPCA -from sklearn.datasets import make_circles +from sklearn.datasets import make_circles +from sklearn.decomposition import KernelPCA print(__doc__) diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py index a40ddea4506dd..26a45ca9f09ca 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py @@ -41,13 +41,12 @@ import time -import numpy as np import matplotlib.pyplot as plt - +import numpy as np from numpy.testing import assert_array_almost_equal -from sklearn.decomposition import KernelPCA -from sklearn.datasets import make_circles +from sklearn.datasets import make_circles +from sklearn.decomposition import KernelPCA print(__doc__) diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py index 9a893545fbb28..1c49c6f5cabdf 100644 --- a/benchmarks/bench_lasso.py +++ b/benchmarks/bench_lasso.py @@ -13,6 +13,7 @@ """ import gc from time import time + import numpy as np from sklearn.datasets import make_regression @@ -59,9 +60,10 @@ def compute_bench(alpha, n_samples, n_features, precompute): if __name__ == "__main__": - from sklearn.linear_model import Lasso, LassoLars import matplotlib.pyplot as plt + from sklearn.linear_model import Lasso, LassoLars + alpha = 0.01 # regularization parameter n_features = 10 diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 31057e2e4067b..8652073a7203d 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -18,11 +18,13 @@ """ from time import time -import numpy as np + import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml +from sklearn.metrics import auc, roc_curve from sklearn.neighbors import LocalOutlierFactor -from sklearn.metrics import roc_curve, auc -from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml from sklearn.preprocessing import LabelBinarizer print(__doc__) diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py index 4bc28ea1a165d..4ba17cb1003c3 100644 --- a/benchmarks/bench_mnist.py +++ b/benchmarks/bench_mnist.py @@ -30,26 +30,24 @@ # Arnaud Joly # License: BSD 3 clause +import argparse import os from time import time -import argparse + import numpy as np from joblib import Memory -from sklearn.datasets import fetch_openml -from sklearn.datasets import get_data_home -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_openml, get_data_home from sklearn.dummy import DummyClassifier -from sklearn.kernel_approximation import Nystroem -from sklearn.kernel_approximation import RBFSampler +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.kernel_approximation import Nystroem, RBFSampler +from sklearn.linear_model import LogisticRegression from sklearn.metrics import zero_one_loss +from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array -from sklearn.linear_model import LogisticRegression -from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode diff --git a/benchmarks/bench_multilabel_metrics.py b/benchmarks/bench_multilabel_metrics.py index 2a87b388e91a2..1b8449a24da51 100755 --- a/benchmarks/bench_multilabel_metrics.py +++ b/benchmarks/bench_multilabel_metrics.py @@ -3,26 +3,25 @@ A comparison of multilabel target formats and metrics over them """ -from timeit import timeit -from functools import partial -import itertools import argparse +import itertools import sys +from functools import partial +from timeit import timeit import matplotlib.pyplot as plt -import scipy.sparse as sp import numpy as np +import scipy.sparse as sp from sklearn.datasets import make_multilabel_classification from sklearn.metrics import ( - f1_score, accuracy_score, + f1_score, hamming_loss, jaccard_similarity_score, ) from sklearn.utils._testing import ignore_warnings - METRICS = { "f1": partial(f1_score, average="micro"), "f1-by-sample": partial(f1_score, average="samples"), diff --git a/benchmarks/bench_online_ocsvm.py b/benchmarks/bench_online_ocsvm.py index 37af2fdd76562..9f92150e079dd 100644 --- a/benchmarks/bench_online_ocsvm.py +++ b/benchmarks/bench_online_ocsvm.py @@ -15,21 +15,20 @@ """ from time import time -import numpy as np +import matplotlib +import matplotlib.pyplot as plt +import numpy as np from scipy.interpolate import interp1d -from sklearn.metrics import roc_curve, auc -from sklearn.datasets import fetch_kddcup99, fetch_covtype -from sklearn.preprocessing import LabelBinarizer, StandardScaler -from sklearn.pipeline import make_pipeline -from sklearn.utils import shuffle +from sklearn.datasets import fetch_covtype, fetch_kddcup99 from sklearn.kernel_approximation import Nystroem -from sklearn.svm import OneClassSVM from sklearn.linear_model import SGDOneClassSVM - -import matplotlib.pyplot as plt -import matplotlib +from sklearn.metrics import auc, roc_curve +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import LabelBinarizer, StandardScaler +from sklearn.svm import OneClassSVM +from sklearn.utils import shuffle font = {"weight": "normal", "size": 15} diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py index 0f42e4b630f1d..49b87c8c7060a 100644 --- a/benchmarks/bench_plot_incremental_pca.py +++ b/benchmarks/bench_plot_incremental_pca.py @@ -7,13 +7,15 @@ """ -import numpy as np import gc -from time import time from collections import defaultdict +from time import time + import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import fetch_lfw_people -from sklearn.decomposition import IncrementalPCA, PCA +from sklearn.decomposition import PCA, IncrementalPCA def plot_results(X, y, label): diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py index c372ee07117fc..c996c9c09520f 100644 --- a/benchmarks/bench_plot_lasso_path.py +++ b/benchmarks/bench_plot_lasso_path.py @@ -2,16 +2,15 @@ The input data is mostly low rank but is a fat infinite tail. """ -from collections import defaultdict import gc import sys +from collections import defaultdict from time import time import numpy as np -from sklearn.linear_model import lars_path, lars_path_gram -from sklearn.linear_model import lasso_path from sklearn.datasets import make_regression +from sklearn.linear_model import lars_path, lars_path_gram, lasso_path def compute_bench(samples_range, features_range): diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py index c6e5541eda6f3..2d9cf2b08b71d 100644 --- a/benchmarks/bench_plot_neighbors.py +++ b/benchmarks/bench_plot_neighbors.py @@ -3,11 +3,11 @@ """ from time import time -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib import ticker -from sklearn import neighbors, datasets +from sklearn import datasets, neighbors def get_data(N, D, dataset="dense"): diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py index 78d6ad875cc34..d23191df0fbc9 100644 --- a/benchmarks/bench_plot_nmf.py +++ b/benchmarks/bench_plot_nmf.py @@ -6,28 +6,25 @@ # Anthony Di Franco (projected gradient, Python and NumPy port) # License: BSD 3 clause -from time import time +import numbers import sys import warnings -import numbers +from time import time -import numpy as np import matplotlib.pyplot as plt -from joblib import Memory +import numpy as np import pandas +from joblib import Memory -from sklearn.utils._testing import ignore_warnings -from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF -from sklearn.decomposition._nmf import _initialize_nmf -from sklearn.decomposition._nmf import _beta_divergence -from sklearn.decomposition._nmf import _check_init +from sklearn.decomposition._nmf import _beta_divergence, _check_init, _initialize_nmf from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.extmath import safe_sparse_dot, squared_norm +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.utils import check_array +from sklearn.utils._testing import ignore_warnings +from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils.validation import check_is_fitted, check_non_negative - mem = Memory(cachedir=".", verbose=0) ################### diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py index a800b3ebe2ba9..ec1bf3281f3a4 100644 --- a/benchmarks/bench_plot_omp_lars.py +++ b/benchmarks/bench_plot_omp_lars.py @@ -9,8 +9,8 @@ import numpy as np -from sklearn.linear_model import lars_path, lars_path_gram, orthogonal_mp from sklearn.datasets import make_sparse_coded_signal +from sklearn.linear_model import lars_path, lars_path_gram, orthogonal_mp def compute_bench(samples_range, features_range): diff --git a/benchmarks/bench_plot_parallel_pairwise.py b/benchmarks/bench_plot_parallel_pairwise.py index a41e3fab20589..ca12972f9be6c 100644 --- a/benchmarks/bench_plot_parallel_pairwise.py +++ b/benchmarks/bench_plot_parallel_pairwise.py @@ -4,9 +4,8 @@ import matplotlib.pyplot as plt +from sklearn.metrics.pairwise import pairwise_distances, pairwise_kernels from sklearn.utils import check_random_state -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.metrics.pairwise import pairwise_kernels def plot(func): diff --git a/benchmarks/bench_plot_polynomial_kernel_approximation.py b/benchmarks/bench_plot_polynomial_kernel_approximation.py index b21589263a49f..ad89d974f3d93 100644 --- a/benchmarks/bench_plot_polynomial_kernel_approximation.py +++ b/benchmarks/bench_plot_polynomial_kernel_approximation.py @@ -42,21 +42,21 @@ # License: BSD 3 clause # Load data manipulation functions -from sklearn.datasets import load_digits -from sklearn.model_selection import train_test_split +# Will use this for timing results +from time import time # Some common libraries import matplotlib.pyplot as plt import numpy as np -# Will use this for timing results -from time import time - -# Import SVM classifiers and feature map approximation algorithms -from sklearn.svm import LinearSVC, SVC +from sklearn.datasets import load_digits from sklearn.kernel_approximation import Nystroem, PolynomialCountSketch +from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline +# Import SVM classifiers and feature map approximation algorithms +from sklearn.svm import SVC, LinearSVC + # Split data in train and test sets X, y = load_digits()["data"], load_digits()["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index 2020096a21b88..9ac4e714cb7dc 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -65,28 +65,29 @@ # Author: Giorgio Patrini -import numpy as np -import scipy as sp -import matplotlib.pyplot as plt - import gc +import os.path import pickle -from time import time from collections import defaultdict -import os.path +from time import time + +import matplotlib.pyplot as plt +import numpy as np +import scipy as sp -from sklearn.utils._arpack import _init_arpack_v0 -from sklearn.utils import gen_batches -from sklearn.utils.validation import check_random_state -from sklearn.utils.extmath import randomized_svd -from sklearn.datasets import make_low_rank_matrix, make_sparse_uncorrelated from sklearn.datasets import ( - fetch_lfw_people, - fetch_openml, fetch_20newsgroups_vectorized, + fetch_lfw_people, fetch_olivetti_faces, + fetch_openml, fetch_rcv1, + make_low_rank_matrix, + make_sparse_uncorrelated, ) +from sklearn.utils import gen_batches +from sklearn.utils._arpack import _init_arpack_v0 +from sklearn.utils.extmath import randomized_svd +from sklearn.utils.validation import check_random_state try: import fbpca diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py index fc370d1073be1..abd2c6fe9d4d4 100644 --- a/benchmarks/bench_plot_svd.py +++ b/benchmarks/bench_plot_svd.py @@ -3,13 +3,14 @@ The data is mostly low rank but is a fat infinite tail. """ import gc -from time import time -import numpy as np from collections import defaultdict +from time import time +import numpy as np from scipy.linalg import svd -from sklearn.utils.extmath import randomized_svd + from sklearn.datasets import make_low_rank_matrix +from sklearn.utils.extmath import randomized_svd def compute_bench(samples_range, features_range, n_iter=3, rank=50): diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py index 696e833eede20..fe5cee201dff4 100644 --- a/benchmarks/bench_plot_ward.py +++ b/benchmarks/bench_plot_ward.py @@ -4,9 +4,9 @@ import time +import matplotlib.pyplot as plt import numpy as np from scipy.cluster import hierarchy -import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py index 89a4550944f3f..bd8c62ecba484 100644 --- a/benchmarks/bench_random_projections.py +++ b/benchmarks/bench_random_projections.py @@ -6,19 +6,19 @@ Benchmarks for random projections. """ +import collections import gc -import sys import optparse +import sys from datetime import datetime -import collections import numpy as np import scipy.sparse as sp from sklearn import clone from sklearn.random_projection import ( - SparseRandomProjection, GaussianRandomProjection, + SparseRandomProjection, johnson_lindenstrauss_min_dim, ) diff --git a/benchmarks/bench_rcv1_logreg_convergence.py b/benchmarks/bench_rcv1_logreg_convergence.py index 2254ab81f30a4..166c6c2f5f9d1 100644 --- a/benchmarks/bench_rcv1_logreg_convergence.py +++ b/benchmarks/bench_rcv1_logreg_convergence.py @@ -3,14 +3,15 @@ # # License: BSD 3 clause -import matplotlib.pyplot as plt -from joblib import Memory -import numpy as np import gc import time -from sklearn.linear_model import LogisticRegression, SGDClassifier +import matplotlib.pyplot as plt +import numpy as np +from joblib import Memory + from sklearn.datasets import fetch_rcv1 +from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.linear_model._sag import get_auto_step_size try: diff --git a/benchmarks/bench_saga.py b/benchmarks/bench_saga.py index 340549ef240e1..dc2ed093f11d0 100644 --- a/benchmarks/bench_saga.py +++ b/benchmarks/bench_saga.py @@ -4,24 +4,24 @@ in using multinomial logistic regression in term of learning time. """ import json -import time import os +import time -from sklearn.utils.parallel import delayed, Parallel import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import ( + fetch_20newsgroups_vectorized, fetch_rcv1, - load_iris, load_digits, - fetch_20newsgroups_vectorized, + load_iris, ) from sklearn.linear_model import LogisticRegression from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.utils.extmath import safe_sparse_dot, softmax +from sklearn.utils.parallel import Parallel, delayed def fit_single( diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py index 10baad5a8495f..743292ca5fa61 100644 --- a/benchmarks/bench_sample_without_replacement.py +++ b/benchmarks/bench_sample_without_replacement.py @@ -3,14 +3,14 @@ """ import gc -import sys +import operator import optparse +import random +import sys from datetime import datetime -import operator import matplotlib.pyplot as plt import numpy as np -import random from sklearn.utils.random import sample_without_replacement diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py index 47dd9e9fc758b..4b1b902795feb 100644 --- a/benchmarks/bench_sgd_regression.py +++ b/benchmarks/bench_sgd_regression.py @@ -1,16 +1,15 @@ # Author: Peter Prettenhofer # License: BSD 3 clause -import numpy as np -import matplotlib.pyplot as plt - import gc - from time import time -from sklearn.linear_model import Ridge, SGDRegressor, ElasticNet -from sklearn.metrics import mean_squared_error +import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import make_regression +from sklearn.linear_model import ElasticNet, Ridge, SGDRegressor +from sklearn.metrics import mean_squared_error """ Benchmark for SGD regression diff --git a/benchmarks/bench_sparsify.py b/benchmarks/bench_sparsify.py index f1aa482b8b732..1832ca40c6ddb 100644 --- a/benchmarks/bench_sparsify.py +++ b/benchmarks/bench_sparsify.py @@ -43,8 +43,9 @@ 60 300 381409 1271.4 97.1 clf.predict(X_test_sparse) """ -from scipy.sparse import csr_matrix import numpy as np +from scipy.sparse import csr_matrix + from sklearn.linear_model import SGDRegressor from sklearn.metrics import r2_score diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 6d75d57658500..31d4141d1af97 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -8,8 +8,8 @@ * psutil (optional, but recommended) """ -import timeit import itertools +import timeit import numpy as np import pandas as pd @@ -18,8 +18,8 @@ from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import ( CountVectorizer, - TfidfVectorizer, HashingVectorizer, + TfidfVectorizer, ) n_repeat = 3 diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py index c23ef627e237e..29cd7584432b7 100644 --- a/benchmarks/bench_tree.py +++ b/benchmarks/bench_tree.py @@ -13,11 +13,12 @@ training set, classify a sample and plot the time taken as a function of the number of dimensions. """ -import numpy as np -import matplotlib.pyplot as plt import gc from datetime import datetime +import matplotlib.pyplot as plt +import numpy as np + # to store the results scikit_classifier_results = [] scikit_regressor_results = [] diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index e399e891cb94e..39462b33d9655 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -7,18 +7,19 @@ # License: BSD 3 clause +import argparse +import json import os import os.path as op from time import time + import numpy as np -import json -import argparse from joblib import Memory from sklearn.datasets import fetch_openml +from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.neighbors import NearestNeighbors -from sklearn.decomposition import PCA from sklearn.utils import check_array from sklearn.utils import shuffle as _shuffle from sklearn.utils._openmp_helpers import _openmp_effective_n_threads diff --git a/benchmarks/plot_tsne_mnist.py b/benchmarks/plot_tsne_mnist.py index d32e3dd769d6a..fff71eed0a26c 100644 --- a/benchmarks/plot_tsne_mnist.py +++ b/benchmarks/plot_tsne_mnist.py @@ -1,9 +1,8 @@ -import matplotlib.pyplot as plt -import numpy as np -import os.path as op - import argparse +import os.path as op +import matplotlib.pyplot as plt +import numpy as np LOG_DIR = "mnist_tsne_output" diff --git a/build_tools/azure/get_commit_message.py b/build_tools/azure/get_commit_message.py index 239da5b8c4498..0b1246b8d2724 100644 --- a/build_tools/azure/get_commit_message.py +++ b/build_tools/azure/get_commit_message.py @@ -1,6 +1,6 @@ +import argparse import os import subprocess -import argparse def get_commit_message(): diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py index dfcc600957469..345e08b4bece4 100755 --- a/build_tools/circle/list_versions.py +++ b/build_tools/circle/list_versions.py @@ -4,9 +4,9 @@ import json import re import sys +from urllib.request import urlopen from sklearn.utils.fixes import parse_version -from urllib.request import urlopen def json_urlread(url): diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py index d4da0db5be3c1..c930a9b2956d1 100644 --- a/build_tools/generate_authors_table.py +++ b/build_tools/generate_authors_table.py @@ -6,12 +6,13 @@ The table should be updated for each new inclusion in the teams. Generating the table requires admin rights. """ -import sys -import requests import getpass +import sys import time -from pathlib import Path from os import path +from pathlib import Path + +import requests print("user:", file=sys.stderr) user = input() diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py index 99d319cba4dc5..3860d3e81adb7 100644 --- a/build_tools/github/check_wheels.py +++ b/build_tools/github/check_wheels.py @@ -1,8 +1,9 @@ """Checks that dist/* contains the number of wheels built from the .github/workflows/wheels.yml config.""" -import yaml -from pathlib import Path import sys +from pathlib import Path + +import yaml gh_wheel_path = Path.cwd() / ".github" / "workflows" / "wheels.yml" with gh_wheel_path.open("r") as f: diff --git a/build_tools/github/vendor.py b/build_tools/github/vendor.py index 2997688423b84..3bc1aceb3437c 100644 --- a/build_tools/github/vendor.py +++ b/build_tools/github/vendor.py @@ -7,7 +7,6 @@ import sys import textwrap - TARGET_FOLDER = op.join("sklearn", ".libs") DISTRIBUTOR_INIT = op.join("sklearn", "_distributor_init.py") VCOMP140_SRC_PATH = "C:\\Windows\\System32\\vcomp140.dll" diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index a4a5c92b077d2..960c01d4383b8 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -29,20 +29,18 @@ """ +import json +import logging import re +import shlex import subprocess import sys -from pathlib import Path -import shlex -import json -import logging from importlib.metadata import version - -from packaging.version import Version +from pathlib import Path import click - from jinja2 import Environment +from packaging.version import Version logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/doc/conf.py b/doc/conf.py index 6ac1b4e231822..73281f02899d4 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -10,14 +10,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os -import warnings import re +import sys +import warnings from datetime import datetime -from sklearn.externals._packaging.version import parse -from pathlib import Path from io import StringIO +from pathlib import Path + +from sklearn.externals._packaging.version import parse # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory @@ -25,10 +26,10 @@ # absolute, like shown here. sys.path.insert(0, os.path.abspath("sphinxext")) -from github_link import make_linkcode_resolve import sphinx_gallery +from github_link import make_linkcode_resolve +from sphinx_gallery.notebook import add_code_cell, add_markdown_cell from sphinx_gallery.sorting import ExampleTitleSortKey -from sphinx_gallery.notebook import add_markdown_cell, add_code_cell try: # Configure plotly to integrate its output into the HTML pages generated by diff --git a/doc/conftest.py b/doc/conftest.py index 73848ccf392fb..ca94ad1ed2b60 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -1,16 +1,14 @@ import os -from os.path import exists -from os.path import join -from os import environ import warnings +from os import environ +from os.path import exists, join -from sklearn.utils import IS_PYPY -from sklearn.utils._testing import SkipTest -from sklearn.utils._testing import check_skip_network -from sklearn.utils.fixes import parse_version from sklearn.datasets import get_data_home from sklearn.datasets._base import _pkl_filepath from sklearn.datasets._twenty_newsgroups import CACHE_NAME +from sklearn.utils import IS_PYPY +from sklearn.utils._testing import SkipTest, check_skip_network +from sklearn.utils.fixes import parse_version def setup_labeled_faces(): diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py index e8f94506daaa5..89d7077bce2b5 100755 --- a/doc/sphinxext/allow_nan_estimators.py +++ b/doc/sphinxext/allow_nan_estimators.py @@ -1,11 +1,12 @@ -from sklearn.utils import all_estimators -from sklearn.utils.estimator_checks import _construct_instance -from sklearn.utils._testing import SkipTest -from docutils import nodes from contextlib import suppress +from docutils import nodes from docutils.parsers.rst import Directive +from sklearn.utils import all_estimators +from sklearn.utils._testing import SkipTest +from sklearn.utils.estimator_checks import _construct_instance + class AllowNanEstimators(Directive): @staticmethod diff --git a/doc/sphinxext/doi_role.py b/doc/sphinxext/doi_role.py index 7d188969bb778..32e905fe650ea 100644 --- a/doc/sphinxext/doi_role.py +++ b/doc/sphinxext/doi_role.py @@ -15,7 +15,6 @@ """ from docutils import nodes, utils - from sphinx.util.nodes import split_explicit_title diff --git a/doc/sphinxext/github_link.py b/doc/sphinxext/github_link.py index 3992d814b825e..d3e43c8ed0f5e 100644 --- a/doc/sphinxext/github_link.py +++ b/doc/sphinxext/github_link.py @@ -1,9 +1,9 @@ -from operator import attrgetter import inspect -import subprocess import os +import subprocess import sys from functools import partial +from operator import attrgetter REVISION_CMD = "git rev-parse --short HEAD" diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py index ecd270354ab76..12e285096726d 100644 --- a/examples/applications/plot_cyclical_feature_engineering.py +++ b/examples/applications/plot_cyclical_feature_engineering.py @@ -35,7 +35,6 @@ # demand around the middle of the days: import matplotlib.pyplot as plt - fig, ax = plt.subplots(figsize=(12, 4)) average_week_demand = df.groupby(["weekday", "hour"])["count"].mean() average_week_demand.plot(ax=ax) @@ -181,12 +180,11 @@ # # The numerical variables need no preprocessing and, for the sake of simplicity, # we only try the default hyper-parameters for this model: -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import OrdinalEncoder from sklearn.compose import ColumnTransformer from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.model_selection import cross_validate - +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OrdinalEncoder categorical_columns = [ "weather", @@ -262,11 +260,10 @@ def evaluate(model, X, y, cv): # For consistency, we scale the numerical features to the same 0-1 range using # class:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not # impact the results much because they are already on comparable scales: -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import MinMaxScaler -from sklearn.linear_model import RidgeCV import numpy as np +from sklearn.linear_model import RidgeCV +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False) alphas = np.logspace(-6, 6, 25) @@ -619,9 +616,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # However, it is possible to use the `PolynomialFeatures` class on coarse # grained spline encoded hours to model the "workingday"/"hours" interaction # explicitly without introducing too many new variables: -from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import FeatureUnion - +from sklearn.preprocessing import PolynomialFeatures hour_workday_interaction = make_pipeline( ColumnTransformer( @@ -668,7 +664,6 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # polynomial kernel expansion. Let us try the latter: from sklearn.kernel_approximation import Nystroem - cyclic_spline_poly_pipeline = make_pipeline( cyclic_spline_transformer, Nystroem(kernel="poly", degree=2, n_components=300, random_state=0), diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py index 72637b6ab036f..bd8d5b1b7b037 100644 --- a/examples/applications/plot_digits_denoising.py +++ b/examples/applications/plot_digits_denoising.py @@ -32,9 +32,10 @@ # :func:`~sklearn.datasets.fetch_openml` to get this dataset. In addition, we # normalize the dataset such that all pixel values are in the range (0, 1). import numpy as np + from sklearn.datasets import fetch_openml -from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True, parser="pandas") X = MinMaxScaler().fit_transform(X) diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py index 878d889f52ce3..1ff4399d60739 100644 --- a/examples/applications/plot_face_recognition.py +++ b/examples/applications/plot_face_recognition.py @@ -13,18 +13,16 @@ """ # %% from time import time + import matplotlib.pyplot as plt +from scipy.stats import loguniform -from sklearn.model_selection import train_test_split -from sklearn.model_selection import RandomizedSearchCV from sklearn.datasets import fetch_lfw_people -from sklearn.metrics import classification_report -from sklearn.metrics import ConfusionMatrixDisplay -from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA +from sklearn.metrics import ConfusionMatrixDisplay, classification_report +from sklearn.model_selection import RandomizedSearchCV, train_test_split +from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC -from scipy.stats import loguniform - # %% # Download the data, if not already on disk and load it as numpy arrays diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py index 812539aa1ff46..f83be241230c3 100644 --- a/examples/applications/plot_model_complexity_influence.py +++ b/examples/applications/plot_model_complexity_influence.py @@ -42,16 +42,16 @@ # License: BSD 3 clause import time -import numpy as np + import matplotlib.pyplot as plt +import numpy as np from sklearn import datasets -from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_squared_error -from sklearn.svm import NuSVR from sklearn.ensemble import GradientBoostingRegressor from sklearn.linear_model import SGDClassifier -from sklearn.metrics import hamming_loss +from sklearn.metrics import hamming_loss, mean_squared_error +from sklearn.model_selection import train_test_split +from sklearn.svm import NuSVR # Initialize random generator np.random.seed(0) diff --git a/examples/applications/plot_out_of_core_classification.py b/examples/applications/plot_out_of_core_classification.py index 212cbda9cc71e..08ae3000c391c 100644 --- a/examples/applications/plot_out_of_core_classification.py +++ b/examples/applications/plot_out_of_core_classification.py @@ -19,24 +19,22 @@ # License: BSD 3 clause import itertools -from pathlib import Path -from hashlib import sha256 import re +import sys import tarfile import time -import sys +from hashlib import sha256 +from html.parser import HTMLParser +from pathlib import Path +from urllib.request import urlretrieve -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib import rcParams -from html.parser import HTMLParser -from urllib.request import urlretrieve from sklearn.datasets import get_data_home from sklearn.feature_extraction.text import HashingVectorizer -from sklearn.linear_model import SGDClassifier -from sklearn.linear_model import PassiveAggressiveClassifier -from sklearn.linear_model import Perceptron +from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, SGDClassifier from sklearn.naive_bayes import MultinomialNB diff --git a/examples/applications/plot_outlier_detection_wine.py b/examples/applications/plot_outlier_detection_wine.py index 45e4c64d9fcc4..c4adfa222a5dd 100644 --- a/examples/applications/plot_outlier_detection_wine.py +++ b/examples/applications/plot_outlier_detection_wine.py @@ -37,12 +37,13 @@ # Author: Virgile Fritsch # License: BSD 3 clause +import matplotlib.font_manager +import matplotlib.pyplot as plt import numpy as np + from sklearn.covariance import EllipticEnvelope -from sklearn.svm import OneClassSVM -import matplotlib.pyplot as plt -import matplotlib.font_manager from sklearn.datasets import load_wine +from sklearn.svm import OneClassSVM # Define "classifiers" to be used classifiers = { diff --git a/examples/applications/plot_prediction_latency.py b/examples/applications/plot_prediction_latency.py index 9b99bcbfdfaf1..8fce81fb9fb4e 100644 --- a/examples/applications/plot_prediction_latency.py +++ b/examples/applications/plot_prediction_latency.py @@ -16,19 +16,18 @@ # Authors: Eustache Diemert # License: BSD 3 clause +import gc +import time from collections import defaultdict -import time -import gc -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import train_test_split from sklearn.datasets import make_regression from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import Ridge -from sklearn.linear_model import SGDRegressor +from sklearn.linear_model import Ridge, SGDRegressor +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from sklearn.utils import shuffle diff --git a/examples/applications/plot_species_distribution_modeling.py b/examples/applications/plot_species_distribution_modeling.py index e3d5778f3307d..bdf50918840c2 100644 --- a/examples/applications/plot_species_distribution_modeling.py +++ b/examples/applications/plot_species_distribution_modeling.py @@ -43,12 +43,12 @@ from time import time -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.utils import Bunch +from sklearn import metrics, svm from sklearn.datasets import fetch_species_distributions -from sklearn import svm, metrics +from sklearn.utils import Bunch # if basemap is available, we'll use it. # otherwise, we'll improvise later... diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py index 91168f434338e..cdf5a36074923 100644 --- a/examples/applications/plot_stock_market.py +++ b/examples/applications/plot_stock_market.py @@ -24,6 +24,7 @@ # `alphavantage.co `_. import sys + import numpy as np import pandas as pd diff --git a/examples/applications/plot_tomography_l1_reconstruction.py b/examples/applications/plot_tomography_l1_reconstruction.py index 9ac351c12206c..d851613402571 100644 --- a/examples/applications/plot_tomography_l1_reconstruction.py +++ b/examples/applications/plot_tomography_l1_reconstruction.py @@ -39,12 +39,11 @@ class :class:`~sklearn.linear_model.Lasso`, that uses the coordinate descent # Author: Emmanuelle Gouillart # License: BSD 3 clause -import numpy as np -from scipy import sparse -from scipy import ndimage -from sklearn.linear_model import Lasso -from sklearn.linear_model import Ridge import matplotlib.pyplot as plt +import numpy as np +from scipy import ndimage, sparse + +from sklearn.linear_model import Lasso, Ridge def _weights(x, dx=1, orig=0): diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index 38945241ab68b..0385fd7c89333 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -27,11 +27,12 @@ # License: BSD 3 clause from time import time + import matplotlib.pyplot as plt -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation from sklearn.datasets import fetch_20newsgroups +from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer n_samples = 2000 n_features = 1000 diff --git a/examples/applications/svm_gui.py b/examples/applications/svm_gui.py index c8019fa72ae91..cd0e3b6101bb6 100644 --- a/examples/applications/svm_gui.py +++ b/examples/applications/svm_gui.py @@ -30,13 +30,13 @@ from matplotlib.backends.backend_tkagg import ( NavigationToolbar2TkAgg as NavigationToolbar2Tk, ) -from matplotlib.figure import Figure -from matplotlib.contour import ContourSet - import sys -import numpy as np import tkinter as Tk +import numpy as np +from matplotlib.contour import ContourSet +from matplotlib.figure import Figure + from sklearn import svm from sklearn.datasets import dump_svmlight_file diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py index fcc337b0a4e00..0be1661d7ed5c 100644 --- a/examples/applications/wikipedia_principal_eigenvector.py +++ b/examples/applications/wikipedia_principal_eigenvector.py @@ -33,19 +33,17 @@ # Author: Olivier Grisel # License: BSD 3 clause -from bz2 import BZ2File import os +from bz2 import BZ2File from datetime import datetime from pprint import pprint from time import time +from urllib.request import urlopen import numpy as np - from scipy import sparse from sklearn.decomposition import randomized_svd -from urllib.request import urlopen - # %% # Download data, if not already on disk diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py index a54f7099c9a74..0fef820bb9f2a 100644 --- a/examples/bicluster/plot_bicluster_newsgroups.py +++ b/examples/bicluster/plot_bicluster_newsgroups.py @@ -23,14 +23,13 @@ """ -from collections import defaultdict import operator +from collections import defaultdict from time import time import numpy as np -from sklearn.cluster import SpectralCoclustering -from sklearn.cluster import MiniBatchKMeans +from sklearn.cluster import MiniBatchKMeans, SpectralCoclustering from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.cluster import v_measure_score diff --git a/examples/bicluster/plot_spectral_biclustering.py b/examples/bicluster/plot_spectral_biclustering.py index baf0f0ccbc58f..18ff7ae67b83a 100644 --- a/examples/bicluster/plot_spectral_biclustering.py +++ b/examples/bicluster/plot_spectral_biclustering.py @@ -32,9 +32,10 @@ # # As you can see, the data is distributed over 12 cluster cells and is # relatively well distinguishable. -from sklearn.datasets import make_checkerboard from matplotlib import pyplot as plt +from sklearn.datasets import make_checkerboard + n_clusters = (4, 3) data, rows, columns = make_checkerboard( shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=42 diff --git a/examples/bicluster/plot_spectral_coclustering.py b/examples/bicluster/plot_spectral_coclustering.py index 0df275e83e3bd..92b10d93956e7 100644 --- a/examples/bicluster/plot_spectral_coclustering.py +++ b/examples/bicluster/plot_spectral_coclustering.py @@ -21,8 +21,8 @@ import numpy as np from matplotlib import pyplot as plt -from sklearn.datasets import make_biclusters from sklearn.cluster import SpectralCoclustering +from sklearn.datasets import make_biclusters from sklearn.metrics import consensus_score data, rows, columns = make_biclusters( diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py index 75d1ea15b8fbd..f928ae631b78b 100644 --- a/examples/calibration/plot_calibration.py +++ b/examples/calibration/plot_calibration.py @@ -91,8 +91,8 @@ # %% # Plot data and the predicted probabilities # ----------------------------------------- -from matplotlib import cm import matplotlib.pyplot as plt +from matplotlib import cm plt.figure() y_unique = np.unique(y) diff --git a/examples/calibration/plot_calibration_curve.py b/examples/calibration/plot_calibration_curve.py index dc4e85a5f1678..915d3b7c20cc9 100644 --- a/examples/calibration/plot_calibration_curve.py +++ b/examples/calibration/plot_calibration_curve.py @@ -140,11 +140,11 @@ import pandas as pd from sklearn.metrics import ( - precision_score, - recall_score, - f1_score, brier_score_loss, + f1_score, log_loss, + precision_score, + recall_score, roc_auc_score, ) diff --git a/examples/calibration/plot_calibration_multiclass.py b/examples/calibration/plot_calibration_multiclass.py index 24962a786ea03..fc6349f3dea5f 100644 --- a/examples/calibration/plot_calibration_multiclass.py +++ b/examples/calibration/plot_calibration_multiclass.py @@ -31,6 +31,7 @@ class of an instance (red: class 1, green: class 2, blue: class 3). # License: BSD Style. import numpy as np + from sklearn.datasets import make_blobs np.random.seed(0) diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py index 87c3f51db5eb2..ec5887b63914d 100644 --- a/examples/classification/plot_classification_probability.py +++ b/examples/classification/plot_classification_probability.py @@ -23,12 +23,12 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.metrics import accuracy_score -from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC +from sklearn import datasets from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF -from sklearn import datasets +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.svm import SVC iris = datasets.load_iris() X = iris.data[:, 0:2] # we only take the first two features for visualization diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py index 71e8318aa0acb..75164cff8b492 100644 --- a/examples/classification/plot_classifier_comparison.py +++ b/examples/classification/plot_classifier_comparison.py @@ -24,23 +24,24 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import ListedColormap + +from sklearn.datasets import make_circles, make_classification, make_moons +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process.kernels import RBF +from sklearn.inspection import DecisionBoundaryDisplay from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline -from sklearn.datasets import make_moons, make_circles, make_classification -from sklearn.neural_network import MLPClassifier +from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC -from sklearn.gaussian_process import GaussianProcessClassifier -from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier -from sklearn.naive_bayes import GaussianNB -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis -from sklearn.inspection import DecisionBoundaryDisplay names = [ "Nearest Neighbors", diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py index f760916d1f66e..d6208400d5416 100644 --- a/examples/classification/plot_digits_classification.py +++ b/examples/classification/plot_digits_classification.py @@ -15,7 +15,7 @@ import matplotlib.pyplot as plt # Import datasets, classifiers and performance metrics -from sklearn import datasets, svm, metrics +from sklearn import datasets, metrics, svm from sklearn.model_selection import train_test_split ############################################################################### diff --git a/examples/classification/plot_lda.py b/examples/classification/plot_lda.py index 322cc8bb4007c..b24479b91f5ea 100644 --- a/examples/classification/plot_lda.py +++ b/examples/classification/plot_lda.py @@ -8,13 +8,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from sklearn.covariance import OAS from sklearn.datasets import make_blobs from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.covariance import OAS - n_train = 20 # samples for training n_test = 200 # samples for testing diff --git a/examples/classification/plot_lda_qda.py b/examples/classification/plot_lda_qda.py index 712354f7f7f44..71230d0a9bcd9 100644 --- a/examples/classification/plot_lda_qda.py +++ b/examples/classification/plot_lda_qda.py @@ -15,8 +15,8 @@ class has its own standard deviation with QDA. # Colormap # -------- -import matplotlib.pyplot as plt import matplotlib as mpl +import matplotlib.pyplot as plt from matplotlib import colors cmap = colors.LinearSegmentedColormap( @@ -172,8 +172,10 @@ def plot_qda_cov(qda, splot): fontsize=15, ) -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.discriminant_analysis import ( + LinearDiscriminantAnalysis, + QuadraticDiscriminantAnalysis, +) for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index d2bc345c00b3e..5816ae298f419 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -10,8 +10,8 @@ """ import numpy as np -from sklearn.cluster import AffinityPropagation from sklearn import metrics +from sklearn.cluster import AffinityPropagation from sklearn.datasets import make_blobs # %% diff --git a/examples/cluster/plot_agglomerative_clustering.py b/examples/cluster/plot_agglomerative_clustering.py index 5bb87a9386bf8..d5e7a8168a648 100644 --- a/examples/cluster/plot_agglomerative_clustering.py +++ b/examples/cluster/plot_agglomerative_clustering.py @@ -28,6 +28,7 @@ # License: BSD 3 clause import time + import matplotlib.pyplot as plt import numpy as np diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py index f1a77d442dbe8..8eb2ea3f7285f 100644 --- a/examples/cluster/plot_agglomerative_clustering_metrics.py +++ b/examples/cluster/plot_agglomerative_clustering_metrics.py @@ -37,8 +37,8 @@ # Author: Gael Varoquaux # License: BSD 3-Clause or CC-0 -import matplotlib.pyplot as plt import matplotlib.patheffects as PathEffects +import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import AgglomerativeClustering diff --git a/examples/cluster/plot_agglomerative_dendrogram.py b/examples/cluster/plot_agglomerative_dendrogram.py index 2de5030d68f6d..20c22f4f0bb39 100644 --- a/examples/cluster/plot_agglomerative_dendrogram.py +++ b/examples/cluster/plot_agglomerative_dendrogram.py @@ -10,11 +10,11 @@ """ import numpy as np - from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram -from sklearn.datasets import load_iris + from sklearn.cluster import AgglomerativeClustering +from sklearn.datasets import load_iris def plot_dendrogram(model, **kwargs): diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py index 3d4185dc9368a..c9c213c948913 100644 --- a/examples/cluster/plot_birch_vs_minibatchkmeans.py +++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py @@ -25,17 +25,17 @@ # Alexandre Gramfort # License: BSD 3 clause -from joblib import cpu_count from itertools import cycle from time import time -import numpy as np -import matplotlib.pyplot as plt + import matplotlib.colors as colors +import matplotlib.pyplot as plt +import numpy as np +from joblib import cpu_count from sklearn.cluster import Birch, MiniBatchKMeans from sklearn.datasets import make_blobs - # Generate centers for the blobs so that it forms a 10 X 10 grid. xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) diff --git a/examples/cluster/plot_bisect_kmeans.py b/examples/cluster/plot_bisect_kmeans.py index a6be3545e0b27..3aebdffddaf63 100644 --- a/examples/cluster/plot_bisect_kmeans.py +++ b/examples/cluster/plot_bisect_kmeans.py @@ -15,9 +15,8 @@ """ import matplotlib.pyplot as plt -from sklearn.datasets import make_blobs from sklearn.cluster import BisectingKMeans, KMeans - +from sklearn.datasets import make_blobs print(__doc__) diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 843c629374828..27b4a1c46c415 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -26,14 +26,14 @@ import time import warnings +from itertools import cycle, islice -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn import cluster, datasets, mixture from sklearn.neighbors import kneighbors_graph from sklearn.preprocessing import StandardScaler -from itertools import cycle, islice np.random.seed(0) diff --git a/examples/cluster/plot_cluster_iris.py b/examples/cluster/plot_cluster_iris.py index 4078d139f8064..b20bc8e38dd78 100644 --- a/examples/cluster/plot_cluster_iris.py +++ b/examples/cluster/plot_cluster_iris.py @@ -22,15 +22,15 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt # Though the following import is not directly being used, it is required # for 3D projection to work with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +import numpy as np -from sklearn.cluster import KMeans from sklearn import datasets +from sklearn.cluster import KMeans np.random.seed(5) diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py index bec68d1221646..c965dc2bd7ace 100644 --- a/examples/cluster/plot_coin_segmentation.py +++ b/examples/cluster/plot_coin_segmentation.py @@ -27,15 +27,14 @@ import time +import matplotlib.pyplot as plt import numpy as np from scipy.ndimage import gaussian_filter -import matplotlib.pyplot as plt from skimage.data import coins from skimage.transform import rescale -from sklearn.feature_extraction import image from sklearn.cluster import spectral_clustering - +from sklearn.feature_extraction import image # load the coins as a numpy array orig_coins = coins() diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index ae37673808e56..cc8849b64ab6f 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -25,13 +25,15 @@ # # License: BSD 3 clause -import numpy as np +from time import time + import matplotlib.pyplot as plt +import numpy as np + from sklearn.cluster import KMeans -from sklearn.metrics import pairwise_distances_argmin from sklearn.datasets import load_sample_image +from sklearn.metrics import pairwise_distances_argmin from sklearn.utils import shuffle -from time import time n_colors = 64 diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py index c762e0bceae08..0b0bd64ecf62b 100644 --- a/examples/cluster/plot_dbscan.py +++ b/examples/cluster/plot_dbscan.py @@ -44,8 +44,9 @@ # the `labels_` attribute. Noisy samples are given the label math:`-1`. import numpy as np -from sklearn.cluster import DBSCAN + from sklearn import metrics +from sklearn.cluster import DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) labels = db.labels_ diff --git a/examples/cluster/plot_digits_agglomeration.py b/examples/cluster/plot_digits_agglomeration.py index 627a9a28d7665..faedefb8aeed8 100644 --- a/examples/cluster/plot_digits_agglomeration.py +++ b/examples/cluster/plot_digits_agglomeration.py @@ -12,10 +12,10 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn import datasets, cluster +from sklearn import cluster, datasets from sklearn.feature_extraction.image import grid_to_graph digits = datasets.load_digits() diff --git a/examples/cluster/plot_digits_linkage.py b/examples/cluster/plot_digits_linkage.py index 730f85c543356..ae67bd5d8e0f4 100644 --- a/examples/cluster/plot_digits_linkage.py +++ b/examples/cluster/plot_digits_linkage.py @@ -35,7 +35,7 @@ import numpy as np from matplotlib import pyplot as plt -from sklearn import manifold, datasets +from sklearn import datasets, manifold digits = datasets.load_digits() X, y = digits.data, digits.target diff --git a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py index e2273326b9a12..577d65f314337 100644 --- a/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py +++ b/examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py @@ -21,18 +21,17 @@ import shutil import tempfile -import numpy as np import matplotlib.pyplot as plt -from scipy import linalg, ndimage +import numpy as np from joblib import Memory +from scipy import linalg, ndimage -from sklearn.feature_extraction.image import grid_to_graph from sklearn import feature_selection from sklearn.cluster import FeatureAgglomeration +from sklearn.feature_extraction.image import grid_to_graph from sklearn.linear_model import BayesianRidge +from sklearn.model_selection import GridSearchCV, KFold from sklearn.pipeline import Pipeline -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import KFold # %% # Set parameters diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py index b97858ff156e8..a4dfcb6c42bbc 100644 --- a/examples/cluster/plot_hdbscan.py +++ b/examples/cluster/plot_hdbscan.py @@ -13,11 +13,11 @@ We first define a couple utility functions for convenience. """ # %% +import matplotlib.pyplot as plt import numpy as np -from sklearn.cluster import HDBSCAN, DBSCAN +from sklearn.cluster import DBSCAN, HDBSCAN from sklearn.datasets import make_blobs -import matplotlib.pyplot as plt def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None): diff --git a/examples/cluster/plot_inductive_clustering.py b/examples/cluster/plot_inductive_clustering.py index e395571a1caad..b6464459160e3 100644 --- a/examples/cluster/plot_inductive_clustering.py +++ b/examples/cluster/plot_inductive_clustering.py @@ -24,6 +24,7 @@ # Christos Aridas import matplotlib.pyplot as plt + from sklearn.base import BaseEstimator, clone from sklearn.cluster import AgglomerativeClustering from sklearn.datasets import make_blobs @@ -32,7 +33,6 @@ from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import check_is_fitted - N_SAMPLES = 5000 RANDOM_STATE = 42 diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py index bc1f01cb1cdd7..46a7ec6fa58b5 100644 --- a/examples/cluster/plot_kmeans_assumptions.py +++ b/examples/cluster/plot_kmeans_assumptions.py @@ -21,6 +21,7 @@ # one has to define a linear `transformation`. import numpy as np + from sklearn.datasets import make_blobs n_samples = 1500 diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py index 94bba2a5c52d9..d61ec91d13d52 100644 --- a/examples/cluster/plot_kmeans_digits.py +++ b/examples/cluster/plot_kmeans_digits.py @@ -34,6 +34,7 @@ # to group images such that the handwritten digits on the image are the same. import numpy as np + from sklearn.datasets import load_digits data, labels = load_digits(return_X_y=True) @@ -53,6 +54,7 @@ # * train and time the pipeline fitting; # * measure the performance of the clustering obtained via different metrics. from time import time + from sklearn import metrics from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler diff --git a/examples/cluster/plot_kmeans_plusplus.py b/examples/cluster/plot_kmeans_plusplus.py index 1f3507c0062ac..69ea738635ddf 100644 --- a/examples/cluster/plot_kmeans_plusplus.py +++ b/examples/cluster/plot_kmeans_plusplus.py @@ -10,9 +10,10 @@ """ +import matplotlib.pyplot as plt + from sklearn.cluster import kmeans_plusplus from sklearn.datasets import make_blobs -import matplotlib.pyplot as plt # Generate sample data n_samples = 4000 diff --git a/examples/cluster/plot_kmeans_silhouette_analysis.py b/examples/cluster/plot_kmeans_silhouette_analysis.py index c7d0dc31d4873..c5817a750c2bb 100644 --- a/examples/cluster/plot_kmeans_silhouette_analysis.py +++ b/examples/cluster/plot_kmeans_silhouette_analysis.py @@ -31,14 +31,14 @@ """ -from sklearn.datasets import make_blobs -from sklearn.cluster import KMeans -from sklearn.metrics import silhouette_samples, silhouette_score - -import matplotlib.pyplot as plt import matplotlib.cm as cm +import matplotlib.pyplot as plt import numpy as np +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs +from sklearn.metrics import silhouette_samples, silhouette_score + # Generating the sample data from make_blobs # This particular setting has one distinct cluster and 3 clusters placed close # together. diff --git a/examples/cluster/plot_kmeans_stability_low_dim_dense.py b/examples/cluster/plot_kmeans_stability_low_dim_dense.py index c88cf864506f7..9340239a3d00e 100644 --- a/examples/cluster/plot_kmeans_stability_low_dim_dense.py +++ b/examples/cluster/plot_kmeans_stability_low_dim_dense.py @@ -26,14 +26,12 @@ # Author: Olivier Grisel # License: BSD 3 clause -import numpy as np -import matplotlib.pyplot as plt import matplotlib.cm as cm +import matplotlib.pyplot as plt +import numpy as np -from sklearn.utils import shuffle -from sklearn.utils import check_random_state -from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans, MiniBatchKMeans +from sklearn.utils import check_random_state, shuffle random_state = np.random.RandomState(0) diff --git a/examples/cluster/plot_linkage_comparison.py b/examples/cluster/plot_linkage_comparison.py index af4c3cd2894af..dc009d0110f7c 100644 --- a/examples/cluster/plot_linkage_comparison.py +++ b/examples/cluster/plot_linkage_comparison.py @@ -25,13 +25,13 @@ import time import warnings +from itertools import cycle, islice -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn import cluster, datasets from sklearn.preprocessing import StandardScaler -from itertools import cycle, islice np.random.seed(0) diff --git a/examples/cluster/plot_mean_shift.py b/examples/cluster/plot_mean_shift.py index 46ded7bc43421..aacbc7f216405 100644 --- a/examples/cluster/plot_mean_shift.py +++ b/examples/cluster/plot_mean_shift.py @@ -12,6 +12,7 @@ """ import numpy as np + from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets import make_blobs diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py index 7a9d599704059..3a6e8aa63786b 100644 --- a/examples/cluster/plot_mini_batch_kmeans.py +++ b/examples/cluster/plot_mini_batch_kmeans.py @@ -21,6 +21,7 @@ # We start by generating the blobs of data to be clustered. import numpy as np + from sklearn.datasets import make_blobs np.random.seed(0) @@ -35,6 +36,7 @@ # ------------------------------ import time + from sklearn.cluster import KMeans k_means = KMeans(init="k-means++", n_clusters=3, n_init=10) diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py index 7915abd20ce53..5765f8089f3b2 100644 --- a/examples/cluster/plot_optics.py +++ b/examples/cluster/plot_optics.py @@ -20,11 +20,12 @@ # Adrin Jalali # License: BSD 3 clause -from sklearn.cluster import OPTICS, cluster_optics_dbscan import matplotlib.gridspec as gridspec import matplotlib.pyplot as plt import numpy as np +from sklearn.cluster import OPTICS, cluster_optics_dbscan + # Generate sample data np.random.seed(0) diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py index 0880cdb893839..6fc41f7a5daf2 100644 --- a/examples/cluster/plot_segmentation_toy.py +++ b/examples/cluster/plot_segmentation_toy.py @@ -78,9 +78,10 @@ # %% # Here we perform spectral clustering using the arpack solver since amg is # numerically unstable on this example. We then plot the results. -from sklearn.cluster import spectral_clustering import matplotlib.pyplot as plt +from sklearn.cluster import spectral_clustering + labels = spectral_clustering(graph, n_clusters=4, eigen_solver="arpack") label_im = np.full(mask.shape, -1.0) label_im[mask] = labels diff --git a/examples/cluster/plot_ward_structured_vs_unstructured.py b/examples/cluster/plot_ward_structured_vs_unstructured.py index 430d00a8b3730..446d744b31e78 100644 --- a/examples/cluster/plot_ward_structured_vs_unstructured.py +++ b/examples/cluster/plot_ward_structured_vs_unstructured.py @@ -29,18 +29,14 @@ # The following import is required # for 3D projection to work with matplotlib < 3.2 - import mpl_toolkits.mplot3d # noqa: F401 - import numpy as np - # %% # Generate data # ------------- # # We start by generating the Swiss Roll dataset. - from sklearn.datasets import make_swiss_roll n_samples = 1500 diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py index d4798d828b321..669e817cbf81d 100644 --- a/examples/compose/plot_column_transformer.py +++ b/examples/compose/plot_column_transformer.py @@ -24,14 +24,14 @@ import numpy as np -from sklearn.preprocessing import FunctionTransformer +from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_20newsgroups from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import FunctionTransformer from sklearn.svm import LinearSVC ############################################################################## diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 5ed3239db8478..d7efd033440ce 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -34,12 +34,12 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_openml -from sklearn.pipeline import Pipeline +from sklearn.feature_selection import SelectPercentile, chi2 from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split, RandomizedSearchCV -from sklearn.feature_selection import SelectPercentile, chi2 +from sklearn.model_selection import RandomizedSearchCV, train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler np.random.seed(0) diff --git a/examples/compose/plot_compare_reduction.py b/examples/compose/plot_compare_reduction.py index 47975f84325b8..529366c6244f2 100644 --- a/examples/compose/plot_compare_reduction.py +++ b/examples/compose/plot_compare_reduction.py @@ -28,15 +28,16 @@ # Illustration of ``Pipeline`` and ``GridSearchCV`` ############################################################################### -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import load_digits +from sklearn.decomposition import NMF, PCA +from sklearn.feature_selection import SelectKBest, mutual_info_classif from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from sklearn.svm import LinearSVC -from sklearn.decomposition import PCA, NMF -from sklearn.feature_selection import SelectKBest, mutual_info_classif from sklearn.preprocessing import MinMaxScaler +from sklearn.svm import LinearSVC X, y = load_digits(return_X_y=True) @@ -103,9 +104,10 @@ # cache. Hence, use the ``memory`` constructor parameter when the fitting # of a transformer is costly. -from joblib import Memory from shutil import rmtree +from joblib import Memory + # Create a temporary folder to store the transformers of the pipeline location = "cachedir" memory = Memory(location=location, verbose=10) diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py index 640cd6e529a8d..2769422c404a4 100644 --- a/examples/compose/plot_digits_pipe.py +++ b/examples/compose/plot_digits_pipe.py @@ -14,15 +14,15 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import pandas as pd from sklearn import datasets from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression -from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler # Define a pipeline to search for the best combination of PCA truncation diff --git a/examples/compose/plot_feature_union.py b/examples/compose/plot_feature_union.py index e014b8b8808b9..01f7e02bfe44f 100644 --- a/examples/compose/plot_feature_union.py +++ b/examples/compose/plot_feature_union.py @@ -20,12 +20,12 @@ # # License: BSD 3 clause -from sklearn.pipeline import Pipeline, FeatureUnion -from sklearn.model_selection import GridSearchCV -from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import FeatureUnion, Pipeline +from sklearn.svm import SVC iris = load_iris() diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py index 1e550ca0ea837..b01c9fbe37934 100644 --- a/examples/compose/plot_transformed_target.py +++ b/examples/compose/plot_transformed_target.py @@ -32,6 +32,7 @@ # (`np.expm1`) will be used to transform the targets before training a linear # regression model and using it for prediction. import numpy as np + from sklearn.datasets import make_regression X, y = make_regression(n_samples=10_000, noise=100, random_state=0) @@ -42,6 +43,7 @@ # Below we plot the probability density functions of the target # before and after applying the logarithmic functions. import matplotlib.pyplot as plt + from sklearn.model_selection import train_test_split f, (ax0, ax1) = plt.subplots(1, 2) diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py index be3bf4837eb9f..df9af8ea330ba 100644 --- a/examples/covariance/plot_covariance_estimation.py +++ b/examples/covariance/plot_covariance_estimation.py @@ -37,9 +37,10 @@ # Compute the likelihood on test data # ----------------------------------- -from sklearn.covariance import ShrunkCovariance, empirical_covariance, log_likelihood from scipy import linalg +from sklearn.covariance import ShrunkCovariance, empirical_covariance, log_likelihood + # spanning a range of possible shrinkage coefficient values shrinkages = np.logspace(-2, 0, 30) negative_logliks = [ @@ -73,8 +74,8 @@ # are Gaussian, in particular for small samples. +from sklearn.covariance import OAS, LedoitWolf from sklearn.model_selection import GridSearchCV -from sklearn.covariance import LedoitWolf, OAS # GridSearch for an optimal shrinkage coefficient tuned_parameters = [{"shrinkage": shrinkages}] diff --git a/examples/covariance/plot_lw_vs_oas.py b/examples/covariance/plot_lw_vs_oas.py index 1fd84b180f50a..107f6bd1c29cc 100644 --- a/examples/covariance/plot_lw_vs_oas.py +++ b/examples/covariance/plot_lw_vs_oas.py @@ -21,11 +21,11 @@ """ -import numpy as np import matplotlib.pyplot as plt -from scipy.linalg import toeplitz, cholesky +import numpy as np +from scipy.linalg import cholesky, toeplitz -from sklearn.covariance import LedoitWolf, OAS +from sklearn.covariance import OAS, LedoitWolf np.random.seed(0) # %% diff --git a/examples/covariance/plot_mahalanobis_distances.py b/examples/covariance/plot_mahalanobis_distances.py index b93d68a269706..bd61e5af22147 100644 --- a/examples/covariance/plot_mahalanobis_distances.py +++ b/examples/covariance/plot_mahalanobis_distances.py @@ -103,6 +103,7 @@ # designed to have a much larger variance in feature 2. import matplotlib.pyplot as plt + from sklearn.covariance import EmpiricalCovariance, MinCovDet # fit a MCD robust estimator to data diff --git a/examples/covariance/plot_robust_vs_empirical_covariance.py b/examples/covariance/plot_robust_vs_empirical_covariance.py index 9111ec82bcbf3..c61a97ddd979b 100644 --- a/examples/covariance/plot_robust_vs_empirical_covariance.py +++ b/examples/covariance/plot_robust_vs_empirical_covariance.py @@ -53,9 +53,9 @@ """ -import numpy as np -import matplotlib.pyplot as plt import matplotlib.font_manager +import matplotlib.pyplot as plt +import numpy as np from sklearn.covariance import EmpiricalCovariance, MinCovDet diff --git a/examples/covariance/plot_sparse_cov.py b/examples/covariance/plot_sparse_cov.py index 96a5486dc964e..a088aeb7e69c0 100644 --- a/examples/covariance/plot_sparse_cov.py +++ b/examples/covariance/plot_sparse_cov.py @@ -59,6 +59,7 @@ # ----------------- import numpy as np from scipy import linalg + from sklearn.datasets import make_sparse_spd_matrix n_samples = 60 diff --git a/examples/cross_decomposition/plot_pcr_vs_pls.py b/examples/cross_decomposition/plot_pcr_vs_pls.py index 529225d11eead..895c75dc1a728 100644 --- a/examples/cross_decomposition/plot_pcr_vs_pls.py +++ b/examples/cross_decomposition/plot_pcr_vs_pls.py @@ -41,8 +41,9 @@ # into PCR and PLS, we fit a PCA estimator to display the two principal # components of this dataset, i.e. the two directions that explain the most # variance in the data. -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.decomposition import PCA rng = np.random.RandomState(0) @@ -99,12 +100,12 @@ # For both models, we plot the projected data onto the first component against # the target. In both cases, this projected data is what the regressors will # use as training data. +from sklearn.cross_decomposition import PLSRegression +from sklearn.decomposition import PCA +from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline -from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler -from sklearn.decomposition import PCA -from sklearn.cross_decomposition import PLSRegression X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) diff --git a/examples/datasets/plot_digits_last_image.py b/examples/datasets/plot_digits_last_image.py index 95ce867011a9a..0fde32cc674a8 100644 --- a/examples/datasets/plot_digits_last_image.py +++ b/examples/datasets/plot_digits_last_image.py @@ -18,10 +18,10 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -from sklearn import datasets - import matplotlib.pyplot as plt +from sklearn import datasets + # Load the digits dataset digits = datasets.load_digits() diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py index 4f3fdbbb11ef5..e5cbdb080b59f 100644 --- a/examples/datasets/plot_random_dataset.py +++ b/examples/datasets/plot_random_dataset.py @@ -16,9 +16,7 @@ import matplotlib.pyplot as plt -from sklearn.datasets import make_classification -from sklearn.datasets import make_blobs -from sklearn.datasets import make_gaussian_quantiles +from sklearn.datasets import make_blobs, make_classification, make_gaussian_quantiles plt.figure(figsize=(8, 8)) plt.subplots_adjust(bottom=0.05, top=0.9, left=0.05, right=0.95) diff --git a/examples/datasets/plot_random_multilabel_dataset.py b/examples/datasets/plot_random_multilabel_dataset.py index f22c7b9695c42..e6e2d6ad9edcf 100644 --- a/examples/datasets/plot_random_multilabel_dataset.py +++ b/examples/datasets/plot_random_multilabel_dataset.py @@ -35,8 +35,8 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import make_multilabel_classification as make_ml_clf diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py index 12c091c8e14cb..c29c99b7f3c67 100644 --- a/examples/decomposition/plot_faces_decomposition.py +++ b/examples/decomposition/plot_faces_decomposition.py @@ -21,12 +21,11 @@ import logging -from numpy.random import RandomState import matplotlib.pyplot as plt +from numpy.random import RandomState +from sklearn import cluster, decomposition from sklearn.datasets import fetch_olivetti_faces -from sklearn import cluster -from sklearn import decomposition rng = RandomState(0) diff --git a/examples/decomposition/plot_ica_blind_source_separation.py b/examples/decomposition/plot_ica_blind_source_separation.py index 8c1529a3256fb..584d6b9509589 100644 --- a/examples/decomposition/plot_ica_blind_source_separation.py +++ b/examples/decomposition/plot_ica_blind_source_separation.py @@ -41,7 +41,7 @@ # Fit ICA and PCA models # ---------------------- -from sklearn.decomposition import FastICA, PCA +from sklearn.decomposition import PCA, FastICA # Compute ICA ica = FastICA(n_components=3, whiten="arbitrary-variance") diff --git a/examples/decomposition/plot_image_denoising.py b/examples/decomposition/plot_image_denoising.py index 2840905f0f604..646669d1469ff 100644 --- a/examples/decomposition/plot_image_denoising.py +++ b/examples/decomposition/plot_image_denoising.py @@ -37,7 +37,6 @@ # ------------------------ import numpy as np - try: # Scipy >= 1.10 from scipy.datasets import face except ImportError: diff --git a/examples/decomposition/plot_incremental_pca.py b/examples/decomposition/plot_incremental_pca.py index adc7f83f3cda0..8e5aeccfddc8a 100644 --- a/examples/decomposition/plot_incremental_pca.py +++ b/examples/decomposition/plot_incremental_pca.py @@ -22,8 +22,8 @@ # Authors: Kyle Kastner # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import load_iris from sklearn.decomposition import PCA, IncrementalPCA diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py index 692b9983ed55e..61ce5dde75c89 100644 --- a/examples/decomposition/plot_pca_3d.py +++ b/examples/decomposition/plot_pca_3d.py @@ -19,7 +19,6 @@ # --------------- import numpy as np - from scipy import stats e = np.exp(1) @@ -52,13 +51,13 @@ def pdf(x): # Plot the figures # ---------------- -from sklearn.decomposition import PCA - import matplotlib.pyplot as plt # unused but required import for doing 3d projections with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +from sklearn.decomposition import PCA + def plot_figs(fig_num, elev, azim): fig = plt.figure(fig_num, figsize=(4, 3)) diff --git a/examples/decomposition/plot_pca_iris.py b/examples/decomposition/plot_pca_iris.py index 7c3e69580d298..d025ba34adc27 100644 --- a/examples/decomposition/plot_pca_iris.py +++ b/examples/decomposition/plot_pca_iris.py @@ -13,15 +13,13 @@ # Code source: Gaël Varoquaux # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt - -from sklearn import decomposition -from sklearn import datasets - # unused but required import for doing 3d projections with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +import numpy as np + +from sklearn import datasets, decomposition np.random.seed(5) diff --git a/examples/decomposition/plot_pca_vs_fa_model_selection.py b/examples/decomposition/plot_pca_vs_fa_model_selection.py index 4c934ab756c3e..e269fc6b5c278 100644 --- a/examples/decomposition/plot_pca_vs_fa_model_selection.py +++ b/examples/decomposition/plot_pca_vs_fa_model_selection.py @@ -34,7 +34,6 @@ # --------------- import numpy as np - from scipy import linalg n_samples, n_features, rank = 500, 25, 5 @@ -56,10 +55,9 @@ import matplotlib.pyplot as plt +from sklearn.covariance import LedoitWolf, ShrunkCovariance from sklearn.decomposition import PCA, FactorAnalysis -from sklearn.covariance import ShrunkCovariance, LedoitWolf -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, cross_val_score n_components = np.arange(0, n_features, 5) # options for n_components diff --git a/examples/decomposition/plot_sparse_coding.py b/examples/decomposition/plot_sparse_coding.py index 4f4602f1ff1ac..c45cd3c83b04f 100644 --- a/examples/decomposition/plot_sparse_coding.py +++ b/examples/decomposition/plot_sparse_coding.py @@ -16,8 +16,8 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.decomposition import SparseCoder diff --git a/examples/decomposition/plot_varimax_fa.py b/examples/decomposition/plot_varimax_fa.py index 6e50709620325..9d4c3b9ed1ee7 100644 --- a/examples/decomposition/plot_varimax_fa.py +++ b/examples/decomposition/plot_varimax_fa.py @@ -22,9 +22,9 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.decomposition import FactorAnalysis, PCA -from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_iris +from sklearn.decomposition import PCA, FactorAnalysis +from sklearn.preprocessing import StandardScaler # %% # Load Iris data diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py index 13d3a90d3b05c..313056286f6ba 100644 --- a/examples/ensemble/plot_adaboost_hastie_10_2.py +++ b/examples/ensemble/plot_adaboost_hastie_10_2.py @@ -94,6 +94,7 @@ # added to the ensemble. import numpy as np + from sklearn.metrics import zero_one_loss ada_discrete_err = np.zeros((n_estimators,)) diff --git a/examples/ensemble/plot_adaboost_multiclass.py b/examples/ensemble/plot_adaboost_multiclass.py index fae87b4a42d3d..f12aa8c75e213 100644 --- a/examples/ensemble/plot_adaboost_multiclass.py +++ b/examples/ensemble/plot_adaboost_multiclass.py @@ -35,7 +35,6 @@ from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier - X, y = make_gaussian_quantiles( n_samples=13000, n_features=10, n_classes=3, random_state=1 ) diff --git a/examples/ensemble/plot_adaboost_twoclass.py b/examples/ensemble/plot_adaboost_twoclass.py index 19679c6285d3b..d1e89c47b7fcf 100644 --- a/examples/ensemble/plot_adaboost_twoclass.py +++ b/examples/ensemble/plot_adaboost_twoclass.py @@ -21,14 +21,13 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.ensemble import AdaBoostClassifier -from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import make_gaussian_quantiles +from sklearn.ensemble import AdaBoostClassifier from sklearn.inspection import DecisionBoundaryDisplay - +from sklearn.tree import DecisionTreeClassifier # Construct dataset X1, y1 = make_gaussian_quantiles( diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py index 4f57b90019e94..9239603115db1 100644 --- a/examples/ensemble/plot_bias_variance.py +++ b/examples/ensemble/plot_bias_variance.py @@ -66,8 +66,8 @@ # Author: Gilles Louppe # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor diff --git a/examples/ensemble/plot_ensemble_oob.py b/examples/ensemble/plot_ensemble_oob.py index bd678af42a7d1..972ca1f6259aa 100644 --- a/examples/ensemble/plot_ensemble_oob.py +++ b/examples/ensemble/plot_ensemble_oob.py @@ -26,9 +26,10 @@ # # License: BSD 3 Clause +from collections import OrderedDict + import matplotlib.pyplot as plt -from collections import OrderedDict from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py index 36eb87bb757cd..8a17dd9d74194 100644 --- a/examples/ensemble/plot_feature_transformation.py +++ b/examples/ensemble/plot_feature_transformation.py @@ -59,7 +59,7 @@ # First, we will start by training the random forest and gradient boosting on # the separated training set -from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier random_forest = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, random_state=10 @@ -105,8 +105,7 @@ # method `apply`. The pipeline in scikit-learn expects a call to `transform`. # Therefore, we wrapped the call to `apply` within a `FunctionTransformer`. -from sklearn.preprocessing import FunctionTransformer -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder def rf_apply(X, model): @@ -143,6 +142,7 @@ def gbdt_apply(X, model): # We can finally show the different ROC curves for all the models. import matplotlib.pyplot as plt + from sklearn.metrics import RocCurveDisplay fig, ax = plt.subplots() diff --git a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py index b4a1993471474..cc48e47e9ad34 100644 --- a/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py +++ b/examples/ensemble/plot_forest_hist_grad_boosting_comparison.py @@ -78,8 +78,8 @@ # here to keep the example simple. import pandas as pd -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import RandomForestRegressor + +from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor from sklearn.model_selection import GridSearchCV, KFold models = { @@ -123,8 +123,8 @@ # Error bars correspond to one standard deviation as computed in the different # folds of the cross-validation. -import plotly.express as px import plotly.colors as colors +import plotly.express as px from plotly.subplots import make_subplots fig = make_subplots( diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index fbda63b26faee..269451168dd7a 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -57,6 +57,7 @@ # cardinality** features (many unique values). See # :ref:`permutation_importance` as an alternative below. import time + import numpy as np start_time = time.time() diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py index 3848873c297de..8b8e8751ec5a2 100644 --- a/examples/ensemble/plot_forest_importances_faces.py +++ b/examples/ensemble/plot_forest_importances_faces.py @@ -59,6 +59,7 @@ # cardinality** features (many unique values). See # :ref:`permutation_importance` as an alternative. import time + import matplotlib.pyplot as plt start_time = time.time() diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py index ee414db7125dc..6aaceea88efd2 100644 --- a/examples/ensemble/plot_forest_iris.py +++ b/examples/ensemble/plot_forest_iris.py @@ -42,15 +42,15 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import ListedColormap from sklearn.datasets import load_iris from sklearn.ensemble import ( - RandomForestClassifier, - ExtraTreesClassifier, AdaBoostClassifier, + ExtraTreesClassifier, + RandomForestClassifier, ) from sklearn.tree import DecisionTreeClassifier diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py index fa4b68be9cbb7..0dd0a84243b4d 100644 --- a/examples/ensemble/plot_gradient_boosting_categorical.py +++ b/examples/ensemble/plot_gradient_boosting_categorical.py @@ -77,10 +77,9 @@ # As a baseline, we create an estimator where the categorical features are # dropped: +from sklearn.compose import make_column_selector, make_column_transformer from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.pipeline import make_pipeline -from sklearn.compose import make_column_transformer -from sklearn.compose import make_column_selector dropper = make_column_transformer( ("drop", make_column_selector(dtype_include="category")), remainder="passthrough" @@ -114,9 +113,10 @@ # were ordered quantities, i.e. the categories will be encoded as 0, 1, 2, # etc., and treated as continuous features. -from sklearn.preprocessing import OrdinalEncoder import numpy as np +from sklearn.preprocessing import OrdinalEncoder + ordinal_encoder = make_column_transformer( ( OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan), @@ -166,9 +166,10 @@ # models performance in terms of # :func:`~metrics.mean_absolute_percentage_error` and fit times. -from sklearn.model_selection import cross_validate import matplotlib.pyplot as plt +from sklearn.model_selection import cross_validate + scoring = "neg_mean_absolute_percentage_error" n_cv_folds = 3 diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py index 6f1013eed9564..f271f80a07c55 100644 --- a/examples/ensemble/plot_gradient_boosting_early_stopping.py +++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py @@ -38,11 +38,10 @@ import time -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn import ensemble -from sklearn import datasets +from sklearn import datasets, ensemble from sklearn.model_selection import train_test_split data_list = [ diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py index dd7f19a1fe245..0cb40ad2c11ea 100644 --- a/examples/ensemble/plot_gradient_boosting_oob.py +++ b/examples/ensemble/plot_gradient_boosting_oob.py @@ -26,15 +26,13 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from scipy.special import expit from sklearn import ensemble -from sklearn.model_selection import KFold -from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss - -from scipy.special import expit +from sklearn.model_selection import KFold, train_test_split # Generate data (adapted from G. Ridgeway's gbm example) n_samples = 1000 diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index 2aa04c3988d9e..36a78dfeb94ca 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -12,6 +12,7 @@ # Generate some data for a synthetic regression problem by applying the # function f to uniformly sampled random inputs. import numpy as np + from sklearn.model_selection import train_test_split @@ -58,7 +59,6 @@ def f(x): from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_pinball_loss, mean_squared_error - all_models = {} common_params = dict( learning_rate=0.05, @@ -93,7 +93,6 @@ def f(x): # 90% interval (from 5th to 95th conditional percentiles). import matplotlib.pyplot as plt - y_pred = all_models["mse"].predict(xx) y_lower = all_models["q 0.05"].predict(xx) y_upper = all_models["q 0.95"].predict(xx) diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index 3e378e8af7203..94705ccfeca24 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -23,6 +23,7 @@ import matplotlib.pyplot as plt import numpy as np + from sklearn import datasets, ensemble from sklearn.inspection import permutation_importance from sklearn.metrics import mean_squared_error diff --git a/examples/ensemble/plot_gradient_boosting_regularization.py b/examples/ensemble/plot_gradient_boosting_regularization.py index a4ac69a822b92..218d69d5ac7d7 100644 --- a/examples/ensemble/plot_gradient_boosting_regularization.py +++ b/examples/ensemble/plot_gradient_boosting_regularization.py @@ -25,11 +25,10 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn import ensemble -from sklearn import datasets +from sklearn import datasets, ensemble from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split diff --git a/examples/ensemble/plot_isolation_forest.py b/examples/ensemble/plot_isolation_forest.py index aeabb60203ac6..f5fad1d7b9ea9 100644 --- a/examples/ensemble/plot_isolation_forest.py +++ b/examples/ensemble/plot_isolation_forest.py @@ -31,6 +31,7 @@ # the label `-1`. import numpy as np + from sklearn.model_selection import train_test_split n_samples, n_outliers = 120, 40 @@ -78,6 +79,7 @@ # or not. The scatter plot displays the true labels. import matplotlib.pyplot as plt + from sklearn.inspection import DecisionBoundaryDisplay disp = DecisionBoundaryDisplay.from_estimator( diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py index b1f7ca8ed24ed..15ad8e9524243 100644 --- a/examples/ensemble/plot_monotonic_constraints.py +++ b/examples/ensemble/plot_monotonic_constraints.py @@ -20,11 +20,11 @@ """ # %% -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.inspection import PartialDependenceDisplay -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.inspection import PartialDependenceDisplay rng = np.random.RandomState(0) diff --git a/examples/ensemble/plot_random_forest_embedding.py b/examples/ensemble/plot_random_forest_embedding.py index 000b83e67b92a..fe26e04ca7789 100644 --- a/examples/ensemble/plot_random_forest_embedding.py +++ b/examples/ensemble/plot_random_forest_embedding.py @@ -26,12 +26,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import make_circles -from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from sklearn.decomposition import TruncatedSVD +from sklearn.ensemble import ExtraTreesClassifier, RandomTreesEmbedding from sklearn.naive_bayes import BernoulliNB # make a synthetic dataset diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py index 4b3d4f4a9a728..ce8346c329127 100644 --- a/examples/ensemble/plot_random_forest_regression_multioutput.py +++ b/examples/ensemble/plot_random_forest_regression_multioutput.py @@ -25,13 +25,13 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.multioutput import MultiOutputRegressor - # Create a random dataset rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(600, 1) - 100, axis=0) diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py index 56a82ded5b725..aac7ccc8a3ef8 100644 --- a/examples/ensemble/plot_stack_predictors.py +++ b/examples/ensemble/plot_stack_predictors.py @@ -131,8 +131,7 @@ def load_ames_housing(): # Then, we will now define the preprocessor used when the ending regressor # is a linear model. -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler cat_linear_processor = OneHotEncoder(handle_unknown="ignore") num_linear_processor = make_pipeline( @@ -206,9 +205,11 @@ def load_ames_housing(): import time + import matplotlib.pyplot as plt + from sklearn.metrics import PredictionErrorDisplay -from sklearn.model_selection import cross_validate, cross_val_predict +from sklearn.model_selection import cross_val_predict, cross_validate fig, axs = plt.subplots(2, 2, figsize=(9, 7)) axs = np.ravel(axs) diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py index e6dc68eeadf98..90441c6d28339 100644 --- a/examples/ensemble/plot_voting_decision_regions.py +++ b/examples/ensemble/plot_voting_decision_regions.py @@ -28,11 +28,11 @@ import matplotlib.pyplot as plt from sklearn import datasets -from sklearn.tree import DecisionTreeClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC from sklearn.ensemble import VotingClassifier from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier # Loading some example data iris = datasets.load_iris() diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py index 54c290c3073e0..14f4f4330c045 100644 --- a/examples/ensemble/plot_voting_probas.py +++ b/examples/ensemble/plot_voting_probas.py @@ -23,13 +23,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import VotingClassifier clf1 = LogisticRegression(max_iter=1000, random_state=123) clf2 = RandomForestClassifier(n_estimators=100, random_state=123) diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py index 23e709cc9e62a..d33becca505e3 100644 --- a/examples/ensemble/plot_voting_regressor.py +++ b/examples/ensemble/plot_voting_regressor.py @@ -26,10 +26,12 @@ import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes -from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import ( + GradientBoostingRegressor, + RandomForestRegressor, + VotingRegressor, +) from sklearn.linear_model import LinearRegression -from sklearn.ensemble import VotingRegressor # %% # Training classifiers diff --git a/examples/exercises/plot_cv_digits.py b/examples/exercises/plot_cv_digits.py index e43bbd86bb027..ebad3a55098b5 100644 --- a/examples/exercises/plot_cv_digits.py +++ b/examples/exercises/plot_cv_digits.py @@ -11,8 +11,9 @@ """ import numpy as np -from sklearn.model_selection import cross_val_score + from sklearn import datasets, svm +from sklearn.model_selection import cross_val_score X, y = datasets.load_digits(return_X_y=True) diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py index 877e615659743..25b0171c66421 100644 --- a/examples/exercises/plot_digits_classification_exercise.py +++ b/examples/exercises/plot_digits_classification_exercise.py @@ -12,7 +12,7 @@ """ -from sklearn import datasets, neighbors, linear_model +from sklearn import datasets, linear_model, neighbors X_digits, y_digits = datasets.load_digits(return_X_y=True) X_digits = X_digits / X_digits.max() diff --git a/examples/exercises/plot_iris_exercise.py b/examples/exercises/plot_iris_exercise.py index 74da8c27889c9..07687b920e1b8 100644 --- a/examples/exercises/plot_iris_exercise.py +++ b/examples/exercises/plot_iris_exercise.py @@ -10,8 +10,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets, svm iris = datasets.load_iris() diff --git a/examples/feature_selection/plot_f_test_vs_mi.py b/examples/feature_selection/plot_f_test_vs_mi.py index ba82625a7cfaf..5c015e7e4fd58 100644 --- a/examples/feature_selection/plot_f_test_vs_mi.py +++ b/examples/feature_selection/plot_f_test_vs_mi.py @@ -23,8 +23,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.feature_selection import f_regression, mutual_info_regression np.random.seed(0) diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py index ce2bad8626a79..c57a2d5d6b6f9 100644 --- a/examples/feature_selection/plot_feature_selection.py +++ b/examples/feature_selection/plot_feature_selection.py @@ -21,6 +21,7 @@ # -------------------- # import numpy as np + from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split diff --git a/examples/feature_selection/plot_rfe_digits.py b/examples/feature_selection/plot_rfe_digits.py index 9684f5fabd383..553f38f9c674f 100644 --- a/examples/feature_selection/plot_rfe_digits.py +++ b/examples/feature_selection/plot_rfe_digits.py @@ -12,10 +12,11 @@ """ # noqa: E501 -from sklearn.svm import SVC +import matplotlib.pyplot as plt + from sklearn.datasets import load_digits from sklearn.feature_selection import RFE -import matplotlib.pyplot as plt +from sklearn.svm import SVC # Load the digits dataset digits = load_digits() diff --git a/examples/feature_selection/plot_rfe_with_cross_validation.py b/examples/feature_selection/plot_rfe_with_cross_validation.py index 2d52ea5a3fdf3..693e21fe21787 100644 --- a/examples/feature_selection/plot_rfe_with_cross_validation.py +++ b/examples/feature_selection/plot_rfe_with_cross_validation.py @@ -39,8 +39,8 @@ # strategy "accuracy" optimizes the proportion of correctly classified samples. from sklearn.feature_selection import RFECV -from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import StratifiedKFold min_features_to_select = 1 # Minimum number of features to consider clf = LogisticRegression() diff --git a/examples/feature_selection/plot_select_from_model_diabetes.py b/examples/feature_selection/plot_select_from_model_diabetes.py index 6e8a6434e1079..38276efcbd770 100644 --- a/examples/feature_selection/plot_select_from_model_diabetes.py +++ b/examples/feature_selection/plot_select_from_model_diabetes.py @@ -46,6 +46,7 @@ # :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py`. # noqa: E501 import matplotlib.pyplot as plt import numpy as np + from sklearn.linear_model import RidgeCV ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y) @@ -67,9 +68,10 @@ # # Since we want to select only 2 features, we will set this threshold slightly # above the coefficient of third most important feature. -from sklearn.feature_selection import SelectFromModel from time import time +from sklearn.feature_selection import SelectFromModel + threshold = np.sort(importance)[-3] + 0.01 tic = time() diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py index 7a58ba437278f..8379baf148256 100644 --- a/examples/gaussian_process/plot_compare_gpr_krr.py +++ b/examples/gaussian_process/plot_compare_gpr_krr.py @@ -125,6 +125,7 @@ # # Thus, let's use such a :class:`~sklearn.kernel_ridge.KernelRidge`. import time + from sklearn.gaussian_process.kernels import ExpSineSquared from sklearn.kernel_ridge import KernelRidge @@ -176,9 +177,10 @@ # parameter and the kernel parameters. # %% -from sklearn.model_selection import RandomizedSearchCV from scipy.stats import loguniform +from sklearn.model_selection import RandomizedSearchCV + param_distributions = { "alpha": loguniform(1e0, 1e3), "kernel__length_scale": loguniform(1e-2, 1e2), diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py index e2d78fa23f09e..21a99065e06ce 100644 --- a/examples/gaussian_process/plot_gpc.py +++ b/examples/gaussian_process/plot_gpc.py @@ -27,13 +27,11 @@ # License: BSD 3 clause import numpy as np - from matplotlib import pyplot as plt -from sklearn.metrics import accuracy_score, log_loss from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF - +from sklearn.metrics import accuracy_score, log_loss # Generate data train_size = 50 diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py index ce0ed066a1377..88c536d8824c8 100644 --- a/examples/gaussian_process/plot_gpc_iris.py +++ b/examples/gaussian_process/plot_gpc_iris.py @@ -10,8 +10,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py index cc036244bc17a..a986d285632b7 100644 --- a/examples/gaussian_process/plot_gpc_isoprobability.py +++ b/examples/gaussian_process/plot_gpc_isoprobability.py @@ -14,12 +14,12 @@ # License: BSD 3 clause import numpy as np - -from matplotlib import pyplot as plt from matplotlib import cm +from matplotlib import pyplot as plt from sklearn.gaussian_process import GaussianProcessClassifier -from sklearn.gaussian_process.kernels import DotProduct, ConstantKernel as C +from sklearn.gaussian_process.kernels import ConstantKernel as C +from sklearn.gaussian_process.kernels import DotProduct # A few constants lim = 8 diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py index 6e6217dba8b9e..4439a5ee722b6 100644 --- a/examples/gaussian_process/plot_gpc_xor.py +++ b/examples/gaussian_process/plot_gpc_xor.py @@ -15,13 +15,12 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF, DotProduct - xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50)) rng = np.random.RandomState(0) X = rng.randn(200, 2) diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index bfc1c21631b26..a3acd1dbfcbd3 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -172,6 +172,7 @@ # Thus, we create synthetic data from 1958 to the current month. In addition, # we need to add the subtracted mean computed during training. import datetime + import numpy as np today = datetime.datetime.now() diff --git a/examples/gaussian_process/plot_gpr_on_structured_data.py b/examples/gaussian_process/plot_gpr_on_structured_data.py index ada50a0edf06b..e702f1fe0769a 100644 --- a/examples/gaussian_process/plot_gpr_on_structured_data.py +++ b/examples/gaussian_process/plot_gpr_on_structured_data.py @@ -40,11 +40,10 @@ # %% import numpy as np -from sklearn.gaussian_process.kernels import Kernel, Hyperparameter -from sklearn.gaussian_process.kernels import GenericKernelMixin -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process import GaussianProcessClassifier + from sklearn.base import clone +from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor +from sklearn.gaussian_process.kernels import GenericKernelMixin, Hyperparameter, Kernel class SequenceKernel(GenericKernelMixin, Kernel): diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index d83922817e5de..9dc8b6c831710 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -44,21 +44,21 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np import pandas as pd +from sklearn.datasets import fetch_california_housing +from sklearn.ensemble import RandomForestRegressor + # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.datasets import fetch_california_housing -from sklearn.impute import SimpleImputer -from sklearn.impute import IterativeImputer -from sklearn.linear_model import BayesianRidge, Ridge +from sklearn.impute import IterativeImputer, SimpleImputer from sklearn.kernel_approximation import Nystroem -from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import BayesianRidge, Ridge +from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline -from sklearn.model_selection import cross_val_score N_SPLITS = 5 diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index f6350ad2544dd..4b9f8ae079d8a 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -44,9 +44,7 @@ import numpy as np -from sklearn.datasets import fetch_california_housing -from sklearn.datasets import load_diabetes - +from sklearn.datasets import fetch_california_housing, load_diabetes rng = np.random.RandomState(42) @@ -95,11 +93,10 @@ def add_missing_values(X_full, y_full): # To use the experimental IterativeImputer, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer +from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline - N_SPLITS = 4 regressor = RandomForestRegressor(random_state=0) @@ -260,7 +257,6 @@ def get_impute_iterative(X_missing, y_missing): import matplotlib.pyplot as plt - n_bars = len(mses_diabetes) xval = np.arange(n_bars) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index d978ee860636c..eb935ee41ae67 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -40,10 +40,10 @@ """ # %% +import matplotlib.pyplot as plt import numpy as np -import scipy as sp import pandas as pd -import matplotlib.pyplot as plt +import scipy as sp import seaborn as sns # %% @@ -53,7 +53,6 @@ # We fetch the data from `OpenML `_. # Note that setting the parameter `as_frame` to True will retrieve the data # as a pandas dataframe. - from sklearn.datasets import fetch_openml survey = fetch_openml(data_id=534, as_frame=True, parser="pandas") @@ -154,9 +153,9 @@ # To describe the dataset as a linear model we use a ridge regressor # with a very small regularization and to model the logarithm of the WAGE. -from sklearn.pipeline import make_pipeline -from sklearn.linear_model import Ridge from sklearn.compose import TransformedTargetRegressor +from sklearn.linear_model import Ridge +from sklearn.pipeline import make_pipeline model = make_pipeline( preprocessor, @@ -178,8 +177,7 @@ # on the test set and computing, # for example, the median absolute error of the model. -from sklearn.metrics import median_absolute_error -from sklearn.metrics import PredictionErrorDisplay +from sklearn.metrics import PredictionErrorDisplay, median_absolute_error mae_train = median_absolute_error(y_train, model.predict(X_train)) y_pred = model.predict(X_test) @@ -319,8 +317,7 @@ # their robustness is not guaranteed, and they should probably be interpreted # with caution. -from sklearn.model_selection import cross_validate -from sklearn.model_selection import RepeatedKFold +from sklearn.model_selection import RepeatedKFold, cross_validate cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=0) cv_model = cross_validate( diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 43404b356d829..ed7a656da9926 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -100,8 +100,9 @@ # We plot the average number of bike rentals by grouping the data by season and # by year. from itertools import product -import numpy as np + import matplotlib.pyplot as plt +import numpy as np days = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") hours = tuple(range(24)) @@ -157,8 +158,7 @@ # numerical features and encode the categorical features with a # :class:`~sklearn.preprocessing.OneHotEncoder`. from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import QuantileTransformer -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OneHotEncoder, QuantileTransformer mlp_preprocessor = ColumnTransformer( transformers=[ @@ -203,6 +203,7 @@ # Let's fit a :class:`~sklearn.neural_network.MLPRegressor` and compute # single-variable partial dependence plots. from time import time + from sklearn.neural_network import MLPRegressor from sklearn.pipeline import make_pipeline @@ -242,6 +243,7 @@ # # We will plot the averaged partial dependence. import matplotlib.pyplot as plt + from sklearn.inspection import PartialDependenceDisplay common_params = { @@ -529,10 +531,9 @@ # # Let's make the same partial dependence plot for the 2 features interaction, # this time in 3 dimensions. -import numpy as np - # unused but required import for doing 3d projections with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +import numpy as np from sklearn.inspection import partial_dependence diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index cf0907ce3fd37..789506e892e3a 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -64,9 +64,9 @@ # categorical features; # - use :class:`~sklearn.impute.SimpleImputer` to fill missing values for # numerical features using a mean strategy. +from sklearn.compose import ColumnTransformer from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer -from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OrdinalEncoder diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py index 59871c00946a6..e14916e808af9 100644 --- a/examples/inspection/plot_permutation_importance_multicollinear.py +++ b/examples/inspection/plot_permutation_importance_multicollinear.py @@ -22,9 +22,9 @@ import matplotlib.pyplot as plt import numpy as np -from scipy.stats import spearmanr from scipy.cluster import hierarchy from scipy.spatial.distance import squareform +from scipy.stats import spearmanr from sklearn.datasets import load_breast_cancer from sklearn.ensemble import RandomForestClassifier diff --git a/examples/kernel_approximation/plot_scalable_poly_kernels.py b/examples/kernel_approximation/plot_scalable_poly_kernels.py index 1a46e4bc2aa9c..8e5854bd8500c 100644 --- a/examples/kernel_approximation/plot_scalable_poly_kernels.py +++ b/examples/kernel_approximation/plot_scalable_poly_kernels.py @@ -64,8 +64,8 @@ # the LIBSVM webpage, and then normalize to unit length as done in the # original Tensor Sketch paper [1]. -from sklearn.preprocessing import MinMaxScaler, Normalizer from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import MinMaxScaler, Normalizer mm = make_pipeline(MinMaxScaler(), Normalizer()) X_train = mm.fit_transform(X_train) @@ -80,6 +80,7 @@ # plot them later. import time + from sklearn.svm import LinearSVC results = {} diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py index 261fec8aeee3b..79b49fb76ef9a 100644 --- a/examples/linear_model/plot_ard.py +++ b/examples/linear_model/plot_ard.py @@ -54,7 +54,8 @@ # coefficients. import pandas as pd -from sklearn.linear_model import ARDRegression, LinearRegression, BayesianRidge + +from sklearn.linear_model import ARDRegression, BayesianRidge, LinearRegression olr = LinearRegression().fit(X, y) brr = BayesianRidge(compute_score=True, n_iter=30).fit(X, y) diff --git a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py index 3bca3101758ff..8313b0b56922e 100644 --- a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py +++ b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py @@ -20,6 +20,7 @@ # %% # Let's start by loading the dataset and creating some sample weights. import numpy as np + from sklearn.datasets import make_regression rng = np.random.RandomState(0) diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py index 2ea5a190e35d8..7c0222b71a721 100644 --- a/examples/linear_model/plot_huber_vs_ridge.py +++ b/examples/linear_model/plot_huber_vs_ridge.py @@ -16,8 +16,8 @@ # Authors: Manoj Kumar mks542@nyu.edu # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import make_regression from sklearn.linear_model import HuberRegressor, Ridge diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py index faf547c783609..b1e4d76c7f221 100644 --- a/examples/linear_model/plot_iris_logistic.py +++ b/examples/linear_model/plot_iris_logistic.py @@ -15,9 +15,10 @@ # License: BSD 3 clause import matplotlib.pyplot as plt -from sklearn.linear_model import LogisticRegression + from sklearn import datasets from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.linear_model import LogisticRegression # import some data to play with iris = datasets.load_iris() diff --git a/examples/linear_model/plot_lasso_and_elasticnet.py b/examples/linear_model/plot_lasso_and_elasticnet.py index b08837304730a..075d8a50d2f62 100644 --- a/examples/linear_model/plot_lasso_and_elasticnet.py +++ b/examples/linear_model/plot_lasso_and_elasticnet.py @@ -112,9 +112,10 @@ # :class:`~sklearn.model_selection.TimeSeriesSplit` cross-validation strategy to a # :class:`~sklearn.linear_model.LassoCV`. To keep the example simple and fast to # execute, we directly set the optimal value for alpha here. +from time import time + from sklearn.linear_model import Lasso from sklearn.metrics import r2_score -from time import time t0 = time() lasso = Lasso(alpha=0.14).fit(X_train, y_train) @@ -181,8 +182,8 @@ # and estimated coefficients of the respective linear models. import matplotlib.pyplot as plt -import seaborn as sns import pandas as pd +import seaborn as sns from matplotlib.colors import SymLogNorm df = pd.DataFrame( diff --git a/examples/linear_model/plot_lasso_coordinate_descent_path.py b/examples/linear_model/plot_lasso_coordinate_descent_path.py index 1796dc5011644..ee2f09f000d23 100644 --- a/examples/linear_model/plot_lasso_coordinate_descent_path.py +++ b/examples/linear_model/plot_lasso_coordinate_descent_path.py @@ -14,12 +14,12 @@ # License: BSD 3 clause from itertools import cycle -import numpy as np + import matplotlib.pyplot as plt +import numpy as np -from sklearn.linear_model import lasso_path, enet_path from sklearn import datasets - +from sklearn.linear_model import enet_path, lasso_path X, y = datasets.load_diabetes(return_X_y=True) diff --git a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py index 8da1820c0b0c4..a797d5d708160 100644 --- a/examples/linear_model/plot_lasso_dense_vs_sparse_data.py +++ b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py @@ -9,13 +9,12 @@ """ from time import time -from scipy import sparse -from scipy import linalg + +from scipy import linalg, sparse from sklearn.datasets import make_regression from sklearn.linear_model import Lasso - # %% # Comparing the two Lasso implementations on Dense data # ----------------------------------------------------- diff --git a/examples/linear_model/plot_lasso_lars.py b/examples/linear_model/plot_lasso_lars.py index 6788b8b1d1598..5444aeec90c65 100644 --- a/examples/linear_model/plot_lasso_lars.py +++ b/examples/linear_model/plot_lasso_lars.py @@ -14,11 +14,10 @@ # Alexandre Gramfort # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn import linear_model -from sklearn import datasets +from sklearn import datasets, linear_model X, y = datasets.load_diabetes(return_X_y=True) diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py index 95c0d0d66608d..8b265130f2f10 100644 --- a/examples/linear_model/plot_lasso_lars_ic.py +++ b/examples/linear_model/plot_lasso_lars_ic.py @@ -45,9 +45,9 @@ # # In the following, we are going to fit two models to compare the values # reported by AIC and BIC. -from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LassoLarsIC from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler lasso_lars_ic = make_pipeline(StandardScaler(), LassoLarsIC(criterion="aic")).fit(X, y) diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py index 7735f01987aa9..169d85ed81644 100644 --- a/examples/linear_model/plot_lasso_model_selection.py +++ b/examples/linear_model/plot_lasso_model_selection.py @@ -59,9 +59,10 @@ # # We will first fit a Lasso model with the AIC criterion. import time -from sklearn.preprocessing import StandardScaler + from sklearn.linear_model import LassoLarsIC from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler start_time = time.time() lasso_lars_ic = make_pipeline(StandardScaler(), LassoLarsIC(criterion="aic")).fit(X, y) diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py index 801c893e5e28e..6ed3c86e8c27b 100644 --- a/examples/linear_model/plot_logistic.py +++ b/examples/linear_model/plot_logistic.py @@ -15,6 +15,7 @@ import matplotlib.pyplot as plt import numpy as np from scipy.special import expit + from sklearn.linear_model import LinearRegression, LogisticRegression # Generate a toy dataset, it's just a straight line with some Gaussian noise: diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py index e8f5a2d51b637..80374d3833151 100644 --- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py +++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py @@ -20,11 +20,11 @@ # Andreas Mueller # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.linear_model import LogisticRegression from sklearn import datasets +from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler X, y = datasets.load_digits(return_X_y=True) diff --git a/examples/linear_model/plot_logistic_multinomial.py b/examples/linear_model/plot_logistic_multinomial.py index 814eeadaa68c4..791a788b2238b 100644 --- a/examples/linear_model/plot_logistic_multinomial.py +++ b/examples/linear_model/plot_logistic_multinomial.py @@ -12,11 +12,12 @@ # Authors: Tom Dupre la Tour # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import make_blobs -from sklearn.linear_model import LogisticRegression from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.linear_model import LogisticRegression # make 3-class dataset for classification centers = [[-5, 0], [0, 1.5], [5, -1]] diff --git a/examples/linear_model/plot_multi_task_lasso_support.py b/examples/linear_model/plot_multi_task_lasso_support.py index a30b51ed7a7fe..9b6ea64ce4d85 100644 --- a/examples/linear_model/plot_multi_task_lasso_support.py +++ b/examples/linear_model/plot_multi_task_lasso_support.py @@ -39,7 +39,7 @@ # Fit models # ---------- -from sklearn.linear_model import MultiTaskLasso, Lasso +from sklearn.linear_model import Lasso, MultiTaskLasso coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.0).fit(X, Y).coef_ diff --git a/examples/linear_model/plot_nnls.py b/examples/linear_model/plot_nnls.py index c8ba2914d783a..05a8550ec166b 100644 --- a/examples/linear_model/plot_nnls.py +++ b/examples/linear_model/plot_nnls.py @@ -9,8 +9,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.metrics import r2_score # %% diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index 0618f545306db..244bd86387474 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -19,6 +19,7 @@ import matplotlib.pyplot as plt import numpy as np + from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py index 7288cc9ae6594..0c95d483f1bf3 100644 --- a/examples/linear_model/plot_ols_3d.py +++ b/examples/linear_model/plot_ols_3d.py @@ -16,9 +16,10 @@ # %% # First we load the diabetes dataset. -from sklearn import datasets import numpy as np +from sklearn import datasets + X, y = datasets.load_diabetes(return_X_y=True) indices = (0, 1) diff --git a/examples/linear_model/plot_ols_ridge_variance.py b/examples/linear_model/plot_ols_ridge_variance.py index b02ab193842d4..a03d9c253c1cf 100644 --- a/examples/linear_model/plot_ols_ridge_variance.py +++ b/examples/linear_model/plot_ols_ridge_variance.py @@ -24,8 +24,8 @@ # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn import linear_model diff --git a/examples/linear_model/plot_omp.py b/examples/linear_model/plot_omp.py index 9329962cce4f6..aa6044173b8ce 100644 --- a/examples/linear_model/plot_omp.py +++ b/examples/linear_model/plot_omp.py @@ -10,9 +10,9 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.linear_model import OrthogonalMatchingPursuit -from sklearn.linear_model import OrthogonalMatchingPursuitCV + from sklearn.datasets import make_sparse_coded_signal +from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV n_components, n_features = 512, 100 n_nonzero_coefs = 17 diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 46f5c23578b55..cf38ca520f076 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -41,21 +41,18 @@ # Olivier Grisel # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np import pandas as pd - ############################################################################## # The French Motor Third-Party Liability Claims dataset # ----------------------------------------------------- # # Let's load the motor claim dataset from OpenML: # https://www.openml.org/d/41214 - from sklearn.datasets import fetch_openml - df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame df @@ -97,11 +94,14 @@ # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import FunctionTransformer, OneHotEncoder -from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer - +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + FunctionTransformer, + KBinsDiscretizer, + OneHotEncoder, + StandardScaler, +) log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler() @@ -139,8 +139,8 @@ # the training sample. from sklearn.dummy import DummyRegressor -from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline df_train, df_test = train_test_split(df, test_size=0.33, random_state=0) @@ -156,9 +156,11 @@ # Let's compute the performance of this constant prediction baseline with 3 # different regression metrics: -from sklearn.metrics import mean_squared_error -from sklearn.metrics import mean_absolute_error -from sklearn.metrics import mean_poisson_deviance +from sklearn.metrics import ( + mean_absolute_error, + mean_poisson_deviance, + mean_squared_error, +) def score_estimator(estimator, df_test): @@ -213,7 +215,6 @@ def score_estimator(estimator, df_test): from sklearn.linear_model import Ridge - ridge_glm = Pipeline( [ ("preprocessor", linear_model_preprocessor), @@ -285,7 +286,6 @@ def score_estimator(estimator, df_test): from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.preprocessing import OrdinalEncoder - tree_preprocessor = ColumnTransformer( [ ( diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py index ac2fe28de870d..f648b7aea762d 100644 --- a/examples/linear_model/plot_polynomial_interpolation.py +++ b/examples/linear_model/plot_polynomial_interpolation.py @@ -42,13 +42,12 @@ # Malte Londschien # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.linear_model import Ridge -from sklearn.preprocessing import PolynomialFeatures, SplineTransformer from sklearn.pipeline import make_pipeline - +from sklearn.preprocessing import PolynomialFeatures, SplineTransformer # %% # We start by defining a function that we intend to approximate and prepare diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py index b66434fa1c0c1..715e6129cdef8 100644 --- a/examples/linear_model/plot_quantile_regression.py +++ b/examples/linear_model/plot_quantile_regression.py @@ -111,7 +111,7 @@ # # We will use the quantiles at 5% and 95% to find the outliers in the training # sample beyond the central 90% interval. -from sklearn.utils.fixes import sp_version, parse_version +from sklearn.utils.fixes import parse_version, sp_version # This is line is to avoid incompatibility if older SciPy version. # You should use `solver="highs"` with recent version of SciPy. @@ -253,8 +253,7 @@ # distributed target to make it more interesting as mean and median are not # equal. from sklearn.linear_model import LinearRegression -from sklearn.metrics import mean_absolute_error -from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error, mean_squared_error linear_regression = LinearRegression() quantile_regression = QuantileRegressor(quantile=0.5, alpha=0, solver=solver) diff --git a/examples/linear_model/plot_ransac.py b/examples/linear_model/plot_ransac.py index 0301dd0ba0088..7b89150c4bd20 100644 --- a/examples/linear_model/plot_ransac.py +++ b/examples/linear_model/plot_ransac.py @@ -18,8 +18,7 @@ import numpy as np from matplotlib import pyplot as plt -from sklearn import linear_model, datasets - +from sklearn import datasets, linear_model n_samples = 1000 n_outliers = 50 diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py index 66f8fd9eb6c23..01f9d45a63f8d 100644 --- a/examples/linear_model/plot_ridge_path.py +++ b/examples/linear_model/plot_ridge_path.py @@ -30,8 +30,9 @@ # Author: Fabian Pedregosa -- # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import linear_model # X is the 10x10 Hilbert matrix diff --git a/examples/linear_model/plot_robust_fit.py b/examples/linear_model/plot_robust_fit.py index c9fe49fc0d416..79213c9a8e83e 100644 --- a/examples/linear_model/plot_robust_fit.py +++ b/examples/linear_model/plot_robust_fit.py @@ -30,18 +30,18 @@ """ -from matplotlib import pyplot as plt import numpy as np +from matplotlib import pyplot as plt from sklearn.linear_model import ( + HuberRegressor, LinearRegression, - TheilSenRegressor, RANSACRegressor, - HuberRegressor, + TheilSenRegressor, ) from sklearn.metrics import mean_squared_error -from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures np.random.seed(42) diff --git a/examples/linear_model/plot_sgd_comparison.py b/examples/linear_model/plot_sgd_comparison.py index 5ab0d6b1b2827..0477e42cf5947 100644 --- a/examples/linear_model/plot_sgd_comparison.py +++ b/examples/linear_model/plot_sgd_comparison.py @@ -9,14 +9,17 @@ # Author: Rob Zinkov # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from sklearn import datasets +import numpy as np +from sklearn import datasets +from sklearn.linear_model import ( + LogisticRegression, + PassiveAggressiveClassifier, + Perceptron, + SGDClassifier, +) from sklearn.model_selection import train_test_split -from sklearn.linear_model import SGDClassifier, Perceptron -from sklearn.linear_model import PassiveAggressiveClassifier -from sklearn.linear_model import LogisticRegression heldout = [0.95, 0.90, 0.75, 0.50, 0.01] # Number of rounds to fit and evaluate an estimator. diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py index 4fb884804492d..6713a74342ba2 100644 --- a/examples/linear_model/plot_sgd_early_stopping.py +++ b/examples/linear_model/plot_sgd_early_stopping.py @@ -41,19 +41,19 @@ # # License: BSD 3 clause -import time import sys +import time -import pandas as pd -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import pandas as pd from sklearn import linear_model from sklearn.datasets import fetch_openml -from sklearn.model_selection import train_test_split -from sklearn.utils._testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import train_test_split from sklearn.utils import shuffle +from sklearn.utils._testing import ignore_warnings def load_mnist(n_samples=None, class_0="0", class_1="8"): diff --git a/examples/linear_model/plot_sgd_iris.py b/examples/linear_model/plot_sgd_iris.py index 64dca07396d54..5d9b923f9b444 100644 --- a/examples/linear_model/plot_sgd_iris.py +++ b/examples/linear_model/plot_sgd_iris.py @@ -9,11 +9,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets -from sklearn.linear_model import SGDClassifier from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.linear_model import SGDClassifier # import some data to play with iris = datasets.load_iris() diff --git a/examples/linear_model/plot_sgd_loss_functions.py b/examples/linear_model/plot_sgd_loss_functions.py index a1f74dca4d6af..140562184b946 100644 --- a/examples/linear_model/plot_sgd_loss_functions.py +++ b/examples/linear_model/plot_sgd_loss_functions.py @@ -8,8 +8,8 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np def modified_huber_loss(y_true, y_pred): diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py index 0413751fb41a9..ff71dba5f20a3 100644 --- a/examples/linear_model/plot_sgd_penalties.py +++ b/examples/linear_model/plot_sgd_penalties.py @@ -11,8 +11,8 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np l1_color = "navy" l2_color = "c" diff --git a/examples/linear_model/plot_sgd_separating_hyperplane.py b/examples/linear_model/plot_sgd_separating_hyperplane.py index af288fcd3dde0..e84ab7c519ae9 100644 --- a/examples/linear_model/plot_sgd_separating_hyperplane.py +++ b/examples/linear_model/plot_sgd_separating_hyperplane.py @@ -9,10 +9,11 @@ """ -import numpy as np import matplotlib.pyplot as plt -from sklearn.linear_model import SGDClassifier +import numpy as np + from sklearn.datasets import make_blobs +from sklearn.linear_model import SGDClassifier # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) diff --git a/examples/linear_model/plot_sgd_weighted_samples.py b/examples/linear_model/plot_sgd_weighted_samples.py index 2db52042b075f..4d605e99b4e49 100644 --- a/examples/linear_model/plot_sgd_weighted_samples.py +++ b/examples/linear_model/plot_sgd_weighted_samples.py @@ -8,8 +8,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import linear_model # we create 20 points diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py index c25f4a84d91e0..2f03768f50532 100644 --- a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py +++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py @@ -19,13 +19,14 @@ """ # noqa: E501 -import numpy as np -import matplotlib.pyplot as plt import matplotlib -from sklearn.svm import OneClassSVM -from sklearn.linear_model import SGDOneClassSVM +import matplotlib.pyplot as plt +import numpy as np + from sklearn.kernel_approximation import Nystroem +from sklearn.linear_model import SGDOneClassSVM from sklearn.pipeline import make_pipeline +from sklearn.svm import OneClassSVM font = {"weight": "normal", "size": 15} diff --git a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py index 507dda5c76901..f62208aab154a 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py +++ b/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py @@ -29,9 +29,9 @@ import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn") t0 = timeit.default_timer() diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index 37327aeaa4cb7..e6746b8fb0896 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -21,6 +21,7 @@ # License: BSD 3 clause import time + import matplotlib.pyplot as plt import numpy as np diff --git a/examples/linear_model/plot_theilsen.py b/examples/linear_model/plot_theilsen.py index b380baf705a76..eb0ac4966841d 100644 --- a/examples/linear_model/plot_theilsen.py +++ b/examples/linear_model/plot_theilsen.py @@ -39,10 +39,11 @@ # License: BSD 3 clause import time -import numpy as np + import matplotlib.pyplot as plt -from sklearn.linear_model import LinearRegression, TheilSenRegressor -from sklearn.linear_model import RANSACRegressor +import numpy as np + +from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor estimators = [ ("OLS", LinearRegression()), diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1d7a5c5ed179f..2ee4b4b18fd7b 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -46,14 +46,16 @@ from functools import partial -import numpy as np import matplotlib.pyplot as plt +import numpy as np import pandas as pd from sklearn.datasets import fetch_openml -from sklearn.metrics import mean_tweedie_deviance -from sklearn.metrics import mean_absolute_error -from sklearn.metrics import mean_squared_error +from sklearn.metrics import ( + mean_absolute_error, + mean_squared_error, + mean_tweedie_deviance, +) def load_mtpl2(n_samples=None): @@ -209,11 +211,14 @@ def score_estimator( # containing the number of claims (``ClaimNb``), with the freMTPL2sev table, # containing the claim amount (``ClaimAmount``) for the same policy ids # (``IDpol``). -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import FunctionTransformer, OneHotEncoder -from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer - +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + FunctionTransformer, + KBinsDiscretizer, + OneHotEncoder, + StandardScaler, +) df = load_mtpl2() @@ -274,9 +279,8 @@ def score_estimator( # constant rate in a given time interval (``Exposure``, in units of years). # Here we model the frequency ``y = ClaimNb / Exposure``, which is still a # (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`. -from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor - +from sklearn.model_selection import train_test_split df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) @@ -396,7 +400,6 @@ def score_estimator( # more than one claim. from sklearn.linear_model import GammaRegressor - mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 @@ -540,7 +543,6 @@ def score_estimator( # regardless of `power`. from sklearn.linear_model import TweedieRegressor - glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, solver="newton-cholesky") glm_pure_premium.fit( X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"] diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index 3773f11605241..88ce0f26b8dde 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -29,12 +29,12 @@ # We start by generating the S-curve dataset. import matplotlib.pyplot as plt -from matplotlib import ticker # unused but required import for doing 3d projections with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +from matplotlib import ticker -from sklearn import manifold, datasets +from sklearn import datasets, manifold n_samples = 1500 S_points, S_color = datasets.make_s_curve(n_samples, random_state=0) diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py index 7d4b6610cee49..4424d700789ff 100644 --- a/examples/manifold/plot_lle_digits.py +++ b/examples/manifold/plot_lle_digits.py @@ -45,6 +45,7 @@ # scattered across it. import numpy as np from matplotlib import offsetbox + from sklearn.preprocessing import MinMaxScaler @@ -103,11 +104,11 @@ def plot_embedding(X, title): from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import RandomTreesEmbedding from sklearn.manifold import ( + MDS, + TSNE, Isomap, LocallyLinearEmbedding, - MDS, SpectralEmbedding, - TSNE, ) from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.pipeline import make_pipeline diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py index 46db3f9f60e6d..624206ff4d5e0 100644 --- a/examples/manifold/plot_manifold_sphere.py +++ b/examples/manifold/plot_manifold_sphere.py @@ -29,14 +29,16 @@ # License: BSD 3 clause from time import time -import numpy as np + import matplotlib.pyplot as plt -from matplotlib.ticker import NullFormatter -from sklearn import manifold -from sklearn.utils import check_random_state # Unused but required import for doing 3d projections with matplotlib < 3.2 import mpl_toolkits.mplot3d # noqa: F401 +import numpy as np +from matplotlib.ticker import NullFormatter + +from sklearn import manifold +from sklearn.utils import check_random_state # Variables for manifold learning. n_neighbors = 10 diff --git a/examples/manifold/plot_mds.py b/examples/manifold/plot_mds.py index 51f9745a33f59..2bb56f1f4ed2a 100644 --- a/examples/manifold/plot_mds.py +++ b/examples/manifold/plot_mds.py @@ -14,13 +14,12 @@ # License: BSD import numpy as np - from matplotlib import pyplot as plt from matplotlib.collections import LineCollection from sklearn import manifold -from sklearn.metrics import euclidean_distances from sklearn.decomposition import PCA +from sklearn.metrics import euclidean_distances EPSILON = np.finfo(np.float32).eps n_samples = 20 diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py index 4a71eb83cc972..fe17d9f80030f 100644 --- a/examples/manifold/plot_swissroll.py +++ b/examples/manifold/plot_swissroll.py @@ -15,8 +15,8 @@ # We start by generating the Swiss Roll dataset. import matplotlib.pyplot as plt -from sklearn import manifold, datasets +from sklearn import datasets, manifold sr_points, sr_color = datasets.make_swiss_roll(n_samples=1500, random_state=0) diff --git a/examples/manifold/plot_t_sne_perplexity.py b/examples/manifold/plot_t_sne_perplexity.py index 014114a8a37d7..314458427f593 100644 --- a/examples/manifold/plot_t_sne_perplexity.py +++ b/examples/manifold/plot_t_sne_perplexity.py @@ -27,12 +27,13 @@ # Author: Narine Kokhlikyan # License: BSD -import numpy as np -import matplotlib.pyplot as plt +from time import time +import matplotlib.pyplot as plt +import numpy as np from matplotlib.ticker import NullFormatter -from sklearn import manifold, datasets -from time import time + +from sklearn import datasets, manifold n_samples = 150 n_components = 2 diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py index ef274bf98fbe5..7fb6b71e2a5c6 100644 --- a/examples/miscellaneous/plot_anomaly_comparison.py +++ b/examples/miscellaneous/plot_anomaly_comparison.py @@ -68,17 +68,17 @@ import time -import numpy as np import matplotlib import matplotlib.pyplot as plt +import numpy as np from sklearn import svm -from sklearn.datasets import make_moons, make_blobs from sklearn.covariance import EllipticEnvelope +from sklearn.datasets import make_blobs, make_moons from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor -from sklearn.linear_model import SGDOneClassSVM from sklearn.kernel_approximation import Nystroem +from sklearn.linear_model import SGDOneClassSVM +from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import make_pipeline matplotlib.rcParams["contour.negative_linestyle"] = "solid" diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py index f108beced7a00..24095de3b5cae 100644 --- a/examples/miscellaneous/plot_display_object_visualization.py +++ b/examples/miscellaneous/plot_display_object_visualization.py @@ -24,10 +24,10 @@ # data is split into a train and test dataset and a logistic regression is # fitted with the train dataset. from sklearn.datasets import fetch_openml -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler X, y = fetch_openml(data_id=1464, return_X_y=True, parser="pandas") X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) @@ -41,8 +41,7 @@ # With the fitted model, we compute the predictions of the model on the test # dataset. These predictions are used to compute the confustion matrix which # is plotted with the :class:`ConfusionMatrixDisplay` -from sklearn.metrics import confusion_matrix -from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) @@ -56,8 +55,7 @@ # The roc curve requires either the probabilities or the non-thresholded # decision values from the estimator. Since the logistic regression provides # a decision function, we will use it to plot the roc curve: -from sklearn.metrics import roc_curve -from sklearn.metrics import RocCurveDisplay +from sklearn.metrics import RocCurveDisplay, roc_curve y_score = clf.decision_function(X_test) @@ -69,8 +67,7 @@ ############################################################################## # Similarly, the precision recall curve can be plotted using `y_score` from # the prevision sections. -from sklearn.metrics import precision_recall_curve -from sklearn.metrics import PrecisionRecallDisplay +from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label=clf.classes_[1]) pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot() diff --git a/examples/miscellaneous/plot_estimator_representation.py b/examples/miscellaneous/plot_estimator_representation.py index 304bb055e6762..1c9e3745db0de 100644 --- a/examples/miscellaneous/plot_estimator_representation.py +++ b/examples/miscellaneous/plot_estimator_representation.py @@ -7,12 +7,11 @@ displayed. """ -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler -from sklearn.impute import SimpleImputer from sklearn.compose import make_column_transformer +from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression - +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler # %% # Compact text representation diff --git a/examples/miscellaneous/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py index 0240a8dec34b5..a1c1174c9e9de 100644 --- a/examples/miscellaneous/plot_isotonic_regression.py +++ b/examples/miscellaneous/plot_isotonic_regression.py @@ -23,12 +23,12 @@ # Alexandre Gramfort # License: BSD -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.collections import LineCollection -from sklearn.linear_model import LinearRegression from sklearn.isotonic import IsotonicRegression +from sklearn.linear_model import LinearRegression from sklearn.utils import check_random_state n = 100 diff --git a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py index 6fd9d3614804c..85161a6ee51bb 100644 --- a/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py +++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py @@ -15,13 +15,16 @@ import sys from time import time -import numpy as np + import matplotlib.pyplot as plt -from sklearn.random_projection import johnson_lindenstrauss_min_dim -from sklearn.random_projection import SparseRandomProjection -from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.datasets import load_digits +import numpy as np + +from sklearn.datasets import fetch_20newsgroups_vectorized, load_digits from sklearn.metrics.pairwise import euclidean_distances +from sklearn.random_projection import ( + SparseRandomProjection, + johnson_lindenstrauss_min_dim, +) # %% # Theoretical bounds diff --git a/examples/miscellaneous/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py index ffb5d3940a055..7faf7900c7eb5 100644 --- a/examples/miscellaneous/plot_kernel_approximation.py +++ b/examples/miscellaneous/plot_kernel_approximation.py @@ -39,14 +39,15 @@ # License: BSD 3 clause # Standard scientific Python imports +from time import time + import matplotlib.pyplot as plt import numpy as np -from time import time # Import datasets, classifiers and performance metrics -from sklearn import datasets, svm, pipeline -from sklearn.kernel_approximation import RBFSampler, Nystroem +from sklearn import datasets, pipeline, svm from sklearn.decomposition import PCA +from sklearn.kernel_approximation import Nystroem, RBFSampler # The digits dataset digits = datasets.load_digits(n_class=9) diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py index fa7cb15446473..6d2288936179a 100644 --- a/examples/miscellaneous/plot_kernel_ridge_regression.py +++ b/examples/miscellaneous/plot_kernel_ridge_regression.py @@ -40,9 +40,9 @@ # Construct the kernel-based regression models # -------------------------------------------- +from sklearn.kernel_ridge import KernelRidge from sklearn.model_selection import GridSearchCV from sklearn.svm import SVR -from sklearn.kernel_ridge import KernelRidge train_size = 100 diff --git a/examples/miscellaneous/plot_metadata_routing.py b/examples/miscellaneous/plot_metadata_routing.py index 81e3b6fc9a01d..b189f320cdd24 100644 --- a/examples/miscellaneous/plot_metadata_routing.py +++ b/examples/miscellaneous/plot_metadata_routing.py @@ -22,23 +22,29 @@ """ # %% -import numpy as np import warnings from pprint import pprint + +import numpy as np + from sklearn import set_config -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin -from sklearn.base import RegressorMixin -from sklearn.base import MetaEstimatorMixin -from sklearn.base import TransformerMixin -from sklearn.base import clone +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + RegressorMixin, + TransformerMixin, + clone, +) +from sklearn.linear_model import LinearRegression from sklearn.utils import metadata_routing -from sklearn.utils.metadata_routing import get_routing_for_object -from sklearn.utils.metadata_routing import MetadataRouter -from sklearn.utils.metadata_routing import MethodMapping -from sklearn.utils.metadata_routing import process_routing +from sklearn.utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + get_routing_for_object, + process_routing, +) from sklearn.utils.validation import check_is_fitted -from sklearn.linear_model import LinearRegression n_samples, n_features = 100, 4 rng = np.random.RandomState(42) diff --git a/examples/miscellaneous/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py index aded595258fea..b424c3253104a 100644 --- a/examples/miscellaneous/plot_multilabel.py +++ b/examples/miscellaneous/plot_multilabel.py @@ -32,14 +32,14 @@ # Authors: Vlad Niculae, Mathieu Blondel # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from sklearn.cross_decomposition import CCA from sklearn.datasets import make_multilabel_classification +from sklearn.decomposition import PCA from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC -from sklearn.decomposition import PCA -from sklearn.cross_decomposition import CCA def plot_hyperplane(clf, min_x, max_x, linestyle, label): diff --git a/examples/miscellaneous/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py index 31e73195747a5..62070bc05e488 100644 --- a/examples/miscellaneous/plot_multioutput_face_completion.py +++ b/examples/miscellaneous/plot_multioutput_face_completion.py @@ -12,16 +12,14 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import fetch_olivetti_faces -from sklearn.utils.validation import check_random_state - from sklearn.ensemble import ExtraTreesRegressor +from sklearn.linear_model import LinearRegression, RidgeCV from sklearn.neighbors import KNeighborsRegressor -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import RidgeCV +from sklearn.utils.validation import check_random_state # Load the faces datasets data, targets = fetch_olivetti_faces(return_X_y=True) diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py index 781fa515f50e8..9b530ccab0807 100644 --- a/examples/miscellaneous/plot_outlier_detection_bench.py +++ b/examples/miscellaneous/plot_outlier_detection_bench.py @@ -32,10 +32,11 @@ # The `preprocess_dataset` function returns data and target. import numpy as np -from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml -from sklearn.preprocessing import LabelBinarizer import pandas as pd +from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml +from sklearn.preprocessing import LabelBinarizer + rng = np.random.RandomState(42) @@ -117,8 +118,8 @@ def preprocess_dataset(dataset_name): # `compute_prediction` function returns average outlier score of X. -from sklearn.neighbors import LocalOutlierFactor from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor def compute_prediction(X, model_name): @@ -145,7 +146,9 @@ def compute_prediction(X, model_name): import math + import matplotlib.pyplot as plt + from sklearn.metrics import RocCurveDisplay datasets_name = [ diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py index 336d7c36d1661..38a984fa5b0cd 100644 --- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py +++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py @@ -13,15 +13,15 @@ """ # noqa: E501 -import pandas as pd import matplotlib.pyplot as plt +import pandas as pd + from sklearn.datasets import load_diabetes +from sklearn.inspection import PartialDependenceDisplay from sklearn.neural_network import MLPRegressor -from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeRegressor -from sklearn.inspection import PartialDependenceDisplay - # %% # Train models on the diabetes dataset diff --git a/examples/miscellaneous/plot_pipeline_display.py b/examples/miscellaneous/plot_pipeline_display.py index f0fea8d2f3a27..9642bb56b903f 100755 --- a/examples/miscellaneous/plot_pipeline_display.py +++ b/examples/miscellaneous/plot_pipeline_display.py @@ -19,10 +19,10 @@ # :class:`~sklearn.linear_model.LogisticRegression`, and displays its visual # representation. +from sklearn import set_config +from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler -from sklearn.linear_model import LogisticRegression -from sklearn import set_config steps = [ ("preprocessing", StandardScaler()), @@ -53,9 +53,9 @@ # :class:`~sklearn.linear_model.LogisticRegression`, and displays its visual # representation. -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler, PolynomialFeatures from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import PolynomialFeatures, StandardScaler steps = [ ("standard_scaler", StandardScaler()), @@ -73,9 +73,9 @@ # a classifier, :class:`~sklearn.svm.SVC`, and displays its visual # representation. +from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.svm import SVC -from sklearn.decomposition import PCA steps = [("reduce_dim", PCA(n_components=4)), ("classifier", SVC(kernel="linear"))] pipe = Pipeline(steps) @@ -90,12 +90,12 @@ # representation. import numpy as np -from sklearn.pipeline import make_pipeline -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer + from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler numeric_preprocessor = Pipeline( steps=[ @@ -133,13 +133,13 @@ # representation. import numpy as np -from sklearn.pipeline import make_pipeline -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer + from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler numeric_preprocessor = Pipeline( steps=[ diff --git a/examples/miscellaneous/plot_roc_curve_visualization_api.py b/examples/miscellaneous/plot_roc_curve_visualization_api.py index b4e08493c77d4..7fc8df9724337 100644 --- a/examples/miscellaneous/plot_roc_curve_visualization_api.py +++ b/examples/miscellaneous/plot_roc_curve_visualization_api.py @@ -15,11 +15,12 @@ # First, we load the wine dataset and convert it to a binary classification # problem. Then, we train a support vector classifier on a training dataset. import matplotlib.pyplot as plt -from sklearn.svm import SVC + +from sklearn.datasets import load_wine from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import RocCurveDisplay -from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split +from sklearn.svm import SVC X, y = load_wine(return_X_y=True) y = y == 2 diff --git a/examples/miscellaneous/plot_set_output.py b/examples/miscellaneous/plot_set_output.py index a2088ae48adc3..725e04894614c 100644 --- a/examples/miscellaneous/plot_set_output.py +++ b/examples/miscellaneous/plot_set_output.py @@ -48,9 +48,9 @@ # %% # In a :class:`pipeline.Pipeline`, `set_output` configures all steps to output # DataFrames. -from sklearn.pipeline import make_pipeline -from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectPercentile +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import make_pipeline clf = make_pipeline( StandardScaler(), SelectPercentile(percentile=75), LogisticRegression() @@ -76,10 +76,10 @@ # %% # The `set_output` API can be configured globally by using :func:`set_config` and # setting `transform_output` to `"pandas"`. +from sklearn import set_config from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer -from sklearn import set_config +from sklearn.preprocessing import OneHotEncoder, StandardScaler set_config(transform_output="pandas") diff --git a/examples/mixture/plot_concentration_prior.py b/examples/mixture/plot_concentration_prior.py index a56ec6325068b..6561186adb119 100644 --- a/examples/mixture/plot_concentration_prior.py +++ b/examples/mixture/plot_concentration_prior.py @@ -32,10 +32,10 @@ # Author: Thierry Guillemot # License: BSD 3 clause -import numpy as np import matplotlib as mpl -import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec +import matplotlib.pyplot as plt +import numpy as np from sklearn.mixture import BayesianGaussianMixture diff --git a/examples/mixture/plot_gmm.py b/examples/mixture/plot_gmm.py index efc89baa8159a..82e48a8d13eb0 100644 --- a/examples/mixture/plot_gmm.py +++ b/examples/mixture/plot_gmm.py @@ -26,10 +26,10 @@ import itertools +import matplotlib as mpl +import matplotlib.pyplot as plt import numpy as np from scipy import linalg -import matplotlib.pyplot as plt -import matplotlib as mpl from sklearn import mixture diff --git a/examples/mixture/plot_gmm_covariances.py b/examples/mixture/plot_gmm_covariances.py index aa0b78ab42a0b..9466e11749966 100644 --- a/examples/mixture/plot_gmm_covariances.py +++ b/examples/mixture/plot_gmm_covariances.py @@ -33,7 +33,6 @@ import matplotlib as mpl import matplotlib.pyplot as plt - import numpy as np from sklearn import datasets diff --git a/examples/mixture/plot_gmm_init.py b/examples/mixture/plot_gmm_init.py index 3b4beefe8c99a..aa0266c98ff7a 100644 --- a/examples/mixture/plot_gmm_init.py +++ b/examples/mixture/plot_gmm_init.py @@ -37,12 +37,14 @@ # Author: Gordon Walsh # Data generation code from Jake Vanderplas +from timeit import default_timer as timer + import matplotlib.pyplot as plt import numpy as np + +from sklearn.datasets._samples_generator import make_blobs from sklearn.mixture import GaussianMixture from sklearn.utils.extmath import row_norms -from sklearn.datasets._samples_generator import make_blobs -from timeit import default_timer as timer print(__doc__) diff --git a/examples/mixture/plot_gmm_pdf.py b/examples/mixture/plot_gmm_pdf.py index 70d58f22f8f41..062bdfd4d6d67 100644 --- a/examples/mixture/plot_gmm_pdf.py +++ b/examples/mixture/plot_gmm_pdf.py @@ -9,9 +9,10 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import LogNorm + from sklearn import mixture n_samples = 300 diff --git a/examples/mixture/plot_gmm_sin.py b/examples/mixture/plot_gmm_sin.py index c8656a69fe9fb..34af17b8920bc 100644 --- a/examples/mixture/plot_gmm_sin.py +++ b/examples/mixture/plot_gmm_sin.py @@ -41,10 +41,10 @@ import itertools +import matplotlib as mpl +import matplotlib.pyplot as plt import numpy as np from scipy import linalg -import matplotlib.pyplot as plt -import matplotlib as mpl from sklearn import mixture diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py index b891564db4025..278083a994e58 100644 --- a/examples/model_selection/plot_confusion_matrix.py +++ b/examples/model_selection/plot_confusion_matrix.py @@ -24,12 +24,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn import svm, datasets -from sklearn.model_selection import train_test_split +from sklearn import datasets, svm from sklearn.metrics import ConfusionMatrixDisplay +from sklearn.model_selection import train_test_split # import some data to play with iris = datasets.load_iris() diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py index 8b70191e4abd1..e6c3580c787f0 100644 --- a/examples/model_selection/plot_cv_indices.py +++ b/examples/model_selection/plot_cv_indices.py @@ -12,19 +12,20 @@ """ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Patch + from sklearn.model_selection import ( - TimeSeriesSplit, + GroupKFold, + GroupShuffleSplit, KFold, ShuffleSplit, + StratifiedGroupKFold, StratifiedKFold, - GroupShuffleSplit, - GroupKFold, StratifiedShuffleSplit, - StratifiedGroupKFold, + TimeSeriesSplit, ) -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.patches import Patch rng = np.random.RandomState(1338) cmap_data = plt.cm.Paired diff --git a/examples/model_selection/plot_cv_predict.py b/examples/model_selection/plot_cv_predict.py index 7fd843c535c85..65517d85f3fd1 100644 --- a/examples/model_selection/plot_cv_predict.py +++ b/examples/model_selection/plot_cv_predict.py @@ -37,6 +37,7 @@ # residuals (i.e. the difference between the observed values and the predicted # values) vs. the predicted values. import matplotlib.pyplot as plt + from sklearn.metrics import PredictionErrorDisplay fig, axs = plt.subplots(ncols=2, figsize=(8, 4)) diff --git a/examples/model_selection/plot_det.py b/examples/model_selection/plot_det.py index 97dbe771e6407..7f7a5be32f976 100644 --- a/examples/model_selection/plot_det.py +++ b/examples/model_selection/plot_det.py @@ -82,6 +82,7 @@ # :func:`scipy.stats.norm`. import matplotlib.pyplot as plt + from sklearn.metrics import DetCurveDisplay, RocCurveDisplay fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5)) diff --git a/examples/model_selection/plot_grid_search_refit_callable.py b/examples/model_selection/plot_grid_search_refit_callable.py index 7a7dd8ea3e463..a8dab986a48d2 100644 --- a/examples/model_selection/plot_grid_search_refit_callable.py +++ b/examples/model_selection/plot_grid_search_refit_callable.py @@ -20,8 +20,8 @@ # Author: Wenhao Zhang -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import load_digits from sklearn.decomposition import PCA diff --git a/examples/model_selection/plot_grid_search_stats.py b/examples/model_selection/plot_grid_search_stats.py index 179d860b42128..fbeb485d8db44 100644 --- a/examples/model_selection/plot_grid_search_stats.py +++ b/examples/model_selection/plot_grid_search_stats.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import seaborn as sns + from sklearn.datasets import make_moons X, y = make_moons(noise=0.352, random_state=1, n_samples=100) diff --git a/examples/model_selection/plot_grid_search_text_feature_extraction.py b/examples/model_selection/plot_grid_search_text_feature_extraction.py index 9ad4296aad9b4..17c2e2bfd5d99 100644 --- a/examples/model_selection/plot_grid_search_text_feature_extraction.py +++ b/examples/model_selection/plot_grid_search_text_feature_extraction.py @@ -105,6 +105,7 @@ # via the parameter `n_jobs`. from pprint import pprint + from sklearn.model_selection import RandomizedSearchCV random_search = RandomizedSearchCV( diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py index 956c70aaabd82..450392679095f 100644 --- a/examples/model_selection/plot_learning_curve.py +++ b/examples/model_selection/plot_learning_curve.py @@ -38,6 +38,7 @@ # a cross-validation procedure. import matplotlib.pyplot as plt import numpy as np + from sklearn.model_selection import LearningCurveDisplay, ShuffleSplit fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6), sharey=True) diff --git a/examples/model_selection/plot_likelihood_ratios.py b/examples/model_selection/plot_likelihood_ratios.py index e6ec94fc50cf9..9a3f29def9e98 100644 --- a/examples/model_selection/plot_likelihood_ratios.py +++ b/examples/model_selection/plot_likelihood_ratios.py @@ -55,8 +55,8 @@ class proportion than the target application. # ratio to evaluate the usefulness of this classifier as a disease diagnosis # tool: -from sklearn.metrics import class_likelihood_ratios from sklearn.linear_model import LogisticRegression +from sklearn.metrics import class_likelihood_ratios estimator = LogisticRegression().fit(X_train, y_train) y_pred = estimator.predict(X_test) @@ -166,10 +166,12 @@ def extract_score(cv_results): # label `1` corresponds to the positive class "disease", whereas the label `0` # stands for "no-disease". -import numpy as np +from collections import defaultdict + import matplotlib.pyplot as plt +import numpy as np + from sklearn.inspection import DecisionBoundaryDisplay -from collections import defaultdict populations = defaultdict(list) common_params = { diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py index e47e67e086ccb..674bf8bc1b07c 100644 --- a/examples/model_selection/plot_multi_metric_evaluation.py +++ b/examples/model_selection/plot_multi_metric_evaluation.py @@ -23,9 +23,8 @@ from matplotlib import pyplot as plt from sklearn.datasets import make_hastie_10_2 +from sklearn.metrics import accuracy_score, make_scorer from sklearn.model_selection import GridSearchCV -from sklearn.metrics import make_scorer -from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier # %% diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py index b6f45255e8a09..7513a078b68ce 100644 --- a/examples/model_selection/plot_nested_cross_validation_iris.py +++ b/examples/model_selection/plot_nested_cross_validation_iris.py @@ -44,11 +44,12 @@ """ -from sklearn.datasets import load_iris +import numpy as np from matplotlib import pyplot as plt + +from sklearn.datasets import load_iris +from sklearn.model_selection import GridSearchCV, KFold, cross_val_score from sklearn.svm import SVC -from sklearn.model_selection import GridSearchCV, cross_val_score, KFold -import numpy as np # Number of random trials NUM_TRIALS = 30 diff --git a/examples/model_selection/plot_permutation_tests_for_classification.py b/examples/model_selection/plot_permutation_tests_for_classification.py index c9fcaebb549fe..a02f6d188f006 100644 --- a/examples/model_selection/plot_permutation_tests_for_classification.py +++ b/examples/model_selection/plot_permutation_tests_for_classification.py @@ -58,9 +58,8 @@ # the percentage of permutations for which the score obtained is greater # that the score obtained using the original data. +from sklearn.model_selection import StratifiedKFold, permutation_test_score from sklearn.svm import SVC -from sklearn.model_selection import StratifiedKFold -from sklearn.model_selection import permutation_test_score clf = SVC(kernel="linear", random_state=7) cv = StratifiedKFold(2, shuffle=True, random_state=0) diff --git a/examples/model_selection/plot_precision_recall.py b/examples/model_selection/plot_precision_recall.py index d11d6e10cdff6..52d85691af439 100644 --- a/examples/model_selection/plot_precision_recall.py +++ b/examples/model_selection/plot_precision_recall.py @@ -100,6 +100,7 @@ # # We will use a Linear SVC classifier to differentiate two types of irises. import numpy as np + from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -198,8 +199,7 @@ # %% # The average precision score in multi-label settings # ................................................... -from sklearn.metrics import precision_recall_curve -from sklearn.metrics import average_precision_score +from sklearn.metrics import average_precision_score, precision_recall_curve # For each class precision = dict() @@ -232,9 +232,10 @@ # %% # Plot Precision-Recall curve for each class and iso-f1 curves # ............................................................ -import matplotlib.pyplot as plt from itertools import cycle +import matplotlib.pyplot as plt + # setup plot details colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"]) diff --git a/examples/model_selection/plot_randomized_search.py b/examples/model_selection/plot_randomized_search.py index 9ffc26a5abc84..140b359ff1934 100644 --- a/examples/model_selection/plot_randomized_search.py +++ b/examples/model_selection/plot_randomized_search.py @@ -20,14 +20,14 @@ """ -import numpy as np - from time import time + +import numpy as np import scipy.stats as stats -from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.datasets import load_digits from sklearn.linear_model import SGDClassifier +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # get some data X, y = load_digits(return_X_y=True, n_class=3) diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py index 34346780def26..3fa1374f1b8a0 100644 --- a/examples/model_selection/plot_roc.py +++ b/examples/model_selection/plot_roc.py @@ -44,6 +44,7 @@ # Here we binarize the output and add noisy features to make the problem harder. import numpy as np + from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -118,6 +119,7 @@ # %% import matplotlib.pyplot as plt + from sklearn.metrics import RocCurveDisplay RocCurveDisplay.from_predictions( @@ -191,7 +193,7 @@ # :class:`~sklearn.metrics.roc_curve` and then the area under the curve with # :class:`~sklearn.metrics.auc` for the raveled true and predicted classes. -from sklearn.metrics import roc_curve, auc +from sklearn.metrics import auc, roc_curve # store the fpr, tpr, and roc_auc for all averaging strategies fpr, tpr, roc_auc = dict(), dict(), dict() diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py index cf4c0496f54fb..a3663aa040b56 100644 --- a/examples/model_selection/plot_roc_crossval.py +++ b/examples/model_selection/plot_roc_crossval.py @@ -41,6 +41,7 @@ # (`class_id=0`). import numpy as np + from sklearn.datasets import load_iris iris = load_iris() @@ -66,8 +67,7 @@ import matplotlib.pyplot as plt from sklearn import svm -from sklearn.metrics import auc -from sklearn.metrics import RocCurveDisplay +from sklearn.metrics import RocCurveDisplay, auc from sklearn.model_selection import StratifiedKFold n_splits = 6 diff --git a/examples/model_selection/plot_successive_halving_heatmap.py b/examples/model_selection/plot_successive_halving_heatmap.py index ecdae48e64011..9b079e4b1351f 100644 --- a/examples/model_selection/plot_successive_halving_heatmap.py +++ b/examples/model_selection/plot_successive_halving_heatmap.py @@ -14,12 +14,10 @@ import numpy as np import pandas as pd -from sklearn.svm import SVC from sklearn import datasets -from sklearn.model_selection import GridSearchCV from sklearn.experimental import enable_halving_search_cv # noqa -from sklearn.model_selection import HalvingGridSearchCV - +from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV +from sklearn.svm import SVC # %% # We first define the parameter space for an :class:`~sklearn.svm.SVC` diff --git a/examples/model_selection/plot_successive_halving_iterations.py b/examples/model_selection/plot_successive_halving_iterations.py index bd2d5635e376e..31805d308e269 100644 --- a/examples/model_selection/plot_successive_halving_iterations.py +++ b/examples/model_selection/plot_successive_halving_iterations.py @@ -10,16 +10,15 @@ """ -import pandas as pd -from sklearn import datasets import matplotlib.pyplot as plt -from scipy.stats import randint import numpy as np +import pandas as pd +from scipy.stats import randint +from sklearn import datasets +from sklearn.ensemble import RandomForestClassifier from sklearn.experimental import enable_halving_search_cv # noqa from sklearn.model_selection import HalvingRandomSearchCV -from sklearn.ensemble import RandomForestClassifier - # %% # We first define the parameter space and train a diff --git a/examples/model_selection/plot_train_error_vs_test_error.py b/examples/model_selection/plot_train_error_vs_test_error.py index 1aba6f4892cbe..af7e7d14cdac0 100644 --- a/examples/model_selection/plot_train_error_vs_test_error.py +++ b/examples/model_selection/plot_train_error_vs_test_error.py @@ -19,6 +19,7 @@ # Generate sample data # -------------------- import numpy as np + from sklearn import linear_model from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split diff --git a/examples/model_selection/plot_underfitting_overfitting.py b/examples/model_selection/plot_underfitting_overfitting.py index ae8450b50cea9..412946fc9ca8b 100644 --- a/examples/model_selection/plot_underfitting_overfitting.py +++ b/examples/model_selection/plot_underfitting_overfitting.py @@ -21,12 +21,13 @@ """ -import numpy as np import matplotlib.pyplot as plt -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import PolynomialFeatures +import numpy as np + from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import PolynomialFeatures def true_fun(X): diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index 48aa19dfbc556..947d8ac2b2fdb 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -17,8 +17,8 @@ import numpy as np from sklearn.datasets import load_digits -from sklearn.svm import SVC from sklearn.model_selection import ValidationCurveDisplay +from sklearn.svm import SVC X, y = load_digits(return_X_y=True) subset_mask = np.isin(y, [1, 2]) # binary classification: 1 vs 2 diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py index e1f9feed43a97..1df4ee3b8346b 100644 --- a/examples/multioutput/plot_classifier_chain_yeast.py +++ b/examples/multioutput/plot_classifier_chain_yeast.py @@ -36,14 +36,15 @@ # Author: Adam Kleczewski # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import fetch_openml -from sklearn.multioutput import ClassifierChain +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import jaccard_score from sklearn.model_selection import train_test_split from sklearn.multiclass import OneVsRestClassifier -from sklearn.metrics import jaccard_score -from sklearn.linear_model import LogisticRegression +from sklearn.multioutput import ClassifierChain # Load a multi-label dataset from https://www.openml.org/d/40597 X, Y = fetch_openml("yeast", version=4, return_X_y=True, parser="pandas") diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py index ee848cdc66428..faff31d7a85c9 100644 --- a/examples/neighbors/approximate_nearest_neighbors.py +++ b/examples/neighbors/approximate_nearest_neighbors.py @@ -40,6 +40,7 @@ import joblib import numpy as np from scipy.sparse import csr_matrix + from sklearn.base import BaseEstimator, TransformerMixin from sklearn.datasets import fetch_openml from sklearn.utils import shuffle diff --git a/examples/neighbors/plot_caching_nearest_neighbors.py b/examples/neighbors/plot_caching_nearest_neighbors.py index 00be6470c1591..10c0d315da7af 100644 --- a/examples/neighbors/plot_caching_nearest_neighbors.py +++ b/examples/neighbors/plot_caching_nearest_neighbors.py @@ -22,11 +22,12 @@ # # License: BSD 3 clause from tempfile import TemporaryDirectory + import matplotlib.pyplot as plt -from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier -from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_digits +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KNeighborsClassifier, KNeighborsTransformer from sklearn.pipeline import Pipeline X, y = load_digits(return_X_y=True) diff --git a/examples/neighbors/plot_classification.py b/examples/neighbors/plot_classification.py index cc4f0864ba926..4ed23862ae455 100644 --- a/examples/neighbors/plot_classification.py +++ b/examples/neighbors/plot_classification.py @@ -11,7 +11,8 @@ import matplotlib.pyplot as plt import seaborn as sns from matplotlib.colors import ListedColormap -from sklearn import neighbors, datasets + +from sklearn import datasets, neighbors from sklearn.inspection import DecisionBoundaryDisplay n_neighbors = 15 diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py index e580f9fa178bc..045058eab09cc 100644 --- a/examples/neighbors/plot_digits_kde_sampling.py +++ b/examples/neighbors/plot_digits_kde_sampling.py @@ -11,13 +11,13 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import load_digits -from sklearn.neighbors import KernelDensity from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KernelDensity # load the data digits = load_digits() diff --git a/examples/neighbors/plot_kde_1d.py b/examples/neighbors/plot_kde_1d.py index 8b139d4cc2335..fc5b1914f23de 100644 --- a/examples/neighbors/plot_kde_1d.py +++ b/examples/neighbors/plot_kde_1d.py @@ -30,9 +30,10 @@ # Author: Jake Vanderplas # -import numpy as np import matplotlib.pyplot as plt +import numpy as np from scipy.stats import norm + from sklearn.neighbors import KernelDensity # ---------------------------------------------------------------------- diff --git a/examples/neighbors/plot_lof_novelty_detection.py b/examples/neighbors/plot_lof_novelty_detection.py index 277134cc77673..91e40661c6dfe 100644 --- a/examples/neighbors/plot_lof_novelty_detection.py +++ b/examples/neighbors/plot_lof_novelty_detection.py @@ -25,9 +25,10 @@ """ -import numpy as np import matplotlib import matplotlib.pyplot as plt +import numpy as np + from sklearn.neighbors import LocalOutlierFactor np.random.seed(42) diff --git a/examples/neighbors/plot_nca_classification.py b/examples/neighbors/plot_nca_classification.py index a08bbe8be3756..f76770640ed03 100644 --- a/examples/neighbors/plot_nca_classification.py +++ b/examples/neighbors/plot_nca_classification.py @@ -19,13 +19,13 @@ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap + from sklearn import datasets +from sklearn.inspection import DecisionBoundaryDisplay from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis from sklearn.pipeline import Pipeline -from sklearn.inspection import DecisionBoundaryDisplay - +from sklearn.preprocessing import StandardScaler n_neighbors = 1 diff --git a/examples/neighbors/plot_nca_dim_reduction.py b/examples/neighbors/plot_nca_dim_reduction.py index d245e0223ccfa..82fd35616929e 100644 --- a/examples/neighbors/plot_nca_dim_reduction.py +++ b/examples/neighbors/plot_nca_dim_reduction.py @@ -30,12 +30,13 @@ # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets -from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler diff --git a/examples/neighbors/plot_nca_illustration.py b/examples/neighbors/plot_nca_illustration.py index d722ffa5be033..e5fd2f9cb67bd 100644 --- a/examples/neighbors/plot_nca_illustration.py +++ b/examples/neighbors/plot_nca_illustration.py @@ -12,13 +12,14 @@ # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from sklearn.datasets import make_classification -from sklearn.neighbors import NeighborhoodComponentsAnalysis +import numpy as np from matplotlib import cm from scipy.special import logsumexp +from sklearn.datasets import make_classification +from sklearn.neighbors import NeighborhoodComponentsAnalysis + # %% # Original points # --------------- diff --git a/examples/neighbors/plot_nearest_centroid.py b/examples/neighbors/plot_nearest_centroid.py index 4eb0e0388a30b..c8f710d0a0377 100644 --- a/examples/neighbors/plot_nearest_centroid.py +++ b/examples/neighbors/plot_nearest_centroid.py @@ -8,13 +8,13 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import ListedColormap + from sklearn import datasets -from sklearn.neighbors import NearestCentroid from sklearn.inspection import DecisionBoundaryDisplay - +from sklearn.neighbors import NearestCentroid # import some data to play with iris = datasets.load_iris() diff --git a/examples/neighbors/plot_regression.py b/examples/neighbors/plot_regression.py index 78b850d1a4e2c..d5ceba8a34860 100644 --- a/examples/neighbors/plot_regression.py +++ b/examples/neighbors/plot_regression.py @@ -18,8 +18,9 @@ # %% # Generate sample data # -------------------- -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import neighbors np.random.seed(0) diff --git a/examples/neighbors/plot_species_kde.py b/examples/neighbors/plot_species_kde.py index 35ea40158a45c..3783138dfcb76 100644 --- a/examples/neighbors/plot_species_kde.py +++ b/examples/neighbors/plot_species_kde.py @@ -40,8 +40,9 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import fetch_species_distributions from sklearn.neighbors import KernelDensity diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py index 443d41f4707bf..b53beef54c115 100644 --- a/examples/neural_networks/plot_mlp_alpha.py +++ b/examples/neural_networks/plot_mlp_alpha.py @@ -23,11 +23,12 @@ import numpy as np from matplotlib import pyplot as plt from matplotlib.colors import ListedColormap + +from sklearn.datasets import make_circles, make_classification, make_moons from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler h = 0.02 # step size in the mesh diff --git a/examples/neural_networks/plot_mlp_training_curves.py b/examples/neural_networks/plot_mlp_training_curves.py index 3fbddda879162..a9f03c2599a8e 100644 --- a/examples/neural_networks/plot_mlp_training_curves.py +++ b/examples/neural_networks/plot_mlp_training_curves.py @@ -18,10 +18,10 @@ import matplotlib.pyplot as plt -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import MinMaxScaler from sklearn import datasets from sklearn.exceptions import ConvergenceWarning +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import MinMaxScaler # different learning rate schedules and momentum parameters params = [ diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 03f615786e830..43e6a171fb696 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -25,11 +25,13 @@ """ import warnings + import matplotlib.pyplot as plt + from sklearn.datasets import fetch_openml from sklearn.exceptions import ConvergenceWarning -from sklearn.neural_network import MLPClassifier from sklearn.model_selection import train_test_split +from sklearn.neural_network import MLPClassifier # Load data from https://www.openml.org/d/554 X, y = fetch_openml( diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py index de939922d9514..3ba878d4ad191 100644 --- a/examples/neural_networks/plot_rbm_logistic_classification.py +++ b/examples/neural_networks/plot_rbm_logistic_classification.py @@ -23,13 +23,11 @@ # linear shifts of 1 pixel in each direction. import numpy as np - from scipy.ndimage import convolve from sklearn import datasets -from sklearn.preprocessing import minmax_scale - from sklearn.model_selection import train_test_split +from sklearn.preprocessing import minmax_scale def nudge_dataset(X, Y): diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py index 2893f5cf01ccb..c53c81a89727a 100644 --- a/examples/preprocessing/plot_all_scaling.py +++ b/examples/preprocessing/plot_all_scaling.py @@ -45,22 +45,22 @@ # Thomas Unterthiner # License: BSD 3 clause -import numpy as np - import matplotlib as mpl -from matplotlib import pyplot as plt +import numpy as np from matplotlib import cm - -from sklearn.preprocessing import MinMaxScaler -from sklearn.preprocessing import minmax_scale -from sklearn.preprocessing import MaxAbsScaler -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import RobustScaler -from sklearn.preprocessing import Normalizer -from sklearn.preprocessing import QuantileTransformer -from sklearn.preprocessing import PowerTransformer +from matplotlib import pyplot as plt from sklearn.datasets import fetch_california_housing +from sklearn.preprocessing import ( + MaxAbsScaler, + MinMaxScaler, + Normalizer, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + minmax_scale, +) dataset = fetch_california_housing() X_full, y_full = dataset.data, dataset.target diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py index ffb3f9403634d..002d606da0c9d 100644 --- a/examples/preprocessing/plot_discretization.py +++ b/examples/preprocessing/plot_discretization.py @@ -31,8 +31,8 @@ # Hanmin Qin # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.linear_model import LinearRegression from sklearn.preprocessing import KBinsDiscretizer diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index a35c56ea683d6..71adf44474aa3 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -33,20 +33,19 @@ # # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib.colors import ListedColormap -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.datasets import make_moons, make_circles, make_classification + +from sklearn.datasets import make_circles, make_classification, make_moons +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import KBinsDiscretizer +from sklearn.preprocessing import KBinsDiscretizer, StandardScaler from sklearn.svm import SVC, LinearSVC -from sklearn.ensemble import GradientBoostingClassifier from sklearn.utils._testing import ignore_warnings -from sklearn.exceptions import ConvergenceWarning h = 0.02 # step size in the mesh diff --git a/examples/preprocessing/plot_discretization_strategies.py b/examples/preprocessing/plot_discretization_strategies.py index 91904246540dd..b4c2f3ca1858d 100644 --- a/examples/preprocessing/plot_discretization_strategies.py +++ b/examples/preprocessing/plot_discretization_strategies.py @@ -19,11 +19,11 @@ # Author: Tom Dupré la Tour # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.preprocessing import KBinsDiscretizer from sklearn.datasets import make_blobs +from sklearn.preprocessing import KBinsDiscretizer strategies = ["uniform", "quantile", "kmeans"] diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py index 42a61d84fa384..a521039098871 100644 --- a/examples/preprocessing/plot_map_data_to_normal.py +++ b/examples/preprocessing/plot_map_data_to_normal.py @@ -38,13 +38,11 @@ # Nicolas Hug # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np -from sklearn.preprocessing import PowerTransformer -from sklearn.preprocessing import QuantileTransformer from sklearn.model_selection import train_test_split - +from sklearn.preprocessing import PowerTransformer, QuantileTransformer N_SAMPLES = 1000 FONT_SIZE = 6 diff --git a/examples/preprocessing/plot_scaling_importance.py b/examples/preprocessing/plot_scaling_importance.py index 4e8f87b68b1d4..6e0ae0ae1c109 100644 --- a/examples/preprocessing/plot_scaling_importance.py +++ b/examples/preprocessing/plot_scaling_importance.py @@ -65,10 +65,10 @@ # of features. import matplotlib.pyplot as plt + from sklearn.inspection import DecisionBoundaryDisplay from sklearn.neighbors import KNeighborsClassifier - X_plot = X[["proline", "hue"]] X_plot_scaled = scaler.fit_transform(X_plot) clf = KNeighborsClassifier(n_neighbors=20) @@ -122,6 +122,7 @@ def fit_and_plot_model(X_plot, y, clf, ax): # We can inspect the first principal components using all the original features: import pandas as pd + from sklearn.decomposition import PCA pca = PCA(n_components=2).fit(X_train) @@ -199,8 +200,9 @@ def fit_and_plot_model(X_plot, y, clf, ax): # non-scaling of the data: import numpy as np -from sklearn.pipeline import make_pipeline + from sklearn.linear_model import LogisticRegressionCV +from sklearn.pipeline import make_pipeline Cs = np.logspace(-5, 5, 20) @@ -218,8 +220,7 @@ def fit_and_plot_model(X_plot, y, clf, ax): # was not scaled before applying PCA. We now evaluate the effect of scaling on # the accuracy and the mean log-loss of the optimal models: -from sklearn.metrics import accuracy_score -from sklearn.metrics import log_loss +from sklearn.metrics import accuracy_score, log_loss y_pred = unscaled_clf.predict(X_test) y_pred_scaled = scaled_clf.predict(X_test) diff --git a/examples/preprocessing/plot_target_encoder.py b/examples/preprocessing/plot_target_encoder.py index a50f0199e5ba8..d35990cfb2a9f 100644 --- a/examples/preprocessing/plot_target_encoder.py +++ b/examples/preprocessing/plot_target_encoder.py @@ -55,9 +55,7 @@ # strategies. First, we list out the encoders we will be using to preprocess # the categorical features: from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OrdinalEncoder -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import TargetEncoder +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder categorical_preprocessors = [ ("drop", "drop"), @@ -71,9 +69,9 @@ # %% # Next, we evaluate the models using cross validation and record the results: -from sklearn.pipeline import make_pipeline -from sklearn.model_selection import cross_validate from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import cross_validate +from sklearn.pipeline import make_pipeline n_cv_folds = 3 max_iter = 20 diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py index 455625cc47460..a7066eeab29cb 100644 --- a/examples/preprocessing/plot_target_encoder_cross_val.py +++ b/examples/preprocessing/plot_target_encoder_cross_val.py @@ -21,9 +21,10 @@ # feature with medium cardinality, an uninformative feature with medium cardinality, # and an uninformative feature with high cardinality. First, we generate the informative # feature: -from sklearn.preprocessing import KBinsDiscretizer import numpy as np +from sklearn.preprocessing import KBinsDiscretizer + n_samples = 50_000 rng = np.random.RandomState(42) @@ -60,9 +61,10 @@ # %% # Finally, we assemble the dataset and perform a train test split: -from sklearn.model_selection import train_test_split import pandas as pd +from sklearn.model_selection import train_test_split + X = pd.DataFrame( np.concatenate( [X_informative, X_shuffled, X_near_unique_categories], @@ -80,8 +82,8 @@ # interval cross validation. First, we see the Ridge model trained on the # raw features will have low performance, because the order of the informative # feature is not informative: -from sklearn.linear_model import Ridge import sklearn +from sklearn.linear_model import Ridge # Configure transformers to always output DataFrames sklearn.set_config(transform_output="pandas") @@ -107,8 +109,8 @@ # %% # The coefficients of the linear model shows that most of the weight is on the # feature at column index 0, which is the informative feature -import pandas as pd import matplotlib.pyplot as plt +import pandas as pd plt.rcParams["figure.constrained_layout.use"] = True diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py index 02b99df3491ee..ca09013703592 100644 --- a/examples/release_highlights/plot_release_highlights_0_22_0.py +++ b/examples/release_highlights/plot_release_highlights_0_22_0.py @@ -34,15 +34,15 @@ # :class:`~metrics.plot_confusion_matrix`. Read more about this new API in the # :ref:`User Guide `. -from sklearn.model_selection import train_test_split -from sklearn.svm import SVC +import matplotlib.pyplot as plt + +from sklearn.datasets import make_classification +from sklearn.ensemble import RandomForestClassifier # from sklearn.metrics import plot_roc_curve from sklearn.metrics import RocCurveDisplay - -from sklearn.ensemble import RandomForestClassifier -from sklearn.datasets import make_classification -import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.svm import SVC X, y = make_classification(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -79,12 +79,12 @@ # Read more in the :ref:`User Guide `. from sklearn.datasets import load_iris -from sklearn.svm import LinearSVC -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline from sklearn.ensemble import StackingClassifier +from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import LinearSVC X, y = load_iris(return_X_y=True) estimators = [ @@ -102,8 +102,9 @@ # The :func:`inspection.permutation_importance` can be used to get an # estimate of the importance of each feature, for any fitted estimator: -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import permutation_importance @@ -155,8 +156,9 @@ # See more details in the :ref:`User Guide `. from tempfile import TemporaryDirectory -from sklearn.neighbors import KNeighborsTransformer + from sklearn.manifold import Isomap +from sklearn.neighbors import KNeighborsTransformer from sklearn.pipeline import make_pipeline X, y = make_classification(random_state=0) @@ -272,8 +274,8 @@ def test_sklearn_compatible_estimator(estimator, check): from sklearn.datasets import make_classification -from sklearn.svm import SVC from sklearn.metrics import roc_auc_score +from sklearn.svm import SVC X, y = make_classification(n_classes=4, n_informative=16) clf = SVC(decision_function_shape="ovo", probability=True).fit(X, y) diff --git a/examples/semi_supervised/plot_label_propagation_digits.py b/examples/semi_supervised/plot_label_propagation_digits.py index f848e3b76e084..bfdff8e362e47 100644 --- a/examples/semi_supervised/plot_label_propagation_digits.py +++ b/examples/semi_supervised/plot_label_propagation_digits.py @@ -24,9 +24,10 @@ class will be very good. # --------------- # # We use the digits dataset. We only use a subset of randomly selected samples. -from sklearn import datasets import numpy as np +from sklearn import datasets + digits = datasets.load_digits() rng = np.random.RandomState(2) indices = np.arange(len(digits.data)) @@ -59,8 +60,8 @@ class will be very good. # # We fit a :class:`~sklearn.semi_supervised.LabelSpreading` and use it to predict # the unknown labels. -from sklearn.semi_supervised import LabelSpreading from sklearn.metrics import classification_report +from sklearn.semi_supervised import LabelSpreading lp_model = LabelSpreading(gamma=0.25, max_iter=20) lp_model.fit(X, y_train) diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index 3a1f533c8a281..215655a287c2d 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -23,13 +23,13 @@ # Authors: Clay Woolam # License: BSD -import numpy as np import matplotlib.pyplot as plt +import numpy as np from scipy import stats from sklearn import datasets -from sklearn.semi_supervised import LabelSpreading from sklearn.metrics import classification_report, confusion_matrix +from sklearn.semi_supervised import LabelSpreading digits = datasets.load_digits() rng = np.random.RandomState(0) diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py index 5de6e9f20a7e3..cfcd1c1bf5a54 100644 --- a/examples/semi_supervised/plot_label_propagation_structure.py +++ b/examples/semi_supervised/plot_label_propagation_structure.py @@ -22,6 +22,7 @@ # Here, all labels but two are tagged as unknown. import numpy as np + from sklearn.datasets import make_circles n_samples = 200 diff --git a/examples/semi_supervised/plot_self_training_varying_threshold.py b/examples/semi_supervised/plot_self_training_varying_threshold.py index 801e48b8411f5..2c7a485d06eb0 100644 --- a/examples/semi_supervised/plot_self_training_varying_threshold.py +++ b/examples/semi_supervised/plot_self_training_varying_threshold.py @@ -32,13 +32,14 @@ # Authors: Oliver Rausch # License: BSD -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets -from sklearn.svm import SVC +from sklearn.metrics import accuracy_score from sklearn.model_selection import StratifiedKFold from sklearn.semi_supervised import SelfTrainingClassifier -from sklearn.metrics import accuracy_score +from sklearn.svm import SVC from sklearn.utils import shuffle n_splits = 3 diff --git a/examples/semi_supervised/plot_semi_supervised_newsgroups.py b/examples/semi_supervised/plot_semi_supervised_newsgroups.py index 609f5d10247c2..58c7f6e42f408 100644 --- a/examples/semi_supervised/plot_semi_supervised_newsgroups.py +++ b/examples/semi_supervised/plot_semi_supervised_newsgroups.py @@ -15,15 +15,13 @@ import numpy as np from sklearn.datasets import fetch_20newsgroups -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.preprocessing import FunctionTransformer +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier +from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline -from sklearn.semi_supervised import SelfTrainingClassifier -from sklearn.semi_supervised import LabelSpreading -from sklearn.metrics import f1_score +from sklearn.preprocessing import FunctionTransformer +from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier # Loading dataset containing first five categories data = fetch_20newsgroups( diff --git a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py index 402cd41d6a0f2..766f7ea0a79c6 100644 --- a/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py +++ b/examples/semi_supervised/plot_semi_supervised_versus_svm_iris.py @@ -18,13 +18,12 @@ # Oliver Rausch # License: BSD -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import datasets +from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier from sklearn.svm import SVC -from sklearn.semi_supervised import LabelSpreading -from sklearn.semi_supervised import SelfTrainingClassifier - iris = datasets.load_iris() diff --git a/examples/svm/plot_custom_kernel.py b/examples/svm/plot_custom_kernel.py index c2c3bc6e6ba28..cacd67ed056ac 100644 --- a/examples/svm/plot_custom_kernel.py +++ b/examples/svm/plot_custom_kernel.py @@ -8,9 +8,10 @@ """ -import numpy as np import matplotlib.pyplot as plt -from sklearn import svm, datasets +import numpy as np + +from sklearn import datasets, svm from sklearn.inspection import DecisionBoundaryDisplay # import some data to play with diff --git a/examples/svm/plot_iris_svc.py b/examples/svm/plot_iris_svc.py index 5bcc81dd91d04..61aba3cc06602 100644 --- a/examples/svm/plot_iris_svc.py +++ b/examples/svm/plot_iris_svc.py @@ -35,9 +35,9 @@ """ import matplotlib.pyplot as plt -from sklearn import svm, datasets -from sklearn.inspection import DecisionBoundaryDisplay +from sklearn import datasets, svm +from sklearn.inspection import DecisionBoundaryDisplay # import some data to play with iris = datasets.load_iris() diff --git a/examples/svm/plot_linearsvc_support_vectors.py b/examples/svm/plot_linearsvc_support_vectors.py index 638579f36f3c3..60e9a3e6f32f9 100644 --- a/examples/svm/plot_linearsvc_support_vectors.py +++ b/examples/svm/plot_linearsvc_support_vectors.py @@ -9,11 +9,12 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import make_blobs -from sklearn.svm import LinearSVC from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.svm import LinearSVC X, y = make_blobs(n_samples=40, centers=2, random_state=0) diff --git a/examples/svm/plot_oneclass.py b/examples/svm/plot_oneclass.py index 082cbcd6de2be..d4348fa0ec435 100644 --- a/examples/svm/plot_oneclass.py +++ b/examples/svm/plot_oneclass.py @@ -11,9 +11,10 @@ """ -import numpy as np -import matplotlib.pyplot as plt import matplotlib.font_manager +import matplotlib.pyplot as plt +import numpy as np + from sklearn import svm xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500)) diff --git a/examples/svm/plot_rbf_parameters.py b/examples/svm/plot_rbf_parameters.py index fa4310134487a..ba0154b477b46 100644 --- a/examples/svm/plot_rbf_parameters.py +++ b/examples/svm/plot_rbf_parameters.py @@ -135,9 +135,8 @@ def __call__(self, value, clip=None): # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. +from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit from sklearn.svm import SVC -from sklearn.model_selection import StratifiedShuffleSplit -from sklearn.model_selection import GridSearchCV C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) diff --git a/examples/svm/plot_separating_hyperplane.py b/examples/svm/plot_separating_hyperplane.py index 45bacff6a2b97..23f464169f516 100644 --- a/examples/svm/plot_separating_hyperplane.py +++ b/examples/svm/plot_separating_hyperplane.py @@ -10,11 +10,11 @@ """ import matplotlib.pyplot as plt + from sklearn import svm from sklearn.datasets import make_blobs from sklearn.inspection import DecisionBoundaryDisplay - # we create 40 separable points X, y = make_blobs(n_samples=40, centers=2, random_state=6) diff --git a/examples/svm/plot_separating_hyperplane_unbalanced.py b/examples/svm/plot_separating_hyperplane_unbalanced.py index fe71420ffd0b3..6fd7de98f3fb6 100644 --- a/examples/svm/plot_separating_hyperplane_unbalanced.py +++ b/examples/svm/plot_separating_hyperplane_unbalanced.py @@ -26,6 +26,7 @@ """ import matplotlib.pyplot as plt + from sklearn import svm from sklearn.datasets import make_blobs from sklearn.inspection import DecisionBoundaryDisplay diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py index 3652fae3e979a..3d5a934bf4884 100644 --- a/examples/svm/plot_svm_anova.py +++ b/examples/svm/plot_svm_anova.py @@ -14,6 +14,7 @@ # Load some data to play with # --------------------------- import numpy as np + from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) @@ -25,8 +26,8 @@ # %% # Create the pipeline # ------------------- -from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC @@ -45,6 +46,7 @@ # Plot the cross-validation score as a function of percentile of features # ----------------------------------------------------------------------- import matplotlib.pyplot as plt + from sklearn.model_selection import cross_val_score score_means = list() diff --git a/examples/svm/plot_svm_kernels.py b/examples/svm/plot_svm_kernels.py index fac86e8a93c7a..7ff2486e1c867 100644 --- a/examples/svm/plot_svm_kernels.py +++ b/examples/svm/plot_svm_kernels.py @@ -13,10 +13,10 @@ # Code source: Gaël Varoquaux # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from sklearn import svm +import numpy as np +from sklearn import svm # Our dataset and targets X = np.c_[ diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py index f3717ecaa24ed..b8253264a4ad0 100644 --- a/examples/svm/plot_svm_margin.py +++ b/examples/svm/plot_svm_margin.py @@ -17,8 +17,9 @@ # Modified for documentation by Jaques Grobler # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import svm # we create 40 separable points diff --git a/examples/svm/plot_svm_nonlinear.py b/examples/svm/plot_svm_nonlinear.py index f88231b4b6af4..4990e509661a1 100644 --- a/examples/svm/plot_svm_nonlinear.py +++ b/examples/svm/plot_svm_nonlinear.py @@ -11,8 +11,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import svm xx, yy = np.meshgrid(np.linspace(-3, 3, 500), np.linspace(-3, 3, 500)) diff --git a/examples/svm/plot_svm_regression.py b/examples/svm/plot_svm_regression.py index 75a16b571c3ea..ab34528a37af6 100644 --- a/examples/svm/plot_svm_regression.py +++ b/examples/svm/plot_svm_regression.py @@ -7,9 +7,10 @@ """ +import matplotlib.pyplot as plt import numpy as np + from sklearn.svm import SVR -import matplotlib.pyplot as plt # %% # Generate sample data diff --git a/examples/svm/plot_svm_scale_c.py b/examples/svm/plot_svm_scale_c.py index 4ba025cffac8e..1e44fb361e6ba 100644 --- a/examples/svm/plot_svm_scale_c.py +++ b/examples/svm/plot_svm_scale_c.py @@ -76,7 +76,8 @@ # We will compute the mean test score for different values of `C`. import numpy as np import pandas as pd -from sklearn.model_selection import validation_curve, ShuffleSplit + +from sklearn.model_selection import ShuffleSplit, validation_curve Cs = np.logspace(-2.3, -1.3, 10) train_sizes = np.linspace(0.3, 0.7, 3) diff --git a/examples/svm/plot_svm_tie_breaking.py b/examples/svm/plot_svm_tie_breaking.py index 93148225b0bb3..848b81dee9c69 100644 --- a/examples/svm/plot_svm_tie_breaking.py +++ b/examples/svm/plot_svm_tie_breaking.py @@ -17,10 +17,11 @@ # Code source: Andreas Mueller, Adrin Jalali # License: BSD 3 clause -import numpy as np import matplotlib.pyplot as plt -from sklearn.svm import SVC +import numpy as np + from sklearn.datasets import make_blobs +from sklearn.svm import SVC X, y = make_blobs(random_state=27) diff --git a/examples/svm/plot_weighted_samples.py b/examples/svm/plot_weighted_samples.py index f346599300aba..c17742e091390 100644 --- a/examples/svm/plot_weighted_samples.py +++ b/examples/svm/plot_weighted_samples.py @@ -14,8 +14,9 @@ """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn import svm diff --git a/examples/text/plot_document_classification_20newsgroups.py b/examples/text/plot_document_classification_20newsgroups.py index 633a0602d421b..04aad46c8451a 100644 --- a/examples/text/plot_document_classification_20newsgroups.py +++ b/examples/text/plot_document_classification_20newsgroups.py @@ -36,9 +36,10 @@ # the classification problem "too easy". This is achieved using simple # heuristics that are neither perfect nor standard, hence disabled by default. +from time import time + from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer -from time import time categories = [ "alt.atheism", @@ -158,6 +159,7 @@ def load_dataset(verbose=False, remove=()): # in the classification errors. import matplotlib.pyplot as plt + from sklearn.metrics import ConfusionMatrixDisplay fig, ax = plt.subplots(figsize=(10, 5)) @@ -182,8 +184,8 @@ def load_dataset(verbose=False, remove=()): # We can gain a deeper understanding of how this classifier makes its decisions # by looking at the words with the highest average feature effects: -import pandas as pd import numpy as np +import pandas as pd def plot_feature_effects(): @@ -315,8 +317,8 @@ def plot_feature_effects(): # training time and testing time. For such purpose we define the following # benchmarking utilities: -from sklearn.utils.extmath import density from sklearn import metrics +from sklearn.utils.extmath import density def benchmark(clf, custom_name=False): @@ -361,14 +363,11 @@ def benchmark(clf, custom_name=False): # :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py` # noqa: E501 # for a demo on how such tuning can be done. -from sklearn.linear_model import LogisticRegression -from sklearn.svm import LinearSVC -from sklearn.linear_model import SGDClassifier -from sklearn.naive_bayes import ComplementNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.neighbors import NearestCentroid from sklearn.ensemble import RandomForestClassifier - +from sklearn.linear_model import LogisticRegression, SGDClassifier +from sklearn.naive_bayes import ComplementNB +from sklearn.neighbors import KNeighborsClassifier, NearestCentroid +from sklearn.svm import LinearSVC results = [] for clf, name in ( diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py index 368cf7cea60ae..fa68b8bd312ea 100644 --- a/examples/text/plot_document_clustering.py +++ b/examples/text/plot_document_clustering.py @@ -46,6 +46,7 @@ # strip those features and have a more sensible clustering problem. import numpy as np + from sklearn.datasets import fetch_20newsgroups categories = [ @@ -104,9 +105,10 @@ # For more reference, see :ref:`clustering_evaluation`. from collections import defaultdict -from sklearn import metrics from time import time +from sklearn import metrics + evaluations = [] evaluations_std = [] @@ -277,7 +279,6 @@ def fit_and_evaluate(km, X, name=None, n_runs=5): from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer - lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False)) t0 = time() X_lsa = lsa.fit_transform(X_tfidf) @@ -353,8 +354,7 @@ def fit_and_evaluate(km, X, name=None, n_runs=5): # case we also add LSA to the pipeline to reduce the dimension and sparcity of # the hashed vector space. -from sklearn.feature_extraction.text import HashingVectorizer -from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer lsa_vectorizer = make_pipeline( HashingVectorizer(stop_words="english", n_features=50_000), @@ -394,8 +394,8 @@ def fit_and_evaluate(km, X, name=None, n_runs=5): # Clustering evaluation summary # ============================== -import pandas as pd import matplotlib.pyplot as plt +import pandas as pd fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(16, 6), sharey=True) diff --git a/examples/text/plot_hashing_vs_dict_vectorizer.py b/examples/text/plot_hashing_vs_dict_vectorizer.py index 8200c646f69ee..ce2dcc2d13c41 100644 --- a/examples/text/plot_hashing_vs_dict_vectorizer.py +++ b/examples/text/plot_hashing_vs_dict_vectorizer.py @@ -118,6 +118,7 @@ def token_freqs(doc): # both of them receive dictionaries as input. from time import time + from sklearn.feature_extraction import DictVectorizer dict_count_vectorizers = defaultdict(list) diff --git a/examples/tree/plot_cost_complexity_pruning.py b/examples/tree/plot_cost_complexity_pruning.py index d21d163c9a1e3..b232389ea9ded 100644 --- a/examples/tree/plot_cost_complexity_pruning.py +++ b/examples/tree/plot_cost_complexity_pruning.py @@ -18,8 +18,9 @@ """ import matplotlib.pyplot as plt -from sklearn.model_selection import train_test_split + from sklearn.datasets import load_breast_cancer +from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier # %% diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index 14f6506b5810f..b3d834da5d067 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -23,13 +23,12 @@ # %% # Display the decision functions of trees trained on all pairs of features. -import numpy as np import matplotlib.pyplot as plt +import numpy as np from sklearn.datasets import load_iris -from sklearn.tree import DecisionTreeClassifier from sklearn.inspection import DecisionBoundaryDisplay - +from sklearn.tree import DecisionTreeClassifier # Parameters n_classes = 3 diff --git a/examples/tree/plot_tree_regression.py b/examples/tree/plot_tree_regression.py index 6ed28a5cbfa99..5a3da0b7b6d06 100644 --- a/examples/tree/plot_tree_regression.py +++ b/examples/tree/plot_tree_regression.py @@ -15,9 +15,10 @@ """ # Import the necessary modules and libraries +import matplotlib.pyplot as plt import numpy as np + from sklearn.tree import DecisionTreeRegressor -import matplotlib.pyplot as plt # Create a random dataset rng = np.random.RandomState(1) diff --git a/examples/tree/plot_tree_regression_multioutput.py b/examples/tree/plot_tree_regression_multioutput.py index a75652a6ddd56..b6d2800d2732d 100644 --- a/examples/tree/plot_tree_regression_multioutput.py +++ b/examples/tree/plot_tree_regression_multioutput.py @@ -15,8 +15,9 @@ details of the training data and learn from the noise, i.e. they overfit. """ -import numpy as np import matplotlib.pyplot as plt +import numpy as np + from sklearn.tree import DecisionTreeRegressor # Create a random dataset diff --git a/examples/tree/plot_unveil_tree_structure.py b/examples/tree/plot_unveil_tree_structure.py index 6313d0ccbb74f..d4009e3111f7f 100644 --- a/examples/tree/plot_unveil_tree_structure.py +++ b/examples/tree/plot_unveil_tree_structure.py @@ -19,10 +19,10 @@ import numpy as np from matplotlib import pyplot as plt -from sklearn.model_selection import train_test_split +from sklearn import tree from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier -from sklearn import tree ############################################################################## # Train tree classifier diff --git a/maint_tools/check_pxd_in_installation.py b/maint_tools/check_pxd_in_installation.py index ac1a8f9627a95..996d45d64d42a 100644 --- a/maint_tools/check_pxd_in_installation.py +++ b/maint_tools/check_pxd_in_installation.py @@ -6,12 +6,11 @@ """ import os -import sys import pathlib +import subprocess +import sys import tempfile import textwrap -import subprocess - sklearn_dir = pathlib.Path(sys.argv[1]) pxd_files = list(sklearn_dir.glob("**/*.pxd")) diff --git a/maint_tools/sort_whats_new.py b/maint_tools/sort_whats_new.py index 178e33bc87e5f..7241059176b66 100755 --- a/maint_tools/sort_whats_new.py +++ b/maint_tools/sort_whats_new.py @@ -2,8 +2,8 @@ # Sorts what's new entries with per-module headings. # Pass what's new entries on stdin. -import sys import re +import sys from collections import defaultdict LABEL_ORDER = ["MajorFeature", "Feature", "Efficiency", "Enhancement", "Fix", "API"] diff --git a/maint_tools/update_tracking_issue.py b/maint_tools/update_tracking_issue.py index 4ddc9d1bfe8e6..725802416fb6c 100644 --- a/maint_tools/update_tracking_issue.py +++ b/maint_tools/update_tracking_issue.py @@ -11,10 +11,10 @@ github account that does **not** have commit access to the public repo. """ -from pathlib import Path -import sys import argparse +import sys from datetime import datetime, timezone +from pathlib import Path import defusedxml.ElementTree as ET from github import Github diff --git a/pyproject.toml b/pyproject.toml index bed85b074dbfb..efd72adf44392 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,9 @@ exclude = ''' ''' [tool.ruff] +# all rules can be found here: https://beta.ruff.rs/docs/rules/ +select = ["E", "F", "W", "I"] + # max line length for black line-length = 88 target-version = "py38" @@ -74,6 +77,7 @@ exclude=[ "examples/*"=["E402"] "doc/conf.py"=["E402"] + [tool.cython-lint] # Ignore the same error codes as ruff # + E501 (line too long) because keeping it < 88 in cython diff --git a/setup.py b/setup.py index 77b38d51b248b..0ee1cd71cebd0 100755 --- a/setup.py +++ b/setup.py @@ -4,18 +4,17 @@ # 2010 Fabian Pedregosa # License: 3-clause BSD -import sys +import importlib import os -from os.path import join import platform import shutil +import sys +import traceback +from os.path import join from setuptools import Command, Extension, setup from setuptools.command.build_ext import build_ext -import traceback -import importlib - try: import builtins except ImportError: @@ -460,10 +459,10 @@ def configure_extension_modules(): if "sdist" in sys.argv or "--help" in sys.argv: return [] - from sklearn._build_utils import cythonize_extensions - from sklearn._build_utils import gen_from_templates import numpy + from sklearn._build_utils import cythonize_extensions, gen_from_templates + is_pypy = platform.python_implementation() == "PyPy" np_include = numpy.get_include() default_optimization_level = "O2" diff --git a/sklearn/__init__.py b/sklearn/__init__.py index d38a949d38208..c021d492fe061 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -12,13 +12,12 @@ See http://scikit-learn.org for complete documentation. """ -import sys import logging import os import random +import sys - -from ._config import get_config, set_config, config_context +from ._config import config_context, get_config, set_config logger = logging.getLogger(__name__) @@ -77,8 +76,10 @@ # It is necessary to do this prior to importing show_versions as the # later is linked to the OpenMP runtime to make it possible to introspect # it and importing it first would fail if the OpenMP dll cannot be found. - from . import _distributor_init # noqa: F401 - from . import __check_build # noqa: F401 + from . import ( + __check_build, # noqa: F401 + _distributor_init, # noqa: F401 + ) from .base import clone from .utils._show_versions import show_versions diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index f84dfa09a9f94..056215e162647 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -5,15 +5,15 @@ # license: BSD +import contextlib import os + import sklearn -import contextlib -from .pre_build_helpers import basic_check_build -from .openmp_helpers import check_openmp_support from .._min_dependencies import CYTHON_MIN_VERSION from ..externals._packaging.version import parse - +from .openmp_helpers import check_openmp_support +from .pre_build_helpers import basic_check_build DEFAULT_ROOT = "sklearn" diff --git a/sklearn/_build_utils/pre_build_helpers.py b/sklearn/_build_utils/pre_build_helpers.py index c1d50abd3ae0c..f3eb054bb037e 100644 --- a/sklearn/_build_utils/pre_build_helpers.py +++ b/sklearn/_build_utils/pre_build_helpers.py @@ -1,11 +1,11 @@ """Helpers to check build environment before actual build of scikit-learn""" +import glob import os +import subprocess import sys -import glob import tempfile import textwrap -import subprocess from setuptools.command.build_ext import customize_compiler, new_compiler diff --git a/sklearn/_config.py b/sklearn/_config.py index 43755071e54e9..e84dc9ef5b228 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -1,8 +1,8 @@ """Global configuration state and functions for management """ import os -from contextlib import contextmanager as contextmanager import threading +from contextlib import contextmanager as contextmanager _global_config = { "assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)), diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py index 78b1eb8543c8d..ee15e693c16f6 100644 --- a/sklearn/_loss/__init__.py +++ b/sklearn/_loss/__init__.py @@ -4,19 +4,18 @@ """ from .loss import ( - HalfSquaredError, AbsoluteError, - PinballLoss, - HuberLoss, - HalfPoissonLoss, + HalfBinomialLoss, HalfGammaLoss, + HalfMultinomialLoss, + HalfPoissonLoss, + HalfSquaredError, HalfTweedieLoss, HalfTweedieLossIdentity, - HalfBinomialLoss, - HalfMultinomialLoss, + HuberLoss, + PinballLoss, ) - __all__ = [ "HalfSquaredError", "AbsoluteError", diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py index 510ef80c641fc..9459844f6b89a 100644 --- a/sklearn/_loss/link.py +++ b/sklearn/_loss/link.py @@ -9,6 +9,7 @@ import numpy as np from scipy.special import expit, logit from scipy.stats import gmean + from ..utils.extmath import softmax diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py index 037d933aa5491..f3b61da0915d5 100644 --- a/sklearn/_loss/loss.py +++ b/sklearn/_loss/loss.py @@ -16,31 +16,33 @@ # - Replace link module of GLMs. import numbers + import numpy as np from scipy.special import xlogy + +from ..utils import check_scalar +from ..utils.stats import _weighted_percentile from ._loss import ( - CyHalfSquaredError, CyAbsoluteError, - CyPinballLoss, - CyHuberLoss, - CyHalfPoissonLoss, + CyExponentialLoss, + CyHalfBinomialLoss, CyHalfGammaLoss, + CyHalfMultinomialLoss, + CyHalfPoissonLoss, + CyHalfSquaredError, CyHalfTweedieLoss, CyHalfTweedieLossIdentity, - CyHalfBinomialLoss, - CyHalfMultinomialLoss, - CyExponentialLoss, + CyHuberLoss, + CyPinballLoss, ) from .link import ( - Interval, + HalfLogitLink, IdentityLink, - LogLink, + Interval, LogitLink, - HalfLogitLink, + LogLink, MultinomialLogit, ) -from ..utils import check_scalar -from ..utils.stats import _weighted_percentile # Note: The shape of raw_prediction for multiclass classifications are diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py index 8421fd3fd7a77..e5a665f8d48ac 100644 --- a/sklearn/_loss/tests/test_link.py +++ b/sklearn/_loss/tests/test_link.py @@ -1,16 +1,15 @@ import numpy as np -from numpy.testing import assert_allclose, assert_array_equal import pytest +from numpy.testing import assert_allclose, assert_array_equal from sklearn._loss.link import ( _LINKS, - _inclusive_low_high, HalfLogitLink, - MultinomialLogit, Interval, + MultinomialLogit, + _inclusive_low_high, ) - LINK_FUNCTIONS = list(_LINKS.values()) diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py index dbfe5b3829dda..d279a2f06a182 100644 --- a/sklearn/_loss/tests/test_loss.py +++ b/sklearn/_loss/tests/test_loss.py @@ -1,22 +1,22 @@ import pickle import numpy as np -from numpy.testing import assert_allclose, assert_array_equal import pytest +from numpy.testing import assert_allclose, assert_array_equal from pytest import approx from scipy.optimize import ( + LinearConstraint, minimize, minimize_scalar, newton, - LinearConstraint, ) from scipy.special import logsumexp -from sklearn._loss.link import _inclusive_low_high, IdentityLink +from sklearn._loss.link import IdentityLink, _inclusive_low_high from sklearn._loss.loss import ( _LOSSES, - BaseLoss, AbsoluteError, + BaseLoss, HalfBinomialLoss, HalfGammaLoss, HalfMultinomialLoss, @@ -30,7 +30,6 @@ from sklearn.utils import assert_all_finite from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit - ALL_LOSSES = list(_LOSSES.values()) LOSS_INSTANCES = [loss() for loss in ALL_LOSSES] diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 72ee14d64e958..e12720dbd5b94 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -1,8 +1,7 @@ """All minimum dependencies for scikit-learn.""" -from collections import defaultdict -import platform import argparse - +import platform +from collections import defaultdict # scipy and cython should by in sync with pyproject.toml diff --git a/sklearn/base.py b/sklearn/base.py index 40bf041a30c13..e62a0a01214bf 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -5,33 +5,36 @@ import copy import functools -import warnings -from collections import defaultdict -import platform import inspect +import platform import re +import warnings +from collections import defaultdict import numpy as np from . import __version__ -from ._config import get_config, config_context +from ._config import config_context, get_config +from .exceptions import InconsistentVersionWarning from .utils import _IS_32BIT +from .utils._estimator_html_repr import estimator_html_repr +from .utils._metadata_requests import _MetadataRequester +from .utils._param_validation import validate_parameter_constraints from .utils._set_output import _SetOutputMixin from .utils._tags import ( _DEFAULT_TAGS, ) -from .exceptions import InconsistentVersionWarning -from .utils.validation import check_X_y -from .utils.validation import check_array -from .utils.validation import _check_y -from .utils.validation import _num_features -from .utils.validation import _check_feature_names_in -from .utils.validation import _generate_get_feature_names_out -from .utils.validation import _is_fitted, check_is_fitted -from .utils._metadata_requests import _MetadataRequester -from .utils.validation import _get_feature_names -from .utils._estimator_html_repr import estimator_html_repr -from .utils._param_validation import validate_parameter_constraints +from .utils.validation import ( + _check_feature_names_in, + _check_y, + _generate_get_feature_names_out, + _get_feature_names, + _is_fitted, + _num_features, + check_array, + check_is_fitted, + check_X_y, +) def clone(estimator, *, safe=True): diff --git a/sklearn/calibration.py b/sklearn/calibration.py index e4869387f4166..42df0b3248733 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -7,43 +7,51 @@ # # License: BSD 3 clause -from numbers import Integral, Real import warnings -from inspect import signature from functools import partial - +from inspect import signature from math import log -import numpy as np +from numbers import Integral, Real -from scipy.special import expit -from scipy.special import xlogy +import numpy as np from scipy.optimize import fmin_bfgs +from scipy.special import expit, xlogy + +from sklearn.utils import Bunch from .base import ( BaseEstimator, ClassifierMixin, - RegressorMixin, - clone, MetaEstimatorMixin, + RegressorMixin, _fit_context, + clone, ) -from .preprocessing import label_binarize, LabelEncoder +from .isotonic import IsotonicRegression +from .model_selection import check_cv, cross_val_predict +from .preprocessing import LabelEncoder, label_binarize +from .svm import LinearSVC from .utils import ( + _safe_indexing, column_or_1d, indexable, - _safe_indexing, ) - -from .utils.multiclass import check_classification_targets -from .utils.parallel import delayed, Parallel from .utils._param_validation import ( - StrOptions, HasMethods, Hidden, - validate_params, Interval, + StrOptions, + validate_params, ) from .utils._plotting import _BinaryClassifierCurveDisplayMixin +from .utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, +) +from .utils.multiclass import check_classification_targets +from .utils.parallel import Parallel, delayed from .utils.validation import ( _check_fit_params, _check_pos_label_consistency, @@ -52,16 +60,6 @@ check_consistent_length, check_is_fitted, ) -from .isotonic import IsotonicRegression -from .svm import LinearSVC -from .model_selection import check_cv, cross_val_predict -from sklearn.utils import Bunch -from .utils.metadata_routing import ( - MetadataRouter, - MethodMapping, - process_routing, - _routing_enabled, -) class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 40b89ea0da8ba..f5d3104d816bf 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -3,27 +3,27 @@ algorithms. """ -from ._spectral import spectral_clustering, SpectralClustering -from ._mean_shift import mean_shift, MeanShift, estimate_bandwidth, get_bin_seeds -from ._affinity_propagation import affinity_propagation, AffinityPropagation +from ._affinity_propagation import AffinityPropagation, affinity_propagation from ._agglomerative import ( - ward_tree, AgglomerativeClustering, - linkage_tree, FeatureAgglomeration, + linkage_tree, + ward_tree, ) -from ._kmeans import k_means, KMeans, MiniBatchKMeans, kmeans_plusplus +from ._bicluster import SpectralBiclustering, SpectralCoclustering +from ._birch import Birch from ._bisect_k_means import BisectingKMeans -from ._dbscan import dbscan, DBSCAN +from ._dbscan import DBSCAN, dbscan +from ._hdbscan.hdbscan import HDBSCAN +from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus +from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift from ._optics import ( OPTICS, cluster_optics_dbscan, - compute_optics_graph, cluster_optics_xi, + compute_optics_graph, ) -from ._bicluster import SpectralBiclustering, SpectralCoclustering -from ._birch import Birch -from ._hdbscan.hdbscan import HDBSCAN +from ._spectral import SpectralClustering, spectral_clustering __all__ = [ "AffinityPropagation", diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 1ffc5f07e8c50..6e7f67fed5199 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -5,20 +5,18 @@ # License: BSD 3 clause -from numbers import Integral, Real import warnings +from numbers import Integral, Real import numpy as np +from .._config import config_context +from ..base import BaseEstimator, ClusterMixin, _fit_context from ..exceptions import ConvergenceWarning -from ..base import BaseEstimator, ClusterMixin -from ..base import _fit_context +from ..metrics import euclidean_distances, pairwise_distances_argmin from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.validation import check_is_fitted -from ..metrics import euclidean_distances -from ..metrics import pairwise_distances_argmin -from .._config import config_context def _equal_similarities_and_preferences(S, preference): diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index b7d08a45dcd80..553908104c92b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -15,22 +15,25 @@ from scipy import sparse from scipy.sparse.csgraph import connected_components -from ..base import BaseEstimator, ClusterMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context -from ..metrics.pairwise import paired_distances -from ..metrics.pairwise import _VALID_METRICS +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + ClusterMixin, + _fit_context, +) from ..metrics import DistanceMetric from ..metrics._dist_metrics import METRIC_MAPPING64 +from ..metrics.pairwise import _VALID_METRICS, paired_distances from ..utils import check_array from ..utils._fast_dict import IntFloatDict -from ..utils.graph import _fix_connected_components from ..utils._param_validation import ( + HasMethods, Hidden, Interval, StrOptions, - HasMethods, validate_params, ) +from ..utils.graph import _fix_connected_components from ..utils.validation import check_memory # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 4133264626ebb..65280c06319d9 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -3,25 +3,19 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod - -import numpy as np from numbers import Integral +import numpy as np from scipy.linalg import norm from scipy.sparse import dia_matrix, issparse from scipy.sparse.linalg import eigsh, svds -from . import KMeans, MiniBatchKMeans -from ..base import BaseEstimator, BiclusterMixin -from ..base import _fit_context -from ..utils import check_random_state -from ..utils import check_scalar - +from ..base import BaseEstimator, BiclusterMixin, _fit_context +from ..utils import check_random_state, check_scalar +from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import make_nonnegative, randomized_svd, safe_sparse_dot - from ..utils.validation import assert_all_finite -from ..utils._param_validation import Interval, StrOptions - +from ._kmeans import KMeans, MiniBatchKMeans __all__ = ["SpectralCoclustering", "SpectralBiclustering"] diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index e74630572a014..d62fb880ba8b2 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -4,26 +4,27 @@ # License: BSD 3 clause import warnings -import numpy as np +from math import sqrt from numbers import Integral, Real + +import numpy as np from scipy import sparse -from math import sqrt -from ..metrics import pairwise_distances_argmin -from ..metrics.pairwise import euclidean_distances +from .._config import config_context from ..base import ( - TransformerMixin, - ClusterMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin, + ClusterMixin, + TransformerMixin, _fit_context, ) -from ..utils.extmath import row_norms +from ..exceptions import ConvergenceWarning +from ..metrics import pairwise_distances_argmin +from ..metrics.pairwise import euclidean_distances from ..utils._param_validation import Interval +from ..utils.extmath import row_norms from ..utils.validation import check_is_fitted -from ..exceptions import ConvergenceWarning from . import AgglomerativeClustering -from .._config import config_context def _iterate_sparse_X(X): diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index 959d78ae85009..9091445261f70 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -7,18 +7,17 @@ import scipy.sparse as sp from ..base import _fit_context -from ._kmeans import _BaseKMeans -from ._kmeans import _kmeans_single_elkan -from ._kmeans import _kmeans_single_lloyd -from ._kmeans import _labels_inertia_threadpool_limit -from ._k_means_common import _inertia_dense -from ._k_means_common import _inertia_sparse -from ..utils.extmath import row_norms from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils.validation import check_is_fitted -from ..utils.validation import _check_sample_weight -from ..utils.validation import check_random_state from ..utils._param_validation import StrOptions +from ..utils.extmath import row_norms +from ..utils.validation import _check_sample_weight, check_is_fitted, check_random_state +from ._k_means_common import _inertia_dense, _inertia_sparse +from ._kmeans import ( + _BaseKMeans, + _kmeans_single_elkan, + _kmeans_single_lloyd, + _labels_inertia_threadpool_limit, +) class _BisectingTree: diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 3c753935ac046..e3ba62dbfdf01 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -14,12 +14,11 @@ import numpy as np from scipy import sparse +from ..base import BaseEstimator, ClusterMixin, _fit_context from ..metrics.pairwise import _VALID_METRICS -from ..base import BaseEstimator, ClusterMixin -from ..base import _fit_context -from ..utils.validation import _check_sample_weight -from ..utils._param_validation import Interval, StrOptions from ..neighbors import NearestNeighbors +from ..utils._param_validation import Interval, StrOptions +from ..utils.validation import _check_sample_weight from ._dbscan_inner import dbscan_inner diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index 55baf247a2931..f84f18c1c18b3 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -6,12 +6,13 @@ # License: BSD 3 clause import warnings + import numpy as np +from scipy.sparse import issparse from ..base import TransformerMixin -from ..utils.validation import check_is_fitted from ..utils import metadata_routing -from scipy.sparse import issparse +from ..utils.validation import check_is_fitted ############################################################################### # Mixin class for feature agglomeration. diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index f1584f46d6f82..fa6c1950b1164 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -46,16 +46,15 @@ from ...metrics._dist_metrics import DistanceMetric from ...neighbors import BallTree, KDTree, NearestNeighbors from ...utils._param_validation import Interval, StrOptions -from ...utils.validation import _assert_all_finite, _allclose_dense_sparse -from ._reachability import mutual_reachability_graph +from ...utils.validation import _allclose_dense_sparse, _assert_all_finite from ._linkage import ( + MST_edge_dtype, make_single_linkage, - mst_from_mutual_reachability, mst_from_data_matrix, - MST_edge_dtype, + mst_from_mutual_reachability, ) -from ._tree import tree_to_labels, labelling_at_cut -from ._tree import HIERARCHY_dtype +from ._reachability import mutual_reachability_graph +from ._tree import HIERARCHY_dtype, labelling_at_cut, tree_to_labels FAST_METRICS = set(KDTree.valid_metrics() + BallTree.valid_metrics()) diff --git a/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/sklearn/cluster/_hdbscan/tests/test_reachibility.py index c8ba28d0af25b..c25b6baf4b65c 100644 --- a/sklearn/cluster/_hdbscan/tests/test_reachibility.py +++ b/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -1,13 +1,12 @@ import numpy as np import pytest +from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph from sklearn.utils._testing import ( _convert_container, assert_allclose, ) -from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph - def test_mutual_reachability_graph_error_sparse_format(): """Check that we raise an error if the sparse format is not CSR.""" diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b36999885a14e..79aa8b3825170 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -11,50 +11,48 @@ # Robert Layton # License: BSD 3 clause +import warnings from abc import ABC, abstractmethod from numbers import Integral, Real -import warnings import numpy as np import scipy.sparse as sp from ..base import ( BaseEstimator, + ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, - ClassNamePrefixFeaturesOutMixin, _fit_context, ) -from ..metrics.pairwise import euclidean_distances -from ..metrics.pairwise import _euclidean_distances +from ..exceptions import ConvergenceWarning +from ..metrics.pairwise import _euclidean_distances, euclidean_distances +from ..utils import check_array, check_random_state +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.extmath import row_norms, stable_cumsum -from ..utils.fixes import threadpool_limits -from ..utils.fixes import threadpool_info -from ..utils.sparsefuncs_fast import assign_rows_csr +from ..utils.fixes import threadpool_info, threadpool_limits from ..utils.sparsefuncs import mean_variance_axis -from ..utils import check_array -from ..utils import check_random_state -from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import _is_arraylike_not_scalar -from ..utils._param_validation import Hidden -from ..utils._param_validation import Interval -from ..utils._param_validation import StrOptions -from ..utils._param_validation import validate_params -from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..exceptions import ConvergenceWarning -from ._k_means_common import CHUNK_SIZE -from ._k_means_common import _inertia_dense -from ._k_means_common import _inertia_sparse -from ._k_means_common import _is_same_clustering -from ._k_means_minibatch import _minibatch_update_dense -from ._k_means_minibatch import _minibatch_update_sparse -from ._k_means_lloyd import lloyd_iter_chunked_dense -from ._k_means_lloyd import lloyd_iter_chunked_sparse -from ._k_means_elkan import init_bounds_dense -from ._k_means_elkan import init_bounds_sparse -from ._k_means_elkan import elkan_iter_chunked_dense -from ._k_means_elkan import elkan_iter_chunked_sparse - +from ..utils.sparsefuncs_fast import assign_rows_csr +from ..utils.validation import ( + _check_sample_weight, + _is_arraylike_not_scalar, + check_is_fitted, +) +from ._k_means_common import ( + CHUNK_SIZE, + _inertia_dense, + _inertia_sparse, + _is_same_clustering, +) +from ._k_means_elkan import ( + elkan_iter_chunked_dense, + elkan_iter_chunked_sparse, + init_bounds_dense, + init_bounds_sparse, +) +from ._k_means_lloyd import lloyd_iter_chunked_dense, lloyd_iter_chunked_sparse +from ._k_means_minibatch import _minibatch_update_dense, _minibatch_update_sparse ############################################################################### # Initialization heuristic diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 6b0f227d011f9..ab9b8e85beadb 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -14,20 +14,20 @@ # Gael Varoquaux # Martino Sorbaro -import numpy as np import warnings +from collections import defaultdict from numbers import Integral, Real -from collections import defaultdict +import numpy as np + +from .._config import config_context +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..metrics.pairwise import pairwise_distances_argmin +from ..neighbors import NearestNeighbors +from ..utils import check_array, check_random_state, gen_batches from ..utils._param_validation import Interval, validate_params +from ..utils.parallel import Parallel, delayed from ..utils.validation import check_is_fitted -from ..utils.parallel import delayed, Parallel -from ..utils import check_random_state, gen_batches, check_array -from ..base import BaseEstimator, ClusterMixin -from ..base import _fit_context -from ..neighbors import NearestNeighbors -from ..metrics.pairwise import pairwise_distances_argmin -from .._config import config_context @validate_params( diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index ca1c74d6f44e7..8a91cd6f5a383 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -10,23 +10,26 @@ License: BSD 3 clause """ +import warnings from numbers import Integral, Real -import warnings import numpy as np +from scipy.sparse import SparseEfficiencyWarning, issparse +from ..base import BaseEstimator, ClusterMixin, _fit_context from ..exceptions import DataConversionWarning -from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS -from ..metrics.pairwise import _VALID_METRICS +from ..metrics import pairwise_distances +from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS +from ..neighbors import NearestNeighbors from ..utils import gen_batches, get_chunk_n_rows -from ..utils._param_validation import Interval, HasMethods, StrOptions, validate_params -from ..utils._param_validation import RealNotInt +from ..utils._param_validation import ( + HasMethods, + Interval, + RealNotInt, + StrOptions, + validate_params, +) from ..utils.validation import check_memory -from ..neighbors import NearestNeighbors -from ..base import BaseEstimator, ClusterMixin -from ..base import _fit_context -from ..metrics import pairwise_distances -from scipy.sparse import issparse, SparseEfficiencyWarning class OPTICS(ClusterMixin, BaseEstimator): diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index f72db4b7c1da3..d5fc9d4fdc68f 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -6,21 +6,19 @@ # Andrew Knyazev # License: BSD 3 clause -from numbers import Integral, Real import warnings +from numbers import Integral, Real import numpy as np - from scipy.linalg import LinAlgError, qr, svd from scipy.sparse import csc_matrix -from ..base import BaseEstimator, ClusterMixin -from ..base import _fit_context -from ..utils._param_validation import Interval, StrOptions, validate_params -from ..utils import check_random_state, as_float_array -from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS -from ..neighbors import kneighbors_graph, NearestNeighbors +from ..base import BaseEstimator, ClusterMixin, _fit_context from ..manifold import spectral_embedding +from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels +from ..neighbors import NearestNeighbors, kneighbors_graph +from ..utils import as_float_array, check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params from ._kmeans import k_means diff --git a/sklearn/cluster/tests/common.py b/sklearn/cluster/tests/common.py index 0f4bd9e14926d..b1fe047fe230a 100644 --- a/sklearn/cluster/tests/common.py +++ b/sklearn/cluster/tests/common.py @@ -5,7 +5,6 @@ import numpy as np - ############################################################################### # Generate sample data diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index 52007c375f667..136d2fe6fd781 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -3,20 +3,18 @@ """ -import numpy as np -import pytest import warnings +import numpy as np +import pytest from scipy.sparse import csr_matrix -from sklearn.exceptions import ConvergenceWarning, NotFittedError -from sklearn.utils._testing import assert_array_equal, assert_allclose - -from sklearn.cluster import AffinityPropagation +from sklearn.cluster import AffinityPropagation, affinity_propagation from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences -from sklearn.cluster import affinity_propagation from sklearn.datasets import make_blobs +from sklearn.exceptions import ConvergenceWarning, NotFittedError from sklearn.metrics import euclidean_distances +from sklearn.utils._testing import assert_allclose, assert_array_equal n_clusters = 3 centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 0a68e97d6fb22..6d4a1067c4048 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -4,23 +4,21 @@ import pytest from scipy.sparse import csr_matrix, issparse -from sklearn.model_selection import ParameterGrid - -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal - from sklearn.base import BaseEstimator, BiclusterMixin - -from sklearn.cluster import SpectralCoclustering -from sklearn.cluster import SpectralBiclustering -from sklearn.cluster._bicluster import _scale_normalize -from sklearn.cluster._bicluster import _bistochastic_normalize -from sklearn.cluster._bicluster import _log_normalize - -from sklearn.metrics import consensus_score, v_measure_score - +from sklearn.cluster import SpectralBiclustering, SpectralCoclustering +from sklearn.cluster._bicluster import ( + _bistochastic_normalize, + _log_normalize, + _scale_normalize, +) from sklearn.datasets import make_biclusters, make_checkerboard +from sklearn.metrics import consensus_score, v_measure_score +from sklearn.model_selection import ParameterGrid +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) class MockBiclustering(BiclusterMixin, BaseEstimator): diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index c2f3c06d15ba7..7fb83f0803f02 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -2,19 +2,16 @@ Tests for the birch clustering algorithm. """ -from scipy import sparse import numpy as np import pytest +from scipy import sparse +from sklearn.cluster import AgglomerativeClustering, Birch from sklearn.cluster.tests.common import generate_clustered_data -from sklearn.cluster import Birch -from sklearn.cluster import AgglomerativeClustering from sklearn.datasets import make_blobs from sklearn.exceptions import ConvergenceWarning from sklearn.metrics import pairwise_distances_argmin, v_measure_score - -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_allclose, assert_array_equal def test_n_samples_leaves_roots(global_random_seed, global_dtype): diff --git a/sklearn/cluster/tests/test_bisect_k_means.py b/sklearn/cluster/tests/test_bisect_k_means.py index c79cd0bcca3e8..01afd4be9c8b5 100644 --- a/sklearn/cluster/tests/test_bisect_k_means.py +++ b/sklearn/cluster/tests/test_bisect_k_means.py @@ -2,9 +2,9 @@ import pytest import scipy.sparse as sp -from sklearn.utils._testing import assert_array_equal, assert_allclose from sklearn.cluster import BisectingKMeans from sklearn.metrics import v_measure_score +from sklearn.utils._testing import assert_allclose, assert_array_equal @pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"]) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index f36eb19caeb0f..972820c6cc137 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -3,23 +3,18 @@ """ import pickle - -import numpy as np - import warnings -from scipy.spatial import distance -from scipy import sparse - +import numpy as np import pytest +from scipy import sparse +from scipy.spatial import distance -from sklearn.utils._testing import assert_array_equal -from sklearn.neighbors import NearestNeighbors -from sklearn.cluster import DBSCAN -from sklearn.cluster import dbscan +from sklearn.cluster import DBSCAN, dbscan from sklearn.cluster.tests.common import generate_clustered_data from sklearn.metrics.pairwise import pairwise_distances - +from sklearn.neighbors import NearestNeighbors +from sklearn.utils._testing import assert_array_equal n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 3db2862384c74..121e8f2cfe400 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -3,13 +3,14 @@ """ # Authors: Sergul Aydore 2017 import warnings -import numpy as np -from numpy.testing import assert_array_equal +import numpy as np import pytest +from numpy.testing import assert_array_equal + from sklearn.cluster import FeatureAgglomeration -from sklearn.utils._testing import assert_array_almost_equal from sklearn.datasets import make_blobs +from sklearn.utils._testing import assert_array_almost_equal def test_feature_agglomeration(): diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index b652a99aa221f..d1ff6452a5a08 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -8,6 +8,12 @@ from scipy.spatial import distance from sklearn.cluster import HDBSCAN +from sklearn.cluster._hdbscan._tree import ( + CONDENSED_dtype, + _condense_tree, + _do_labelling, +) +from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING from sklearn.datasets import make_blobs from sklearn.metrics import fowlkes_mallows_score from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances @@ -15,12 +21,6 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose, assert_array_equal -from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING -from sklearn.cluster._hdbscan._tree import ( - _do_labelling, - _condense_tree, - CONDENSED_dtype, -) n_clusters_true = 3 X, y = make_blobs(n_samples=200, random_state=10) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index acaf3c27bedb1..95f28413d132d 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -6,48 +6,48 @@ # Matteo Visconti di Oleggio Castello 2014 # License: BSD 3 clause import itertools -from tempfile import mkdtemp import shutil -import pytest from functools import partial +from tempfile import mkdtemp import numpy as np +import pytest from scipy import sparse from scipy.cluster import hierarchy from scipy.sparse.csgraph import connected_components -from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS -from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import ignore_warnings - -from sklearn.cluster import ward_tree -from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration +from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree from sklearn.cluster._agglomerative import ( - _hc_cut, _TREE_BUILDERS, - linkage_tree, _fix_connectivity, + _hc_cut, + linkage_tree, +) +from sklearn.cluster._hierarchical_fast import ( + average_merge, + max_merge, + mst_linkage_core, ) +from sklearn.datasets import make_circles, make_moons from sklearn.feature_extraction.image import grid_to_graph from sklearn.metrics import DistanceMetric +from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score from sklearn.metrics.pairwise import ( PAIRED_DISTANCES, cosine_distances, manhattan_distances, pairwise_distances, ) -from sklearn.metrics.cluster import normalized_mutual_info_score +from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS from sklearn.neighbors import kneighbors_graph -from sklearn.cluster._hierarchical_fast import ( - average_merge, - max_merge, - mst_linkage_core, -) from sklearn.utils._fast_dict import IntFloatDict -from sklearn.utils._testing import assert_array_equal -from sklearn.datasets import make_moons, make_circles +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + create_memmap_backed_data, + ignore_warnings, +) def test_linkage_misc(): diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index c11d5dd3165c0..a61f548ba11a0 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -2,37 +2,36 @@ import re import sys import warnings +from io import StringIO import numpy as np -from scipy import sparse as sp - import pytest +from scipy import sparse as sp -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose -from sklearn.utils.fixes import threadpool_limits from sklearn.base import clone +from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus +from sklearn.cluster._k_means_common import ( + _euclidean_dense_dense_wrapper, + _euclidean_sparse_dense_wrapper, + _inertia_dense, + _inertia_sparse, + _is_same_clustering, + _relocate_empty_clusters_dense, + _relocate_empty_clusters_sparse, +) +from sklearn.cluster._kmeans import _labels_inertia, _mini_batch_step +from sklearn.datasets import make_blobs from sklearn.exceptions import ConvergenceWarning - -from sklearn.utils.extmath import row_norms -from sklearn.metrics import pairwise_distances -from sklearn.metrics import pairwise_distances_argmin -from sklearn.metrics.pairwise import euclidean_distances +from sklearn.metrics import pairwise_distances, pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score -from sklearn.cluster import KMeans, k_means, kmeans_plusplus -from sklearn.cluster import MiniBatchKMeans -from sklearn.cluster._kmeans import _labels_inertia -from sklearn.cluster._kmeans import _mini_batch_step -from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense -from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse -from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper -from sklearn.cluster._k_means_common import _euclidean_sparse_dense_wrapper -from sklearn.cluster._k_means_common import _inertia_dense -from sklearn.cluster._k_means_common import _inertia_sparse -from sklearn.cluster._k_means_common import _is_same_clustering -from sklearn.utils._testing import create_memmap_backed_data -from sklearn.datasets import make_blobs -from io import StringIO +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, + create_memmap_backed_data, +) +from sklearn.utils.extmath import row_norms +from sklearn.utils.fixes import threadpool_limits # TODO(1.4): Remove msg = ( diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index db13e4d18650f..265c72d0c4ce1 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -3,20 +3,15 @@ """ -import numpy as np import warnings -import pytest -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose +import numpy as np +import pytest -from sklearn.cluster import MeanShift -from sklearn.cluster import mean_shift -from sklearn.cluster import estimate_bandwidth -from sklearn.cluster import get_bin_seeds +from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift from sklearn.datasets import make_blobs from sklearn.metrics import v_measure_score - +from sklearn.utils._testing import assert_allclose, assert_array_equal n_clusters = 3 centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 0acf818912c0f..d7bf4034ab98a 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -1,24 +1,21 @@ # Authors: Shane Grigsby # Adrin Jalali # License: BSD 3 clause +import warnings + import numpy as np import pytest from scipy import sparse -import warnings -from sklearn.datasets import make_blobs -from sklearn.cluster import OPTICS +from sklearn.cluster import DBSCAN, OPTICS from sklearn.cluster._optics import _extend_region, _extract_xi_labels -from sklearn.exceptions import DataConversionWarning +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets import make_blobs +from sklearn.exceptions import DataConversionWarning, EfficiencyWarning from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances -from sklearn.cluster import DBSCAN from sklearn.utils import shuffle -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose -from sklearn.exceptions import EfficiencyWarning -from sklearn.cluster.tests.common import generate_clustered_data - +from sklearn.utils._testing import assert_allclose, assert_array_equal rng = np.random.RandomState(0) n_points_per_cluster = 10 diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index d301f06e92075..33968a542691a 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -1,24 +1,21 @@ """Testing for Spectral Clustering methods""" +import pickle import re import numpy as np +import pytest from scipy import sparse from scipy.linalg import LinAlgError -import pytest - -import pickle - -from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_array_equal - from sklearn.cluster import SpectralClustering, spectral_clustering -from sklearn.cluster._spectral import discretize, cluster_qr +from sklearn.cluster._spectral import cluster_qr, discretize +from sklearn.datasets import make_blobs from sklearn.feature_extraction import img_to_graph from sklearn.metrics import adjusted_rand_score from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel from sklearn.neighbors import NearestNeighbors -from sklearn.datasets import make_blobs +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_equal try: from pyamg import smoothed_aggregation_solver # noqa diff --git a/sklearn/compose/__init__.py b/sklearn/compose/__init__.py index 8be8d17040e82..7b137cdf9e07f 100644 --- a/sklearn/compose/__init__.py +++ b/sklearn/compose/__init__.py @@ -7,12 +7,11 @@ from ._column_transformer import ( ColumnTransformer, - make_column_transformer, make_column_selector, + make_column_transformer, ) from ._target import TransformedTargetRegressor - __all__ = [ "ColumnTransformer", "make_column_transformer", diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 14349662cfee9..1f5854eac663e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -6,29 +6,28 @@ # Author: Andreas Mueller # Joris Van den Bossche # License: BSD -from numbers import Integral, Real -from itertools import chain from collections import Counter +from itertools import chain +from numbers import Integral, Real import numpy as np from scipy import sparse -from ..base import clone, TransformerMixin -from ..base import _fit_context -from ..utils._estimator_html_repr import _VisualBlock -from ..pipeline import _fit_transform_one, _transform_one, _name_estimators +from ..base import TransformerMixin, _fit_context, clone +from ..pipeline import _fit_transform_one, _name_estimators, _transform_one from ..preprocessing import FunctionTransformer -from ..utils import Bunch -from ..utils import _safe_indexing -from ..utils import _get_column_indices -from ..utils._param_validation import HasMethods, Interval, StrOptions, Hidden +from ..utils import Bunch, _get_column_indices, _safe_indexing, check_pandas_support +from ..utils._estimator_html_repr import _VisualBlock +from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions from ..utils._set_output import _get_output_config, _safe_set_output -from ..utils import check_pandas_support from ..utils.metaestimators import _BaseComposition -from ..utils.validation import check_array, check_is_fitted, _check_feature_names_in -from ..utils.validation import _num_samples -from ..utils.parallel import delayed, Parallel - +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_feature_names_in, + _num_samples, + check_array, + check_is_fitted, +) __all__ = ["ColumnTransformer", "make_column_transformer", "make_column_selector"] diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index e926ed7abe324..348cdda48ea1c 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -6,14 +6,13 @@ import numpy as np -from ..base import BaseEstimator, RegressorMixin, clone -from ..base import _fit_context -from ..utils.validation import check_is_fitted -from ..utils._tags import _safe_tags -from ..utils import check_array, _safe_indexing -from ..utils._param_validation import HasMethods -from ..preprocessing import FunctionTransformer +from ..base import BaseEstimator, RegressorMixin, _fit_context, clone from ..exceptions import NotFittedError +from ..preprocessing import FunctionTransformer +from ..utils import _safe_indexing, check_array +from ..utils._param_validation import HasMethods +from ..utils._tags import _safe_tags +from ..utils.validation import check_is_fitted __all__ = ["TransformedTargetRegressor"] diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index cb9ddc0b4f344..dcf84273b3f14 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1,28 +1,33 @@ """ Test the ColumnTransformer. """ -import re import pickle +import re import numpy as np -from scipy import sparse import pytest - from numpy.testing import assert_allclose -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose_dense_sparse -from sklearn.utils._testing import assert_almost_equal +from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin from sklearn.compose import ( ColumnTransformer, - make_column_transformer, make_column_selector, + make_column_transformer, ) from sklearn.exceptions import NotFittedError -from sklearn.preprocessing import FunctionTransformer -from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_selection import VarianceThreshold +from sklearn.preprocessing import ( + FunctionTransformer, + Normalizer, + OneHotEncoder, + StandardScaler, +) +from sklearn.utils._testing import ( + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_equal, +) class Trans(TransformerMixin, BaseEstimator): diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py index f0d63c00c2772..53242b7e0277b 100644 --- a/sklearn/compose/tests/test_target.py +++ b/sklearn/compose/tests/test_target.py @@ -1,25 +1,14 @@ import numpy as np import pytest -from sklearn.base import clone -from sklearn.base import BaseEstimator -from sklearn.base import TransformerMixin - -from sklearn.dummy import DummyRegressor - -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_no_warnings - -from sklearn.preprocessing import FunctionTransformer -from sklearn.preprocessing import StandardScaler - -from sklearn.pipeline import Pipeline - -from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit - from sklearn import datasets - +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.compose import TransformedTargetRegressor +from sklearn.dummy import DummyRegressor +from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.utils._testing import assert_allclose, assert_no_warnings friedman = datasets.make_friedman1(random_state=0) diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 5d5f80d2e22d5..3d2c73b99a801 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -1,29 +1,29 @@ -from os import environ -from functools import wraps import platform import sys from contextlib import suppress +from functools import wraps +from os import environ from unittest import SkipTest import joblib -import pytest import numpy as np -from threadpoolctl import threadpool_limits +import pytest from _pytest.doctest import DoctestItem +from threadpoolctl import threadpool_limits -from sklearn.utils import _IS_32BIT from sklearn._min_dependencies import PYTEST_MIN_VERSION -from sklearn.utils.fixes import sp_version -from sklearn.utils.fixes import parse_version -from sklearn.datasets import fetch_20newsgroups -from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.datasets import fetch_california_housing -from sklearn.datasets import fetch_covtype -from sklearn.datasets import fetch_kddcup99 -from sklearn.datasets import fetch_olivetti_faces -from sklearn.datasets import fetch_rcv1 +from sklearn.datasets import ( + fetch_20newsgroups, + fetch_20newsgroups_vectorized, + fetch_california_housing, + fetch_covtype, + fetch_kddcup99, + fetch_olivetti_faces, + fetch_rcv1, +) from sklearn.tests import random_seed - +from sklearn.utils import _IS_32BIT +from sklearn.utils.fixes import parse_version, sp_version if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION): raise ImportError( diff --git a/sklearn/covariance/__init__.py b/sklearn/covariance/__init__.py index 011fde3647145..8fcf8c68444e5 100644 --- a/sklearn/covariance/__init__.py +++ b/sklearn/covariance/__init__.py @@ -6,24 +6,23 @@ Models. """ +from ._elliptic_envelope import EllipticEnvelope from ._empirical_covariance import ( - empirical_covariance, EmpiricalCovariance, + empirical_covariance, log_likelihood, ) +from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso +from ._robust_covariance import MinCovDet, fast_mcd from ._shrunk_covariance import ( - shrunk_covariance, + OAS, + LedoitWolf, ShrunkCovariance, ledoit_wolf, ledoit_wolf_shrinkage, - LedoitWolf, oas, - OAS, + shrunk_covariance, ) -from ._robust_covariance import fast_mcd, MinCovDet -from ._graph_lasso import graphical_lasso, GraphicalLasso, GraphicalLassoCV -from ._elliptic_envelope import EllipticEnvelope - __all__ = [ "EllipticEnvelope", diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index c99f200592580..fe109dddd5303 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -2,14 +2,15 @@ # # License: BSD 3 clause -import numpy as np from numbers import Real -from . import MinCovDet + +import numpy as np + +from ..base import OutlierMixin, _fit_context +from ..metrics import accuracy_score from ..utils._param_validation import Interval from ..utils.validation import check_is_fitted -from ..metrics import accuracy_score -from ..base import OutlierMixin -from ..base import _fit_context +from ._robust_covariance import MinCovDet class EllipticEnvelope(OutlierMixin, MinCovDet): diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 8083bfd2e1aa1..e39c18017cdf0 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -11,16 +11,16 @@ # avoid division truncation import warnings + import numpy as np from scipy import linalg from .. import config_context -from ..base import BaseEstimator -from ..base import _fit_context +from ..base import BaseEstimator, _fit_context +from ..metrics.pairwise import pairwise_distances from ..utils import check_array from ..utils._param_validation import validate_params from ..utils.extmath import fast_logdet -from ..metrics.pairwise import pairwise_distances def log_likelihood(emp_cov, precision): diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 8575cc4f75801..2b3248eb0300e 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -5,32 +5,30 @@ # Author: Gael Varoquaux # License: BSD 3 clause # Copyright: INRIA -import warnings import operator import sys import time - +import warnings from numbers import Integral, Real + import numpy as np from scipy import linalg -from . import empirical_covariance, EmpiricalCovariance, log_likelihood - from ..base import _fit_context from ..exceptions import ConvergenceWarning -from ..utils.validation import ( - _is_arraylike_not_scalar, - check_random_state, - check_scalar, -) -from ..utils.parallel import delayed, Parallel -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import validate_params # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' from ..linear_model import _cd_fast as cd_fast # type: ignore from ..linear_model import lars_path_gram from ..model_selection import check_cv, cross_val_score +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _is_arraylike_not_scalar, + check_random_state, + check_scalar, +) +from . import EmpiricalCovariance, empirical_covariance, log_likelihood # Helper functions to compute the objective and dual objective functions diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index c723bba7a097b..a6b32e50a6c1f 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -10,15 +10,16 @@ import warnings from numbers import Integral, Real + import numpy as np from scipy import linalg from scipy.stats import chi2 -from . import empirical_covariance, EmpiricalCovariance from ..base import _fit_context -from ..utils.extmath import fast_logdet -from ..utils import check_random_state, check_array +from ..utils import check_array, check_random_state from ..utils._param_validation import Interval +from ..utils.extmath import fast_logdet +from ._empirical_covariance import EmpiricalCovariance, empirical_covariance # Minimum Covariance Determinant diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 21d2e034b45d7..06d65c46faef7 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -14,13 +14,14 @@ # avoid division truncation import warnings -from numbers import Real, Integral +from numbers import Integral, Real + import numpy as np -from . import empirical_covariance, EmpiricalCovariance from ..base import _fit_context from ..utils import check_array from ..utils._param_validation import Interval, validate_params +from . import EmpiricalCovariance, empirical_covariance def _ledoit_wolf(X, *, assume_centered, block_size): diff --git a/sklearn/covariance/tests/test_covariance.py b/sklearn/covariance/tests/test_covariance.py index bbd3a4757a835..0866c209a10c3 100644 --- a/sklearn/covariance/tests/test_covariance.py +++ b/sklearn/covariance/tests/test_covariance.py @@ -7,24 +7,25 @@ import numpy as np import pytest -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal - from sklearn import datasets from sklearn.covariance import ( - empirical_covariance, + OAS, EmpiricalCovariance, - ShrunkCovariance, - shrunk_covariance, LedoitWolf, + ShrunkCovariance, + empirical_covariance, ledoit_wolf, ledoit_wolf_shrinkage, - OAS, oas, + shrunk_covariance, ) from sklearn.covariance._shrunk_covariance import _ledoit_wolf +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) from .._shrunk_covariance import _oas diff --git a/sklearn/covariance/tests/test_elliptic_envelope.py b/sklearn/covariance/tests/test_elliptic_envelope.py index 122d4c8bfb4cc..ca85717fb3782 100644 --- a/sklearn/covariance/tests/test_elliptic_envelope.py +++ b/sklearn/covariance/tests/test_elliptic_envelope.py @@ -6,10 +6,12 @@ import pytest from sklearn.covariance import EllipticEnvelope -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal from sklearn.exceptions import NotFittedError +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) def test_elliptic_envelope(global_random_seed): diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py index 44a60f3e05103..317bf2aa85124 100644 --- a/sklearn/covariance/tests/test_graphical_lasso.py +++ b/sklearn/covariance/tests/test_graphical_lasso.py @@ -1,26 +1,27 @@ """ Test the graphical_lasso module. """ import sys -import pytest +from io import StringIO import numpy as np -from scipy import linalg - +import pytest from numpy.testing import assert_allclose -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_less -from sklearn.utils._testing import _convert_container +from scipy import linalg +from sklearn import datasets from sklearn.covariance import ( - graphical_lasso, GraphicalLasso, GraphicalLassoCV, empirical_covariance, + graphical_lasso, ) from sklearn.datasets import make_sparse_spd_matrix -from io import StringIO from sklearn.utils import check_random_state -from sklearn import datasets +from sklearn.utils._testing import ( + _convert_container, + assert_array_almost_equal, + assert_array_less, +) def test_graphical_lasso(random_state=0): diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py index 213f3d7e8f04b..44dcdbbbf8249 100644 --- a/sklearn/covariance/tests/test_robust_covariance.py +++ b/sklearn/covariance/tests/test_robust_covariance.py @@ -9,11 +9,9 @@ import numpy as np import pytest -from sklearn.utils._testing import assert_array_almost_equal - from sklearn import datasets -from sklearn.covariance import empirical_covariance, MinCovDet -from sklearn.covariance import fast_mcd +from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd +from sklearn.utils._testing import assert_array_almost_equal X = datasets.load_iris().data X_1d = X[:, 0] diff --git a/sklearn/cross_decomposition/__init__.py b/sklearn/cross_decomposition/__init__.py index ec2f5fb3049af..47b78783caf9c 100644 --- a/sklearn/cross_decomposition/__init__.py +++ b/sklearn/cross_decomposition/__init__.py @@ -1,3 +1,3 @@ -from ._pls import PLSCanonical, PLSRegression, PLSSVD, CCA +from ._pls import CCA, PLSSVD, PLSCanonical, PLSRegression __all__ = ["PLSCanonical", "PLSRegression", "PLSSVD", "CCA"] diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index da395d8f060fb..f1fc90af11d82 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -5,25 +5,27 @@ # Author: Edouard Duchesnay # License: BSD 3 clause -from numbers import Integral, Real - import warnings from abc import ABCMeta, abstractmethod +from numbers import Integral, Real import numpy as np from scipy.linalg import svd -from ..base import BaseEstimator, RegressorMixin, TransformerMixin -from ..base import MultiOutputMixin -from ..base import ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning from ..utils import check_array, check_consistent_length -from ..utils.fixes import sp_version -from ..utils.fixes import parse_version -from ..utils.extmath import svd_flip -from ..utils.validation import check_is_fitted, FLOAT_DTYPES from ..utils._param_validation import Interval, StrOptions -from ..exceptions import ConvergenceWarning +from ..utils.extmath import svd_flip +from ..utils.fixes import parse_version, sp_version +from ..utils.validation import FLOAT_DTYPES, check_is_fitted __all__ = ["PLSCanonical", "PLSRegression", "PLSSVD"] diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index 8f4840c9b9f21..fcdd927efb389 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -1,21 +1,20 @@ -import pytest import warnings + import numpy as np -from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_allclose +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal -from sklearn.datasets import load_linnerud +from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression from sklearn.cross_decomposition._pls import ( _center_scale_xy, _get_first_singular_vectors_power_method, _get_first_singular_vectors_svd, _svd_flip_1d, ) -from sklearn.cross_decomposition import CCA -from sklearn.cross_decomposition import PLSSVD, PLSRegression, PLSCanonical -from sklearn.datasets import make_regression +from sklearn.datasets import load_linnerud, make_regression +from sklearn.exceptions import ConvergenceWarning from sklearn.utils import check_random_state from sklearn.utils.extmath import svd_flip -from sklearn.exceptions import ConvergenceWarning def assert_matrix_orthogonal(M): diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 465d4159a32c4..7ae7902f3365c 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -5,52 +5,55 @@ """ import textwrap -from ._base import load_breast_cancer -from ._base import load_diabetes -from ._base import load_digits -from ._base import load_files -from ._base import load_iris -from ._base import load_linnerud -from ._base import load_sample_images -from ._base import load_sample_image -from ._base import load_wine -from ._base import get_data_home -from ._base import clear_data_home +from ._base import ( + clear_data_home, + get_data_home, + load_breast_cancer, + load_diabetes, + load_digits, + load_files, + load_iris, + load_linnerud, + load_sample_image, + load_sample_images, + load_wine, +) +from ._california_housing import fetch_california_housing from ._covtype import fetch_covtype from ._kddcup99 import fetch_kddcup99 -from ._lfw import fetch_lfw_pairs -from ._lfw import fetch_lfw_people -from ._twenty_newsgroups import fetch_20newsgroups -from ._twenty_newsgroups import fetch_20newsgroups_vectorized -from ._openml import fetch_openml -from ._samples_generator import make_classification -from ._samples_generator import make_multilabel_classification -from ._samples_generator import make_hastie_10_2 -from ._samples_generator import make_regression -from ._samples_generator import make_blobs -from ._samples_generator import make_moons -from ._samples_generator import make_circles -from ._samples_generator import make_friedman1 -from ._samples_generator import make_friedman2 -from ._samples_generator import make_friedman3 -from ._samples_generator import make_low_rank_matrix -from ._samples_generator import make_sparse_coded_signal -from ._samples_generator import make_sparse_uncorrelated -from ._samples_generator import make_spd_matrix -from ._samples_generator import make_swiss_roll -from ._samples_generator import make_s_curve -from ._samples_generator import make_sparse_spd_matrix -from ._samples_generator import make_gaussian_quantiles -from ._samples_generator import make_biclusters -from ._samples_generator import make_checkerboard -from ._svmlight_format_io import load_svmlight_file -from ._svmlight_format_io import load_svmlight_files -from ._svmlight_format_io import dump_svmlight_file +from ._lfw import fetch_lfw_pairs, fetch_lfw_people from ._olivetti_faces import fetch_olivetti_faces -from ._species_distributions import fetch_species_distributions -from ._california_housing import fetch_california_housing +from ._openml import fetch_openml from ._rcv1 import fetch_rcv1 - +from ._samples_generator import ( + make_biclusters, + make_blobs, + make_checkerboard, + make_circles, + make_classification, + make_friedman1, + make_friedman2, + make_friedman3, + make_gaussian_quantiles, + make_hastie_10_2, + make_low_rank_matrix, + make_moons, + make_multilabel_classification, + make_regression, + make_s_curve, + make_sparse_coded_signal, + make_sparse_spd_matrix, + make_sparse_uncorrelated, + make_spd_matrix, + make_swiss_roll, +) +from ._species_distributions import fetch_species_distributions +from ._svmlight_format_io import ( + dump_svmlight_file, + load_svmlight_file, + load_svmlight_files, +) +from ._twenty_newsgroups import fetch_20newsgroups, fetch_20newsgroups_vectorized __all__ = [ "clear_data_home", diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index bba06fbb74021..d9cc42de71f66 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -8,7 +8,6 @@ import numpy as np import scipy as sp - from ..externals import _arff from ..externals._arff import ArffSparseDataType from ..utils import ( diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 014a37ae30b09..7dad2f1eb7cd1 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -7,26 +7,23 @@ # 2010 Olivier Grisel # License: BSD 3 clause import csv -import hashlib import gzip +import hashlib +import os import shutil from collections import namedtuple -import os +from numbers import Integral from os import environ, listdir, makedirs from os.path import expanduser, isdir, join, splitext from pathlib import Path -from numbers import Integral - -from ..preprocessing import scale -from ..utils import Bunch -from ..utils import check_random_state -from ..utils import check_pandas_support -from ..utils.fixes import _open_binary, _open_text, _read_text, _contents -from ..utils._param_validation import validate_params, Interval, StrOptions +from urllib.request import urlretrieve import numpy as np -from urllib.request import urlretrieve +from ..preprocessing import scale +from ..utils import Bunch, check_pandas_support, check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.fixes import _contents, _open_binary, _open_text, _read_text DATA_MODULE = "sklearn.datasets.data" DESCR_MODULE = "sklearn.datasets.descr" diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index 96443c95f9979..0c06544e88317 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -21,24 +21,24 @@ # Authors: Peter Prettenhofer # License: BSD 3 clause -from os.path import exists -from os import makedirs, remove -import tarfile - -import numpy as np import logging +import tarfile +from os import makedirs, remove +from os.path import exists import joblib +import numpy as np -from . import get_data_home -from ._base import _convert_data_dataframe -from ._base import _fetch_remote -from ._base import _pkl_filepath -from ._base import RemoteFileMetadata -from ._base import load_descr from ..utils import Bunch from ..utils._param_validation import validate_params - +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) # The original data can be found at: # https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 83bd8ad229924..236e69727b7ef 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -14,24 +14,25 @@ # Peter Prettenhofer # License: BSD 3 clause -from gzip import GzipFile import logging -from os.path import exists, join import os +from gzip import GzipFile +from os.path import exists, join from tempfile import TemporaryDirectory -import numpy as np import joblib +import numpy as np -from . import get_data_home -from ._base import _convert_data_dataframe -from ._base import _fetch_remote -from ._base import RemoteFileMetadata -from ._base import load_descr -from ..utils import Bunch -from ._base import _pkl_filepath -from ..utils import check_random_state +from ..utils import Bunch, check_random_state from ..utils._param_validation import validate_params +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) # The original data can be found in: # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 749e15cd53522..30a535c1f4cd4 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -9,24 +9,24 @@ """ import errno -from gzip import GzipFile import logging import os +from gzip import GzipFile from os.path import exists, join -import numpy as np import joblib +import numpy as np -from ._base import _fetch_remote -from ._base import _convert_data_dataframe -from . import get_data_home -from ._base import RemoteFileMetadata -from ._base import load_descr -from ..utils._param_validation import StrOptions, validate_params -from ..utils import Bunch -from ..utils import check_random_state +from ..utils import Bunch, check_random_state from ..utils import shuffle as shuffle_method - +from ..utils._param_validation import StrOptions, validate_params +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + load_descr, +) # The original data can be found at: # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py index 7f6cf8f235d3f..e758eef5dc427 100644 --- a/sklearn/datasets/_lfw.py +++ b/sklearn/datasets/_lfw.py @@ -8,22 +8,22 @@ # Copyright (c) 2011 Olivier Grisel # License: BSD 3 clause -from os import listdir, makedirs, remove -from os.path import join, exists, isdir -from ..utils._param_validation import validate_params, Interval, Hidden, StrOptions -from numbers import Integral, Real import logging +from numbers import Integral, Real +from os import listdir, makedirs, remove +from os.path import exists, isdir, join import numpy as np from joblib import Memory +from ..utils import Bunch +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ._base import ( - get_data_home, - _fetch_remote, RemoteFileMetadata, + _fetch_remote, + get_data_home, load_descr, ) -from ..utils import Bunch logger = logging.getLogger(__name__) diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index 55f4b856c6cf0..5ef5cb6286c9f 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -13,20 +13,17 @@ # Copyright (c) 2011 David Warde-Farley # License: BSD 3 clause -from os.path import exists from os import makedirs, remove +from os.path import exists +import joblib import numpy as np from scipy.io import loadmat -import joblib -from . import get_data_home -from ._base import _fetch_remote -from ._base import RemoteFileMetadata -from ._base import _pkl_filepath -from ._base import load_descr -from ..utils import check_random_state, Bunch +from ..utils import Bunch, check_random_state from ..utils._param_validation import validate_params +from . import get_data_home +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr # The original data can be found at: # https://cs.nyu.edu/~roweis/data/olivettifaces.mat diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 21d8eb99858bb..3f525c3433a90 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -7,18 +7,20 @@ from contextlib import closing from functools import wraps from os.path import join -from typing import Callable, Optional, Dict, Tuple, List, Any, Union from tempfile import TemporaryDirectory +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.error import HTTPError, URLError -from urllib.request import urlopen, Request +from urllib.request import Request, urlopen from warnings import warn import numpy as np +from ..utils import ( + Bunch, + check_pandas_support, # noqa +) from . import get_data_home from ._arff_parser import load_arff_from_gzip_file -from ..utils import Bunch -from ..utils import check_pandas_support # noqa __all__ = ["fetch_openml"] diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index ae391edbad113..a0780edaba9da 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -9,25 +9,20 @@ # License: BSD 3 clause import logging - -from os import remove, makedirs -from os.path import exists, join from gzip import GzipFile +from os import makedirs, remove +from os.path import exists, join +import joblib import numpy as np import scipy.sparse as sp -import joblib +from ..utils import Bunch +from ..utils import shuffle as shuffle_ +from ..utils._param_validation import StrOptions, validate_params from . import get_data_home -from ._base import _pkl_filepath -from ._base import _fetch_remote -from ._base import RemoteFileMetadata -from ._base import load_descr +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr from ._svmlight_format_io import load_svmlight_files -from ..utils import shuffle as shuffle_ -from ..utils import Bunch -from ..utils._param_validation import validate_params, StrOptions - # The original vectorized data can be found at: # http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index cb3b36d944eb2..9a34c995c0546 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -6,20 +6,20 @@ # G. Louppe, J. Nothman # License: BSD 3 clause -from numbers import Integral, Real -import numbers import array +import numbers import warnings from collections.abc import Iterable +from numbers import Integral, Real import numpy as np -from scipy import linalg import scipy.sparse as sp +from scipy import linalg from ..preprocessing import MultiLabelBinarizer from ..utils import check_array, check_random_state -from ..utils._param_validation import Interval, validate_params, Hidden, StrOptions from ..utils import shuffle as util_shuffle +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.random import sample_without_replacement diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index 3387217349e20..8f5a0881bdf6b 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -37,21 +37,18 @@ # # License: BSD 3 clause +import logging from io import BytesIO from os import makedirs, remove from os.path import exists -import logging -import numpy as np - import joblib +import numpy as np -from . import get_data_home -from ._base import _fetch_remote -from ._base import RemoteFileMetadata from ..utils import Bunch -from ._base import _pkl_filepath from ..utils._param_validation import validate_params +from . import get_data_home +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath # The original data can be found at: # https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index e04d90e15dceb..a48eab7938336 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -15,22 +15,21 @@ # Olivier Grisel # License: BSD 3 clause -from contextlib import closing import os.path +from contextlib import closing +from numbers import Integral import numpy as np import scipy.sparse as sp -from numbers import Integral from .. import __version__ - -from ..utils import check_array, IS_PYPY -from ..utils._param_validation import validate_params, HasMethods, Interval, StrOptions +from ..utils import IS_PYPY, check_array +from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params if not IS_PYPY: from ._svmlight_format_fast import ( - _load_svmlight_file, _dump_svmlight_file, + _load_svmlight_file, ) else: diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index 512b7974a497d..2e2dd6aa73234 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -24,29 +24,30 @@ # Copyright (c) 2011 Olivier Grisel # License: BSD 3 clause -import os +import codecs import logging -import tarfile +import os import pickle -import shutil import re -import codecs +import shutil +import tarfile +import joblib import numpy as np import scipy.sparse as sp -import joblib -from . import get_data_home -from . import load_files -from ._base import _convert_data_dataframe -from ._base import _pkl_filepath -from ._base import _fetch_remote -from ._base import RemoteFileMetadata -from ._base import load_descr -from ..feature_extraction.text import CountVectorizer from .. import preprocessing -from ..utils import check_random_state, Bunch +from ..feature_extraction.text import CountVectorizer +from ..utils import Bunch, check_random_state from ..utils._param_validation import StrOptions, validate_params +from . import get_data_home, load_files +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) logger = logging.getLogger(__name__) diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index ef1280f6218b1..c8ab1cd04ee6e 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,6 +1,7 @@ """ Network tests are only run, if data is already locally available, or if download is specifically requested by environment variable.""" import builtins + import pytest diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index e30348c894559..af308e49c5ebf 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -4,16 +4,17 @@ from functools import partial from unittest.mock import patch -import pytest - import numpy as np +import pytest import scipy.sparse as sp -from sklearn.datasets.tests.test_common import check_as_frame -from sklearn.datasets.tests.test_common import check_pandas_dependency_message -from sklearn.datasets.tests.test_common import check_return_X_y -from sklearn.utils._testing import assert_allclose_dense_sparse +from sklearn.datasets.tests.test_common import ( + check_as_frame, + check_pandas_dependency_message, + check_return_X_y, +) from sklearn.preprocessing import normalize +from sklearn.utils._testing import assert_allclose_dense_sparse def test_20news(fetch_20newsgroups_fxt): diff --git a/sklearn/datasets/tests/test_arff_parser.py b/sklearn/datasets/tests/test_arff_parser.py index 8465289d187ee..b675439cd2e9d 100644 --- a/sklearn/datasets/tests/test_arff_parser.py +++ b/sklearn/datasets/tests/test_arff_parser.py @@ -1,5 +1,5 @@ -from io import BytesIO import textwrap +from io import BytesIO import pytest diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 23dc78570fc9d..f31f20636c0c1 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -2,31 +2,33 @@ import shutil import tempfile import warnings -from pickle import loads -from pickle import dumps from functools import partial +from pickle import dumps, loads -import pytest import numpy as np -from sklearn.datasets import get_data_home -from sklearn.datasets import clear_data_home -from sklearn.datasets import load_files -from sklearn.datasets import load_sample_images -from sklearn.datasets import load_sample_image -from sklearn.datasets import load_digits -from sklearn.datasets import load_diabetes -from sklearn.datasets import load_linnerud -from sklearn.datasets import load_iris -from sklearn.datasets import load_breast_cancer -from sklearn.datasets import load_wine +import pytest + +from sklearn.datasets import ( + clear_data_home, + get_data_home, + load_breast_cancer, + load_diabetes, + load_digits, + load_files, + load_iris, + load_linnerud, + load_sample_image, + load_sample_images, + load_wine, +) from sklearn.datasets._base import ( load_csv_data, load_gzip_compressed_csv_data, ) +from sklearn.datasets.tests.test_common import check_as_frame from sklearn.preprocessing import scale from sklearn.utils import Bunch from sklearn.utils.fixes import _is_resource -from sklearn.datasets.tests.test_common import check_as_frame def _remove_dir(path): diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index 495becccd820f..ef6fc95db80bf 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,10 +1,11 @@ """Test the california_housing loader, if the data is available, or if specifically requested via environment variable (e.g. for CI jobs).""" +from functools import partial + import pytest from sklearn.datasets.tests.test_common import check_return_X_y -from functools import partial def test_fetch(fetch_california_housing_fxt): diff --git a/sklearn/datasets/tests/test_common.py b/sklearn/datasets/tests/test_common.py index 5f21bdc66b4dc..8048a31041ddc 100644 --- a/sklearn/datasets/tests/test_common.py +++ b/sklearn/datasets/tests/test_common.py @@ -2,8 +2,8 @@ import inspect import os -import pytest import numpy as np +import pytest import sklearn.datasets diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 2cc2fed81bad6..e44fdaae69ec3 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -2,7 +2,9 @@ or if specifically requested via environment variable (e.g. for CI jobs).""" from functools import partial + import pytest + from sklearn.datasets.tests.test_common import check_return_X_y diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 8eb1d6ec71eb3..5f6e9c83a30b8 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -7,11 +7,14 @@ """ from functools import partial + import pytest -from sklearn.datasets.tests.test_common import check_as_frame -from sklearn.datasets.tests.test_common import check_pandas_dependency_message -from sklearn.datasets.tests.test_common import check_return_X_y +from sklearn.datasets.tests.test_common import ( + check_as_frame, + check_pandas_dependency_message, + check_return_X_y, +) @pytest.mark.parametrize("as_frame", [True, False]) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 36f33d8a10289..92edb99ce3b0b 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -8,19 +8,18 @@ joblib, successive runs will be fast (less than 200ms). """ -import random import os +import random import shutil import tempfile +from functools import partial + import numpy as np import pytest -from functools import partial -from sklearn.datasets import fetch_lfw_pairs -from sklearn.datasets import fetch_lfw_people -from sklearn.utils._testing import assert_array_equal +from sklearn.datasets import fetch_lfw_pairs, fetch_lfw_people from sklearn.datasets.tests.test_common import check_return_X_y - +from sklearn.utils._testing import assert_array_equal SCIKIT_LEARN_DATA = None SCIKIT_LEARN_EMPTY_DATA = None diff --git a/sklearn/datasets/tests/test_olivetti_faces.py b/sklearn/datasets/tests/test_olivetti_faces.py index 18fceb0ed8b0e..e5d6c853aa454 100644 --- a/sklearn/datasets/tests/test_olivetti_faces.py +++ b/sklearn/datasets/tests/test_olivetti_faces.py @@ -4,9 +4,8 @@ import numpy as np -from sklearn.utils import Bunch from sklearn.datasets.tests.test_common import check_return_X_y - +from sklearn.utils import Bunch from sklearn.utils._testing import assert_array_equal diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index c13b82dd769d3..8c78b753f336f 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -8,28 +8,26 @@ from urllib.error import HTTPError import numpy as np -import scipy.sparse import pytest +import scipy.sparse import sklearn from sklearn import config_context +from sklearn.datasets import fetch_openml as fetch_openml_orig +from sklearn.datasets._openml import ( + _OPENML_PREFIX, + _get_local_path, + _open_openml_url, + _retry_with_clean_cache, +) from sklearn.utils import Bunch, check_pandas_support -from sklearn.utils.fixes import _open_binary from sklearn.utils._testing import ( SkipTest, assert_allclose, assert_array_equal, fails_if_pypy, ) - -from sklearn.datasets import fetch_openml as fetch_openml_orig -from sklearn.datasets._openml import ( - _OPENML_PREFIX, - _open_openml_url, - _get_local_path, - _retry_with_clean_cache, -) - +from sklearn.utils.fixes import _open_binary OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml" # if True, urlopen will be monkey patched to only use local files diff --git a/sklearn/datasets/tests/test_rcv1.py b/sklearn/datasets/tests/test_rcv1.py index 11d0335f4fb8c..fbb9d67015a30 100644 --- a/sklearn/datasets/tests/test_rcv1.py +++ b/sklearn/datasets/tests/test_rcv1.py @@ -2,12 +2,13 @@ or if specifically requested via environment variable (e.g. for CI jobs).""" -import scipy.sparse as sp -import numpy as np from functools import partial + +import numpy as np +import scipy.sparse as sp + from sklearn.datasets.tests.test_common import check_return_X_y -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_almost_equal, assert_array_equal def test_fetch_rcv1(fetch_rcv1_fxt, global_random_seed): diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index cd23fc5016672..ad6569f0863bf 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -6,31 +6,33 @@ import pytest import scipy.sparse as sp -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import ignore_warnings - -from sklearn.datasets import make_classification -from sklearn.datasets import make_multilabel_classification -from sklearn.datasets import make_hastie_10_2 -from sklearn.datasets import make_regression -from sklearn.datasets import make_blobs -from sklearn.datasets import make_friedman1 -from sklearn.datasets import make_friedman2 -from sklearn.datasets import make_friedman3 -from sklearn.datasets import make_low_rank_matrix -from sklearn.datasets import make_moons -from sklearn.datasets import make_circles -from sklearn.datasets import make_sparse_coded_signal -from sklearn.datasets import make_sparse_uncorrelated -from sklearn.datasets import make_spd_matrix -from sklearn.datasets import make_swiss_roll -from sklearn.datasets import make_s_curve -from sklearn.datasets import make_biclusters -from sklearn.datasets import make_checkerboard - +from sklearn.datasets import ( + make_biclusters, + make_blobs, + make_checkerboard, + make_circles, + make_classification, + make_friedman1, + make_friedman2, + make_friedman3, + make_hastie_10_2, + make_low_rank_matrix, + make_moons, + make_multilabel_classification, + make_regression, + make_s_curve, + make_sparse_coded_signal, + make_sparse_uncorrelated, + make_spd_matrix, + make_swiss_roll, +) +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) from sklearn.utils.validation import assert_all_finite diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index 0b76cce3c5a4d..213e9095a73da 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -1,22 +1,23 @@ -from bz2 import BZ2File import gzip -from io import BytesIO -import numpy as np -import scipy.sparse as sp import os import shutil +from bz2 import BZ2File +from io import BytesIO from tempfile import NamedTemporaryFile +import numpy as np import pytest - -from sklearn.utils.fixes import _open_binary, _path -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal, assert_allclose -from sklearn.utils._testing import fails_if_pypy +import scipy.sparse as sp import sklearn -from sklearn.datasets import load_svmlight_file, load_svmlight_files, dump_svmlight_file - +from sklearn.datasets import dump_svmlight_file, load_svmlight_file, load_svmlight_files +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + fails_if_pypy, +) +from sklearn.utils.fixes import _open_binary, _path TEST_DATA_MODULE = "sklearn.datasets.tests.data" datafile = "svmlight_classification.txt" diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index c5f323d3c5d72..1f9cfe07dc0e8 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -5,29 +5,28 @@ """ -from ._nmf import ( - NMF, - MiniBatchNMF, - non_negative_factorization, -) -from ._pca import PCA -from ._incremental_pca import IncrementalPCA -from ._kernel_pca import KernelPCA -from ._sparse_pca import SparsePCA, MiniBatchSparsePCA -from ._truncated_svd import TruncatedSVD -from ._fastica import FastICA, fastica +from ..utils.extmath import randomized_svd from ._dict_learning import ( - dict_learning, - dict_learning_online, - sparse_encode, DictionaryLearning, MiniBatchDictionaryLearning, SparseCoder, + dict_learning, + dict_learning_online, + sparse_encode, ) from ._factor_analysis import FactorAnalysis -from ..utils.extmath import randomized_svd +from ._fastica import FastICA, fastica +from ._incremental_pca import IncrementalPCA +from ._kernel_pca import KernelPCA from ._lda import LatentDirichletAllocation - +from ._nmf import ( + NMF, + MiniBatchNMF, + non_negative_factorization, +) +from ._pca import PCA +from ._sparse_pca import MiniBatchSparsePCA, SparsePCA +from ._truncated_svd import TruncatedSVD __all__ = [ "DictionaryLearning", diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index 20bf7af4f284a..9634395a335ba 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -8,12 +8,13 @@ # # License: BSD 3 clause +from abc import ABCMeta, abstractmethod + import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin from ..utils.validation import check_is_fitted -from abc import ABCMeta, abstractmethod class _BasePCA( diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 54b3590f5b62e..b6972235dca7d 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -3,27 +3,29 @@ # Author: Vlad Niculae, Gael Varoquaux, Alexandre Gramfort # License: BSD 3 clause -import time -import sys import itertools -from numbers import Integral, Real +import sys +import time import warnings - from math import ceil +from numbers import Integral, Real import numpy as np -from scipy import linalg from joblib import effective_n_jobs +from scipy import linalg -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context -from ..utils import check_array, check_random_state, gen_even_slices, gen_batches -from ..utils._param_validation import Hidden, Interval, StrOptions -from ..utils._param_validation import validate_params +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..linear_model import Lars, Lasso, LassoLars, orthogonal_mp_gram +from ..utils import check_array, check_random_state, gen_batches, gen_even_slices +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params from ..utils.extmath import randomized_svd, row_norms, svd_flip +from ..utils.parallel import Parallel, delayed from ..utils.validation import check_is_fitted -from ..utils.parallel import delayed, Parallel -from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars def _check_positive_coding(method, positive): diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 8c3d590b2c814..af3498d534483 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -20,19 +20,23 @@ # License: BSD3 import warnings -from math import sqrt, log +from math import log, sqrt from numbers import Integral, Real + import numpy as np from scipy import linalg - -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import fast_logdet, randomized_svd, squared_norm from ..utils.validation import check_is_fitted -from ..exceptions import ConvergenceWarning class FactorAnalysis(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 6dcf62c0ace3b..da7f6393c2b7f 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -15,12 +15,16 @@ import numpy as np from scipy import linalg -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) from ..exceptions import ConvergenceWarning -from ..utils import check_array, as_float_array, check_random_state +from ..utils import as_float_array, check_array, check_random_state +from ..utils._param_validation import Interval, Options, StrOptions, validate_params from ..utils.validation import check_is_fitted -from ..utils._param_validation import Interval, StrOptions, Options, validate_params __all__ = ["fastica", "FastICA"] diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 5ae5d58b06ca4..f05e2dacc66b2 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -5,14 +5,15 @@ # License: BSD 3 clause from numbers import Integral + import numpy as np from scipy import linalg, sparse -from ._base import _BasePCA from ..base import _fit_context from ..utils import gen_batches from ..utils._param_validation import Interval -from ..utils.extmath import svd_flip, _incremental_mean_and_var +from ..utils.extmath import _incremental_mean_and_var, svd_flip +from ._base import _BasePCA class IncrementalPCA(_BasePCA): diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 61d502a006c5e..ccf79e896f210 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -4,24 +4,29 @@ # Sylvain Marie # License: BSD 3 clause -import numpy as np from numbers import Integral, Real + +import numpy as np from scipy import linalg -from scipy.sparse.linalg import eigsh from scipy.linalg import eigh +from scipy.sparse.linalg import eigsh +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import NotFittedError +from ..metrics.pairwise import pairwise_kernels +from ..preprocessing import KernelCenterer from ..utils._arpack import _init_arpack_v0 -from ..utils.extmath import svd_flip, _randomized_eigsh +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import _randomized_eigsh, svd_flip from ..utils.validation import ( - check_is_fitted, _check_psd_eigenvalues, + check_is_fitted, ) -from ..utils._param_validation import Interval, StrOptions -from ..exceptions import NotFittedError -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context -from ..preprocessing import KernelCenterer -from ..metrics.pairwise import pairwise_kernels class KernelPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index ab1ea5ebb5460..9e161c178b9e3 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -14,22 +14,28 @@ import numpy as np import scipy.sparse as sp -from scipy.special import gammaln, logsumexp from joblib import effective_n_jobs +from scipy.special import gammaln, logsumexp -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) from ..utils import check_random_state, gen_batches, gen_even_slices -from ..utils.validation import check_non_negative -from ..utils.validation import check_is_fitted -from ..utils.parallel import delayed, Parallel from ..utils._param_validation import Interval, StrOptions - +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_is_fitted, check_non_negative from ._online_lda_fast import ( - mean_change as cy_mean_change, _dirichlet_expectation_1d as cy_dirichlet_expectation_1d, +) +from ._online_lda_fast import ( _dirichlet_expectation_2d, ) +from ._online_lda_fast import ( + mean_change as cy_mean_change, +) EPS = np.finfo(float).eps diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index d561583dec205..40db8edd0b2fd 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -6,34 +6,37 @@ # Tom Dupre la Tour # License: BSD 3 clause +import itertools +import time +import warnings from abc import ABC +from math import sqrt from numbers import Integral, Real + import numpy as np import scipy.sparse as sp -import time -import itertools -import warnings -from math import sqrt from scipy import linalg -from ._cdnmf_fast import _update_cdnmf_fast from .._config import config_context -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context -from ..exceptions import ConvergenceWarning -from ..utils import check_random_state, check_array, gen_batches -from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm -from ..utils.validation import ( - check_is_fitted, - check_non_negative, +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, ) +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_random_state, gen_batches, metadata_routing from ..utils._param_validation import ( Interval, StrOptions, validate_params, ) -from ..utils import metadata_routing - +from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm +from ..utils.validation import ( + check_is_fitted, + check_non_negative, +) +from ._cdnmf_fast import _update_cdnmf_fast EPSILON = np.finfo(np.float32).eps diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 1d3c0678aca89..96931324d7cae 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -15,20 +15,18 @@ import numpy as np from scipy import linalg -from scipy.special import gammaln from scipy.sparse import issparse from scipy.sparse.linalg import svds +from scipy.special import gammaln -from ._base import _BasePCA from ..base import _fit_context from ..utils import check_random_state from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils.deprecation import deprecated -from ..utils.extmath import fast_logdet, randomized_svd, svd_flip -from ..utils.extmath import stable_cumsum +from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip from ..utils.validation import check_is_fitted -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt +from ._base import _BasePCA def _assess_dimension(spectrum, rank, n_samples): diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 93e4a2164a87f..aa4dec2fb7ee9 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -6,14 +6,18 @@ import numpy as np +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..linear_model import ridge_regression from ..utils import check_random_state -from ..utils.extmath import svd_flip from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.extmath import svd_flip from ..utils.validation import check_array, check_is_fitted -from ..linear_model import ridge_regression -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context -from ._dict_learning import dict_learning, MiniBatchDictionaryLearning +from ._dict_learning import MiniBatchDictionaryLearning, dict_learning class _BaseSparsePCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 67f5c73028f15..725683e8d46c6 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -7,18 +7,23 @@ # License: 3-clause BSD. from numbers import Integral, Real + import numpy as np import scipy.sparse as sp from scipy.sparse.linalg import svds -from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import check_is_fitted -from ..utils._param_validation import Interval, StrOptions __all__ = ["TruncatedSVD"] diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 6e6ddd20acb8c..0986bc6b3feed 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -1,38 +1,37 @@ -import pytest +import itertools import warnings +from functools import partial import numpy as np -from functools import partial -import itertools +import pytest import sklearn - from sklearn.base import clone - +from sklearn.decomposition import ( + DictionaryLearning, + MiniBatchDictionaryLearning, + SparseCoder, + dict_learning, + dict_learning_online, + sparse_encode, +) +from sklearn.decomposition._dict_learning import _update_dict from sklearn.exceptions import ConvergenceWarning - from sklearn.utils import check_array +from sklearn.utils._testing import ( + TempMemmap, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.estimator_checks import ( + check_transformer_data_not_an_array, + check_transformer_general, + check_transformers_unfitted, +) from sklearn.utils.parallel import Parallel -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import TempMemmap - -from sklearn.decomposition import DictionaryLearning -from sklearn.decomposition import MiniBatchDictionaryLearning -from sklearn.decomposition import SparseCoder -from sklearn.decomposition import dict_learning -from sklearn.decomposition import dict_learning_online -from sklearn.decomposition import sparse_encode -from sklearn.utils.estimator_checks import check_transformer_data_not_an_array -from sklearn.utils.estimator_checks import check_transformer_general -from sklearn.utils.estimator_checks import check_transformers_unfitted - -from sklearn.decomposition._dict_learning import _update_dict - - rng_global = np.random.RandomState(0) n_samples, n_features = 10, 8 X = rng_global.randn(n_samples, n_features) @@ -397,8 +396,8 @@ def test_dict_learning_online_positivity(positive_code, positive_dict): def test_dict_learning_online_verbosity(): # test verbosity for better coverage n_components = 5 - from io import StringIO import sys + from io import StringIO old_stdout = sys.stdout try: diff --git a/sklearn/decomposition/tests/test_factor_analysis.py b/sklearn/decomposition/tests/test_factor_analysis.py index 4284327f3eeb4..2ff14f8d71722 100644 --- a/sklearn/decomposition/tests/test_factor_analysis.py +++ b/sklearn/decomposition/tests/test_factor_analysis.py @@ -7,12 +7,14 @@ import numpy as np import pytest -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.exceptions import ConvergenceWarning from sklearn.decomposition import FactorAnalysis -from sklearn.utils._testing import ignore_warnings from sklearn.decomposition._factor_analysis import _ortho_rotation +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + ignore_warnings, +) # Ignore warnings from switching to more power iterations in randomized_svd diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 14938b3787a98..6a376b01ecb19 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -2,18 +2,17 @@ Test the fastica algorithm. """ import itertools -import pytest -import warnings import os +import warnings import numpy as np +import pytest from scipy import stats -from sklearn.utils._testing import assert_allclose - -from sklearn.decomposition import FastICA, fastica, PCA +from sklearn.decomposition import PCA, FastICA, fastica from sklearn.decomposition._fastica import _gs_decorrelation from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import assert_allclose def center_and_norm(x, axis=-1): diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index d8402dad24c04..6ef500b42026b 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -1,17 +1,18 @@ """Tests for Incremental PCA.""" -import numpy as np -import pytest import warnings -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_allclose_dense_sparse +import numpy as np +import pytest from numpy.testing import assert_array_equal +from scipy import sparse from sklearn import datasets from sklearn.decomposition import PCA, IncrementalPCA - -from scipy import sparse +from sklearn.utils._testing import ( + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_almost_equal, +) iris = datasets.load_iris() diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 39aa32a3e9694..3c95454749b4a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -1,23 +1,22 @@ -import numpy as np -import scipy.sparse as sp -import pytest import warnings -from sklearn.utils._testing import ( - assert_array_almost_equal, - assert_array_equal, - assert_allclose, -) +import numpy as np +import pytest +import scipy.sparse as sp +from sklearn.datasets import make_blobs, make_circles from sklearn.decomposition import PCA, KernelPCA -from sklearn.datasets import make_circles -from sklearn.datasets import make_blobs from sklearn.exceptions import NotFittedError from sklearn.linear_model import Perceptron +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler -from sklearn.model_selection import GridSearchCV -from sklearn.metrics.pairwise import rbf_kernel +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) from sklearn.utils.validation import _check_psd_eigenvalues diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 2b1ed4d91be5e..2cd027f90cdd6 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -1,27 +1,26 @@ import re import sys -from io import StringIO import warnings +from io import StringIO import numpy as np +import pytest import scipy.sparse as sp - from scipy import linalg -from sklearn.decomposition import NMF, MiniBatchNMF -from sklearn.decomposition import non_negative_factorization -from sklearn.decomposition import _nmf as nmf # For testing internals from scipy.sparse import csc_matrix -import pytest - -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import ignore_warnings -from sklearn.utils.extmath import squared_norm from sklearn.base import clone +from sklearn.decomposition import NMF, MiniBatchNMF, non_negative_factorization +from sklearn.decomposition import _nmf as nmf # For testing internals from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.extmath import squared_norm @pytest.mark.parametrize( diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 872bd55916fcb..50c812bcb9f14 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -1,26 +1,25 @@ import sys +from io import StringIO import numpy as np +import pytest +from numpy.testing import assert_array_equal from scipy.linalg import block_diag from scipy.sparse import csr_matrix from scipy.special import psi -from numpy.testing import assert_array_equal - -import pytest from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition._online_lda_fast import ( _dirichlet_expectation_1d, _dirichlet_expectation_2d, ) - -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import if_safe_multiprocessing_with_blas - from sklearn.exceptions import NotFittedError -from io import StringIO +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + if_safe_multiprocessing_with_blas, +) def _build_sparse_mtx(): diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 5bf893f92fd16..0176ebd0be9e7 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -1,17 +1,15 @@ +import warnings + import numpy as np +import pytest import scipy as sp from numpy.testing import assert_array_equal -import pytest -import warnings - -from sklearn.utils._testing import assert_allclose - from sklearn import datasets -from sklearn.decomposition import PCA from sklearn.datasets import load_iris -from sklearn.decomposition._pca import _assess_dimension -from sklearn.decomposition._pca import _infer_dimension +from sklearn.decomposition import PCA +from sklearn.decomposition._pca import _assess_dimension, _infer_dimension +from sklearn.utils._testing import assert_allclose iris = datasets.load_iris() PCA_SOLVERS = ["full", "arpack", "randomized", "auto"] diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py index cf237014c6049..4abbbd515aeb9 100644 --- a/sklearn/decomposition/tests/test_sparse_pca.py +++ b/sklearn/decomposition/tests/test_sparse_pca.py @@ -2,17 +2,18 @@ # License: BSD 3 clause import sys -import pytest import numpy as np +import pytest from numpy.testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import if_safe_multiprocessing_with_blas - -from sklearn.decomposition import SparsePCA, MiniBatchSparsePCA, PCA +from sklearn.decomposition import PCA, MiniBatchSparsePCA, SparsePCA from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + if_safe_multiprocessing_with_blas, +) def generate_toy_data(n_components, n_samples, image_size, random_state=None): diff --git a/sklearn/decomposition/tests/test_truncated_svd.py b/sklearn/decomposition/tests/test_truncated_svd.py index bd0bde6e08aa7..4edb7d4a11109 100644 --- a/sklearn/decomposition/tests/test_truncated_svd.py +++ b/sklearn/decomposition/tests/test_truncated_svd.py @@ -1,13 +1,12 @@ """Test truncated SVD transformer.""" import numpy as np -import scipy.sparse as sp - import pytest +import scipy.sparse as sp -from sklearn.decomposition import TruncatedSVD, PCA +from sklearn.decomposition import PCA, TruncatedSVD from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_array_less, assert_allclose +from sklearn.utils._testing import assert_allclose, assert_array_less SVD_SOLVERS = ["arpack", "randomized"] diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 275f4ae4d3b30..29146ca857694 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -10,24 +10,27 @@ # License: BSD 3-Clause import warnings +from numbers import Integral, Real + import numpy as np import scipy.linalg from scipy import linalg -from numbers import Real, Integral -from .base import BaseEstimator, TransformerMixin, ClassifierMixin -from .base import ClassNamePrefixFeaturesOutMixin -from .base import _fit_context +from .base import ( + BaseEstimator, + ClassifierMixin, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from .covariance import empirical_covariance, ledoit_wolf, shrunk_covariance from .linear_model._base import LinearClassifierMixin -from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance -from .utils.multiclass import unique_labels -from .utils.validation import check_is_fitted -from .utils._array_api import get_namespace, _expit, device, size -from .utils.multiclass import check_classification_targets -from .utils.extmath import softmax -from .utils._param_validation import StrOptions, Interval, HasMethods from .preprocessing import StandardScaler - +from .utils._array_api import _expit, device, get_namespace, size +from .utils._param_validation import HasMethods, Interval, StrOptions +from .utils.extmath import softmax +from .utils.multiclass import check_classification_targets, unique_labels +from .utils.validation import check_is_fitted __all__ = ["LinearDiscriminantAnalysis", "QuadraticDiscriminantAnalysis"] diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 0d8519484d7a5..1db664826f5c9 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -9,18 +9,25 @@ import numpy as np import scipy.sparse as sp -from .base import BaseEstimator, ClassifierMixin, RegressorMixin -from .base import MultiOutputMixin -from .base import _fit_context +from .base import ( + BaseEstimator, + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + _fit_context, +) from .utils import check_random_state -from .utils._param_validation import StrOptions, Interval -from .utils.validation import _num_samples -from .utils.validation import check_array -from .utils.validation import check_consistent_length -from .utils.validation import check_is_fitted, _check_sample_weight +from .utils._param_validation import Interval, StrOptions +from .utils.multiclass import class_distribution from .utils.random import _random_choice_csc from .utils.stats import _weighted_percentile -from .utils.multiclass import class_distribution +from .utils.validation import ( + _check_sample_weight, + _num_samples, + check_array, + check_consistent_length, + check_is_fitted, +) class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index e892d36a0ce46..f4a3756bdaf1d 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -2,27 +2,24 @@ The :mod:`sklearn.ensemble` module includes ensemble-based methods for classification, regression and anomaly detection. """ +from ._bagging import BaggingClassifier, BaggingRegressor from ._base import BaseEnsemble -from ._forest import RandomForestClassifier -from ._forest import RandomForestRegressor -from ._forest import RandomTreesEmbedding -from ._forest import ExtraTreesClassifier -from ._forest import ExtraTreesRegressor -from ._bagging import BaggingClassifier -from ._bagging import BaggingRegressor -from ._iforest import IsolationForest -from ._weight_boosting import AdaBoostClassifier -from ._weight_boosting import AdaBoostRegressor -from ._gb import GradientBoostingClassifier -from ._gb import GradientBoostingRegressor -from ._voting import VotingClassifier -from ._voting import VotingRegressor -from ._stacking import StackingClassifier -from ._stacking import StackingRegressor +from ._forest import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, +) +from ._gb import GradientBoostingClassifier, GradientBoostingRegressor from ._hist_gradient_boosting.gradient_boosting import ( - HistGradientBoostingRegressor, HistGradientBoostingClassifier, + HistGradientBoostingRegressor, ) +from ._iforest import IsolationForest +from ._stacking import StackingClassifier, StackingRegressor +from ._voting import VotingClassifier, VotingRegressor +from ._weight_boosting import AdaBoostClassifier, AdaBoostRegressor __all__ = [ "BaseEnsemble", diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 0354413fdebfe..117bf470c509f 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -6,28 +6,25 @@ import itertools import numbers -import numpy as np from abc import ABCMeta, abstractmethod +from functools import partial from numbers import Integral from warnings import warn -from functools import partial -from ._base import BaseEnsemble, _partition_estimators -from ..base import ClassifierMixin, RegressorMixin -from ..base import _fit_context -from ..metrics import r2_score, accuracy_score +import numpy as np + +from ..base import ClassifierMixin, RegressorMixin, _fit_context +from ..metrics import accuracy_score, r2_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, column_or_1d -from ..utils import indices_to_mask +from ..utils import check_random_state, column_or_1d, indices_to_mask +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils._tags import _safe_tags from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets +from ..utils.parallel import Parallel, delayed from ..utils.random import sample_without_replacement -from ..utils._param_validation import Interval, HasMethods, StrOptions -from ..utils._param_validation import RealNotInt -from ..utils.validation import has_fit_parameter, check_is_fitted, _check_sample_weight -from ..utils._tags import _safe_tags -from ..utils.parallel import delayed, Parallel - +from ..utils.validation import _check_sample_weight, check_is_fitted, has_fit_parameter +from ._base import BaseEnsemble, _partition_estimators __all__ = ["BaggingClassifier", "BaggingRegressor"] diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 3850fa724f11a..3107b4cf9a6c5 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -3,20 +3,15 @@ # Authors: Gilles Louppe # License: BSD 3 clause +import warnings from abc import ABCMeta, abstractmethod from typing import List -import warnings import numpy as np - from joblib import effective_n_jobs -from ..base import clone -from ..base import is_classifier, is_regressor -from ..base import BaseEstimator -from ..base import MetaEstimatorMixin -from ..utils import Bunch, _print_elapsed_time, deprecated -from ..utils import check_random_state +from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor +from ..utils import Bunch, _print_elapsed_time, check_random_state, deprecated from ..utils.metaestimators import _BaseComposition diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ce3a6f78b241d..df8ecc974dd34 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,19 +40,24 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause +import threading +from abc import ABCMeta, abstractmethod from numbers import Integral, Real from warnings import catch_warnings, simplefilter, warn -import threading -from abc import ABCMeta, abstractmethod import numpy as np -from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack +from scipy.sparse import issparse -from ..base import is_classifier -from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin -from ..base import _fit_context - +from ..base import ( + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + is_classifier, +) +from ..exceptions import DataConversionWarning from ..metrics import accuracy_score, r2_score from ..preprocessing import OneHotEncoder from ..tree import ( @@ -62,21 +67,18 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeClassifier, ExtraTreeRegressor, ) -from ..tree._tree import DTYPE, DOUBLE +from ..tree._tree import DOUBLE, DTYPE from ..utils import check_random_state, compute_sample_weight -from ..exceptions import DataConversionWarning -from ._base import BaseEnsemble, _partition_estimators -from ..utils.parallel import delayed, Parallel +from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils.multiclass import check_classification_targets, type_of_target +from ..utils.parallel import Parallel, delayed from ..utils.validation import ( - check_is_fitted, - _check_sample_weight, _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, ) -from ..utils.validation import _num_samples -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt - +from ._base import BaseEnsemble, _partition_estimators __all__ = [ "RandomForestClassifier", diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 1b924749f52bd..777e1a18d8396 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -20,37 +20,26 @@ # Arnaud Joly, Jacob Schreiber # License: BSD 3 clause -from abc import ABCMeta -from abc import abstractmethod -from numbers import Integral, Real import warnings - -from ._base import BaseEnsemble -from ..base import ClassifierMixin, RegressorMixin -from ..base import is_classifier -from ..base import _fit_context - -from ._gradient_boosting import predict_stages -from ._gradient_boosting import predict_stage -from ._gradient_boosting import _random_sample_mask +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real +from time import time import numpy as np +from scipy.sparse import csc_matrix, csr_matrix, issparse -from scipy.sparse import csc_matrix -from scipy.sparse import csr_matrix -from scipy.sparse import issparse - -from time import time +from ..base import ClassifierMixin, RegressorMixin, _fit_context, is_classifier +from ..exceptions import NotFittedError from ..model_selection import train_test_split from ..tree import DecisionTreeRegressor -from ..tree._tree import DTYPE, DOUBLE -from . import _gb_losses - +from ..tree._tree import DOUBLE, DTYPE from ..utils import check_array, check_random_state, column_or_1d from ..utils._param_validation import HasMethods, Interval, StrOptions -from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.multiclass import check_classification_targets -from ..exceptions import NotFittedError +from ..utils.validation import _check_sample_weight, check_is_fitted +from . import _gb_losses +from ._base import BaseEnsemble +from ._gradient_boosting import _random_sample_mask, predict_stage, predict_stages class VerboseReporter: diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py index db2116d9aa2e1..7fb7e4726c325 100644 --- a/sklearn/ensemble/_gb_losses.py +++ b/sklearn/ensemble/_gb_losses.py @@ -2,16 +2,14 @@ decision trees. """ -from abc import ABCMeta -from abc import abstractmethod +from abc import ABCMeta, abstractmethod import numpy as np from scipy.special import expit, logsumexp +from ..dummy import DummyClassifier, DummyRegressor from ..tree._tree import TREE_LEAF from ..utils.stats import _weighted_percentile -from ..dummy import DummyClassifier -from ..dummy import DummyRegressor class LossFunction(metaclass=ABCMeta): diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 805a13b2d361b..8786e866d7be3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -9,14 +9,14 @@ import numpy as np -from ...utils import check_random_state, check_array from ...base import BaseEstimator, TransformerMixin -from ...utils.validation import check_is_fitted -from ...utils.fixes import percentile +from ...utils import check_array, check_random_state from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils.fixes import percentile +from ...utils.validation import check_is_fitted from ._binning import _map_to_bins -from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF, X_BITSET_INNER_DTYPE from ._bitset import set_bitset_memoryview +from .common import ALMOST_INF, X_BINNED_DTYPE, X_BITSET_INNER_DTYPE, X_DTYPE def _find_binning_thresholds(col_data, max_bins): diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index e44b6428f8f4e..136e8c3b29efe 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1,13 +1,14 @@ """Fast Gradient Boosting decision trees for classification and regression.""" # Author: Nicolas Hug +import itertools from abc import ABC, abstractmethod from functools import partial -import itertools -from numbers import Real, Integral +from numbers import Integral, Real +from timeit import default_timer as time import numpy as np -from timeit import default_timer as time + from ..._loss.loss import ( _LOSSES, BaseLoss, @@ -17,29 +18,31 @@ HalfPoissonLoss, PinballLoss, ) -from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier -from ...base import _fit_context -from ...utils import check_random_state, resample, compute_sample_weight -from ...utils.validation import ( - check_is_fitted, - check_consistent_length, - _check_sample_weight, - _check_monotonic_cst, +from ...base import ( + BaseEstimator, + ClassifierMixin, + RegressorMixin, + _fit_context, + is_classifier, ) -from ...utils._param_validation import Interval, StrOptions -from ...utils._param_validation import RealNotInt -from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring from ...model_selection import train_test_split from ...preprocessing import LabelEncoder +from ...utils import check_random_state, compute_sample_weight, resample +from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils._param_validation import Interval, RealNotInt, StrOptions +from ...utils.multiclass import check_classification_targets +from ...utils.validation import ( + _check_monotonic_cst, + _check_sample_weight, + check_consistent_length, + check_is_fitted, +) from ._gradient_boosting import _update_raw_predictions -from .common import Y_DTYPE, X_DTYPE, G_H_DTYPE - from .binning import _BinMapper +from .common import G_H_DTYPE, X_DTYPE, Y_DTYPE from .grower import TreeGrower - _LOSSES = _LOSSES.copy() _LOSSES.update( { diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index b8c0c17969e99..4ed6041ecaa30 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -6,22 +6,25 @@ """ # Author: Nicolas Hug -from heapq import heappush, heappop -import numpy as np -from timeit import default_timer as time import numbers +from heapq import heappop, heappush +from timeit import default_timer as time -from .splitting import Splitter +import numpy as np + +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + +from ._bitset import set_raw_bitset_from_binned_bitset +from .common import ( + PREDICTOR_RECORD_DTYPE, + X_BITSET_INNER_DTYPE, + Y_DTYPE, + MonotonicConstraint, +) from .histogram import HistogramBuilder from .predictor import TreePredictor +from .splitting import Splitter from .utils import sum_parallel -from .common import PREDICTOR_RECORD_DTYPE -from .common import X_BITSET_INNER_DTYPE -from .common import Y_DTYPE -from .common import MonotonicConstraint -from ._bitset import set_raw_bitset_from_binned_bitset -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads - EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index 746fa34753121..600e55e43467f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -5,10 +5,12 @@ import numpy as np +from ._predictor import ( + _compute_partial_dependence, + _predict_from_binned_data, + _predict_from_raw_data, +) from .common import Y_DTYPE -from ._predictor import _predict_from_raw_data -from ._predictor import _predict_from_binned_data -from ._predictor import _compute_partial_dependence class TreePredictor: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py index 08bfebfcbf6c9..6f9fcd0057141 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -1,15 +1,17 @@ import numpy as np -from numpy.testing import assert_array_equal, assert_allclose import pytest +from numpy.testing import assert_allclose, assert_array_equal from sklearn.ensemble._hist_gradient_boosting.binning import ( _BinMapper, _find_binning_thresholds, _map_to_bins, ) -from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF +from sklearn.ensemble._hist_gradient_boosting.common import ( + ALMOST_INF, + X_BINNED_DTYPE, + X_DTYPE, +) from sklearn.utils._openmp_helpers import _openmp_effective_n_threads n_threads = _openmp_effective_n_threads() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py index e058781cefcef..c02d66b666f80 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py @@ -1,10 +1,10 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose from sklearn.ensemble._hist_gradient_boosting._bitset import ( - set_bitset_memoryview, in_bitset_memoryview, + set_bitset_memoryview, set_raw_bitset_from_binned_bitset, ) from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 6bd5b38d5a4ee..bbdcb38ef013a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -1,13 +1,15 @@ -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -from sklearn.datasets import make_classification, make_regression import numpy as np import pytest -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split @pytest.mark.parametrize("seed", range(5)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 94d8960b6e813..4851c8e129203 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -1,36 +1,35 @@ +import re import warnings -import re import numpy as np import pytest from numpy.testing import assert_allclose, assert_array_equal + from sklearn._loss.loss import ( AbsoluteError, HalfBinomialLoss, HalfSquaredError, PinballLoss, ) -from sklearn.datasets import make_classification, make_regression -from sklearn.datasets import make_low_rank_matrix -from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder -from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.base import clone, BaseEstimator, TransformerMixin -from sklearn.base import is_regressor -from sklearn.pipeline import make_pipeline -from sklearn.metrics import mean_gamma_deviance, mean_poisson_deviance -from sklearn.dummy import DummyRegressor -from sklearn.exceptions import NotFittedError +from sklearn.base import BaseEstimator, TransformerMixin, clone, is_regressor from sklearn.compose import make_column_transformer - -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.exceptions import NotFittedError +from sklearn.metrics import mean_gamma_deviance, mean_poisson_deviance +from sklearn.model_selection import cross_val_score, train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder from sklearn.utils import shuffle from sklearn.utils._openmp_helpers import _openmp_effective_n_threads - n_threads = _openmp_effective_n_threads() X_classification, y_classification = make_classification(random_state=0) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index f3380fbf2af6d..a55cb871e3c72 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -1,17 +1,18 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_array_equal from pytest import approx -from numpy.testing import assert_array_equal -from numpy.testing import assert_allclose -from sklearn.preprocessing import OneHotEncoder -from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_BITSET_INNER_DTYPE +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + X_BINNED_DTYPE, + X_BITSET_INNER_DTYPE, + X_DTYPE, + Y_DTYPE, +) +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.preprocessing import OneHotEncoder from sklearn.utils._openmp_helpers import _openmp_effective_n_threads n_threads = _openmp_effective_n_threads() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py index 1d5963d20739b..99f74b0f542ee 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -1,20 +1,20 @@ import numpy as np import pytest +from numpy.testing import assert_allclose, assert_array_equal -from numpy.testing import assert_allclose -from numpy.testing import assert_array_equal - +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + HISTOGRAM_DTYPE, + X_BINNED_DTYPE, +) from sklearn.ensemble._hist_gradient_boosting.histogram import ( - _build_histogram_naive, _build_histogram, + _build_histogram_naive, _build_histogram_no_hessian, - _build_histogram_root_no_hessian, _build_histogram_root, + _build_histogram_root_no_hessian, _subtract_histograms, ) -from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE @pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram]) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py index f11bec3bd77db..7782b5b32eb68 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py @@ -1,18 +1,23 @@ import re + import numpy as np import pytest +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + X_BINNED_DTYPE, + MonotonicConstraint, +) from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower -from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint +from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder from sklearn.ensemble._hist_gradient_boosting.splitting import ( Splitter, compute_node_value, ) -from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._testing import _convert_container diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 856ab180459d2..3c3c9ae81bac2 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -1,25 +1,25 @@ import numpy as np -from numpy.testing import assert_allclose -from sklearn.datasets import make_regression -from sklearn.model_selection import train_test_split -from sklearn.metrics import r2_score import pytest +from numpy.testing import assert_allclose +from sklearn.datasets import make_regression +from sklearn.ensemble._hist_gradient_boosting._bitset import ( + set_bitset_memoryview, + set_raw_bitset_from_binned_bitset, +) from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower -from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor from sklearn.ensemble._hist_gradient_boosting.common import ( + ALMOST_INF, G_H_DTYPE, PREDICTOR_RECORD_DTYPE, - ALMOST_INF, X_BINNED_DTYPE, X_BITSET_INNER_DTYPE, X_DTYPE, ) -from sklearn.ensemble._hist_gradient_boosting._bitset import ( - set_bitset_memoryview, - set_raw_bitset_from_binned_bitset, -) +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split from sklearn.utils._openmp_helpers import _openmp_effective_n_threads n_threads = _openmp_effective_n_threads() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py index 255d13bb08456..f862273beadf5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -2,17 +2,19 @@ import pytest from numpy.testing import assert_array_equal -from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE -from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + HISTOGRAM_DTYPE, + X_BINNED_DTYPE, + MonotonicConstraint, +) +from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder from sklearn.ensemble._hist_gradient_boosting.splitting import ( Splitter, compute_node_value, ) -from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder -from sklearn.utils._testing import skip_if_32bit from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._testing import skip_if_32bit n_threads = _openmp_effective_n_threads() diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py index f8d7533ec38bc..03a2720b36127 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py @@ -1,17 +1,15 @@ import numpy as np -from numpy.testing import assert_array_equal -from numpy.testing import assert_allclose - import pytest +from numpy.testing import assert_allclose, assert_array_equal from sklearn.base import clone from sklearn.datasets import make_classification, make_regression - -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) from sklearn.metrics import check_scoring - X_classification, y_classification = make_classification(random_state=0) X_regression, y_regression = make_regression(random_state=0) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 048a1d69395e2..9371d2e4e6c5b 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -3,25 +3,23 @@ # License: BSD 3 clause import numbers +from numbers import Integral, Real +from warnings import warn + import numpy as np from scipy.sparse import issparse -from warnings import warn -from numbers import Integral, Real +from ..base import OutlierMixin, _fit_context from ..tree import ExtraTreeRegressor from ..tree._tree import DTYPE as tree_dtype from ..utils import ( - check_random_state, check_array, + check_random_state, gen_batches, get_chunk_n_rows, ) -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt -from ..utils.validation import check_is_fitted, _num_samples -from ..base import OutlierMixin -from ..base import _fit_context - +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils.validation import _num_samples, check_is_fitted from ._bagging import BaseBagging __all__ = ["IsolationForest"] diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 5b3486edfeb33..2129e4d9a0134 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -10,35 +10,32 @@ import numpy as np import scipy.sparse as sparse -from ..base import clone -from ..base import ClassifierMixin, RegressorMixin, TransformerMixin -from ..base import is_classifier, is_regressor -from ..base import _fit_context +from ..base import ( + ClassifierMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + clone, + is_classifier, + is_regressor, +) from ..exceptions import NotFittedError -from ..utils._estimator_html_repr import _VisualBlock - -from ._base import _fit_single_estimator -from ._base import _BaseHeterogeneousEnsemble - -from ..linear_model import LogisticRegression -from ..linear_model import RidgeCV - -from ..model_selection import cross_val_predict -from ..model_selection import check_cv - +from ..linear_model import LogisticRegression, RidgeCV +from ..model_selection import check_cv, cross_val_predict from ..preprocessing import LabelEncoder - from ..utils import Bunch -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.metaestimators import available_if -from ..utils.parallel import delayed, Parallel +from ..utils._estimator_html_repr import _VisualBlock from ..utils._param_validation import HasMethods, StrOptions +from ..utils.metaestimators import available_if +from ..utils.multiclass import check_classification_targets, type_of_target +from ..utils.parallel import Parallel, delayed from ..utils.validation import ( _check_feature_names_in, _check_response_method, check_is_fitted, column_or_1d, ) +from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator def _estimator_has(attr): diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index f8f4d2c4c197f..50670a5a52699 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -18,24 +18,23 @@ import numpy as np -from ..base import ClassifierMixin -from ..base import RegressorMixin -from ..base import TransformerMixin -from ..base import clone -from ..base import _fit_context -from ._base import _fit_single_estimator -from ._base import _BaseHeterogeneousEnsemble +from ..base import ( + ClassifierMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + clone, +) +from ..exceptions import NotFittedError from ..preprocessing import LabelEncoder from ..utils import Bunch +from ..utils._estimator_html_repr import _VisualBlock +from ..utils._param_validation import StrOptions from ..utils.metaestimators import available_if -from ..utils.validation import check_is_fitted -from ..utils.validation import _check_feature_names_in from ..utils.multiclass import check_classification_targets -from ..utils.validation import column_or_1d -from ..utils._param_validation import StrOptions -from ..exceptions import NotFittedError -from ..utils._estimator_html_repr import _VisualBlock -from ..utils.parallel import delayed, Parallel +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _check_feature_names_in, check_is_fitted, column_or_1d +from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 569609e6326e5..4beee0f09e6f2 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -23,28 +23,32 @@ # # License: BSD 3 clause +import warnings from abc import ABCMeta, abstractmethod - from numbers import Integral, Real -import numpy as np - -import warnings +import numpy as np from scipy.special import xlogy -from ._base import BaseEnsemble -from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor -from ..base import _fit_context -from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, _safe_indexing -from ..utils.extmath import softmax -from ..utils.extmath import stable_cumsum +from ..base import ( + ClassifierMixin, + RegressorMixin, + _fit_context, + is_classifier, + is_regressor, +) from ..metrics import accuracy_score, r2_score -from ..utils.validation import check_is_fitted -from ..utils.validation import _check_sample_weight -from ..utils.validation import has_fit_parameter -from ..utils.validation import _num_samples +from ..tree import DecisionTreeClassifier, DecisionTreeRegressor +from ..utils import _safe_indexing, check_random_state from ..utils._param_validation import HasMethods, Interval, StrOptions +from ..utils.extmath import softmax, stable_cumsum +from ..utils.validation import ( + _check_sample_weight, + _num_samples, + check_is_fitted, + has_fit_parameter, +) +from ._base import BaseEnsemble __all__ = [ "AdaBoostClassifier", diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index f6311e8c459d4..2c1067ccfc248 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -4,35 +4,33 @@ # Author: Gilles Louppe # License: BSD 3 clause -from itertools import product +from itertools import cycle, product -import numpy as np import joblib +import numpy as np import pytest +from scipy.sparse import csc_matrix, csr_matrix from sklearn.base import BaseEstimator - -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal +from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.model_selection import GridSearchCV, ParameterGrid -from sklearn.ensemble import BaggingClassifier, BaggingRegressor -from sklearn.linear_model import Perceptron, LogisticRegression +from sklearn.ensemble import ( + BaggingClassifier, + BaggingRegressor, + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.feature_selection import SelectKBest +from sklearn.linear_model import LogisticRegression, Perceptron +from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from sklearn.svm import SVC, SVR -from sklearn.random_projection import SparseRandomProjection from sklearn.pipeline import make_pipeline -from sklearn.feature_selection import SelectKBest -from sklearn.model_selection import train_test_split -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 -from sklearn.utils import check_random_state from sklearn.preprocessing import FunctionTransformer, scale -from itertools import cycle - -from scipy.sparse import csc_matrix, csr_matrix +from sklearn.random_projection import SparseRandomProjection +from sklearn.svm import SVC, SVR +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal rng = check_random_state(0) diff --git a/sklearn/ensemble/tests/test_base.py b/sklearn/ensemble/tests/test_base.py index fe4b1e33ae7b3..8687d91053a22 100644 --- a/sklearn/ensemble/tests/test_base.py +++ b/sklearn/ensemble/tests/test_base.py @@ -5,19 +5,19 @@ # Authors: Gilles Louppe # License: BSD 3 clause +from collections import OrderedDict + import numpy as np import pytest +from sklearn import ensemble from sklearn.datasets import load_iris +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import BaggingClassifier from sklearn.ensemble._base import _set_random_states -from sklearn.linear_model import Perceptron -from sklearn.linear_model import Ridge, LogisticRegression -from collections import OrderedDict -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectFromModel -from sklearn import ensemble +from sklearn.linear_model import LogisticRegression, Perceptron, Ridge +from sklearn.pipeline import Pipeline def test_base(): diff --git a/sklearn/ensemble/tests/test_common.py b/sklearn/ensemble/tests/test_common.py index 5bafe08881ae9..7e14b34993d6f 100644 --- a/sklearn/ensemble/tests/test_common.py +++ b/sklearn/ensemble/tests/test_common.py @@ -1,21 +1,25 @@ import numpy as np import pytest -from sklearn.base import clone -from sklearn.base import ClassifierMixin -from sklearn.base import is_classifier - -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression -from sklearn.datasets import load_iris, load_diabetes +from sklearn.base import ClassifierMixin, clone, is_classifier +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + StackingClassifier, + StackingRegressor, + VotingClassifier, + VotingRegressor, +) from sklearn.impute import SimpleImputer -from sklearn.linear_model import LogisticRegression, LinearRegression -from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR +from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.pipeline import make_pipeline -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor - -from sklearn.ensemble import StackingClassifier, StackingRegressor -from sklearn.ensemble import VotingClassifier, VotingRegressor +from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR X, y = load_iris(return_X_y=True) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 9ee29f717af88..15d2999b5ef4d 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -8,58 +8,54 @@ # Arnaud Joly # License: BSD 3 clause -import pickle +import itertools import math +import pickle from collections import defaultdict from functools import partial -import itertools -from itertools import combinations -from itertools import product -from typing import Dict, Any - -import numpy as np -from scipy.sparse import csr_matrix -from scipy.sparse import csc_matrix -from scipy.sparse import coo_matrix -from scipy.special import comb +from itertools import combinations, product +from typing import Any, Dict +from unittest.mock import patch import joblib - +import numpy as np import pytest +from scipy.sparse import coo_matrix, csc_matrix, csr_matrix +from scipy.special import comb import sklearn -from sklearn.dummy import DummyRegressor -from sklearn.metrics import mean_poisson_deviance -from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import _convert_container -from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import skip_if_no_parallel - -from sklearn.exceptions import NotFittedError - from sklearn import datasets -from sklearn.decomposition import TruncatedSVD from sklearn.datasets import make_classification -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import ExtraTreesRegressor -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import RandomTreesEmbedding -from sklearn.metrics import explained_variance_score, f1_score -from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.model_selection import GridSearchCV +from sklearn.decomposition import TruncatedSVD +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, +) +from sklearn.exceptions import NotFittedError +from sklearn.metrics import ( + explained_variance_score, + f1_score, + mean_poisson_deviance, + mean_squared_error, +) +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.svm import LinearSVC +from sklearn.tree._classes import SPARSE_SPLITTERS +from sklearn.utils._testing import ( + _convert_container, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, + skip_if_no_parallel, +) from sklearn.utils.parallel import Parallel from sklearn.utils.validation import check_random_state -from sklearn.metrics import mean_squared_error - -from sklearn.tree._classes import SPARSE_SPLITTERS - -from unittest.mock import patch - # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index ad31b2ed732e9..f46bf9959fa29 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -3,38 +3,34 @@ """ import re import warnings + import numpy as np +import pytest from numpy.testing import assert_allclose - -from scipy.sparse import csr_matrix -from scipy.sparse import csc_matrix -from scipy.sparse import coo_matrix +from scipy.sparse import coo_matrix, csc_matrix, csr_matrix from scipy.special import expit -import pytest - from sklearn import datasets from sklearn.base import clone from sklearn.datasets import make_classification, make_regression -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import GradientBoostingRegressor +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.ensemble._gradient_boosting import predict_stages -from sklearn.preprocessing import scale +from sklearn.exceptions import DataConversionWarning, NotFittedError +from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import scale +from sklearn.svm import NuSVR from sklearn.utils import check_random_state, tosequence from sklearn.utils._mocking import NoSampleWeightWrapper -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import skip_if_32bit from sklearn.utils._param_validation import InvalidParameterError -from sklearn.exceptions import DataConversionWarning -from sklearn.exceptions import NotFittedError -from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.pipeline import make_pipeline -from sklearn.linear_model import LinearRegression -from sklearn.svm import NuSVR - +from sklearn.utils._testing import ( + assert_array_almost_equal, + assert_array_equal, + skip_if_32bit, +) GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, GradientBoostingRegressor] @@ -674,9 +670,8 @@ def test_oob_multilcass_iris(): def test_verbose_output(): # Check verbose=1 does not cause error. - from io import StringIO - import sys + from io import StringIO old_stdout = sys.stdout sys.stdout = StringIO() @@ -706,8 +701,8 @@ def test_verbose_output(): def test_more_verbose_output(): # Check verbose=2 does not cause error. - from io import StringIO import sys + from io import StringIO old_stdout = sys.stdout sys.stdout = StringIO() diff --git a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py index e710be9504be3..df92c68801da2 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py +++ b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py @@ -2,22 +2,25 @@ Testing for the gradient boosting loss functions and initial estimators. """ from itertools import product + import numpy as np -from numpy.testing import assert_allclose import pytest +from numpy.testing import assert_allclose from pytest import approx -from sklearn.utils import check_random_state +from sklearn.ensemble._gb_losses import ( + LOSS_FUNCTIONS, + BinomialDeviance, + ExponentialLoss, + HuberLossFunction, + LeastAbsoluteError, + LeastSquaresError, + MultinomialDeviance, + QuantileLossFunction, + RegressionLossFunction, +) from sklearn.metrics import mean_pinball_loss -from sklearn.ensemble._gb_losses import RegressionLossFunction -from sklearn.ensemble._gb_losses import LeastSquaresError -from sklearn.ensemble._gb_losses import LeastAbsoluteError -from sklearn.ensemble._gb_losses import HuberLossFunction -from sklearn.ensemble._gb_losses import QuantileLossFunction -from sklearn.ensemble._gb_losses import BinomialDeviance -from sklearn.ensemble._gb_losses import MultinomialDeviance -from sklearn.ensemble._gb_losses import ExponentialLoss -from sklearn.ensemble._gb_losses import LOSS_FUNCTIONS +from sklearn.utils import check_random_state def test_binomial_deviance(): diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 7650dd5c14ce4..854ebdb701014 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -6,27 +6,25 @@ # Alexandre Gramfort # License: BSD 3 clause -import pytest import warnings +from unittest.mock import Mock, patch import numpy as np +import pytest +from scipy.sparse import csc_matrix, csr_matrix -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_allclose - -from sklearn.model_selection import ParameterGrid +from sklearn.datasets import load_diabetes, load_iris, make_classification from sklearn.ensemble import IsolationForest from sklearn.ensemble._iforest import _average_path_length -from sklearn.model_selection import train_test_split -from sklearn.datasets import load_diabetes, load_iris, make_classification -from sklearn.utils import check_random_state from sklearn.metrics import roc_auc_score - -from scipy.sparse import csc_matrix, csr_matrix -from unittest.mock import Mock, patch - +from sklearn.model_selection import ParameterGrid, train_test_split +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) # load iris & diabetes dataset iris = load_iris() diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py index 2c04171fcd0f4..006b9cdb9e966 100644 --- a/sklearn/ensemble/tests/test_stacking.py +++ b/sklearn/ensemble/tests/test_stacking.py @@ -3,55 +3,47 @@ # Authors: Guillaume Lemaitre # License: BSD 3 clause -import pytest +from unittest.mock import Mock + import numpy as np -from numpy.testing import assert_array_equal +import pytest import scipy.sparse as sparse +from numpy.testing import assert_array_equal -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin -from sklearn.base import RegressorMixin -from sklearn.base import clone - -from sklearn.exceptions import ConvergenceWarning - -from sklearn.datasets import load_iris -from sklearn.datasets import load_diabetes -from sklearn.datasets import load_breast_cancer -from sklearn.datasets import make_regression -from sklearn.datasets import make_classification -from sklearn.datasets import make_multilabel_classification - -from sklearn.dummy import DummyClassifier -from sklearn.dummy import DummyRegressor -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import Ridge -from sklearn.linear_model import RidgeClassifier -from sklearn.svm import LinearSVC -from sklearn.svm import LinearSVR -from sklearn.svm import SVC -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import RandomForestRegressor +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone +from sklearn.datasets import ( + load_breast_cancer, + load_diabetes, + load_iris, + make_classification, + make_multilabel_classification, + make_regression, +) +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + StackingClassifier, + StackingRegressor, +) +from sklearn.exceptions import ConvergenceWarning, NotFittedError +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + Ridge, + RidgeClassifier, +) +from sklearn.model_selection import KFold, StratifiedKFold, train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import scale - -from sklearn.ensemble import StackingClassifier -from sklearn.ensemble import StackingRegressor - -from sklearn.model_selection import train_test_split -from sklearn.model_selection import StratifiedKFold -from sklearn.model_selection import KFold - +from sklearn.svm import SVC, LinearSVC, LinearSVR from sklearn.utils._mocking import CheckingClassifier -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_allclose_dense_sparse -from sklearn.utils._testing import ignore_warnings - -from sklearn.exceptions import NotFittedError - -from unittest.mock import Mock +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + ignore_warnings, +) diabetes = load_diabetes() X_diabetes, y_diabetes = diabetes.data, diabetes.target diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 56db8b3c7fbf5..52734fc031fde 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -1,30 +1,34 @@ """Testing for the VotingClassifier and VotingRegressor""" -import pytest import re + import numpy as np +import pytest -from sklearn.utils._testing import assert_almost_equal, assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.exceptions import NotFittedError -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import LogisticRegression -from sklearn.naive_bayes import GaussianNB -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import VotingClassifier, VotingRegressor -from sklearn.tree import DecisionTreeClassifier -from sklearn.tree import DecisionTreeRegressor -from sklearn.model_selection import GridSearchCV from sklearn import datasets -from sklearn.model_selection import cross_val_score, train_test_split +from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.datasets import make_multilabel_classification -from sklearn.svm import SVC +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + VotingClassifier, + VotingRegressor, +) +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.multiclass import OneVsRestClassifier +from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier -from sklearn.base import BaseEstimator, ClassifierMixin, clone -from sklearn.dummy import DummyRegressor from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) # Load datasets iris = datasets.load_iris() diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py index a5b0f7a49ce47..a8e0f06340dc4 100755 --- a/sklearn/ensemble/tests/test_weight_boosting.py +++ b/sklearn/ensemble/tests/test_weight_boosting.py @@ -1,33 +1,27 @@ """Testing for the boost module (sklearn.ensemble.boost).""" -import numpy as np -import pytest import re -from scipy.sparse import csc_matrix -from scipy.sparse import csr_matrix -from scipy.sparse import coo_matrix -from scipy.sparse import dok_matrix -from scipy.sparse import lil_matrix - -from sklearn.utils._testing import assert_array_equal, assert_array_less -from sklearn.utils._testing import assert_array_almost_equal +import numpy as np +import pytest +from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix -from sklearn.base import BaseEstimator -from sklearn.base import clone +from sklearn import datasets +from sklearn.base import BaseEstimator, clone from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.linear_model import LinearRegression -from sklearn.model_selection import train_test_split -from sklearn.model_selection import GridSearchCV -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import AdaBoostRegressor +from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor from sklearn.ensemble._weight_boosting import _samme_proba +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle from sklearn.utils._mocking import NoSampleWeightWrapper -from sklearn import datasets - +from sklearn.utils._testing import ( + assert_array_almost_equal, + assert_array_equal, + assert_array_less, +) # Common random state rng = np.random.RandomState(0) diff --git a/sklearn/experimental/enable_halving_search_cv.py b/sklearn/experimental/enable_halving_search_cv.py index f6937b0d14c01..dd399ef35b6f7 100644 --- a/sklearn/experimental/enable_halving_search_cv.py +++ b/sklearn/experimental/enable_halving_search_cv.py @@ -19,13 +19,12 @@ flake8 to ignore the import, which appears as unused. """ +from .. import model_selection from ..model_selection._search_successive_halving import ( - HalvingRandomSearchCV, HalvingGridSearchCV, + HalvingRandomSearchCV, ) -from .. import model_selection - # use settattr to avoid mypy errors when monkeypatching setattr(model_selection, "HalvingRandomSearchCV", HalvingRandomSearchCV) setattr(model_selection, "HalvingGridSearchCV", HalvingGridSearchCV) diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py index f0416ac013e96..d287400c7999f 100644 --- a/sklearn/experimental/enable_hist_gradient_boosting.py +++ b/sklearn/experimental/enable_hist_gradient_boosting.py @@ -12,7 +12,6 @@ import warnings - warnings.warn( "Since version 1.0, " "it is not needed to import enable_hist_gradient_boosting anymore. " diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py index 9ef9f6a0dbdf0..0b906961ca184 100644 --- a/sklearn/experimental/enable_iterative_imputer.py +++ b/sklearn/experimental/enable_iterative_imputer.py @@ -12,8 +12,8 @@ >>> from sklearn.impute import IterativeImputer """ -from ..impute._iterative import IterativeImputer from .. import impute +from ..impute._iterative import IterativeImputer # use settattr to avoid mypy errors when monkeypatching setattr(impute, "IterativeImputer", IterativeImputer) diff --git a/sklearn/feature_extraction/__init__.py b/sklearn/feature_extraction/__init__.py index a9c1496181b3b..f4db85303f4b6 100644 --- a/sklearn/feature_extraction/__init__.py +++ b/sklearn/feature_extraction/__init__.py @@ -4,10 +4,10 @@ images. """ +from . import text from ._dict_vectorizer import DictVectorizer from ._hash import FeatureHasher -from .image import img_to_graph, grid_to_graph -from . import text +from .image import grid_to_graph, img_to_graph __all__ = [ "DictVectorizer", diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 60e2cb3b7ad84..e32de4be42462 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -3,15 +3,14 @@ # License: BSD 3 clause from array import array -from collections.abc import Mapping, Iterable -from operator import itemgetter +from collections.abc import Iterable, Mapping from numbers import Number +from operator import itemgetter import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin -from ..base import _fit_context +from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import check_array from ..utils.validation import check_is_fitted diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index e1b5e5f2561fe..e0941ed1dac97 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -1,16 +1,15 @@ # Author: Lars Buitinck # License: BSD 3 clause -from numbers import Integral from itertools import chain +from numbers import Integral import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin -from ..base import _fit_context -from ._hashing_fast import transform as _hashing_transform +from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils._param_validation import Interval, StrOptions +from ._hashing_fast import transform as _hashing_transform def _iteritems(d): diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index beea3e23e0adc..da5d26b76e8ad 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -11,15 +11,14 @@ from itertools import product from numbers import Integral, Number, Real + import numpy as np -from scipy import sparse from numpy.lib.stride_tricks import as_strided +from scipy import sparse -from ..base import BaseEstimator, TransformerMixin -from ..base import _fit_context +from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import check_array, check_random_state -from ..utils._param_validation import Hidden, Interval, validate_params -from ..utils._param_validation import RealNotInt +from ..utils._param_validation import Hidden, Interval, RealNotInt, validate_params __all__ = [ "PatchExtractor", diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index c8b9aaa8b5c8a..7e3c7f259ea03 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -3,12 +3,11 @@ # License: BSD 3 clause from random import Random -import numpy as np -import scipy.sparse as sp -from numpy.testing import assert_array_equal -from numpy.testing import assert_allclose +import numpy as np import pytest +import scipy.sparse as sp +from numpy.testing import assert_allclose, assert_array_equal from sklearn.feature_extraction import DictVectorizer from sklearn.feature_selection import SelectKBest, chi2 diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index b074620f8c029..945a7cb3ca8f9 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -1,6 +1,6 @@ import numpy as np -from numpy.testing import assert_array_equal import pytest +from numpy.testing import assert_array_equal from sklearn.feature_extraction import FeatureHasher from sklearn.feature_extraction._hashing_fast import transform as _hashing_transform diff --git a/sklearn/feature_extraction/tests/test_image.py b/sklearn/feature_extraction/tests/test_image.py index 5a89062e7de19..375652c848db6 100644 --- a/sklearn/feature_extraction/tests/test_image.py +++ b/sklearn/feature_extraction/tests/test_image.py @@ -3,17 +3,17 @@ # License: BSD 3 clause import numpy as np +import pytest from scipy import ndimage from scipy.sparse.csgraph import connected_components -import pytest from sklearn.feature_extraction.image import ( - img_to_graph, - grid_to_graph, - extract_patches_2d, - reconstruct_from_patches_2d, PatchExtractor, _extract_patches, + extract_patches_2d, + grid_to_graph, + img_to_graph, + reconstruct_from_patches_2d, ) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 80a42aaea5af0..fc35053b40251 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1,43 +1,37 @@ -from collections.abc import Mapping +import pickle import re +import warnings +from collections import defaultdict +from collections.abc import Mapping +from functools import partial +from io import StringIO +import numpy as np import pytest -import warnings +from numpy.testing import assert_array_almost_equal, assert_array_equal from scipy import sparse -from sklearn.feature_extraction.text import strip_tags -from sklearn.feature_extraction.text import strip_accents_unicode -from sklearn.feature_extraction.text import strip_accents_ascii - -from sklearn.feature_extraction.text import HashingVectorizer -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.feature_extraction.text import TfidfVectorizer - -from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS - -from sklearn.model_selection import train_test_split -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import GridSearchCV +from sklearn.base import clone +from sklearn.feature_extraction.text import ( + ENGLISH_STOP_WORDS, + CountVectorizer, + HashingVectorizer, + TfidfTransformer, + TfidfVectorizer, + strip_accents_ascii, + strip_accents_unicode, + strip_tags, +) +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC - -from sklearn.base import clone - -import numpy as np -from numpy.testing import assert_array_almost_equal -from numpy.testing import assert_array_equal from sklearn.utils import IS_PYPY from sklearn.utils._testing import ( + assert_allclose_dense_sparse, assert_almost_equal, fails_if_pypy, - assert_allclose_dense_sparse, skip_if_32bit, ) -from collections import defaultdict -from functools import partial -import pickle -from io import StringIO JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 3201e3a0d51bb..4b4b4396d1863 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -12,29 +12,26 @@ """ import array +import re +import unicodedata +import warnings from collections import defaultdict from collections.abc import Mapping from functools import partial from numbers import Integral from operator import itemgetter -import re -import unicodedata -import warnings import numpy as np import scipy.sparse as sp -from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin -from ..base import _fit_context +from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context +from ..exceptions import NotFittedError from ..preprocessing import normalize +from ..utils import _IS_32BIT +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils.validation import FLOAT_DTYPES, check_array, check_is_fitted from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS -from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES -from ..utils import _IS_32BIT -from ..exceptions import NotFittedError -from ..utils._param_validation import StrOptions, Interval, HasMethods -from ..utils._param_validation import RealNotInt - __all__ = [ "HashingVectorizer", diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py index ce5fbc10ee459..4fbc631155078 100644 --- a/sklearn/feature_selection/__init__.py +++ b/sklearn/feature_selection/__init__.py @@ -4,31 +4,25 @@ recursive feature elimination algorithm. """ -from ._univariate_selection import chi2 -from ._univariate_selection import f_classif -from ._univariate_selection import f_oneway -from ._univariate_selection import f_regression -from ._univariate_selection import r_regression -from ._univariate_selection import SelectPercentile -from ._univariate_selection import SelectKBest -from ._univariate_selection import SelectFpr -from ._univariate_selection import SelectFdr -from ._univariate_selection import SelectFwe -from ._univariate_selection import GenericUnivariateSelect - -from ._variance_threshold import VarianceThreshold - -from ._rfe import RFE -from ._rfe import RFECV - +from ._base import SelectorMixin from ._from_model import SelectFromModel - +from ._mutual_info import mutual_info_classif, mutual_info_regression +from ._rfe import RFE, RFECV from ._sequential import SequentialFeatureSelector - -from ._mutual_info import mutual_info_regression, mutual_info_classif - -from ._base import SelectorMixin - +from ._univariate_selection import ( + GenericUnivariateSelect, + SelectFdr, + SelectFpr, + SelectFwe, + SelectKBest, + SelectPercentile, + chi2, + f_classif, + f_oneway, + f_regression, + r_regression, +) +from ._variance_threshold import VarianceThreshold __all__ = [ "GenericUnivariateSelect", diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index 100af272038ad..9ede37c98c75b 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -8,16 +8,16 @@ from operator import attrgetter import numpy as np -from scipy.sparse import issparse, csc_matrix +from scipy.sparse import csc_matrix, issparse from ..base import TransformerMixin from ..utils import ( + _safe_indexing, check_array, safe_sqr, ) -from ..utils._tags import _safe_tags -from ..utils import _safe_indexing from ..utils._set_output import _get_output_config +from ..utils._tags import _safe_tags from ..utils.validation import _check_feature_names_in, check_is_fitted diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 47f98d89e8abe..d3a287007bd49 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -2,20 +2,17 @@ # License: BSD 3 clause from copy import deepcopy - -import numpy as np from numbers import Integral, Real -from ._base import SelectorMixin -from ._base import _get_feature_importances -from ..base import BaseEstimator, clone, MetaEstimatorMixin -from ..base import _fit_context -from ..utils._tags import _safe_tags -from ..utils.validation import check_is_fitted, check_scalar, _num_features -from ..utils._param_validation import HasMethods, Interval, Options +import numpy as np +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone from ..exceptions import NotFittedError +from ..utils._param_validation import HasMethods, Interval, Options +from ..utils._tags import _safe_tags from ..utils.metaestimators import available_if +from ..utils.validation import _num_features, check_is_fitted, check_scalar +from ._base import SelectorMixin, _get_feature_importances def _calculate_threshold(estimator, importances, threshold): diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index 9cacfc3890784..b3de388c0811a 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -1,18 +1,19 @@ # Author: Nikolay Mayorov # License: 3-clause BSD -import numpy as np from numbers import Integral + +import numpy as np from scipy.sparse import issparse from scipy.special import digamma from ..metrics.cluster import mutual_info_score -from ..neighbors import NearestNeighbors, KDTree +from ..neighbors import KDTree, NearestNeighbors from ..preprocessing import scale from ..utils import check_random_state -from ..utils.validation import check_array, check_X_y -from ..utils.multiclass import check_classification_targets from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_array, check_X_y def _compute_mi_cc(x, y, n_neighbors): diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 932d66449ae22..11cf083992653 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -6,28 +6,21 @@ """Recursive feature elimination for feature ranking""" -import numpy as np from numbers import Integral -from joblib import effective_n_jobs +import numpy as np +from joblib import effective_n_jobs -from ..utils.metaestimators import available_if -from ..utils.metaestimators import _safe_split -from ..utils._param_validation import HasMethods, Interval -from ..utils._param_validation import RealNotInt -from ..utils._tags import _safe_tags -from ..utils.validation import check_is_fitted -from ..utils.parallel import delayed, Parallel -from ..base import BaseEstimator -from ..base import MetaEstimatorMixin -from ..base import clone -from ..base import is_classifier -from ..base import _fit_context +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..metrics import check_scoring from ..model_selection import check_cv from ..model_selection._validation import _score -from ..metrics import check_scoring -from ._base import SelectorMixin -from ._base import _get_feature_importances +from ..utils._param_validation import HasMethods, Interval, RealNotInt +from ..utils._tags import _safe_tags +from ..utils.metaestimators import _safe_split, available_if +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_is_fitted +from ._base import SelectorMixin, _get_feature_importances def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 0fbe91273053b..78a1c86df49de 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -5,15 +5,13 @@ import numpy as np -from ._base import SelectorMixin -from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier -from ..base import _fit_context -from ..utils._param_validation import HasMethods, Interval, StrOptions -from ..utils._param_validation import RealNotInt +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..metrics import get_scorer_names +from ..model_selection import check_cv, cross_val_score +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions from ..utils._tags import _safe_tags from ..utils.validation import check_is_fitted -from ..model_selection import cross_val_score, check_cv -from ..metrics import get_scorer_names +from ._base import SelectorMixin class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index f4355c39f88cd..fc1fcbc01a151 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -5,20 +5,19 @@ # License: BSD 3 clause -import numpy as np import warnings - from numbers import Integral, Real + +import numpy as np from scipy import special, stats from scipy.sparse import issparse -from ..base import BaseEstimator -from ..base import _fit_context +from ..base import BaseEstimator, _fit_context from ..preprocessing import LabelBinarizer -from ..utils import as_float_array, check_array, check_X_y, safe_sqr, safe_mask -from ..utils.extmath import safe_sparse_dot, row_norms -from ..utils.validation import check_is_fitted +from ..utils import as_float_array, check_array, check_X_y, safe_mask, safe_sqr from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import row_norms, safe_sparse_dot +from ..utils.validation import check_is_fitted from ._base import SelectorMixin diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 073a22c6ad92b..f97c75db1e34b 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -3,12 +3,12 @@ from numbers import Real import numpy as np -from ..base import BaseEstimator -from ..base import _fit_context -from ._base import SelectorMixin + +from ..base import BaseEstimator, _fit_context +from ..utils._param_validation import Interval from ..utils.sparsefuncs import mean_variance_axis, min_max_axis from ..utils.validation import check_is_fitted -from ..utils._param_validation import Interval +from ._base import SelectorMixin class VarianceThreshold(SelectorMixin, BaseEstimator): diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py index 9869a1c03e677..bf883797ddabd 100644 --- a/sklearn/feature_selection/tests/test_base.py +++ b/sklearn/feature_selection/tests/test_base.py @@ -1,8 +1,7 @@ import numpy as np import pytest -from scipy import sparse as sp - from numpy.testing import assert_array_equal +from scipy import sparse as sp from sklearn.base import BaseEstimator from sklearn.feature_selection._base import SelectorMixin diff --git a/sklearn/feature_selection/tests/test_chi2.py b/sklearn/feature_selection/tests/test_chi2.py index d7d830459e455..4fdc652a998a9 100644 --- a/sklearn/feature_selection/tests/test_chi2.py +++ b/sklearn/feature_selection/tests/test_chi2.py @@ -7,13 +7,12 @@ import numpy as np import pytest -from scipy.sparse import coo_matrix, csr_matrix import scipy.stats +from scipy.sparse import coo_matrix, csr_matrix from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_selection._univariate_selection import _chisquare -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal # Feature 0 is highly informative for class 1; # feature 1 is the same everywhere; diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index ff51243bb1378..b182aca270e06 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -3,35 +3,36 @@ """ import itertools import warnings -import numpy as np -from numpy.testing import assert_allclose -from scipy import stats, sparse +import numpy as np import pytest +from numpy.testing import assert_allclose +from scipy import sparse, stats -from sklearn.utils._testing import assert_almost_equal, _convert_container -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import ignore_warnings -from sklearn.utils import safe_mask - -from sklearn.datasets import make_classification, make_regression, load_iris +from sklearn.datasets import load_iris, make_classification, make_regression from sklearn.feature_selection import ( + GenericUnivariateSelect, + SelectFdr, + SelectFpr, + SelectFwe, + SelectKBest, + SelectPercentile, chi2, f_classif, f_oneway, f_regression, - GenericUnivariateSelect, mutual_info_classif, mutual_info_regression, r_regression, - SelectPercentile, - SelectKBest, - SelectFpr, - SelectFdr, - SelectFwe, ) - +from sklearn.utils import safe_mask +from sklearn.utils._testing import ( + _convert_container, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) ############################################################################## # Test the score functions diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 7b408201bc7f5..aa802136c2f39 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -1,34 +1,36 @@ import re -import pytest -import numpy as np import warnings from unittest.mock import Mock -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import skip_if_32bit -from sklearn.utils._testing import MinimalClassifier +import numpy as np +import pytest from sklearn import datasets +from sklearn.base import BaseEstimator from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression from sklearn.datasets import make_friedman1 +from sklearn.decomposition import PCA +from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier from sklearn.exceptions import NotFittedError +from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import ( - LogisticRegression, - SGDClassifier, - Lasso, - LassoCV, ElasticNet, ElasticNetCV, + Lasso, + LassoCV, + LogisticRegression, + PassiveAggressiveClassifier, + SGDClassifier, ) -from sklearn.svm import LinearSVC -from sklearn.feature_selection import SelectFromModel -from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier -from sklearn.linear_model import PassiveAggressiveClassifier -from sklearn.base import BaseEstimator from sklearn.pipeline import make_pipeline -from sklearn.decomposition import PCA +from sklearn.svm import LinearSVC +from sklearn.utils._testing import ( + MinimalClassifier, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + skip_if_32bit, +) class NaNTag(BaseEstimator): diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py index f39e4a5738b21..f7b4af0a393f9 100644 --- a/sklearn/feature_selection/tests/test_mutual_info.py +++ b/sklearn/feature_selection/tests/test_mutual_info.py @@ -2,13 +2,13 @@ import pytest from scipy.sparse import csr_matrix +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression +from sklearn.feature_selection._mutual_info import _compute_mi from sklearn.utils import check_random_state from sklearn.utils._testing import ( - assert_array_equal, assert_allclose, + assert_array_equal, ) -from sklearn.feature_selection._mutual_info import _compute_mi -from sklearn.feature_selection import mutual_info_regression, mutual_info_classif def test_compute_mi_dd(): diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py index fa7aeea19be6c..0f141f3461d7f 100644 --- a/sklearn/feature_selection/tests/test_rfe.py +++ b/sklearn/feature_selection/tests/test_rfe.py @@ -4,31 +4,26 @@ from operator import attrgetter -import pytest import numpy as np -from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_allclose +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal from scipy import sparse from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA -from sklearn.feature_selection import RFE, RFECV +from sklearn.compose import TransformedTargetRegressor +from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression from sklearn.datasets import load_iris, make_friedman1 -from sklearn.metrics import zero_one_loss -from sklearn.svm import SVC, SVR, LinearSVR -from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import GroupKFold -from sklearn.compose import TransformedTargetRegressor +from sklearn.feature_selection import RFE, RFECV +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import get_scorer, make_scorer, zero_one_loss +from sklearn.model_selection import GroupKFold, cross_val_score from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler - +from sklearn.svm import SVC, SVR, LinearSVR from sklearn.utils import check_random_state from sklearn.utils._testing import ignore_warnings -from sklearn.metrics import make_scorer -from sklearn.metrics import get_scorer - class MockClassifier: """ @@ -278,8 +273,8 @@ def test_rfecv_mockclassifier(): def test_rfecv_verbose_output(): # Check verbose=1 is producing an output. - from io import StringIO import sys + from io import StringIO sys.stdout = StringIO() diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py index a1ea1d4677dd4..a515bf22cdda3 100644 --- a/sklearn/feature_selection/tests/test_sequential.py +++ b/sklearn/feature_selection/tests/test_sequential.py @@ -1,17 +1,17 @@ +import numpy as np import pytest import scipy -import numpy as np from numpy.testing import assert_array_equal -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs, make_classification, make_regression +from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.feature_selection import SequentialFeatureSelector -from sklearn.datasets import make_regression, make_blobs, make_classification from sklearn.linear_model import LinearRegression -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.model_selection import cross_val_score, LeaveOneGroupOut -from sklearn.cluster import KMeans +from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler def test_bad_n_features_to_select(): diff --git a/sklearn/feature_selection/tests/test_variance_threshold.py b/sklearn/feature_selection/tests/test_variance_threshold.py index 4bce46556a666..190d016952980 100644 --- a/sklearn/feature_selection/tests/test_variance_threshold.py +++ b/sklearn/feature_selection/tests/test_variance_threshold.py @@ -1,11 +1,9 @@ import numpy as np import pytest - -from sklearn.utils._testing import assert_array_equal - from scipy.sparse import bsr_matrix, csc_matrix, csr_matrix from sklearn.feature_selection import VarianceThreshold +from sklearn.utils._testing import assert_array_equal data = [[0, 1, 2, 3, 4], [0, 2, 2, 3, 5], [1, 1, 2, 4, 0]] diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py index 719208b7951be..bc0d902b45b18 100644 --- a/sklearn/gaussian_process/__init__.py +++ b/sklearn/gaussian_process/__init__.py @@ -8,9 +8,8 @@ based regression and classification. """ -from ._gpr import GaussianProcessRegressor -from ._gpc import GaussianProcessClassifier from . import kernels - +from ._gpc import GaussianProcessClassifier +from ._gpr import GaussianProcessRegressor __all__ = ["GaussianProcessRegressor", "GaussianProcessClassifier", "kernels"] diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 50a8739372972..013815795a853 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -8,20 +8,19 @@ from operator import itemgetter import numpy as np -from scipy.linalg import cholesky, cho_solve, solve import scipy.optimize +from scipy.linalg import cho_solve, cholesky, solve from scipy.special import erf, expit -from ..base import BaseEstimator, ClassifierMixin, clone -from ..base import _fit_context -from .kernels import Kernel, RBF, CompoundKernel, ConstantKernel as C -from ..utils.validation import check_is_fitted +from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone +from ..multiclass import OneVsOneClassifier, OneVsRestClassifier +from ..preprocessing import LabelEncoder from ..utils import check_random_state -from ..utils.optimize import _check_optimize_result from ..utils._param_validation import Interval, StrOptions -from ..preprocessing import LabelEncoder -from ..multiclass import OneVsRestClassifier, OneVsOneClassifier - +from ..utils.optimize import _check_optimize_result +from ..utils.validation import check_is_fitted +from .kernels import RBF, CompoundKernel, Kernel +from .kernels import ConstantKernel as C # Values required for approximating the logistic sigmoid by # error functions. coefs are obtained via: diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 49fcab40c25f8..90bbe7e446917 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -9,17 +9,16 @@ from operator import itemgetter import numpy as np -from scipy.linalg import cholesky, cho_solve, solve_triangular import scipy.optimize +from scipy.linalg import cho_solve, cholesky, solve_triangular -from ..base import BaseEstimator, RegressorMixin, clone -from ..base import MultiOutputMixin -from ..base import _fit_context -from .kernels import Kernel, RBF, ConstantKernel as C +from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone from ..preprocessing._data import _handle_zeros_in_scale from ..utils import check_random_state -from ..utils.optimize import _check_optimize_result from ..utils._param_validation import Interval, StrOptions +from ..utils.optimize import _check_optimize_result +from .kernels import RBF, Kernel +from .kernels import ConstantKernel as C GPR_CHOLESKY_LOWER = True diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index 1e0866afb6a4d..95db7b13c33ff 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -19,21 +19,20 @@ # Note: this module is strongly inspired by the kernel module of the george # package. +import math +import warnings from abc import ABCMeta, abstractmethod from collections import namedtuple -import math from inspect import signature import numpy as np -from scipy.special import kv, gamma -from scipy.spatial.distance import pdist, cdist, squareform +from scipy.spatial.distance import cdist, pdist, squareform +from scipy.special import gamma, kv -from ..metrics.pairwise import pairwise_kernels from ..base import clone -from ..utils.validation import _num_samples from ..exceptions import ConvergenceWarning - -import warnings +from ..metrics.pairwise import pairwise_kernels +from ..utils.validation import _num_samples def _check_length_scale(X, length_scale): diff --git a/sklearn/gaussian_process/tests/_mini_sequence_kernel.py b/sklearn/gaussian_process/tests/_mini_sequence_kernel.py index ad81890680168..4667329aff9b8 100644 --- a/sklearn/gaussian_process/tests/_mini_sequence_kernel.py +++ b/sklearn/gaussian_process/tests/_mini_sequence_kernel.py @@ -1,8 +1,12 @@ -from sklearn.gaussian_process.kernels import Kernel, Hyperparameter -from sklearn.gaussian_process.kernels import GenericKernelMixin -from sklearn.gaussian_process.kernels import StationaryKernelMixin import numpy as np + from sklearn.base import clone +from sklearn.gaussian_process.kernels import ( + GenericKernelMixin, + Hyperparameter, + Kernel, + StationaryKernelMixin, +) class MiniSeqKernel(GenericKernelMixin, StationaryKernelMixin, Kernel): diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py index aefdb2e8ff0e2..842159f13ac04 100644 --- a/sklearn/gaussian_process/tests/test_gpc.py +++ b/sklearn/gaussian_process/tests/test_gpc.py @@ -4,22 +4,22 @@ # License: BSD 3 clause import warnings -import numpy as np - -from scipy.optimize import approx_fprime +import numpy as np import pytest +from scipy.optimize import approx_fprime +from sklearn.exceptions import ConvergenceWarning from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import ( RBF, CompoundKernel, - ConstantKernel as C, WhiteKernel, ) +from sklearn.gaussian_process.kernels import ( + ConstantKernel as C, +) from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel -from sklearn.exceptions import ConvergenceWarning - from sklearn.utils._testing import assert_almost_equal, assert_array_equal diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index 2de35d4659ce6..d890dc05d9f02 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -4,29 +4,31 @@ # Modified by: Pete Green # License: BSD 3 clause -import warnings -import sys import re -import numpy as np - -from scipy.optimize import approx_fprime +import sys +import warnings +import numpy as np import pytest +from scipy.optimize import approx_fprime +from sklearn.exceptions import ConvergenceWarning from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import ( RBF, - ConstantKernel as C, + DotProduct, + ExpSineSquared, WhiteKernel, ) -from sklearn.gaussian_process.kernels import DotProduct, ExpSineSquared +from sklearn.gaussian_process.kernels import ( + ConstantKernel as C, +) from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel -from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import ( - assert_array_less, + assert_allclose, assert_almost_equal, assert_array_almost_equal, - assert_allclose, + assert_array_less, ) diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 56ab9c8b6c2bf..8733f94c94e06 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -3,40 +3,38 @@ # Author: Jan Hendrik Metzen # License: BSD 3 clause -import pytest -import numpy as np from inspect import signature -from sklearn.gaussian_process.kernels import _approx_fprime +import numpy as np +import pytest -from sklearn.metrics.pairwise import ( - PAIRWISE_KERNEL_FUNCTIONS, - euclidean_distances, - pairwise_kernels, -) +from sklearn.base import clone from sklearn.gaussian_process.kernels import ( RBF, + CompoundKernel, + ConstantKernel, + DotProduct, + Exponentiation, + ExpSineSquared, + KernelOperator, Matern, + PairwiseKernel, RationalQuadratic, - ExpSineSquared, - DotProduct, - ConstantKernel, WhiteKernel, - PairwiseKernel, - KernelOperator, - Exponentiation, - CompoundKernel, + _approx_fprime, +) +from sklearn.metrics.pairwise import ( + PAIRWISE_KERNEL_FUNCTIONS, + euclidean_distances, + pairwise_kernels, ) -from sklearn.base import clone - from sklearn.utils._testing import ( + assert_allclose, assert_almost_equal, - assert_array_equal, assert_array_almost_equal, - assert_allclose, + assert_array_equal, ) - X = np.random.RandomState(0).normal(0, 1, (5, 2)) Y = np.random.RandomState(0).normal(0, 1, (6, 2)) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 37fc43731514a..9245a107adf4f 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -10,17 +10,13 @@ import numpy.ma as ma from scipy import sparse as sp -from ..base import BaseEstimator, TransformerMixin -from ..base import _fit_context -from ..utils._param_validation import StrOptions, MissingValues +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import _is_pandas_na, is_scalar_nan +from ..utils._mask import _get_mask +from ..utils._param_validation import MissingValues, StrOptions from ..utils.fixes import _mode from ..utils.sparsefuncs import _get_median -from ..utils.validation import check_is_fitted -from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _check_feature_names_in -from ..utils._mask import _get_mask -from ..utils import _is_pandas_na -from ..utils import is_scalar_nan +from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted def _check_inputs_dtype(X, missing_values): diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index f977e5bc23e6c..a0087a5a10d55 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -1,31 +1,25 @@ -from time import time +import warnings from collections import namedtuple from numbers import Integral, Real -import warnings +from time import time -from scipy import stats import numpy as np +from scipy import stats -from ..base import clone -from ..base import _fit_context +from ..base import _fit_context, clone from ..exceptions import ConvergenceWarning from ..preprocessing import normalize from ..utils import ( + _safe_assign, + _safe_indexing, check_array, check_random_state, is_scalar_nan, - _safe_assign, - _safe_indexing, ) -from ..utils.validation import FLOAT_DTYPES, check_is_fitted -from ..utils.validation import _check_feature_names_in from ..utils._mask import _get_mask from ..utils._param_validation import HasMethods, Interval, StrOptions - -from ._base import _BaseImputer -from ._base import SimpleImputer -from ._base import _check_inputs_dtype - +from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted +from ._base import SimpleImputer, _BaseImputer, _check_inputs_dtype _ImputerTriplet = namedtuple( "_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"] diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 915f8cbdb3fcb..db0da278b39ef 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -3,19 +3,18 @@ # License: BSD 3 clause from numbers import Integral + import numpy as np -from ._base import _BaseImputer from ..base import _fit_context -from ..utils.validation import FLOAT_DTYPES from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import _NAN_METRICS from ..neighbors._base import _get_weights from ..utils import is_scalar_nan from ..utils._mask import _get_mask -from ..utils.validation import check_is_fitted -from ..utils.validation import _check_feature_names_in from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted +from ._base import _BaseImputer class KNNImputer(_BaseImputer): diff --git a/sklearn/impute/tests/test_base.py b/sklearn/impute/tests/test_base.py index fedfdebb20a1f..0c1bd83f7ca9e 100644 --- a/sklearn/impute/tests/test_base.py +++ b/sklearn/impute/tests/test_base.py @@ -1,12 +1,10 @@ -import pytest - import numpy as np - -from sklearn.utils._mask import _get_mask -from sklearn.utils._testing import _convert_container, assert_allclose +import pytest from sklearn.impute._base import _BaseImputer from sklearn.impute._iterative import _assign_where +from sklearn.utils._mask import _get_mask +from sklearn.utils._testing import _convert_container, assert_allclose @pytest.fixture diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index 00521ca090dc5..aad7eb12a0a92 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -1,17 +1,14 @@ -import pytest - import numpy as np +import pytest from scipy import sparse -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_allclose_dense_sparse -from sklearn.utils._testing import assert_array_equal - from sklearn.experimental import enable_iterative_imputer # noqa - -from sklearn.impute import IterativeImputer -from sklearn.impute import KNNImputer -from sklearn.impute import SimpleImputer +from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + assert_array_equal, +) def imputers(): diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 24b070d21ef06..936847e55e324 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -1,33 +1,31 @@ -import pytest +import io import warnings import numpy as np +import pytest from scipy import sparse from scipy.stats import kstest -import io - -from sklearn.utils._testing import _convert_container -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_allclose_dense_sparse -from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal +from sklearn import tree +from sklearn.datasets import load_diabetes +from sklearn.dummy import DummyRegressor +from sklearn.exceptions import ConvergenceWarning # make IterativeImputer available from sklearn.experimental import enable_iterative_imputer # noqa - -from sklearn.datasets import load_diabetes -from sklearn.impute import MissingIndicator -from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer -from sklearn.dummy import DummyRegressor -from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV -from sklearn.pipeline import Pipeline -from sklearn.pipeline import make_union +from sklearn.impute import IterativeImputer, KNNImputer, MissingIndicator, SimpleImputer +from sklearn.impute._base import _most_frequent +from sklearn.linear_model import ARDRegression, BayesianRidge, RidgeCV from sklearn.model_selection import GridSearchCV -from sklearn import tree +from sklearn.pipeline import Pipeline, make_union from sklearn.random_projection import _sparse_random_matrix -from sklearn.exceptions import ConvergenceWarning -from sklearn.impute._base import _most_frequent +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_allclose_dense_sparse, + assert_array_almost_equal, + assert_array_equal, +) def _assert_array_equal_and_same_dtype(x, y): diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 80ee1d0c2b574..141c2ea90dbd9 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -3,8 +3,7 @@ from sklearn import config_context from sklearn.impute import KNNImputer -from sklearn.metrics.pairwise import nan_euclidean_distances -from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.pairwise import nan_euclidean_distances, pairwise_distances from sklearn.neighbors import KNeighborsRegressor from sklearn.utils._testing import assert_allclose diff --git a/sklearn/inspection/__init__.py b/sklearn/inspection/__init__.py index f73ffe8cff26f..f8e08785e8358 100644 --- a/sklearn/inspection/__init__.py +++ b/sklearn/inspection/__init__.py @@ -1,13 +1,11 @@ """The :mod:`sklearn.inspection` module includes tools for model inspection.""" +from ._partial_dependence import partial_dependence from ._permutation_importance import permutation_importance from ._plot.decision_boundary import DecisionBoundaryDisplay - -from ._partial_dependence import partial_dependence from ._plot.partial_dependence import PartialDependenceDisplay - __all__ = [ "partial_dependence", "permutation_importance", diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index e3af7dda1e505..59a9212aff440 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -11,18 +11,23 @@ from scipy import sparse from scipy.stats.mstats import mquantiles -from ._pd_utils import _check_feature_names, _get_feature_index from ..base import is_classifier, is_regressor -from ..utils.extmath import cartesian -from ..utils import check_array -from ..utils import check_matplotlib_support # noqa -from ..utils import _safe_indexing -from ..utils import _safe_assign -from ..utils import _determine_key_type -from ..utils import _get_column_indices -from ..utils.validation import _check_sample_weight -from ..utils.validation import check_is_fitted -from ..utils import Bunch +from ..ensemble import RandomForestRegressor +from ..ensemble._gb import BaseGradientBoosting +from ..ensemble._hist_gradient_boosting.gradient_boosting import ( + BaseHistGradientBoosting, +) +from ..exceptions import NotFittedError +from ..tree import DecisionTreeRegressor +from ..utils import ( + Bunch, + _determine_key_type, + _get_column_indices, + _safe_assign, + _safe_indexing, + check_array, + check_matplotlib_support, # noqa +) from ..utils._param_validation import ( HasMethods, Integral, @@ -30,14 +35,9 @@ StrOptions, validate_params, ) -from ..tree import DecisionTreeRegressor -from ..ensemble import RandomForestRegressor -from ..exceptions import NotFittedError -from ..ensemble._gb import BaseGradientBoosting -from ..ensemble._hist_gradient_boosting.gradient_boosting import ( - BaseHistGradientBoosting, -) - +from ..utils.extmath import cartesian +from ..utils.validation import _check_sample_weight, check_is_fitted +from ._pd_utils import _check_feature_names, _get_feature_index __all__ = [ "partial_dependence", diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 9330589a04794..f8e1fba2967c5 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -1,15 +1,13 @@ """Permutation importance for estimators.""" import numbers + import numpy as np from ..ensemble._bagging import _generate_indices from ..metrics import check_scoring, get_scorer_names from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer from ..model_selection._validation import _aggregate_score_dicts -from ..utils import Bunch, _safe_indexing -from ..utils import check_random_state -from ..utils import check_array -from ..utils.parallel import delayed, Parallel +from ..utils import Bunch, _safe_indexing, check_array, check_random_state from ..utils._param_validation import ( HasMethods, Integral, @@ -18,6 +16,7 @@ StrOptions, validate_params, ) +from ..utils.parallel import Parallel, delayed def _weights_scorer(scorer, estimator, X, y, sample_weight): diff --git a/sklearn/inspection/_plot/decision_boundary.py b/sklearn/inspection/_plot/decision_boundary.py index 22b4590d9bc3c..e588edbef7626 100644 --- a/sklearn/inspection/_plot/decision_boundary.py +++ b/sklearn/inspection/_plot/decision_boundary.py @@ -2,14 +2,13 @@ import numpy as np -from ...preprocessing import LabelEncoder -from ...utils import check_matplotlib_support -from ...utils import _safe_indexing from ...base import is_regressor +from ...preprocessing import LabelEncoder +from ...utils import _safe_indexing, check_matplotlib_support from ...utils.validation import ( - check_is_fitted, _is_arraylike_not_scalar, _num_features, + check_is_fitted, ) diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 48e151cefedbe..46d2c78d78d2e 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -6,16 +6,18 @@ from scipy import sparse from scipy.stats.mstats import mquantiles -from .. import partial_dependence -from .._pd_utils import _check_feature_names, _get_feature_index from ...base import is_regressor -from ...utils import Bunch -from ...utils import check_array -from ...utils import check_matplotlib_support # noqa -from ...utils import check_random_state -from ...utils import _safe_indexing -from ...utils.parallel import delayed, Parallel +from ...utils import ( + Bunch, + _safe_indexing, + check_array, + check_matplotlib_support, # noqa + check_random_state, +) from ...utils._encode import _unique +from ...utils.parallel import Parallel, delayed +from .. import partial_dependence +from .._pd_utils import _check_feature_names, _get_feature_index class PartialDependenceDisplay: diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py index 73cfe187d7f6e..47c21e4521c35 100644 --- a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py +++ b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py @@ -1,21 +1,19 @@ import warnings -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose -from sklearn.base import BaseEstimator -from sklearn.base import ClassifierMixin -from sklearn.datasets import make_classification -from sklearn.linear_model import LogisticRegression -from sklearn.datasets import load_iris -from sklearn.datasets import make_multilabel_classification -from sklearn.tree import DecisionTreeRegressor -from sklearn.tree import DecisionTreeClassifier - +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.datasets import ( + load_iris, + make_classification, + make_multilabel_classification, +) from sklearn.inspection import DecisionBoundaryDisplay from sklearn.inspection._plot.decision_boundary import _check_boundary_response_method - +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved pytestmark = pytest.mark.filterwarnings( diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py index acda2d001144e..106819b5a25d5 100644 --- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py @@ -1,22 +1,21 @@ import numpy as np -from scipy.stats.mstats import mquantiles - import pytest from numpy.testing import assert_allclose +from scipy.stats.mstats import mquantiles -from sklearn.datasets import load_diabetes -from sklearn.datasets import load_iris -from sklearn.datasets import make_classification, make_regression -from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.linear_model import LinearRegression -from sklearn.utils._testing import _convert_container from sklearn.compose import make_column_transformer -from sklearn.preprocessing import OneHotEncoder -from sklearn.pipeline import make_pipeline - +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.inspection import PartialDependenceDisplay - +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils._testing import _convert_container # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved pytestmark = pytest.mark.filterwarnings( diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 4e93985f4d02a..0336dc4b827fe 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -7,41 +7,39 @@ import pytest import sklearn +from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_regressor +from sklearn.cluster import KMeans +from sklearn.compose import make_column_transformer +from sklearn.datasets import load_iris, make_classification, make_regression +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ( + GradientBoostingClassifier, + GradientBoostingRegressor, + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, + RandomForestRegressor, +) +from sklearn.exceptions import NotFittedError from sklearn.inspection import partial_dependence from sklearn.inspection._partial_dependence import ( _grid_from_X, _partial_dependence_brute, _partial_dependence_recursion, ) -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import MultiTaskLasso -from sklearn.tree import DecisionTreeRegressor -from sklearn.datasets import load_iris -from sklearn.datasets import make_classification, make_regression -from sklearn.cluster import KMeans -from sklearn.compose import make_column_transformer +from sklearn.linear_model import LinearRegression, LogisticRegression, MultiTaskLasso from sklearn.metrics import r2_score -from sklearn.preprocessing import PolynomialFeatures -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import RobustScaler -from sklearn.preprocessing import scale from sklearn.pipeline import make_pipeline -from sklearn.dummy import DummyClassifier -from sklearn.base import BaseEstimator, ClassifierMixin, clone -from sklearn.base import is_regressor -from sklearn.exceptions import NotFittedError -from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_array_equal +from sklearn.preprocessing import ( + PolynomialFeatures, + RobustScaler, + StandardScaler, + scale, +) +from sklearn.tree import DecisionTreeRegressor +from sklearn.tree.tests.test_tree import assert_is_subtree from sklearn.utils import _IS_32BIT +from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.utils.validation import check_random_state -from sklearn.tree.tests.test_tree import assert_is_subtree - # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] diff --git a/sklearn/inspection/tests/test_pd_utils.py b/sklearn/inspection/tests/test_pd_utils.py index 5f461ad498f5b..5dea3834a77a7 100644 --- a/sklearn/inspection/tests/test_pd_utils.py +++ b/sklearn/inspection/tests/test_pd_utils.py @@ -1,9 +1,8 @@ import numpy as np import pytest -from sklearn.utils._testing import _convert_container - from sklearn.inspection._pd_utils import _check_feature_names, _get_feature_index +from sklearn.utils._testing import _convert_container @pytest.mark.parametrize( diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 307d17188e852..b1a680646afe1 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -1,31 +1,27 @@ -import pytest import numpy as np - +import pytest from numpy.testing import assert_allclose from sklearn.compose import ColumnTransformer -from sklearn.datasets import load_diabetes -from sklearn.datasets import load_iris -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LinearRegression -from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.impute import SimpleImputer from sklearn.inspection import permutation_importance -from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import ( get_scorer, mean_squared_error, r2_score, ) +from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import KBinsDiscretizer -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import scale +from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, scale from sklearn.utils import parallel_backend from sklearn.utils._testing import _convert_container diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index a1cf95b95591b..4e5f7f7b0034f 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -3,20 +3,19 @@ # Nelle Varoquaux # License: BSD 3 clause +import math +import warnings +from numbers import Real + import numpy as np from scipy import interpolate from scipy.stats import spearmanr -from numbers import Real -import warnings -import math -from .base import BaseEstimator, TransformerMixin, RegressorMixin -from .base import _fit_context +from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique +from .base import BaseEstimator, RegressorMixin, TransformerMixin, _fit_context from .utils import check_array, check_consistent_length -from .utils.validation import _check_sample_weight, check_is_fitted from .utils._param_validation import Interval, StrOptions -from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique - +from .utils.validation import _check_sample_weight, check_is_fitted __all__ = ["check_increasing", "isotonic_regression", "IsotonicRegression"] diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 7f190a2b66823..11e6e91236437 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -8,8 +8,8 @@ # License: BSD 3 clause -from numbers import Integral, Real import warnings +from numbers import Integral, Real import numpy as np import scipy.sparse as sp @@ -20,20 +20,21 @@ except ImportError: # scipy < 1.4 from scipy.fftpack import fft, ifft -from .base import BaseEstimator -from .base import TransformerMixin -from .base import ClassNamePrefixFeaturesOutMixin -from .base import _fit_context -from .utils import check_random_state -from .utils import deprecated +from .base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from .metrics.pairwise import KERNEL_PARAMS, PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels +from .utils import check_random_state, deprecated +from .utils._param_validation import Interval, StrOptions from .utils.extmath import safe_sparse_dot -from .utils.validation import check_is_fitted -from .utils.validation import _check_feature_names_in -from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS -from .utils.validation import check_non_negative -from .utils._param_validation import Interval -from .utils._param_validation import StrOptions -from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS +from .utils.validation import ( + _check_feature_names_in, + check_is_fitted, + check_non_negative, +) class PolynomialCountSketch( diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index a7bfeefaef651..f418c8946510d 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -7,12 +7,11 @@ import numpy as np -from .base import BaseEstimator, RegressorMixin, MultiOutputMixin -from .base import _fit_context -from .utils._param_validation import Interval, StrOptions -from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels +from .base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context from .linear_model._ridge import _solve_cholesky_kernel -from .utils.validation import check_is_fitted, _check_sample_weight +from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels +from .utils._param_validation import Interval, StrOptions +from .utils.validation import _check_sample_weight, check_is_fitted class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index d5a14756c41a9..45c99d4d36df1 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -7,46 +7,44 @@ # complete documentation. from ._base import LinearRegression -from ._bayes import BayesianRidge, ARDRegression -from ._least_angle import ( - Lars, - LassoLars, - lars_path, - lars_path_gram, - LarsCV, - LassoLarsCV, - LassoLarsIC, -) +from ._bayes import ARDRegression, BayesianRidge from ._coordinate_descent import ( - Lasso, ElasticNet, - LassoCV, ElasticNetCV, - lasso_path, - enet_path, - MultiTaskLasso, + Lasso, + LassoCV, MultiTaskElasticNet, MultiTaskElasticNetCV, + MultiTaskLasso, MultiTaskLassoCV, + enet_path, + lasso_path, ) -from ._glm import PoissonRegressor, GammaRegressor, TweedieRegressor +from ._glm import GammaRegressor, PoissonRegressor, TweedieRegressor from ._huber import HuberRegressor -from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber -from ._stochastic_gradient import SGDClassifier, SGDRegressor, SGDOneClassSVM -from ._ridge import Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, ridge_regression +from ._least_angle import ( + Lars, + LarsCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + lars_path, + lars_path_gram, +) from ._logistic import LogisticRegression, LogisticRegressionCV from ._omp import ( - orthogonal_mp, - orthogonal_mp_gram, OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, + orthogonal_mp, + orthogonal_mp_gram, ) -from ._passive_aggressive import PassiveAggressiveClassifier -from ._passive_aggressive import PassiveAggressiveRegressor +from ._passive_aggressive import PassiveAggressiveClassifier, PassiveAggressiveRegressor from ._perceptron import Perceptron - from ._quantile import QuantileRegressor from ._ransac import RANSACRegressor +from ._ridge import Ridge, RidgeClassifier, RidgeClassifierCV, RidgeCV, ridge_regression +from ._sgd_fast import Hinge, Huber, Log, ModifiedHuber, SquaredLoss +from ._stochastic_gradient import SGDClassifier, SGDOneClassSVM, SGDRegressor from ._theil_sen import TheilSenRegressor __all__ = [ diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 92c067c850225..249c13da179c0 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -14,33 +14,37 @@ # Maria Telenczuk # License: BSD 3 clause -from abc import ABCMeta, abstractmethod import numbers import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral import numpy as np import scipy.sparse as sp -from scipy import linalg -from scipy import optimize -from scipy import sparse +from scipy import linalg, optimize, sparse from scipy.sparse.linalg import lsqr from scipy.special import expit -from numbers import Integral -from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin -from ..base import _fit_context +from ..base import ( + BaseEstimator, + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + _fit_context, +) from ..preprocessing._data import _is_constant_feature -from ..utils import check_array -from ..utils.validation import FLOAT_DTYPES -from ..utils import check_random_state -from ..utils.extmath import safe_sparse_dot -from ..utils.extmath import _incremental_mean_and_var -from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale +from ..utils import check_array, check_random_state from ..utils._array_api import get_namespace -from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 -from ..utils._seq_dataset import ArrayDataset64, CSRDataset64 -from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.parallel import delayed, Parallel +from ..utils._seq_dataset import ( + ArrayDataset32, + ArrayDataset64, + CSRDataset32, + CSRDataset64, +) +from ..utils.extmath import _incremental_mean_and_var, safe_sparse_dot +from ..utils.parallel import Parallel, delayed +from ..utils.sparsefuncs import inplace_column_scale, mean_variance_axis +from ..utils.validation import FLOAT_DTYPES, _check_sample_weight, check_is_fitted # TODO: bayesian_ridge_regression and bayesian_regression_ard # should be squashed into its respective objects. diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 37dc3b81511f5..7b64e91f18c17 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -8,16 +8,16 @@ import warnings from math import log from numbers import Integral, Real + import numpy as np from scipy import linalg +from scipy.linalg import pinvh -from ._base import LinearModel, _preprocess_data, _rescale_data -from ..base import RegressorMixin -from ..base import _fit_context +from ..base import RegressorMixin, _fit_context +from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils.extmath import fast_logdet -from scipy.linalg import pinvh from ..utils.validation import _check_sample_weight -from ..utils._param_validation import Interval, Hidden, StrOptions +from ._base import LinearModel, _preprocess_data, _rescale_data # TODO(1.5) Remove diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 829c0ab6149f1..c7caeab2090fe 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -5,36 +5,34 @@ # # License: BSD 3 clause +import numbers import sys import warnings -import numbers from abc import ABC, abstractmethod from functools import partial from numbers import Integral, Real import numpy as np -from scipy import sparse from joblib import effective_n_jobs +from scipy import sparse -from ._base import LinearModel, _pre_fit -from ..base import RegressorMixin, MultiOutputMixin -from ..base import _fit_context -from ._base import _preprocess_data +from ..base import MultiOutputMixin, RegressorMixin, _fit_context +from ..model_selection import check_cv from ..utils import check_array, check_scalar -from ..utils.validation import check_random_state from ..utils._param_validation import Interval, StrOptions -from ..model_selection import check_cv from ..utils.extmath import safe_sparse_dot +from ..utils.parallel import Parallel, delayed from ..utils.validation import ( _check_sample_weight, check_consistent_length, check_is_fitted, + check_random_state, column_or_1d, ) -from ..utils.parallel import delayed, Parallel # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' from . import _cd_fast as cd_fast # type: ignore +from ._base import LinearModel, _pre_fit, _preprocess_data def _set_order(X, y, order="C"): diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index fea9c4d4cf6ba..1b82bbd77bcf9 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,10 +1,10 @@ # License: BSD 3 clause from .glm import ( - _GeneralizedLinearRegressor, - PoissonRegressor, GammaRegressor, + PoissonRegressor, TweedieRegressor, + _GeneralizedLinearRegressor, ) __all__ = [ diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index b1bc460f24dff..3dc0bbdc66bff 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -11,7 +11,6 @@ import numpy as np import scipy.optimize -from ._newton_solver import NewtonCholeskySolver, NewtonSolver from ..._loss.loss import ( HalfGammaLoss, HalfPoissonLoss, @@ -19,14 +18,14 @@ HalfTweedieLoss, HalfTweedieLossIdentity, ) -from ...base import BaseEstimator, RegressorMixin -from ...base import _fit_context +from ...base import BaseEstimator, RegressorMixin, _fit_context from ...utils import check_array from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils._param_validation import Hidden, Interval, StrOptions from ...utils.optimize import _check_optimize_result from ...utils.validation import _check_sample_weight, check_is_fitted from .._linear_loss import LinearModelLoss +from ._newton_solver import NewtonCholeskySolver, NewtonSolver class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index c92ef5f99ca8a..5256a5f370272 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -2,22 +2,22 @@ # # License: BSD 3 clause -from functools import partial import itertools import warnings +from functools import partial import numpy as np -from numpy.testing import assert_allclose import pytest import scipy +from numpy.testing import assert_allclose from scipy import linalg from scipy.optimize import minimize, root -from sklearn.base import clone from sklearn._loss import HalfBinomialLoss, HalfPoissonLoss, HalfTweedieLoss from sklearn._loss.link import IdentityLink, LogLink - +from sklearn.base import clone from sklearn.datasets import make_low_rank_matrix, make_regression +from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import ( GammaRegressor, PoissonRegressor, @@ -27,11 +27,9 @@ from sklearn.linear_model._glm import _GeneralizedLinearRegressor from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver from sklearn.linear_model._linear_loss import LinearModelLoss -from sklearn.exceptions import ConvergenceWarning from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance from sklearn.model_selection import train_test_split - SOLVERS = ["lbfgs", "newton-cholesky"] diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index def2ae273d5c4..554f693061116 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -2,18 +2,17 @@ # License: BSD 3 clause from numbers import Integral, Real -import numpy as np +import numpy as np from scipy import optimize -from ..base import BaseEstimator, RegressorMixin -from ..base import _fit_context -from ._base import LinearModel +from ..base import BaseEstimator, RegressorMixin, _fit_context from ..utils import axis0_safe_slice from ..utils._param_validation import Interval -from ..utils.validation import _check_sample_weight from ..utils.extmath import safe_sparse_dot from ..utils.optimize import _check_optimize_result +from ..utils.validation import _check_sample_weight +from ._base import LinearModel def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None): diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index e6c653eb80bb3..439ba636d159f 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -8,27 +8,24 @@ # # License: BSD 3 clause -from math import log import sys import warnings - +from math import log from numbers import Integral, Real + import numpy as np -from scipy import linalg, interpolate +from scipy import interpolate, linalg from scipy.linalg.lapack import get_lapack_funcs -from ._base import LinearModel, LinearRegression -from ._base import _deprecate_normalize, _preprocess_data -from ..base import RegressorMixin, MultiOutputMixin -from ..base import _fit_context +from ..base import MultiOutputMixin, RegressorMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..model_selection import check_cv # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' -from ..utils import arrayfuncs, as_float_array # type: ignore -from ..utils import check_random_state +from ..utils import arrayfuncs, as_float_array, check_random_state # type: ignore from ..utils._param_validation import Hidden, Interval, StrOptions -from ..model_selection import check_cv -from ..exceptions import ConvergenceWarning -from ..utils.parallel import delayed, Parallel +from ..utils.parallel import Parallel, delayed +from ._base import LinearModel, LinearRegression, _deprecate_normalize, _preprocess_data SOLVE_TRIANGULAR_ARGS = {"check_finite": False} diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py index f70d78fb42871..92a203abc87ab 100644 --- a/sklearn/linear_model/_linear_loss.py +++ b/sklearn/linear_model/_linear_loss.py @@ -3,6 +3,7 @@ """ import numpy as np from scipy import sparse + from ..utils.extmath import squared_norm diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 30a0f40a0f2fd..6bdc4b7368ef0 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -11,35 +11,37 @@ # Arthur Mensch