diff --git a/doc/related_projects.rst b/doc/related_projects.rst index fb02ea8beaf0d..9b2cd9fe74956 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -151,8 +151,8 @@ and tasks. - `nolearn `_ A number of wrappers and abstractions around existing neural network libraries -- `Keras `_ High-level API for - TensorFlow with a scikit-learn inspired API. +- `keras `_ Deep Learning library capable of + running on top of either TensorFlow or Theano. - `lasagne `_ A lightweight library to build and train neural networks in Theano. diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a78cbe69b746d..9b36e310b51ac 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -386,6 +386,10 @@ Changelog supporting sparse matrix and raise the appropriate error message. :pr:`19879` by :user:`Guillaume Lemaitre `. +- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in + :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. + :pr:`19290` by :user:`Gleb Levitskiy `. + :mod:`sklearn.tree` ................... diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 2f2da882652c0..35bde87838dec 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -19,6 +19,7 @@ from ..utils import Bunch from ..utils import _safe_indexing from ..utils import _get_column_indices +from ..utils import _determine_key_type from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted from ..utils.validation import _deprecate_positional_args @@ -327,6 +328,12 @@ def _validate_remainder(self, X): "'passthrough', or estimator. '%s' was passed instead" % self.remainder) + # Make it possible to check for reordered named columns on transform + self._has_str_cols = any(_determine_key_type(cols) == 'str' + for cols in self._columns) + if hasattr(X, 'columns'): + self._df_columns = X.columns + self._n_features = X.shape[1] cols = [] for columns in self._columns: @@ -362,12 +369,12 @@ def get_feature_names(self): if trans == 'drop' or _is_empty_column_selection(column): continue if trans == 'passthrough': - if self._feature_names_in is not None: + if hasattr(self, '_df_columns'): if ((not isinstance(column, slice)) and all(isinstance(col, str) for col in column)): feature_names.extend(column) else: - feature_names.extend(self._feature_names_in[column]) + feature_names.extend(self._df_columns[column]) else: indices = np.arange(self._n_features) feature_names.extend(['x%d' % i for i in indices[column]]) @@ -463,7 +470,7 @@ def _fit_transform(self, X, y, func, fitted=False): message_clsname='ColumnTransformer', message=self._log_message(name, idx, len(transformers))) for idx, (name, trans, column, weight) in enumerate( - transformers, 1)) + self._iter(fitted=fitted, replace_strings=True), 1)) except ValueError as e: if "Expected 2D array, got 1D array instead" in str(e): raise ValueError(_ERR_MSG_1DCOLUMN) from e @@ -629,9 +636,9 @@ def _sk_visual_block_(self): transformers = self.transformers elif hasattr(self, "_remainder"): remainder_columns = self._remainder[2] - if self._feature_names_in is not None: + if hasattr(self, '_df_columns'): remainder_columns = ( - self._feature_names_in[remainder_columns].tolist() + self._df_columns[remainder_columns].tolist() ) transformers = chain(self.transformers, [('remainder', self.remainder, diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index eb0a73919021a..191d1830fdf6f 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -32,8 +32,7 @@ def _assess_dimension(spectrum, rank, n_samples): """Compute the log-likelihood of a rank ``rank`` dataset. The dataset is assumed to be embedded in gaussian noise of shape(n, - dimf) having spectrum ``spectrum``. This implements the method of - T. P. Minka. + dimf) having spectrum ``spectrum``. Parameters ---------- @@ -51,11 +50,10 @@ def _assess_dimension(spectrum, rank, n_samples): ll : float The log-likelihood. - References - ---------- + Notes + ----- This implements the method of `Thomas P. Minka: - Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604 - `_ + Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604` """ n_features = spectrum.shape[0] @@ -274,30 +272,26 @@ class PCA(_BasePCA): References ---------- - For n_components == 'mle', this class uses the method from: - `Minka, T. P.. "Automatic choice of dimensionality for PCA". - In NIPS, pp. 598-604 `_ + For n_components == 'mle', this class uses the method of *Minka, T. P. + "Automatic choice of dimensionality for PCA". In NIPS, pp. 598-604* Implements the probabilistic PCA model from: - `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal + Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal component analysis". Journal of the Royal Statistical Society: Series B (Statistical Methodology), 61(3), 611-622. - `_ via the score and score_samples methods. + See http://www.miketipping.com/papers/met-mppca.pdf For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`. For svd_solver == 'randomized', see: - `Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). + *Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions". - SIAM review, 53(2), 217-288. - `_ - and also - `Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). + SIAM review, 53(2), 217-288.* and also + *Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). "A randomized algorithm for the decomposition of matrices". - Applied and Computational Harmonic Analysis, 30(1), 47-68 - `_. + Applied and Computational Harmonic Analysis, 30(1), 47-68.* Examples -------- diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 22fa236f3314e..4747c4adc4945 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -205,7 +205,8 @@ def fit(self, X, y=None): init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure - km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) + km = KMeans(n_clusters=n_bins[jj], init=init, + n_init=1, algorithm='full') centers = km.fit(column[:, None]).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort()