diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index afa328f68ba8f..c089b4e6612fb 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -9,7 +9,6 @@ # List of modules ignored when checking for numpydoc validation. DOCSTRING_IGNORE_LIST = [ - "IterativeImputer", "KNNImputer", "LabelPropagation", "LabelSpreading", diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index d95e78ccd36d3..321c1f537520d 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -36,7 +36,7 @@ class IterativeImputer(_BaseImputer): This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, - you need to explicitly import ``enable_iterative_imputer``:: + you need to explicitly import `enable_iterative_imputer`:: >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_iterative_imputer # noqa @@ -47,20 +47,20 @@ class IterativeImputer(_BaseImputer): ---------- estimator : estimator object, default=BayesianRidge() The estimator to use at each step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. + If `sample_posterior=True`, the estimator must support + `return_std` in its `predict` method. - missing_values : int, np.nan, default=np.nan + missing_values : int or np.nan, default=np.nan The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. - sample_posterior : boolean, default=False + sample_posterior : bool, default=False Whether to sample from the (Gaussian) predictive posterior of the fitted estimator for each imputation. Estimator must support - ``return_std`` in its ``predict`` method if set to ``True``. Set to - ``True`` if using ``IterativeImputer`` for multiple imputations. + `return_std` in its `predict` method if set to `True`. Set to + `True` if using `IterativeImputer` for multiple imputations. max_iter : int, default=10 Maximum number of imputation rounds to perform before returning the @@ -68,7 +68,7 @@ class IterativeImputer(_BaseImputer): imputation of each feature with missing values. The stopping criterion is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`, where `X_t` is `X` at iteration `t`. Note that early stopping is only - applied if ``sample_posterior=False``. + applied if `sample_posterior=False`. tol : float, default=1e-3 Tolerance of the stopping condition. @@ -81,45 +81,41 @@ class IterativeImputer(_BaseImputer): imputation process, the neighbor features are not necessarily nearest, but are drawn with probability proportional to correlation for each imputed target feature. Can provide significant speed-up when the - number of features is huge. If ``None``, all features will be used. + number of features is huge. If `None`, all features will be used. - initial_strategy : str, default='mean' + initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \ + default='mean' Which strategy to use to initialize the missing values. Same as the - ``strategy`` parameter in :class:`~sklearn.impute.SimpleImputer` - Valid values: {"mean", "median", "most_frequent", or "constant"}. + `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`. - imputation_order : str, default='ascending' + imputation_order : {'ascending', 'descending', 'roman', 'arabic', \ + 'random'}, default='ascending' The order in which the features will be imputed. Possible values: - "ascending" - From features with fewest missing values to most. - "descending" - From features with most missing values to fewest. - "roman" - Left to right. - "arabic" - Right to left. - "random" - A random order for each round. - - skip_complete : boolean, default=False - If ``True`` then features with missing values during ``transform`` - which did not have any missing values during ``fit`` will be imputed - with the initial imputation method only. Set to ``True`` if you have - many features with no missing values at both ``fit`` and ``transform`` - time to save compute. + - `'ascending'`: From features with fewest missing values to most. + - `'descending'`: From features with most missing values to fewest. + - `'roman'`: Left to right. + - `'arabic'`: Right to left. + - `'random'`: A random order for each round. + + skip_complete : bool, default=False + If `True` then features with missing values during :meth:`transform` + which did not have any missing values during :meth:`fit` will be + imputed with the initial imputation method only. Set to `True` if you + have many features with no missing values at both :meth:`fit` and + :meth:`transform` time to save compute. min_value : float or array-like of shape (n_features,), default=-np.inf - Minimum possible imputed value. Broadcast to shape (n_features,) if - scalar. If array-like, expects shape (n_features,), one min value for + Minimum possible imputed value. Broadcast to shape `(n_features,)` if + scalar. If array-like, expects shape `(n_features,)`, one min value for each feature. The default is `-np.inf`. .. versionchanged:: 0.23 Added support for array-like. max_value : float or array-like of shape (n_features,), default=np.inf - Maximum possible imputed value. Broadcast to shape (n_features,) if - scalar. If array-like, expects shape (n_features,), one max value for + Maximum possible imputed value. Broadcast to shape `(n_features,)` if + scalar. If array-like, expects shape `(n_features,)`, one max value for each feature. The default is `np.inf`. .. versionchanged:: 0.23 @@ -132,13 +128,13 @@ class IterativeImputer(_BaseImputer): random_state : int, RandomState instance or None, default=None The seed of the pseudo random number generator to use. Randomizes - selection of estimator features if n_nearest_features is not None, the - ``imputation_order`` if ``random``, and the sampling from posterior if - ``sample_posterior`` is True. Use an integer for determinism. + selection of estimator features if `n_nearest_features` is not `None`, + the `imputation_order` if `random`, and the sampling from posterior if + `sample_posterior=True`. Use an integer for determinism. See :term:`the Glossary `. - add_indicator : boolean, default=False - If True, a :class:`MissingIndicator` transform will stack onto output + add_indicator : bool, default=False + If `True`, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on @@ -151,16 +147,16 @@ class IterativeImputer(_BaseImputer): Imputer used to initialize the missing values. imputation_sequence_ : list of tuples - Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where - ``feat_idx`` is the current feature to be imputed, - ``neighbor_feat_idx`` is the array of other features used to impute the - current feature, and ``estimator`` is the trained estimator used for - the imputation. Length is ``self.n_features_with_missing_ * - self.n_iter_``. + Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where + `feat_idx` is the current feature to be imputed, + `neighbor_feat_idx` is the array of other features used to impute the + current feature, and `estimator` is the trained estimator used for + the imputation. Length is `self.n_features_with_missing_ * + self.n_iter_`. n_iter_ : int Number of iteration rounds that occurred. Will be less than - ``self.max_iter`` if early stopping criterion was reached. + `self.max_iter` if early stopping criterion was reached. n_features_in_ : int Number of features seen during :term:`fit`. @@ -178,7 +174,7 @@ class IterativeImputer(_BaseImputer): indicator_ : :class:`~sklearn.impute.MissingIndicator` Indicator used to add binary indicators for missing values. - ``None`` if add_indicator is False. + `None` if `add_indicator=False`. random_state_ : RandomState instance RandomState instance that is generated either from a seed, the random @@ -188,28 +184,14 @@ class IterativeImputer(_BaseImputer): -------- SimpleImputer : Univariate imputation of missing values. - Examples - -------- - >>> import numpy as np - >>> from sklearn.experimental import enable_iterative_imputer - >>> from sklearn.impute import IterativeImputer - >>> imp_mean = IterativeImputer(random_state=0) - >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) - IterativeImputer(random_state=0) - >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] - >>> imp_mean.transform(X) - array([[ 6.9584..., 2. , 3. ], - [ 4. , 2.6000..., 6. ], - [10. , 4.9999..., 9. ]]) - Notes ----- To support imputation in inductive mode we store each feature's estimator - during the ``fit`` phase, and predict without refitting (in order) during - the ``transform`` phase. + during the :meth:`fit` phase, and predict without refitting (in order) + during the :meth:`transform` phase. - Features which contain all missing values at ``fit`` are discarded upon - ``transform``. + Features which contain all missing values at :meth:`fit` are discarded upon + :meth:`transform`. References ---------- @@ -222,6 +204,20 @@ class IterativeImputer(_BaseImputer): Multivariate Data Suitable for use with an Electronic Computer". Journal of the Royal Statistical Society 22(2): 302-306. `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.experimental import enable_iterative_imputer + >>> from sklearn.impute import IterativeImputer + >>> imp_mean = IterativeImputer(random_state=0) + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + IterativeImputer(random_state=0) + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> imp_mean.transform(X) + array([[ 6.9584..., 2. , 3. ], + [ 4. , 2.6000..., 6. ], + [10. , 4.9999..., 9. ]]) """ def __init__( @@ -269,8 +265,8 @@ def _impute_one_feature( """Impute a single feature from the others provided. This function predicts the missing values of one of the features using - the current estimates of all the other features. The ``estimator`` must - support ``return_std=True`` in its ``predict`` method for this function + the current estimates of all the other features. The `estimator` must + support `return_std=True` in its `predict` method for this function to work. Parameters @@ -285,12 +281,12 @@ def _impute_one_feature( Index of the feature currently being imputed. neighbor_feat_idx : ndarray - Indices of the features to be used in imputing ``feat_idx``. + Indices of the features to be used in imputing `feat_idx`. estimator : object The estimator to use at this step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. + If `sample_posterior=True`, the estimator must support + `return_std` in its `predict` method. If None, it will be cloned from self._estimator. fit_mode : boolean, default=True @@ -299,11 +295,11 @@ def _impute_one_feature( Returns ------- X_filled : ndarray - Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. + Input data with `X_filled[missing_row_mask, feat_idx]` updated. estimator : estimator with sklearn API The fitted estimator used to impute - ``X_filled[missing_row_mask, feat_idx]``. + `X_filled[missing_row_mask, feat_idx]`. """ if estimator is None and fit_mode is False: raise ValueError( @@ -360,29 +356,29 @@ def _impute_one_feature( return X_filled, estimator def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): - """Get a list of other features to predict ``feat_idx``. + """Get a list of other features to predict `feat_idx`. - If self.n_nearest_features is less than or equal to the total + If `self.n_nearest_features` is less than or equal to the total number of features, then use a probability proportional to the absolute - correlation between ``feat_idx`` and each other feature to randomly + correlation between `feat_idx` and each other feature to randomly choose a subsample of the other features (without replacement). Parameters ---------- n_features : int - Number of features in ``X``. + Number of features in `X`. feat_idx : int Index of the feature currently being imputed. abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X``. The diagonal has been zeroed + Absolute correlation matrix of `X`. The diagonal has been zeroed out and each feature has been normalized to sum to 1. Can be None. Returns ------- neighbor_feat_idx : array-like - The features to use to impute ``feat_idx``. + The features to use to impute `feat_idx`. """ if self.n_nearest_features is not None and self.n_nearest_features < n_features: p = abs_corr_mat[:, feat_idx] @@ -407,8 +403,8 @@ def _get_ordered_idx(self, mask_missing_values): Parameters ---------- mask_missing_values : array-like, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. + Input data's missing indicator matrix, where `n_samples` is the + number of samples and `n_features` is the number of features. Returns ------- @@ -451,13 +447,13 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): Input data with the most recent imputations. tolerance : float, default=1e-6 - ``abs_corr_mat`` can have nans, which will be replaced - with ``tolerance``. + `abs_corr_mat` can have nans, which will be replaced + with `tolerance`. Returns ------- abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X`` at the beginning of the + Absolute correlation matrix of `X` at the beginning of the current round. The diagonal has been zeroed out and each feature's absolute correlations with all others have been normalized to sum to 1. @@ -481,33 +477,33 @@ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): return abs_corr_mat def _initial_imputation(self, X, in_fit=False): - """Perform initial imputation for input X. + """Perform initial imputation for input `X`. Parameters ---------- X : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. in_fit : bool, default=False - Whether function is called in fit. + Whether function is called in :meth:`fit`. Returns ------- Xt : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. X_filled : ndarray, shape (n_samples, n_features) Input data with the most recent imputations. mask_missing_values : ndarray, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. + Input data's missing indicator matrix, where `n_samples` is the + number of samples and `n_features` is the number of features. X_missing_mask : ndarray, shape (n_samples, n_features) Input data's mask matrix indicating missing datapoints, where - "n_samples" is the number of samples and "n_features" is the + `n_samples` is the number of samples and `n_features` is the number of features. """ if is_scalar_nan(self.missing_values): @@ -544,20 +540,23 @@ def _initial_imputation(self, X, in_fit=False): @staticmethod def _validate_limit(limit, limit_type, n_features): - """Validate the limits (min/max) of the feature values - Converts scalar min/max limits to vectors of shape (n_features,) + """Validate the limits (min/max) of the feature values. + + Converts scalar min/max limits to vectors of shape `(n_features,)`. Parameters ---------- limit: scalar or array-like - The user-specified limit (i.e, min_value or max_value) - limit_type: string, "max" or "min" - n_features: Number of features in the dataset + The user-specified limit (i.e, min_value or max_value). + limit_type: {'max', 'min'} + Type of limit to validate. + n_features: int + Number of features in the dataset. Returns ------- limit: ndarray, shape(n_features,) - Array of limits, one for each feature + Array of limits, one for each feature. """ limit_bound = np.inf if limit_type == "max" else -np.inf limit = limit_bound if limit is None else limit @@ -573,15 +572,16 @@ def _validate_limit(limit, limit_type, n_features): return limit def fit_transform(self, X, y=None): - """Fits the imputer on X and return the transformed X. + """Fit the imputer on `X` and return the transformed `X`. Parameters ---------- X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. - y : ignored. + y : Ignored + Not used, present for API consistency by convention. Returns ------- @@ -704,10 +704,10 @@ def fit_transform(self, X, y=None): return super()._concatenate_indicator(Xt, X_indicator) def transform(self, X): - """Imputes all missing values in X. + """Impute all missing values in `X`. - Note that this is stochastic, and that if random_state is not fixed, - repeated calls, or permuted input, will yield different results. + Note that this is stochastic, and that if `random_state` is not fixed, + repeated calls, or permuted input, results will differ. Parameters ---------- @@ -756,20 +756,21 @@ def transform(self, X): return super()._concatenate_indicator(Xt, X_indicator) def fit(self, X, y=None): - """Fits the imputer on X and return self. + """Fit the imputer on `X` and return self. Parameters ---------- X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. - y : ignored + y : Ignored + Not used, present for API consistency by convention. Returns ------- self : object - Returns self. + Fitted estimator. """ self.fit_transform(X) return self