diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index 2f44ddb8585c6..d75efbf2b6ab0 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -67,4 +67,4 @@ plt.xticks(()) plt.yticks(()) -plt.show() +plt.show() \ No newline at end of file diff --git a/examples/missing_values.py b/examples/missing_values.py index 8a0895f9a589f..53503c67dac01 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -1,11 +1,12 @@ """ -====================================================== +==================================================== Imputing missing values before building an estimator -====================================================== +==================================================== -This example shows that imputing the missing values can give better results -than discarding the samples containing any missing value. -Imputing does not always improve the predictions, so please check via cross-validation. +This example shows that imputing the missing values can give +better results than discarding the samples containing any missing value. +Imputing does not always improve the predictions, +so please check via cross-validation. Sometimes dropping rows or using marker values is more effective. Missing values can be replaced by the mean, the median or the most frequent @@ -13,38 +14,54 @@ The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). -Script output:: +Another option is the MICE imputer. This uses round-robin linear regression, +treating every variable as an output in turn. The simple version implemented +assumes Gaussian output variables. If your output variables are obviously +non-Gaussian, consider transforming them to improve performance. - Score with the entire dataset = 0.56 - Score without the samples containing missing values = 0.48 - Score after imputation of the missing values = 0.55 +Script output: + + MSE with the entire dataset = 3354.15 + MSE without the samples containing missing values = 2968.98 + MSE after mean imputation of the missing values = 3507.77 + MSE after MICE imputation of the missing values = 3340.39 + +In this case, imputing helps the classifier match the original score. + +Note that MICE will not always be better than, e.g., simple mean imputation. +To see an example of this, swap in ``boston`` for ``diabetes``. -In this case, imputing helps the classifier get close to the original score. - """ import numpy as np +from sklearn.datasets import load_diabetes from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer +from sklearn.preprocessing import MICEImputer from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) -dataset = load_boston() +dataset_name = 'diabetes' # 'boston' for another examples +if dataset_name == 'boston': + dataset = load_boston() +elif dataset_name == 'diabetes': + dataset = load_diabetes() X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) -score = cross_val_score(estimator, X_full, y_full).mean() -print("Score with the entire dataset = %.2f" % score) +score = cross_val_score(estimator, X_full, y_full, + scoring='neg_mean_squared_error').mean() * -1 +print("MSE with the entire dataset = %.2f" % score) # Add missing values in 75% of the lines missing_rate = 0.75 -n_missing_samples = np.floor(n_samples * missing_rate) +n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, @@ -56,10 +73,11 @@ X_filtered = X_full[~missing_samples, :] y_filtered = y_full[~missing_samples] estimator = RandomForestRegressor(random_state=0, n_estimators=100) -score = cross_val_score(estimator, X_filtered, y_filtered).mean() -print("Score without the samples containing missing values = %.2f" % score) +score = cross_val_score(estimator, X_filtered, y_filtered, + scoring='neg_mean_squared_error').mean() * -1 +print("MSE without the samples containing missing values = %.2f" % score) -# Estimate the score after imputation of the missing values +# Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() @@ -68,5 +86,14 @@ axis=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) -score = cross_val_score(estimator, X_missing, y_missing).mean() -print("Score after imputation of the missing values = %.2f" % score) +score = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error').mean() * -1 +print("MSE after mean imputation of the missing values = %.2f" % score) + +# Estimate the score after imputation (MICE strategy) of the missing values +estimator = Pipeline([("imputer", MICEImputer(missing_values=0)), + ("forest", RandomForestRegressor(random_state=0, + n_estimators=100))]) +score = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error').mean() * -1 +print("MSE after MICE imputation of the missing values = %.2f" % score) \ No newline at end of file diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 84d42e7177a0a..f6c7bf8ba9f95 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -449,7 +449,7 @@ def fit(self, X, y, sample_weight=None): self.constant_ = np.reshape(self.constant_, (1, -1)) return self - def predict(self, X): + def predict(self, X, return_std=False): """ Perform classification on test vectors X. @@ -459,18 +459,29 @@ def predict(self, X): Input vectors, where n_samples is the number of samples and n_features is the number of features. + return_std : boolean, optional + Whether to return the standard deviation of posterior prediction. + Returns ------- y : array, shape = [n_samples] or [n_samples, n_outputs] Predicted target values for X. + + y_std : array, shape = [n_samples] or [n_samples, n_outputs] + Standard deviation of predictive distribution of query points. """ check_is_fitted(self, "constant_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) n_samples = X.shape[0] y = np.ones((n_samples, 1)) * self.constant_ + y_std = np.zeros((n_samples, 1)) if self.n_outputs_ == 1 and not self.output_2d_: y = np.ravel(y) + y_std = np.ravel(y_std) - return y + if return_std: + return y, y_std + else: + return y diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index cabbd469c10d4..6e3280e083dad 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -29,6 +29,7 @@ from .label import MultiLabelBinarizer from .imputation import Imputer +from .mice import MICEImputer __all__ = [ @@ -38,6 +39,7 @@ 'KernelCenterer', 'LabelBinarizer', 'LabelEncoder', + 'MICEImputer', 'MultiLabelBinarizer', 'MinMaxScaler', 'MaxAbsScaler', @@ -54,4 +56,4 @@ 'maxabs_scale', 'minmax_scale', 'label_binarize', -] +] \ No newline at end of file diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index e414e98f424df..2636375520489 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -166,6 +166,10 @@ def fit(self, X, y=None): self.missing_values, self.axis) + invalid_mask = np.isnan(self.statistics_) + valid_mask = np.logical_not(invalid_mask) + self._valid_statistics_indexes = np.where(valid_mask)[0] + return self def _sparse_fit(self, X, strategy, missing_values, axis): @@ -339,14 +343,13 @@ def transform(self, X): invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.where(valid_mask)[0] missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) - X = X[:, valid_statistics_indexes] + X = X[:, self._valid_statistics_indexes] elif self.axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) @@ -374,4 +377,4 @@ def transform(self, X): X[coordinates] = values - return X + return X \ No newline at end of file diff --git a/sklearn/preprocessing/mice.py b/sklearn/preprocessing/mice.py new file mode 100644 index 0000000000000..953f9ae399b7e --- /dev/null +++ b/sklearn/preprocessing/mice.py @@ -0,0 +1,365 @@ +# Authors: Sergey Feldman +# License: BSD 3 clause + +from copy import deepcopy +from time import time + +import numpy as np + +from .imputation import _get_mask, Imputer +from ..base import BaseEstimator, TransformerMixin +from ..dummy import DummyRegressor +from ..preprocessing import normalize +from ..utils import check_array +from ..utils.validation import check_is_fitted + +__all__ = [ + 'MICEImputer', +] + + +class MICEImputer(BaseEstimator, TransformerMixin): + """ + Basic implementation of MICE package from R. + This version assumes all of the columns are Gaussian. + + Parameters + ---------- + missing_values : integer or "NaN", optional (default="NaN") + The placeholder for the missing values. All occurrences of + missing_values will be imputed. For missing values encoded as + np.nan, use the string value "NaN". + + imputation_order : str, optional, default="monotone" + The order in which the columns will be imputed. + "monotone" - From columns with fewest missing values to most. + "revmonotone" - From columns with most missing values to least. + "roman" - Left to right. + "arabic" - Right to left. + "random" - A random order for each round. + + n_imputations : int, optional, defaults = 100. + Number of MICE rounds to perform the results of which will be + used in the final average. + + n_burn_in : int, optional, default = 10. + Number of initial MICE rounds to perform the results of which + will not be returned. + + n_nearest_columns : int, optional, default = np.infty. + Number of other columns to use to estimate the missing values of + the current column. Can provide significant speed-up + when the number of columns is huge. + + initial_fill_method : str, optional, default = "mean" + Valid values: {"mean", "median", or "most_frequent"}. + Uses sklearn.preprocessing.Imputer. + + min_value : float, default = None + Minimum possible imputed value. + + max_value : float, default = None + Maximum possible imputed value. + + verbose : boolean, default = False + Whether to provide updates on progress. + + Notes + ----- + - Columns which only contain missing values at `fit` are discarded upon + `transform`. + + References + ---------- + + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + """ + + def __init__( + self, + missing_values='NaN', + imputation_order='monotone', + n_imputations=100, + n_burn_in=10, + n_nearest_columns=np.infty, + initial_fill_method="mean", + min_value=None, + max_value=None, + verbose=False): + from ..linear_model import BayesianRidge # avoiding circular import + self.model = BayesianRidge() + self.missing_values = missing_values + self.imputation_order = imputation_order + self.n_imputations = n_imputations + self.n_burn_in = n_burn_in + self.n_nearest_columns = n_nearest_columns + self.initial_fill_method = initial_fill_method + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.trained_model_triplets = [] + + def _fill_in_one_column(self, + X_filled, + mask_missing_values, + this_column, + other_columns, + model=None, + min_std=1e-6): + # if nothing is missing, just return the default + if mask_missing_values[:, this_column].sum() == 0: + return X_filled, model + missing_row_mask = mask_missing_values[:, this_column] + if model is None: + X_train = X_filled[:, other_columns][~missing_row_mask] + y_train = X_filled[:, this_column][~missing_row_mask] + if np.std(y_train) > 0: + model = deepcopy(self.model) + model.fit(X_train, y_train) + else: + model = DummyRegressor() + model.fit(X_train, y_train) + X_test = X_filled[:, other_columns][missing_row_mask] + mus, sigmas = model.predict(X_test, return_std=True) + sigmas = np.maximum(sigmas, min_std) + imputed_values = np.random.normal(mus, sigmas) + imputed_values = self._clip(imputed_values) + X_filled[missing_row_mask, this_column] = imputed_values + return X_filled, model + + def _get_other_columns(self, + n_features, + this_column, + abs_correlation_matrix): + """ + Get a list of other columns to predict this_column. + + If self.n_nearest_columns is less than or equal to the total + number of columns, then use a probability proportional to the absolute + correlation between this_column and each other column to randomly + choose a subsample of the other columns (without replacement). + + Parameters + ---------- + n_features : integer + Number of columns in X. + + this_column : integer + Index of the column currently being imputed. + + abs_correlation_matrix : array-like, shape (n_features, n_features) + Absolute correlation matrix of X at the begging of the current + round. The diagonal has been zeroed out and each column has been + normalized to sum to 1. + """ + ordered_columns = np.arange(n_features) + if self.n_nearest_columns <= n_features - 1: + p = abs_correlation_matrix[:, this_column] + other_columns = np.random.choice(ordered_columns, + self.n_nearest_columns, + replace=False, + p=p) + else: + other_columns = np.concatenate([ordered_columns[:this_column], + ordered_columns[this_column + 1:]]) + return other_columns + + def _clip(self, X): + """ + Clip values to fall within min/max constraints. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + """ + X = np.asarray(X) + if self.min_value is not None: + X[X < self.min_value] = self.min_value + if self.max_value is not None: + X[X > self.max_value] = self.max_value + return X + + def _get_ordered_indices(self, mask_missing_values): + """ + Decide what order we will update the columns. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function filters out columns which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + """ + n_samples, n_features = mask_missing_values.shape + fraction_of_missing_values = mask_missing_values.mean(axis=0) + all_features_indices = np.arange(n_features) + if self.imputation_order == 'roman': + ordered_indices = all_features_indices + elif self.imputation_order == 'arabic': + ordered_indices = all_features_indices[::-1] + elif self.imputation_order == 'monotone': + ordered_indices = np.argsort(fraction_of_missing_values)[::-1] + elif self.imputation_order == 'revmonotone': + ordered_indices = np.argsort(fraction_of_missing_values) + else: + ordered_indices = np.arange(n_features) + np.random.shuffle(ordered_indices) + + # filter out indices for which we have no missing values + valid_features = all_features_indices[fraction_of_missing_values > 0] + valid_features = set(valid_features) + ordered_indices = [i for i in ordered_indices if i in valid_features] + return ordered_indices + + def _get_abs_correlation_matrix(self, X_filled, eps=1e-6): + # at each stage all but one of the features is used as input + n_features = X_filled.shape[1] + if self.n_nearest_columns > n_features - 1: + return None + abs_correlation_matrix = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for constant columns + abs_correlation_matrix[np.isnan(abs_correlation_matrix)] = eps + np.fill_diagonal(abs_correlation_matrix, 0) + abs_correlation_matrix = normalize(abs_correlation_matrix, norm='l1', + axis=0) + return abs_correlation_matrix + + def fit_transform(self, X, y=None): + """ + Fit the imputer on X and return the transformed X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + """ + X = check_array(X, dtype=np.float64, force_all_finite=False) + X = np.asarray(X, order="F") + mask_missing_values = _get_mask(X, self.missing_values) + + # initial imputation + self.initial_imputer_ = Imputer(missing_values=self.missing_values, + strategy=self.initial_fill_method, + axis=0) + X_filled = self.initial_imputer_.fit_transform(X) + self._val_inds = self.initial_imputer_._valid_statistics_indexes + X = X[:, self._val_inds] + mask_missing_values = mask_missing_values[:, self._val_inds] + + # perform imputations + n_samples, n_features = X_filled.shape + total_rounds = self.n_burn_in + self.n_imputations + results_list = [] + if self.verbose: + print("[MICE] Completing matrix with shape %s" % (X.shape,)) + start_t = time() + mice_msg = '[MICE] Ending imputation round ' + for m in range(total_rounds): + # order in which to impute + ordered_indices = self._get_ordered_indices(mask_missing_values) + + # abs_correlation matrix is used to choose a subset of other + # features to impute from f + abs_corr_mat = self._get_abs_correlation_matrix(X_filled) + + # Fill in each column in the order of ordered_indices + for this_column in ordered_indices: + other_columns = self._get_other_columns(n_features, + this_column, + abs_corr_mat) + X_filled, model = self._fill_in_one_column(X_filled, + mask_missing_values, + this_column, + other_columns) + model_triplet = (this_column, other_columns, model) + self.trained_model_triplets.append(model_triplet) + + if m >= self.n_burn_in: + results_list.append(X_filled[mask_missing_values]) + if self.verbose: + print(mice_msg + 'round %d/%d, elapsed time %0.2f' + % (m + 1, total_rounds, time() - start_t)) + + if len(results_list) > 0: + X[mask_missing_values] = np.array(results_list).mean(axis=0) + else: + X[mask_missing_values] = X_filled[mask_missing_values] + + return X + + def transform(self, X): + """ + Impute all missing values in X. + + Parameters + ---------- + X : array-like}, shape = [n_samples, n_features] + The input data to complete. + """ + check_is_fitted(self, 'initial_imputer_') + X = check_array(X, dtype=np.float64, force_all_finite=False) + X = np.asarray(X, order="F") + mask_missing_values = _get_mask(X, self.missing_values) + + # initial imputation + X_filled = self.initial_imputer_.transform(X) + X = X[:, self._val_inds] + mask_missing_values = mask_missing_values[:, self._val_inds] + + # perform imputations + n_samples, n_features = X_filled.shape + total_rounds = self.n_burn_in + self.n_imputations + if total_rounds > 0: + total_iterations = len(self.trained_model_triplets) + imputations_per_round = total_iterations / total_rounds + round_index = 0 + results_list = [] + if self.verbose: + print("[MICE] Completing matrix with shape %s" % (X.shape,)) + start_t = time() + mice_msg = '[MICE] Ending imputation round ' + for i, model_triplet in enumerate(self.trained_model_triplets): + this_column, other_columns, model = model_triplet + X_filled, _ = self._fill_in_one_column(X_filled, + mask_missing_values, + this_column, + other_columns, + model) + if (i + 1) % imputations_per_round == 0: + round_index += 1 + if round_index >= self.n_burn_in: + results_list.append(X_filled[mask_missing_values]) + if self.verbose: + print(mice_msg + '%d/%d, elapsed time %0.2f' + % (round_index, total_rounds, time() - start_t)) + + if total_rounds > 0 and len(results_list) > 0: + X[mask_missing_values] = np.array(results_list).mean(axis=0) + else: + X[mask_missing_values] = X_filled[mask_missing_values] + + return X + + def fit(self, X, y=None): + """ + Fit the imputer on X and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + """ + self.fit_transform(X) + return self diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index cb23e0ba8a315..dd574d619b421 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -85,7 +85,7 @@ def _yield_non_meta_checks(name, Estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if name not in ['Imputer']: + if name not in ['Imputer', 'MICEImputer']: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf