diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index c886119e908c1..e13b6850d50eb 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -434,8 +434,9 @@ You can check for common programming errors with the following tools: must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular, - - when importing C or Cython modules - - on properties with decorators + + - when importing C or Cython modules + - on properties with decorators Bonus points for contributions that include a performance analysis with a benchmark script and profiling output (please report on the mailing diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 5186cf0ba3bac..2ba7dc05d16b6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -5,122 +5,255 @@ Missing values can be replaced by the mean, the median or the most frequent value using the basic :class:`sklearn.impute.SimpleImputer`. -The median is a more robust estimator for data with high magnitude variables -which could dominate results (otherwise known as a 'long tail'). -With ``KNNImputer``, missing values can be imputed using the weighted -or unweighted mean of the desired number of nearest neighbors. +In this example we will investigate different imputation techniques: -Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -round-robin linear regression, treating every variable as an output in -turn. The version implemented assumes Gaussian (output) variables. If your -features are obviously non-Normal, consider transforming them to look more -Normal so as to potentially improve performance. +- imputation by the constant value 0 +- imputation by the mean value of each feature combined with a missing-ness + indicator auxiliary variable +- k nearest neighbor imputation +- iterative imputation + +We will use two datasets: Diabetes dataset which consists of 10 feature +variables collected from diabetes patients with an aim to predict disease +progression and California Housing dataset for which the target is the median +house value for California districts. + +As neither of these datasets have missing values, we will remove some +values to create new versions with artificially missing data. The performance +of +:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset +is then compared the performance on the altered datasets with the artificially +missing values imputed using different techniques. -In addition of using an imputing method, we can also keep an indication of the -missing information using :func:`sklearn.impute.MissingIndicator` which might -carry some information. """ print(__doc__) +# Authors: Maria Telenczuk +# License: BSD 3 clause + +############################################################################### +# Download the data and make missing values sets +################################################ +# +# First we download the two datasets. Diabetes dataset is shipped with +# scikit-learn. It has 442 entries, each with 10 features. California Housing +# dataset is much larger with 20640 entries and 8 features. It needs to be +# downloaded. We will only use the first 400 entries for the sake of speeding +# up the calculations but feel free to use the whole dataset. +# + import numpy as np -import matplotlib.pyplot as plt -# To use the experimental IterativeImputer, we need to explicitly ask for it: -from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.datasets import fetch_california_housing from sklearn.datasets import load_diabetes -from sklearn.datasets import load_boston + + +rng = np.random.RandomState(42) + +X_diabetes, y_diabetes = load_diabetes(return_X_y=True) +X_california, y_california = fetch_california_housing(return_X_y=True) +X_california = X_california[:400] +y_california = y_california[:400] + + +def add_missing_values(X_full, y_full): + n_samples, n_features = X_full.shape + + # Add missing values in 75% of the lines + missing_rate = 0.75 + n_missing_samples = int(n_samples * missing_rate) + + missing_samples = np.zeros(n_samples, dtype=np.bool) + missing_samples[: n_missing_samples] = True + + rng.shuffle(missing_samples) + missing_features = rng.randint(0, n_features, n_missing_samples) + X_missing = X_full.copy() + X_missing[missing_samples, missing_features] = np.nan + y_missing = y_full.copy() + + return X_missing, y_missing + + +X_miss_california, y_miss_california = add_missing_values( + X_california, y_california) + +X_miss_diabetes, y_miss_diabetes = add_missing_values( + X_diabetes, y_diabetes) + + +############################################################################### +# Impute the missing data and score +# ################################# +# Now we will write a function which will score the results on the differently +# imputed data. Let's look at each imputer separately: +# + +rng = np.random.RandomState(0) + from sklearn.ensemble import RandomForestRegressor -from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import ( - SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator) + +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer from sklearn.model_selection import cross_val_score +from sklearn.pipeline import make_pipeline -rng = np.random.RandomState(0) N_SPLITS = 5 -REGRESSOR = RandomForestRegressor(random_state=0) +regressor = RandomForestRegressor(random_state=0) + +############################################################################### +# Missing information +# ------------------- +# In addition to imputing the missing values, the imputers have an +# `add_indicator` parameter that marks the values that were missing, which +# might carry some information. +# def get_scores_for_imputer(imputer, X_missing, y_missing): - estimator = make_pipeline( - make_union(imputer, MissingIndicator(missing_values=0)), - REGRESSOR) + estimator = make_pipeline(imputer, regressor) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores -def get_results(dataset): - X_full, y_full = dataset.data, dataset.target - n_samples = X_full.shape[0] - n_features = X_full.shape[1] +x_labels = ['Full data', + 'Zero imputation', + 'Mean Imputation', + 'KNN Imputation', + 'Iterative Imputation'] + +mses_california = np.zeros(5) +stds_california = np.zeros(5) +mses_diabetes = np.zeros(5) +stds_diabetes = np.zeros(5) + +############################################################################### +# Estimate the score +# ------------------ +# First, we want to estimate the score on the original data: +# - # Estimate the score on the entire dataset, with no missing values - full_scores = cross_val_score(REGRESSOR, X_full, y_full, + +def get_full_score(X_full, y_full): + full_scores = cross_val_score(regressor, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) + return full_scores.mean(), full_scores.std() - # Add missing values in 75% of the lines - missing_rate = 0.75 - n_missing_samples = int(np.floor(n_samples * missing_rate)) - missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) - rng.shuffle(missing_samples) - missing_features = rng.randint(0, n_features, n_missing_samples) - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - # Estimate the score after replacing missing values by 0 - imputer = SimpleImputer(missing_values=0, - strategy='constant', - fill_value=0) +mses_california[0], stds_california[0] = get_full_score(X_california, + y_california) +mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) + + +############################################################################### +# Replace missing values by 0 +# --------------------------- +# +# Now we will estimate the score on the data where the missing values are +# replaced by 0: +# + + +def get_impute_zero_score(X_missing, y_missing): + + imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, + strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return zero_impute_scores.mean(), zero_impute_scores.std() - # Estimate the score after imputation (mean strategy) of the missing values - imputer = SimpleImputer(missing_values=0, strategy="mean") - mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - # Estimate the score after kNN-imputation of the missing values - imputer = KNNImputer(missing_values=0) +mses_california[1], stds_california[1] = get_impute_zero_score( + X_miss_california, y_miss_california) +mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# kNN-imputation of the missing values +# ------------------------------------ +# +# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted +# or unweighted mean of the desired number of nearest neighbors. + +def get_impute_knn_score(X_missing, y_missing): + imputer = KNNImputer(missing_values=np.nan, add_indicator=True) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return knn_impute_scores.mean(), knn_impute_scores.std() - # Estimate the score after iterative imputation of the missing values - imputer = IterativeImputer(missing_values=0, - random_state=0, - n_nearest_features=5, + +mses_california[2], stds_california[2] = get_impute_knn_score( + X_miss_california, y_miss_california) +mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# Impute missing values with mean +# ------------------------------- +# + +def get_impute_mean(X_missing, y_missing): + imputer = SimpleImputer(missing_values=np.nan, strategy="mean", + add_indicator=True) + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return mean_impute_scores.mean(), mean_impute_scores.std() + + +mses_california[3], stds_california[3] = get_impute_mean(X_miss_california, + y_miss_california) +mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# Iterative imputation of the missing values +# ------------------------------------------ +# +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +# round-robin linear regression, modeling each feature with missing values as a +# function of other features, in turn. +# The version implemented assumes Gaussian (output) variables. If your features +# are obviously non-normal, consider transforming them to look more normal +# to potentially improve performance. +# + +def get_impute_iterative(X_missing, y_missing): + imputer = IterativeImputer(missing_values=np.nan, add_indicator=True, + random_state=0, n_nearest_features=5, sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return iterative_impute_scores.mean(), iterative_impute_scores.std() - return ((full_scores.mean(), full_scores.std()), - (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std()), - (knn_impute_scores.mean(), knn_impute_scores.std()), - (iterative_impute_scores.mean(), iterative_impute_scores.std())) +mses_california[4], stds_california[4] = get_impute_iterative( + X_miss_california, y_miss_california) +mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, + y_miss_diabetes) -results_diabetes = np.array(get_results(load_diabetes())) -mses_diabetes = results_diabetes[:, 0] * -1 -stds_diabetes = results_diabetes[:, 1] +mses_diabetes = mses_diabetes * -1 +mses_california = mses_california * -1 + +############################################################################### +# Plot the results +# ################ +# +# Finally we are going to visualize the score: +# + +import matplotlib.pyplot as plt -results_boston = np.array(get_results(load_boston())) -mses_boston = results_boston[:, 0] * -1 -stds_boston = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) -x_labels = ['Full data', - 'Zero imputation', - 'Mean Imputation', - 'KNN Imputation', - 'Iterative Imputation'] colors = ['r', 'g', 'b', 'orange', 'black'] # plot diabetes results @@ -138,16 +271,20 @@ def get_results(dataset): ax1.invert_yaxis() ax1.set_yticklabels(x_labels) -# plot boston results +# plot california dataset results ax2 = plt.subplot(122) for j in xval: - ax2.barh(j, mses_boston[j], xerr=stds_boston[j], + ax2.barh(j, mses_california[j], xerr=stds_california[j], color=colors[j], alpha=0.6, align='center') -ax2.set_title('Imputation Techniques with Boston Data') +ax2.set_title('Imputation Techniques with California Data') ax2.set_yticks(xval) ax2.set_xlabel('MSE') ax2.invert_yaxis() ax2.set_yticklabels([''] * n_bars) plt.show() + +# You can also try different techniques. For instance, the median is a more +# robust estimator for data with high magnitude variables which could dominate +# results (otherwise known as a 'long tail').