scikit-learn · thomasjpfan · Apr 28, 2020 · Feb 21, 2020 · Feb 21, 2020 · Feb 21, 2020
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -434,8 +434,9 @@ You can check for common programming errors with the following tools:
   must not produce new errors in your pull request. Using `# type: ignore`
   annotation can be a workaround for a few cases that are not supported by
   mypy, in particular,
-   - when importing C or Cython modules
-   - on properties with decorators
+
+  - when importing C or Cython modules
+  - on properties with decorators
 
 Bonus points for contributions that include a performance analysis with
 a benchmark script and profiling output (please report on the mailing

diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
@@ -5,122 +5,255 @@
 
 Missing values can be replaced by the mean, the median or the most frequent
 value using the basic :class:`sklearn.impute.SimpleImputer`.
-The median is a more robust estimator for data with high magnitude variables
-which could dominate results (otherwise known as a 'long tail').
 
-With ``KNNImputer``, missing values can be imputed using the weighted
-or unweighted mean of the desired number of nearest neighbors.
+In this example we will investigate different imputation techniques:
 
-Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
-round-robin linear regression, treating every variable as an output in
-turn. The version implemented assumes Gaussian (output) variables. If your
-features are obviously non-Normal, consider transforming them to look more
-Normal so as to potentially improve performance.
+- imputation by the constant value 0
+- imputation by the mean value of each feature combined with a missing-ness
+  indicator auxiliary variable
+- k nearest neighbor imputation
+- iterative imputation
+
+We will use two datasets: Diabetes dataset which consists of 10 feature
+variables collected from diabetes patients with an aim to predict disease
+progression and California Housing dataset for which the target is the median
+house value for California districts.
+
+As neither of these datasets have missing values, we will remove some
+values to create new versions with artificially missing data. The performance
+of
+:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset
+is then compared the performance on the altered datasets with the artificially
+missing values imputed using different techniques.
 
-In addition of using an imputing method, we can also keep an indication of the
-missing information using :func:`sklearn.impute.MissingIndicator` which might
-carry some information.
 """
 print(__doc__)
 
+# Authors: Maria Telenczuk  <https://github.com/maikia>
+# License: BSD 3 clause
+
+###############################################################################
+# Download the data and make missing values sets
+################################################
+#
+# First we download the two datasets. Diabetes dataset is shipped with
+# scikit-learn. It has 442 entries, each with 10 features. California Housing
+# dataset is much larger with 20640 entries and 8 features. It needs to be
+# downloaded. We will only use the first 400 entries for the sake of speeding
+# up the calculations but feel free to use the whole dataset.
+#
+
 import numpy as np
-import matplotlib.pyplot as plt
 
-# To use the experimental IterativeImputer, we need to explicitly ask for it:
-from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.datasets import fetch_california_housing
 from sklearn.datasets import load_diabetes
-from sklearn.datasets import load_boston
+
+
+rng = np.random.RandomState(42)
+
+X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
+X_california, y_california = fetch_california_housing(return_X_y=True)
+X_california = X_california[:400]
+y_california = y_california[:400]
+
+
+def add_missing_values(X_full, y_full):
+    n_samples, n_features = X_full.shape
+
+    # Add missing values in 75% of the lines
+    missing_rate = 0.75
+    n_missing_samples = int(n_samples * missing_rate)
+
+    missing_samples = np.zeros(n_samples, dtype=np.bool)
+    missing_samples[: n_missing_samples] = True
+
+    rng.shuffle(missing_samples)
+    missing_features = rng.randint(0, n_features, n_missing_samples)
+    X_missing = X_full.copy()
+    X_missing[missing_samples, missing_features] = np.nan
+    y_missing = y_full.copy()
+
+    return X_missing, y_missing
+
+
+X_miss_california, y_miss_california = add_missing_values(
+    X_california, y_california)
+
+X_miss_diabetes, y_miss_diabetes = add_missing_values(
+    X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Impute the missing data and score
+# #################################
+# Now we will write a function which will score the results on the differently
+# imputed data. Let's look at each imputer separately:
+#
+
+rng = np.random.RandomState(0)
+
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import (
-    SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator)
+
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
 from sklearn.model_selection import cross_val_score
+from sklearn.pipeline import make_pipeline
 
-rng = np.random.RandomState(0)
 
 N_SPLITS = 5
-REGRESSOR = RandomForestRegressor(random_state=0)
+regressor = RandomForestRegressor(random_state=0)
+
+###############################################################################
+# Missing information
+# -------------------
+# In addition to imputing the missing values, the imputers have an
+# `add_indicator` parameter that marks the values that were missing, which
+# might carry some information.
+#
 
 
 def get_scores_for_imputer(imputer, X_missing, y_missing):
-    estimator = make_pipeline(
-        make_union(imputer, MissingIndicator(missing_values=0)),
-        REGRESSOR)
+    estimator = make_pipeline(imputer, regressor)
     impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                     scoring='neg_mean_squared_error',
                                     cv=N_SPLITS)
     return impute_scores
 
 
-def get_results(dataset):
-    X_full, y_full = dataset.data, dataset.target
-    n_samples = X_full.shape[0]
-    n_features = X_full.shape[1]
+x_labels = ['Full data',
+            'Zero imputation',
+            'Mean Imputation',
+            'KNN Imputation',
+            'Iterative Imputation']
+
+mses_california = np.zeros(5)
+stds_california = np.zeros(5)
+mses_diabetes = np.zeros(5)
+stds_diabetes = np.zeros(5)
+
+###############################################################################
+# Estimate the score
+# ------------------
+# First, we want to estimate the score on the original data:
+#
 
-    # Estimate the score on the entire dataset, with no missing values
-    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
+
+def get_full_score(X_full, y_full):
+    full_scores = cross_val_score(regressor, X_full, y_full,
                                   scoring='neg_mean_squared_error',
                                   cv=N_SPLITS)
+    return full_scores.mean(), full_scores.std()
 
-    # Add missing values in 75% of the lines
-    missing_rate = 0.75
-    n_missing_samples = int(np.floor(n_samples * missing_rate))
-    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
-                                          dtype=np.bool),
-                                 np.ones(n_missing_samples,
-                                         dtype=np.bool)))
-    rng.shuffle(missing_samples)
-    missing_features = rng.randint(0, n_features, n_missing_samples)
-    X_missing = X_full.copy()
-    X_missing[np.where(missing_samples)[0], missing_features] = 0
-    y_missing = y_full.copy()
 
-    # Estimate the score after replacing missing values by 0
-    imputer = SimpleImputer(missing_values=0,
-                            strategy='constant',
-                            fill_value=0)
+mses_california[0], stds_california[0] = get_full_score(X_california,
+                                                        y_california)
+mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Replace missing values by 0
+# ---------------------------
+#
+# Now we will estimate the score on the data where the missing values are
+# replaced by 0:
+#
+
+
+def get_impute_zero_score(X_missing, y_missing):
+
+    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True,
+                            strategy='constant', fill_value=0)
     zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return zero_impute_scores.mean(), zero_impute_scores.std()
 
-    # Estimate the score after imputation (mean strategy) of the missing values
-    imputer = SimpleImputer(missing_values=0, strategy="mean")
-    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
-    # Estimate the score after kNN-imputation of the missing values
-    imputer = KNNImputer(missing_values=0)
+mses_california[1], stds_california[1] = get_impute_zero_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
+                                                           y_miss_diabetes)
+
+
+###############################################################################
+# kNN-imputation of the missing values
+# ------------------------------------
+#
+# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted
+# or unweighted mean of the desired number of nearest neighbors.
+
+def get_impute_knn_score(X_missing, y_missing):
+    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return knn_impute_scores.mean(), knn_impute_scores.std()
 
-    # Estimate the score after iterative imputation of the missing values
-    imputer = IterativeImputer(missing_values=0,
-                               random_state=0,
-                               n_nearest_features=5,
+
+mses_california[2], stds_california[2] = get_impute_knn_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
+                                                          y_miss_diabetes)
+
+
+###############################################################################
+# Impute missing values with mean
+# -------------------------------
+#
+
+def get_impute_mean(X_missing, y_missing):
+    imputer = SimpleImputer(missing_values=np.nan, stra
3D24
tegy="mean",
+                            add_indicator=True)
+    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return mean_impute_scores.mean(), mean_impute_scores.std()
+
+
+mses_california[3], stds_california[3] = get_impute_mean(X_miss_california,
+                                                         y_miss_california)
+mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
+                                                     y_miss_diabetes)
+
+
+###############################################################################
+# Iterative imputation of the missing values
+# ------------------------------------------
+#
+# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
+# round-robin linear regression, modeling each feature with missing values as a
+# function of other features, in turn.
+# The version implemented assumes Gaussian (output) variables. If your features
+# are obviously non-normal, consider transforming them to look more normal
+# to potentially improve performance.
+#
+
+def get_impute_iterative(X_missing, y_missing):
+    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,
+                               random_state=0, n_nearest_features=5,
                                sample_posterior=True)
     iterative_impute_scores = get_scores_for_imputer(imputer,
                                                      X_missing,
                                                      y_missing)
+    return iterative_impute_scores.mean(), iterative_impute_scores.std()
 
-    return ((full_scores.mean(), full_scores.std()),
-            (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (knn_impute_scores.mean(), knn_impute_scores.std()),
-            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
+mses_california[4], stds_california[4] = get_impute_iterative(
+    X_miss_california, y_miss_california)
+mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
+                                                          y_miss_diabetes)
 
-results_diabetes = np.array(get_results(load_diabetes()))
-mses_diabetes = results_diabetes[:, 0] * -1
-stds_diabetes = results_diabetes[:, 1]
+mses_diabetes = mses_diabetes * -1
+mses_california = mses_california * -1
+
+###############################################################################
+# Plot the results
+# ################
+#
+# Finally we are going to visualize the score:
+#
+
+import matplotlib.pyplot as plt
 
-results_boston = np.array(get_results(load_boston()))
-mses_boston = results_boston[:, 0] * -1
-stds_boston = results_boston[:, 1]
 
 n_bars = len(mses_diabetes)
 xval = np.arange(n_bars)
 
-x_labels = ['Full data',
-            'Zero imputation',
-            'Mean Imputation',
-            'KNN Imputation',
-            'Iterative Imputation']
 colors = ['r', 'g', 'b', 'orange', 'black']
 
 # plot diabetes results
@@ -138,16 +271,20 @@ def get_results(dataset):
 ax1.invert_yaxis()
 ax1.set_yticklabels(x_labels)
 
-# plot boston results
+# plot california dataset results
 ax2 = plt.subplot(122)
 for j in xval:
-    ax2.barh(j, mses_boston[j], xerr=stds_boston[j],
+    ax2.barh(j, mses_california[j], xerr=stds_california[j],
              color=colors[j], alpha=0.6, align='center')
 
-ax2.set_title('Imputation Techniques with Boston Data')
+ax2.set_title('Imputation Techniques with California Data')
 ax2.set_yticks(xval)
 ax2.set_xlabel('MSE')
 ax2.invert_yaxis()
 ax2.set_yticklabels([''] * n_bars)
 
 plt.show()
+
+# You can also try different techniques. For instance, the median is a more
+# robust estimator for data with high magnitude variables which could dominate
+# results (otherwise known as a 'long tail').