5
5
6
6
Missing values can be replaced by the mean, the median or the most frequent
7
7
value using the basic :class:`sklearn.impute.SimpleImputer`.
8
- The median is a more robust estimator for data with high magnitude variables
9
- which could dominate results (otherwise known as a 'long tail').
10
8
11
- With ``KNNImputer``, missing values can be imputed using the weighted
12
- or unweighted mean of the desired number of nearest neighbors.
9
+ In this example we will investigate different imputation techniques:
13
10
14
- Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
15
- round-robin linear regression, treating every variable as an output in
16
- turn. The version implemented assumes Gaussian (output) variables. If your
17
- features are obviously non-Normal, consider transforming them to look more
18
- Normal so as to potentially improve performance.
11
+ - imputation by the constant value 0
12
+ - imputation by the mean value of each feature combined with a missing-ness
13
+ indicator auxiliary variable
14
+ - k nearest neighbor imputation
15
+ - iterative imputation
16
+
17
+ We will use two datasets: Diabetes dataset which consists of 10 feature
18
+ variables collected from diabetes patients with an aim to predict disease
19
+ progression and California Housing dataset for which the target is the median
20
+ house value for California districts.
21
+
22
+ As neither of these datasets have missing values, we will remove some
23
+ values to create new versions with artificially missing data. The performance
24
+ of
25
+ :class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset
26
+ is then compared the performance on the altered datasets with the artificially
27
+ missing values imputed using different techniques.
19
28
20
- In addition of using an imputing method, we can also keep an indication of the
21
- missing information using :func:`sklearn.impute.MissingIndicator` which might
22
- carry some information.
23
29
"""
24
30
print (__doc__ )
25
31
32
+ # Authors: Maria Telenczuk <https://github.com/maikia>
33
+ # License: BSD 3 clause
34
+
35
+ ###############################################################################
36
+ # Download the data and make missing values sets
37
+ ################################################
38
+ #
39
+ # First we download the two datasets. Diabetes dataset is shipped with
40
+ # scikit-learn. It has 442 entries, each with 10 features. California Housing
41
+ # dataset is much larger with 20640 entries and 8 features. It needs to be
42
+ # downloaded. We will only use the first 400 entries for the sake of speeding
43
+ # up the calculations but feel free to use the whole dataset.
44
+ #
45
+
26
46
import numpy as np
27
- import matplotlib .pyplot as plt
28
47
29
- # To use the experimental IterativeImputer, we need to explicitly ask for it:
30
- from sklearn .experimental import enable_iterative_imputer # noqa
48
+ from sklearn .datasets import fetch_california_housing
31
49
from sklearn .datasets import load_diabetes
32
- from sklearn .datasets import load_boston
50
+
51
+
52
+ rng = np .random .RandomState (42 )
53
+
54
+ X_diabetes , y_diabetes = load_diabetes (return_X_y = True )
55
+ X_california , y_california = fetch_california_housing (return_X_y = True )
56
+ X_california = X_california [:400 ]
57
+ y_california = y_california [:400 ]
58
+
59
+
60
+ def add_missing_values (X_full , y_full ):
61
+ n_samples , n_features = X_full .shape
62
+
63
+ # Add missing values in 75% of the lines
64
+ missing_rate = 0.75
65
+ n_missing_samples = int (n_samples * missing_rate )
66
+
67
+ missing_samples = np .zeros (n_samples , dtype = np .bool )
68
+ missing_samples [: n_missing_samples ] = True
69
+
70
+ rng .shuffle (missing_samples )
71
+ missing_features = rng .randint (0 , n_features , n_missing_samples )
72
+ X_missing = X_full .copy ()
73
+ X_missing [missing_samples , missing_features ] = np .nan
74
+ y_missing = y_full .copy ()
75
+
76
+ return X_missing , y_missing
77
+
78
+
79
+ X_miss_california , y_miss_california = add_missing_values (
80
+ X_california , y_california )
81
+
82
+ X_miss_diabetes , y_miss_diabetes = add_missing_values (
83
+ X_diabetes , y_diabetes )
84
+
85
+
86
+ ###############################################################################
87
+ # Impute the missing data and score
88
+ # #################################
89
+ # Now we will write a function which will score the results on the differently
90
+ # imputed data. Let's look at each imputer separately:
91
+ #
92
+
93
+ rng = np .random .RandomState (0 )
94
+
33
95
from sklearn .ensemble import RandomForestRegressor
34
- from sklearn .pipeline import make_pipeline , make_union
35
- from sklearn .impute import (
36
- SimpleImputer , KNNImputer , IterativeImputer , MissingIndicator )
BE2E
div>
96
+
97
+ # To use the experimental IterativeImputer, we need to explicitly ask for it:
98
+ from sklearn .experimental import enable_iterative_imputer # noqa
99
+ from sklearn .impute import SimpleImputer , KNNImputer , IterativeImputer
37
100
from sklearn .model_selection import cross_val_score
101
+ from sklearn .pipeline import make_pipeline
38
102
39
- rng = np .random .RandomState (0 )
40
103
41
104
N_SPLITS = 5
42
- REGRESSOR = RandomForestRegressor (random_state = 0 )
105
+ regressor = RandomForestRegressor (random_state = 0 )
106
+
107
+ ###############################################################################
108
+ # Missing information
109
+ # -------------------
110
+ # In addition to imputing the missing values, the imputers have an
111
+ # `add_indicator` parameter that marks the values that were missing, which
112
+ # might carry some information.
113
+ #
43
114
44
115
45
116
def get_scores_for_imputer (imputer , X_missing , y_missing ):
46
- estimator = make_pipeline (
47
- make_union (imputer , MissingIndicator (missing_values = 0 )),
48
- REGRESSOR )
117
+ estimator = make_pipeline (imputer , regressor )
49
118
impute_scores = cross_val_score (estimator , X_missing , y_missing ,
50
119
scoring = 'neg_mean_squared_error' ,
51
120
cv = N_SPLITS )
52
121
return impute_scores
53
122
54
123
55
- def get_results (dataset ):
56
- X_full , y_full = dataset .data , dataset .target
57
- n_samples = X_full .shape [0 ]
58
- n_features = X_full .shape [1 ]
124
+ x_labels = ['Full data' ,
125
+ 'Zero imputation' ,
126
+ 'Mean Imputation' ,
127
+ 'KNN Imputation' ,
128
+ 'Iterative Imputation' ]
129
+
130
+ mses_california = np .zeros (5 )
131
+ stds_california = np .zeros (5 )
132
+ mses_diabetes = np .zeros (5 )
133
+ stds_diabetes = np .zeros (5 )
134
+
135
+ ###############################################################################
136
+ # Estimate the score
137
+ # ------------------
138
+ # First, we want to estimate the score on the original data:
139
+ #
59
140
60
- # Estimate the score on the entire dataset, with no missing values
61
- full_scores = cross_val_score (REGRESSOR , X_full , y_full ,
141
+
142
+ def get_full_score (X_full , y_full ):
143
+ full_scores = cross_val_score (regressor , X_full , y_full ,
62
144
scoring = 'neg_mean_squared_error' ,
63
145
cv = N_SPLITS )
146
+ return full_scores .mean (), full_scores .std ()
64
147
65
- # Add missing values in 75% of the lines
66
- missing_rate = 0.75
67
- n_missing_samples = int (np .floor (n_samples * missing_rate ))
68
- missing_samples = np .hstack ((np .zeros (n_samples - n_missing_samples ,
69
- dtype = np .bool ),
70
- np .ones (n_missing_samples ,
71
- dtype = np .bool )))
72
- rng .shuffle (missing_samples )
73
- missing_features = rng .randint (0 , n_features , n_missing_samples )
74
- X_missing = X_full .copy ()
75
- X_missing [np .where (missing_samples )[0 ], missing_features ] = 0
76
- y_missing = y_full .copy ()
77
148
78
- # Estimate the score after replacing missing values by 0
79
- imputer = SimpleImputer (missing_values = 0 ,
80
- strategy = 'constant' ,
81
- fill_value = 0 )
149
+ mses_california [0 ], stds_california [0 ] = get_full_score (X_california ,
150
+ y_california )
151
+ mses_diabetes [0 ], stds_diabetes [0 ] = get_full_score (X_diabetes , y_diabetes )
152
+
153
+
154
+ ###############################################################################
155
+ # Replace missing values by 0
156
+ # ---------------------------
157
+ #
158
+ # Now we will estimate the score on the data where the missing values are
159
+ # replaced by 0:
160
+ #
161
+
162
+
163
+ def get_impute_zero_score (X_missing , y_missing ):
164
+
165
+ imputer = SimpleImputer (missing_values = np .nan , add_indicator = True ,
166
+ strategy = 'constant' , fill_value = 0 )
82
167
zero_impute_scores = get_scores_for_imputer (imputer , X_missing , y_missing )
168
+ return zero_impute_scores .mean (), zero_impute_scores .std ()
83
169
84
- # Estimate the score after imputation (mean strategy) of the missing values
85
- imputer = SimpleImputer (missing_values = 0 , strategy = "mean" )
86
- mean_impute_scores = get_scores_for_imputer (imputer , X_missing , y_missing )
87
170
88
- # Estimate the score after kNN-imputation of the missing values
89
- imputer = KNNImputer (missing_values = 0 )
171
+ mses_california [1 ], stds_california [1 ] = get_impute_zero_score (
172
+ X_miss_california , y_miss_california )
173
+ mses_diabetes [1 ], stds_diabetes [1 ] = get_impute_zero_score (X_miss_diabetes ,
174
+ y_miss_diabetes )
175
+
176
+
177
+ ###############################################################################
178
+ # kNN-imputation of the missing values
179
+ # ------------------------------------
180
+ #
181
+ # :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted
182
+ # or unweighted mean of the desired number of nearest neighbors.
183
+
184
+ def get_impute_knn_score (X_missing , y_missing ):
185
+ imputer = KNNImputer (missing_values = np .nan , add_indicator = True )
90
186
knn_impute_scores = get_scores_for_imputer (imputer , X_missing , y_missing )
187
+ return knn_impute_scores .mean (), knn_impute_scores .std ()
91
188
92
- # Estimate the score after iterative imputation of the missing values
93
- imputer = IterativeImputer (missing_values = 0 ,
94
- random_state = 0 ,
95
- n_nearest_features = 5 ,
189
+
190
+ mses_california [2 ], stds_california [2 ] = get_impute_knn_score (
191
+ X_miss_california , y_miss_california )
192
+ mses_diabetes [2 ], stds_diabetes [2 ] = get_impute_knn_score (X_miss_diabetes ,
193
+ y_miss_diabetes )
194
+
195
+
196
+ ###############################################################################
197
+ # Impute missing values with mean
198
+ # -------------------------------
199
+ #
200
+
201
+ def get_impute_mean (X_missing , y_missing ):
202
+ imputer = SimpleImputer (missing_values = np .nan , strategy = "mean" ,
203
+ add_indicator = True )
204
+ mean_impute_scores = get_scores_for_imputer (imputer , X_missing , y_missing )
205
+ return mean_impute_scores .mean (), mean_impute_scores .std ()
206
+
207
+
208
+ mses_california [3 ], stds_california [3 ] = get_impute_mean (X_miss_california ,
209
+ y_miss_california )
210
+ mses_diabetes [3 ], stds_diabetes [3 ] = get_impute_mean (X_miss_diabetes ,
211
+ y_miss_diabetes )
212
+
213
+
214
+ ###############################################################################
215
+ # Iterative imputation of the missing values
216
+ # ------------------------------------------
217
+ #
218
+ # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
219
+ # round-robin linear regression, modeling each feature with missing values as a
220
+ # function of other features, in turn.
221
+ # The version implemented assumes Gaussian (output) variables. If your features
222
+ # are obviously non-normal, consider transforming them to look more normal
223
+ # to potentially improve performance.
224
+ #
225
+
226
+ def get_impute_iterative (X_missing , y_missing ):
227
+ imputer = IterativeImputer (missing_values = np .nan , add_indicator = True ,
228
+ random_state = 0 , n_nearest_features = 5 ,
96
229
sample_posterior = True )
97
230
iterative_impute_scores = get_scores_for_imputer (imputer ,
98
231
X_missing ,
99
232
y_missing )
233
+ return iterative_impute_scores .mean (), iterative_impute_scores .std ()
100
234
101
- return ((full_scores .mean (), full_scores .std ()),
102
- (zero_impute_scores .mean (), zero_impute_scores .std ()),
103
- (mean_impute_scores .mean (), mean_impute_scores .std ()),
104
- (knn_impute_scores .mean (), knn_impute_scores .std ()),
105
- (iterative_impute_scores .mean (), iterative_impute_scores .std ()))
106
235
236
+ mses_california [4 ], stds_california [4 ] = get_impute_iterative (
237
+ X_miss_california , y_miss_california )
238
+ mses_diabetes [4 ], stds_diabetes [4 ] = get_impute_iterative (X_miss_diabetes ,
239
+ y_miss_diabetes )
107
240
108
- results_diabetes = np .array (get_results (load_diabetes ()))
109
- mses_diabetes = results_diabetes [:, 0 ] * - 1
110
- stds_diabetes = results_diabetes [:, 1 ]
241
+ mses_diabetes = mses_diabetes * - 1
242
+ mses_california = mses_california * - 1
243
+
244
+ ###############################################################################
245
+ # Plot the results
246
+ # ################
247
+ #
248
+ # Finally we are going to visualize the score:
249
+ #
250
+
251
+ import matplotlib .pyplot as plt
111
252
112
- results_boston = np .array (get_results (load_boston ()))
113
- mses_boston = results_boston [:, 0 ] * - 1
114
- stds_boston = results_boston [:, 1 ]
115
253
116
254
n_bars = len (mses_diabetes )
117
255
xval = np .arange (n_bars )
118
256
119
- x_labels = ['Full data' ,
120
- 'Zero imputation' ,
121
- 'Mean Imputation' ,
122
- 'KNN Imputation' ,
123
- 'Iterative Imputation' ]
124
257
colors = ['r' , 'g' , 'b' , 'orange' , 'black' ]
125
258
126
259
# plot diabetes results
@@ -138,16 +271,20 @@ def get_results(dataset):
138
271
ax1 .invert_yaxis ()
139
272
ax1 .set_yticklabels (x_labels )
140
273
141
- # plot boston results
274
+ # plot california dataset results
142
275
ax2 = plt .subplot (122 )
143
276
for j in xval :
144
- ax2 .barh (j , mses_boston [j ], xerr = stds_boston [j ],
277
+ ax2 .barh (j , mses_california [j ], xerr = stds_california [j ],
145
278
color = colors [j ], alpha = 0.6 , align = 'center' )
146
279
147
- ax2 .set_title ('Imputation Techniques with Boston Data' )
280
+ ax2 .set_title ('Imputation Techniques with California Data' )
148
281
ax2 .set_yticks (xval )
149
282
ax2 .set_xlabel ('MSE' )
150
283
ax2 .invert_yaxis ()
151
284
ax2 .set_yticklabels (['' ] * n_bars )
152
285
153
286
plt .show ()
287
+
288
+ # You can also try different techniques. For instance, the median is a more
289
+ # robust estimator for data with high magnitude variables which could dominate
290
+ # results (otherwise known as a 'long tail').
0 commit comments