scikit-learn · sergeyf · Nov 29, 2016 · Nov 29, 2016 · Nov 29, 2016 · Nov 29, 2016
diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py
@@ -67,4 +67,4 @@
 plt.xticks(())
 plt.yticks(())
 
-plt.show()
+plt.show()
diff --git a/examples/missing_values.py b/examples/missing_values.py
@@ -1,50 +1,67 @@
 """
-======================================================
+====================================================
 Imputing missing values before building an estimator
-======================================================
+====================================================
 
-This example shows that imputing the missing values can give better results
-than discarding the samples containing any missing value.
-Imputing does not always improve the predictions, so please check via cross-validation.
+This example shows that imputing the missing values can give
+better results than discarding the samples containing any missing value.
+Imputing does not always improve the predictions,
+so please check via cross-validation.
 Sometimes dropping rows or using marker values is more effective.
 
 Missing values can be replaced by the mean, the median or the most frequent
 value using the ``strategy`` hyper-parameter.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Script output::
+Another option is the MICE imputer. This uses round-robin linear regression,
+treating every variable as an output in turn. The simple version implemented
+assumes Gaussian output variables. If your output variables are obviously
+non-Gaussian, consider transforming them to improve performance.
 
-  Score with the entire dataset = 0.56
-  Score without the samples containing missing values = 0.48
-  Score after imputation of the missing values = 0.55
+Script output:
+
+  MSE with the entire dataset = 3354.15
+  MSE without the samples containing missing values = 2968.98
+  MSE after mean imputation of the missing values = 3507.77
+  MSE after MICE imputation of the missing values = 3340.39
+
+In this case, imputing helps the classifier match the original score.
+
+Note that MICE will not always be better than, e.g., simple mean imputation.
+To see an example of this, swap in ``boston`` for ``diabetes``.
 
-In this case, imputing helps the classifier get close to the original score.
-
 """
 import numpy as np
 
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import Imputer
+from sklearn.preprocessing import MICEImputer
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
 
-dataset = load_boston()
+dataset_name = 'diabetes'  # 'boston' for another examples
+if dataset_name == 'boston':
+    dataset = load_boston()
+elif dataset_name == 'diabetes':
+    dataset = load_diabetes()
 X_full, y_full = dataset.data, dataset.target
 n_samples = X_full.shape[0]
 n_features = X_full.shape[1]
 
 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-score = cross_val_score(estimator, X_full, y_full).mean()
-print("Score with the entire dataset = %.2f" % score)
+score = cross_val_score(estimator, X_full, y_full,
+                        scoring='neg_mean_squared_error').mean() * -1
+print("MSE with the entire dataset = %.2f" % score)
 
 # Add missing values in 75% of the lines
 missing_rate = 0.75
-n_missing_samples = np.floor(n_samples * missing_rate)
+n_missing_samples = int(np.floor(n_samples * missing_rate))
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                       dtype=np.bool),
                              np.ones(n_missing_samples,
@@ -56,10 +73,11 @@
 X_filtered = X_full[~missing_samples, :]
 y_filtered = y_full[~missing_samples]
 estimator = RandomForestRegressor(random_state=0, n_estimators=100)
-score = cross_val_score(estimator, X_filtered, y_filtered).mean()
-print("Score without the samples containing missing values = %.2f" % score)
+score = cross_val_score(estimator, X_filtered, y_filtered,
+                        scoring='neg_mean_squared_error').mean() * -1
+    print("MSE without the samples containing missing values = %.2f" % score)
 
-# Estimate the score after imputation of the missing values
+# Estimate the score after imputation (mean strategy) of the missing values
 X_missing = X_full.copy()
 X_missing[np.where(missing_samples)[0], missing_features] = 0
 y_missing = y_full.copy()
@@ -68,5 +86,14 @@
                                           axis=0)),
                       ("forest", RandomForestRegressor(random_state=0,
                                                        n_estimators=100))])
-score = cross_val_score(estimator, X_missing, y_missing).mean()
-print("Score after imputation of the missing values = %.2f" % score)
+score = cross_val_score(estimator, X_missing, y_missing,
+                        scoring='neg_mean_squared_error').mean() * -1
+print("MSE after mean imputation of the missing values = %.2f" % score)
+
+# Estimate the score after imputation (MICE strategy) of the missing values
+estimator = Pipeline([("imputer", MICEImputer(missing_values=0)),
+                      ("forest", RandomForestRegressor(random_state=0,
+                                                       n_estimators=100))])
+score = cross_val_score(estimator, X_missing, y_missing,
+                        scoring='neg_mean_squared_error').mean() * -1
+print("MSE after MICE imputation of the missing values = %.2f" % score)
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -449,7 +449,7 @@ def fit(self, X, y, sample_weight=None):
         self.constant_ = np.reshape(self.constant_, (1, -1))
         return self
 
-    def predict(self, X):
+    def predict(self, X, return_std=False):
         """
         Perform classification on test vectors X.
 
@@ -459,18 +459,29 @@ def predict(self, X):
             Input vectors, where n_samples is the number of samples
             and n_features is the number of features.
 
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+
         Returns
         -------
         y : array, shape = [n_samples]  or [n_samples, n_outputs]
             Predicted target values for X.
+
+        y_std : array, shape = [n_samples]  or [n_samples, n_outputs]
+            Standard deviation of predictive distribution of query points.
         """
         check_is_fitted(self, "constant_")
         X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
         n_samples = X.shape[0]
 
         y = np.ones((n_samples, 1)) * self.constant_
+        y_std = np.zeros((n_samples, 1))
 
         if self.n_outputs_ == 1 and not self.output_2d_:
             y = np.ravel(y)
+            y_std = np.ravel(y_std)
 
-        return y
+        if return_std:
+            return y, y_std
+        else:
+            return y
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -29,6 +29,7 @@
 from .label import MultiLabelBinarizer
 
 from .imputation import Imputer
+from .mice import MICEImputer
 
 
 __all__ = [
@@ -38,6 +39,7 @@
     'KernelCenterer',
     'LabelBinarizer',
     'LabelEncoder',
+    'MICEImputer',
     'MultiLabelBinarizer',
     'MinMaxScaler',
     'MaxAbsScaler',
@@ -54,4 +56,4 @@
     'maxabs_scale',
     'minmax_scale',
     'label_binarize',
-]
+]
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
@@ -166,6 +166,10 @@ def fit(self, X, y=None):
                                                    self.missing_values,
                                                    self.axis)
 
+            invalid_mask = np.isnan(self.statistics_)
+            valid_mask = np.logical_not(invalid_mask)
+            self._valid_statistics_indexes = np.where(valid_mask)[0]
+
         return self
 
     def _sparse_fit(self, X, strategy, missing_values, axis):
@@ -339,14 +343,13 @@ def transform(self, X):
         invalid_mask = np.isnan(statistics)
         valid_mask = np.logical_not(invalid_mask)
         valid_statistics = statistics[valid_mask]
-        valid_statistics_indexes = np.where(valid_mask)[0]
         missing = np.arange(X.shape[not self.axis])[invalid_mask]
 
         if self.axis == 0 and invalid_mask.any():
             if self.verbose:
                 warnings.warn("Deleting features without "
                               "observed values: %s" % missing)
-            X = X[:, valid_statistics_indexes]
+            X = X[:, self._valid_statistics_indexes]
         elif self.axis == 1 and invalid_mask.any():
             raise ValueError("Some rows only contain "
                              "missing values: %s" % missing)
@@ -374,4 +377,4 @@ def transform(self, X):
 
             X[coordinates] = values
 
-        return X
+        return X