From 0e415aa3c4d8af80bdd6d5249ae20ff7a6d264c8 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Sun, 24 Jun 2018 10:28:19 -0700
Subject: [PATCH 01/26] first commit

---
 sklearn/impute.py | 70 ++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f9fb156103fff..841b70517c19f 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -452,17 +452,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_imputations : int, optional (default=100)
-        Number of chained imputation rounds to perform, the results of which
-        will be used in the final average.
-
     n_burn_in : int, optional (default=10)
         Number of initial imputation rounds to perform the results of which
         will not be returned.
 
     predictor : estimator object, default=BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
-        It must support ``return_std`` in its ``predict`` method.
+        It must support ``return_std`` in its ``predict`` method if
+        ``sample_after_predict`` option is set to ``True`` below.
+
+    sample_after_predict : boolean, default=False
+        Whether to sample from the predictive posterior of the fitted
+        predictor for each Imputation. Set to ``True`` if using
+        ``ChainedImputer`` to have the same functionality as MICE.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
@@ -533,9 +535,9 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_imputations=100,
                  n_burn_in=10,
                  predictor=None,
+                 sample_after_predict=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -545,9 +547,9 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_imputations = n_imputations
         self.n_burn_in = n_burn_in
         self.predictor = predictor
+        self.sample_after_predict = sample_after_predict
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -624,12 +626,15 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        mus, sigmas = predictor.predict(X_test, return_std=True)
-        good_sigmas = sigmas > 0
-        imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
-        imputed_values[~good_sigmas] = mus[~good_sigmas]
-        imputed_values[good_sigmas] = self.random_state_.normal(
-            loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        if self.sample_after_predict:
+            mus, sigmas = predictor.predict(X_test, return_std=True)
+            good_sigmas = sigmas > 0
+            imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+            imputed_values[~good_sigmas] = mus[~good_sigmas]
+            imputed_values[good_sigmas] = self.random_state_.normal(
+                loc=mus[good_sigmas], scale=sigmas[good_sigmas])
+        else:
+            imputed_values = predictor.predict(X_test)
 
         # clip the values
         imputed_values = np.clip(imputed_values,
@@ -836,10 +841,10 @@ def fit_transform(self, X, y=None):
         self.initial_imputer_ = None
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
+        # edge case: in case the user specifies 0 for n_burn_in,
+        # then there is no need to do chained imputation and the result should
+        # be just the initial imputation (before clipping)
+        if self.n_burn_in < 1:
             return X_filled
 
         X_filled = np.clip(X_filled, self._min_value, self._max_value)
@@ -853,15 +858,13 @@ def fit_transform(self, X, y=None):
         abs_corr_mat = self._get_abs_corr_mat(X_filled)
 
         # impute data
-        n_rounds = self.n_burn_in + self.n_imputations
         n_samples, n_features = X_filled.shape
-        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
         self.imputation_sequence_ = []
         if self.verbose > 0:
             print("[ChainedImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(n_rounds):
+        for i_rnd in range(self.n_burn_in):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -877,14 +880,12 @@ def fit_transform(self, X, y=None):
                                                    predictor)
                 self.imputation_sequence_.append(predictor_triplet)
 
-            if i_rnd >= self.n_burn_in:
-                Xt += X_filled
             if self.verbose > 0:
                 print('[ChainedImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, n_rounds, time() - start_t))
+                      % (i_rnd + 1, self.n_burn_in, time() - start_t))
 
-        Xt /= self.n_imputations
+        Xt = X_filled
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -908,19 +909,16 @@ def transform(self, X):
 
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_imputations,
-        # then there is no need to do burn in and the result should be
-        # just the initial imputation (before clipping)
-        if self.n_imputations < 1:
+        # edge case: in case the user specifies 0 for n_burn_in,
+        # then there is no need to do chained imputation and the result should
+        # be just the initial imputation (before clipping)
+        if self.n_burn_in < 1:
             return X_filled
 
         X_filled = np.clip(X_filled, self._min_value, self._max_value)
 
-        n_rounds = self.n_burn_in + self.n_imputations
-        n_imputations = len(self.imputation_sequence_)
-        imputations_per_round = n_imputations // n_rounds
+        imps_per_round = len(self.imputation_sequence_) // self.n_burn_in
         i_rnd = 0
-        Xt = np.zeros(X.shape, dtype=X.dtype)
         if self.verbose > 0:
             print("[ChainedImputer] Completing matrix with shape %s"
                   % (X.shape,))
@@ -934,16 +932,14 @@ def transform(self, X):
                 predictor=predictor_triplet.predictor,
                 fit_mode=False
             )
-            if not (it + 1) % imputations_per_round:
-                if i_rnd >= self.n_burn_in:
-                    Xt += X_filled
+            if not (it + 1) % imps_per_round:
                 if self.verbose > 1:
                     print('[ChainedImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, n_rounds, time() - start_t))
+                          % (i_rnd + 1, self.n_burn_in, time() - start_t))
                 i_rnd += 1
 
-        Xt /= self.n_imputations
+        Xt = X_filled
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 

From 5445012cd60c32856269dd75e4f969f234143207 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Sun, 24 Jun 2018 19:38:48 -0700
Subject: [PATCH 02/26] complete functional overhaul

---
 doc/modules/impute.rst       |  8 ++++----
 sklearn/impute.py            | 24 ++++++++++++------------
 sklearn/tests/test_impute.py | 33 +++++++++++++--------------------
 3 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 0f9089c981782..c2390b8b69fd4 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -105,12 +105,12 @@ rounds. Here is an example snippet::
 
     >>> import numpy as np
     >>> from sklearn.impute import ChainedImputer
-    >>> imp = ChainedImputer(n_imputations=10, random_state=0)
+    >>> imp = ChainedImputer(random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     ChainedImputer(imputation_order='ascending', initial_strategy='mean',
-            max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
-            n_imputations=10, n_nearest_features=None, predictor=None,
-            random_state=0, verbose=False)
+            max_value=None, min_value=None, missing_values=nan, n_iter=10,
+            n_nearest_features=None, predictor=None, random_state=0,
+            sample_after_predict=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 841b70517c19f..972cd93ea3805 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -452,9 +452,9 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         "random"
             A random order for each round.
 
-    n_burn_in : int, optional (default=10)
-        Number of initial imputation rounds to perform the results of which
-        will not be returned.
+    n_iter : int, optional (default=10)
+        Number of imputation rounds to perform before returning the final
+        imputations.
 
     predictor : estimator object, default=BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
@@ -535,7 +535,7 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
     def __init__(self,
                  missing_values=np.nan,
                  imputation_order='ascending',
-                 n_burn_in=10,
+                 n_iter=10,
                  predictor=None,
                  sample_after_predict=False,
                  n_nearest_features=None,
@@ -547,7 +547,7 @@ def __init__(self,
 
         self.missing_values = missing_values
         self.imputation_order = imputation_order
-        self.n_burn_in = n_burn_in
+        self.n_iter = n_iter
         self.predictor = predictor
         self.sample_after_predict = sample_after_predict
         self.n_nearest_features = n_nearest_features
@@ -844,7 +844,7 @@ def fit_transform(self, X, y=None):
         # edge case: in case the user specifies 0 for n_burn_in,
         # then there is no need to do chained imputation and the result should
         # be just the initial imputation (before clipping)
-        if self.n_burn_in < 1:
+        if self.n_iter < 1:
             return X_filled
 
         X_filled = np.clip(X_filled, self._min_value, self._max_value)
@@ -864,7 +864,7 @@ def fit_transform(self, X, y=None):
             print("[ChainedImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
-        for i_rnd in range(self.n_burn_in):
+        for i_rnd in range(self.n_iter):
             if self.imputation_order == 'random':
                 ordered_idx = self._get_ordered_idx(mask_missing_values)
 
@@ -883,7 +883,7 @@ def fit_transform(self, X, y=None):
             if self.verbose > 0:
                 print('[ChainedImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
-                      % (i_rnd + 1, self.n_burn_in, time() - start_t))
+                      % (i_rnd + 1, self.n_iter, time() - start_t))
 
         Xt = X_filled
         Xt[~mask_missing_values] = X[~mask_missing_values]
@@ -912,12 +912,12 @@ def transform(self, X):
         # edge case: in case the user specifies 0 for n_burn_in,
         # then there is no need to do chained imputation and the result should
         # be just the initial imputation (before clipping)
-        if self.n_burn_in < 1:
+        if self.n_iter < 1:
             return X_filled
 
         X_filled = np.clip(X_filled, self._min_value, self._max_value)
 
-        imps_per_round = len(self.imputation_sequence_) // self.n_burn_in
+        imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
         if self.verbose > 0:
             print("[ChainedImputer] Completing matrix with shape %s"
@@ -932,11 +932,11 @@ def transform(self, X):
                 predictor=predictor_triplet.predictor,
                 fit_mode=False
             )
-            if not (it + 1) % imps_per_round:
+            if not (it + 1) % imputations_per_round:
                 if self.verbose > 1:
                     print('[ChainedImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
-                          % (i_rnd + 1, self.n_burn_in, time() - start_t))
+                          % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
 
         Xt = X_filled
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f5c42f7443487..36b7a22c6ec2c 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -496,8 +496,7 @@ def test_chained_imputer_rank_one():
     X_missing = X.copy()
     X_missing[nan_mask] = np.nan
 
-    imputer = ChainedImputer(n_imputations=5,
-                             n_burn_in=5,
+    imputer = ChainedImputer(n_iter=5,
                              verbose=True,
                              random_state=rng)
     X_filled = imputer.fit_transform(X_missing)
@@ -516,8 +515,7 @@ def test_chained_imputer_imputation_order(imputation_order):
     X[:, 0] = 1  # this column should not be discarded by ChainedImputer
 
     imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
+                             n_iter=1,
                              n_nearest_features=5,
                              min_value=0,
                              max_value=1,
@@ -550,8 +548,7 @@ def test_chained_imputer_predictors(predictor):
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
 
     imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
+                             n_iter=1,
                              predictor=predictor,
                              random_state=rng)
     imputer.fit_transform(X)
@@ -574,8 +571,7 @@ def test_chained_imputer_clip():
                              random_state=rng).toarray()
 
     imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
+                             n_iter=1,
                              min_value=0.1,
                              max_value=0.2,
                              random_state=rng)
@@ -601,8 +597,7 @@ def test_chained_imputer_missing_at_transform(strategy):
     X_test[0, 0] = 0  # definitely missing value in 0th column
 
     imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
+                             n_iter=1,
                              initial_strategy=strategy,
                              random_state=rng).fit(X_train)
     initial_imputer = SimpleImputer(missing_values=0,
@@ -622,8 +617,8 @@ def test_chained_imputer_transform_stochasticity():
                              random_state=rng).toarray()
 
     imputer = ChainedImputer(missing_values=0,
-                             n_imputations=1,
-                             n_burn_in=1,
+                             n_iter=1,
+                             sample_after_predict=True,
                              random_state=rng)
     imputer.fit(X)
 
@@ -638,8 +633,8 @@ def test_chained_imputer_no_missing():
     rng = np.random.RandomState(0)
     X = rng.rand(100, 100)
     X[:, 0] = np.nan
-    m1 = ChainedImputer(n_imputations=10, random_state=rng)
-    m2 = ChainedImputer(n_imputations=10, random_state=rng)
+    m1 = ChainedImputer(n_iter=10, random_state=rng)
+    m2 = ChainedImputer(n_iter=10, random_state=rng)
     pred1 = m1.fit(X).transform(X)
     pred2 = m2.fit_transform(X)
     # should exclude the first column entirely
@@ -670,12 +665,11 @@ def test_chained_imputer_transform_recovery(rank):
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=10,
-                             n_burn_in=10,
+    imputer = ChainedImputer(n_iter=10,
                              verbose=True,
                              random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
-    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)
 
 
 def test_chained_imputer_additive_matrix():
@@ -699,9 +693,8 @@ def test_chained_imputer_additive_matrix():
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_imputations=25,
-                             n_burn_in=10,
+    imputer = ChainedImputer(n_iter=10,
                              verbose=True,
                              random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
-    assert_allclose(X_test_filled, X_test_est, atol=0.01)
+    assert_allclose(X_test_filled, X_test_est, atol=0.1)

From 3a2d5100cf37ca130d2d26f357313299210439a9 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Sun, 24 Jun 2018 19:56:48 -0700
Subject: [PATCH 03/26] fixing some documentation

---
 sklearn/impute.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 972cd93ea3805..e8e99778a931b 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -500,15 +500,15 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    initial_imputer_ : object of class :class:`sklearn.preprocessing.Imputer`'
-        The imputer used to initialize the missing values.
+    initial_imputer_ : :class:`sklearn.impute.SimpleImputer`'
+        Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
         Each tuple has ``(feat_idx, neighbor_feat_idx, predictor)``, where
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation.
+        the imputation. Length is ``n_features_with_missing * n_iter``.
 
     Notes
     -----

From bf58d922d92e3878b9465636a93126c9166b2699 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Sun, 24 Jun 2018 21:51:06 -0700
Subject: [PATCH 04/26] fixing broken test

---
 sklearn/tests/test_impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 36b7a22c6ec2c..3190b13f9405d 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -515,7 +515,7 @@ def test_chained_imputer_imputation_order(imputation_order):
     X[:, 0] = 1  # this column should not be discarded by ChainedImputer
 
     imputer = ChainedImputer(missing_values=0,
-                             n_iter=1,
+                             n_iter=2,
                              n_nearest_features=5,
                              min_value=0,
                              max_value=1,

From 45f228cbf0a87a6cc56a8ea31f875511d05e51ab Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 07:39:40 -0700
Subject: [PATCH 05/26] addressing review comments + stef's suggestions for
 documentation

---
 doc/modules/impute.rst       | 29 ++++++++++++++-----------
 sklearn/impute.py            | 42 ++++++++++++++++++++++++------------
 sklearn/tests/test_impute.py |  2 +-
 3 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index c2390b8b69fd4..8754627055b8a 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -93,15 +93,17 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
 Multivariate feature imputation
 ===============================
 
-A more sophisticated approach is to use the :class:`ChainedImputer` class, which
-implements the imputation technique from MICE (Multivariate Imputation by
-Chained Equations). MICE models each feature with missing values as a function of
-other features, and uses that estimate for imputation. It does so in a round-robin
-fashion: at each step, a feature column is designated as output `y` and the other
-feature columns are treated as inputs `X`. A regressor is fit on `(X, y)` for known `y`.
-Then, the regressor is used to predict the unknown values of `y`. This is repeated
-for each feature in a chained fashion, and then is done for a number of imputation
-rounds. Here is an example snippet::
+A more sophisticated approach is to use the :class:`ChainedImputer` class, models
+each feature with missing values as a function of other features, and uses that
+estimate for imputation. It does so in an iterated round-robin fashion: at each step,
+a feature column is designated as output `y` and the other feature columns are treated
+as inputs `X`. A regressor is fit on `(X, y)` for known `y`. Then, the regressor is
+used to predict the unknown values of `y`. This is repeated for each feature in a
+chained fashion, and then is done for a number of imputation rounds. The results
+of the final imputation round are returned. Our implementation was inspired by the
+R MICE package (Multivariate Imputation by Chained Equations), but differs from
+it in setting single imputation to default instead of multiple imputation. This
+is discussed further below. Here is an example snippet::
 
     >>> import numpy as np
     >>> from sklearn.impute import ChainedImputer
@@ -135,9 +137,12 @@ analysis results (e.g. held-out validation error) allow the data scientist to
 obtain understanding of the uncertainty inherent in the missing values. The above
 practice is called multiple imputation. As implemented, the :class:`ChainedImputer`
 class generates a single (averaged) imputation for each missing value because this
-is the most common use case for machine learning applications. However, it can also be used
-for multiple imputations by applying it repeatedly to the same dataset with different
-random seeds with the ``n_imputations`` parameter set to 1.
+is the most common use case for machine learning applications. However, it can also
+be used for multiple imputations by applying it repeatedly to the same dataset with
+different random seeds.
+
+It is still an open problem as to how useful single versus multiple imputation is in
+the context of prediction and classification.
 
 Note that a call to the ``transform`` method of :class:`ChainedImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
diff --git a/sklearn/impute.py b/sklearn/impute.py
index e8e99778a931b..c89e0bc1a9006 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -426,11 +426,11 @@ def transform(self, X):
 class ChainedImputer(BaseEstimator, TransformerMixin):
     """Chained imputer transformer to impute missing values.
 
-    Basic implementation of chained imputer from MICE (Multivariate
-    Imputations by Chained Equations) package from R. This version assumes all
-    of the features are Gaussian.
+    Basic implementation of chained mutual regressions to find replacement
+    values in multivariate missing data. This version assumes all features
+    are Gaussian.
 
-    Read more in the :ref:`User Guide <mice>`.
+    Read more in the :ref:`User Guide <chained_imputer>`.
 
     Parameters
     ----------
@@ -454,17 +454,20 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
 
     n_iter : int, optional (default=10)
         Number of imputation rounds to perform before returning the final
-        imputations.
+        imputations computed during the final round. A round is a single
+        imputation of each feature with missing values.
 
     predictor : estimator object, default=BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
         It must support ``return_std`` in its ``predict`` method if
-        ``sample_after_predict`` option is set to ``True`` below.
+        ``predict_posterior`` option is set to ``True`` below.
 
-    sample_after_predict : boolean, default=False
-        Whether to sample from the predictive posterior of the fitted
-        predictor for each Imputation. Set to ``True`` if using
-        ``ChainedImputer`` to have the same functionality as MICE.
+    predict_posterior : boolean, default=False
+        Whether to sample from the (Gaussian) predictive posterior of the
+        fitted predictor for each imputation. Predictor must support
+        ``return_std`` in its ``predict`` method if set to ``True``. Set to
+        ``True`` if using ``ChainedImputer`` to have the same functionality as
+        MICE.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
@@ -500,7 +503,7 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    initial_imputer_ : :class:`sklearn.impute.SimpleImputer`'
+    initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
         Imputer used to initialize the missing values.
 
     imputation_sequence_ : list of tuples
@@ -512,6 +515,12 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
 
     Notes
     -----
+    This implementation was inspired by the R MICE package (Multivariate
+    Imputation by Chained Equations), but differs from it in setting single
+    imputation to default instead of multiple imputation. However, multiple
+    imputation is supported with multiple instances of the imputer with
+    different random seeds run in parallel.
+
     The R version of MICE does not have inductive functionality, i.e. first
     fitting on ``X_train`` and then transforming any ``X_test`` without
     additional fitting. We do this by storing each feature's predictor during
@@ -530,6 +539,11 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         Multivariate Imputation by Chained Equations in R". Journal of
         Statistical Software 45: 1-67.
         <https://www.jstatsoft.org/article/view/v045i03>`_
+
+    .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+        Multivariate Data Suitable for use with an Electronic Computer".
+        Journal of the Royal Statistical Society 22(2): 302-306.
+        <https://www.jstor.org/stable/2984099>`_
     """
 
     def __init__(self,
@@ -537,7 +551,7 @@ def __init__(self,
                  imputation_order='ascending',
                  n_iter=10,
                  predictor=None,
-                 sample_after_predict=False,
+                 predict_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -549,7 +563,7 @@ def __init__(self,
         self.imputation_order = imputation_order
         self.n_iter = n_iter
         self.predictor = predictor
-        self.sample_after_predict = sample_after_predict
+        self.predict_posterior = predict_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -626,7 +640,7 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        if self.sample_after_predict:
+        if self.predict_posterior:
             mus, sigmas = predictor.predict(X_test, return_std=True)
             good_sigmas = sigmas > 0
             imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 3190b13f9405d..2fbaa2987379e 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -618,7 +618,7 @@ def test_chained_imputer_transform_stochasticity():
 
     imputer = ChainedImputer(missing_values=0,
                              n_iter=1,
-                             sample_after_predict=True,
+                             predict_posterior=True,
                              random_state=rng)
     imputer.fit(X)
 

From b231b059d6cf0a594b262c06d2699ce9728ba0ee Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 09:05:22 -0700
Subject: [PATCH 06/26] fixing docs

---
 doc/modules/impute.rst | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 8754627055b8a..a324b5d7bb983 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -93,17 +93,18 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
 Multivariate feature imputation
 ===============================
 
-A more sophisticated approach is to use the :class:`ChainedImputer` class, models
-each feature with missing values as a function of other features, and uses that
-estimate for imputation. It does so in an iterated round-robin fashion: at each step,
-a feature column is designated as output `y` and the other feature columns are treated
-as inputs `X`. A regressor is fit on `(X, y)` for known `y`. Then, the regressor is
-used to predict the unknown values of `y`. This is repeated for each feature in a
-chained fashion, and then is done for a number of imputation rounds. The results
-of the final imputation round are returned. Our implementation was inspired by the
-R MICE package (Multivariate Imputation by Chained Equations), but differs from
-it in setting single imputation to default instead of multiple imputation. This
-is discussed further below. Here is an example snippet::
+A more sophisticated approach is to use the :class:`ChainedImputer` class,
+which models each feature with missing values as a function of other features,
+and uses that estimate for imputation. It does so in an iterated round-robin
+fashion: at each step, a feature column is designated as output `y` and the
+other feature columns are treated as inputs `X`. A regressor is fit on `(X, y)`
+for known `y`. Then, the regressor is used to predict the missing values of `y`.
+This is repeated for each feature in a chained fashion, and then is done for a
+number of imputation rounds. The results of the final imputation round are
+returned. Our implementation was inspired by the R MICE package (Multivariate
+Imputation by Chained Equations), but differs from it in setting single imputation
+to default instead of multiple imputation. This is discussed further below.
+Here is an example snippet::
 
     >>> import numpy as np
     >>> from sklearn.impute import ChainedImputer
@@ -111,8 +112,8 @@ is discussed further below. Here is an example snippet::
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     ChainedImputer(imputation_order='ascending', initial_strategy='mean',
             max_value=None, min_value=None, missing_values=nan, n_iter=10,
-            n_nearest_features=None, predictor=None, random_state=0,
-            sample_after_predict=False, verbose=False)
+            n_nearest_features=None, predict_posterior=False, predictor=None,
+            random_state=0, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]

From ecfac5ca9522fa42558d30d34b531c7fbdf2109c Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 09:17:18 -0700
Subject: [PATCH 07/26] more update to impute.rst

---
 doc/modules/impute.rst | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index a324b5d7bb983..6f811d9b22553 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -136,13 +136,17 @@ Each of these 10 imputations is then put through the subsequent analysis pipelin
 (e.g. feature engineering, clustering, regression, classification). The 10 final
 analysis results (e.g. held-out validation error) allow the data scientist to
 obtain understanding of the uncertainty inherent in the missing values. The above
-practice is called multiple imputation. As implemented, the :class:`ChainedImputer`
-class generates a single (averaged) imputation for each missing value because this
-is the most common use case for machine learning applications. However, it can also
-be used for multiple imputations by applying it repeatedly to the same dataset with
-different random seeds.
+practice is called multiple imputation.
 
-It is still an open problem as to how useful single versus multiple imputation is in
+As implemented, the :class:`ChainedImputer` class generates a single imputation
+for each missing value because this is the most common use case for machine learning
+applications. However, it can also be used for multiple imputations by applying it
+repeatedly to the same dataset with different random seeds.
+
+See Chapter 4 of "Statistical Analysis with Missing Data" by Little and Rubin for
+more discussion on multiple vs. single imputations.
+
+It is still an open problem as to how useful single vs. multiple imputation is in
 the context of prediction and classification.
 
 Note that a call to the ``transform`` method of :class:`ChainedImputer` is not

From 5b2b1811145205d93114d123134d1c90fffc2c56 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 18:23:35 -0700
Subject: [PATCH 08/26] name change again, new default ridgeCV

---
 doc/modules/classes.rst           |   2 +-
 doc/modules/impute.rst            |  20 ++---
 doc/whats_new/v0.20.rst           |   2 +-
 examples/plot_missing_values.py   |  18 ++---
 sklearn/impute.py                 |  59 ++++++++------
 sklearn/tests/test_impute.py      | 130 +++++++++++++++---------------
 sklearn/utils/estimator_checks.py |   2 +-
 7 files changed, 123 insertions(+), 110 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index df46579f7fd82..e1cf479fbd122 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -653,7 +653,7 @@ Kernels:
    :template: class.rst
 
    impute.SimpleImputer
-   impute.ChainedImputer
+   impute.IterativeImputer
 
 .. _kernel_approximation_ref:
 
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 6f811d9b22553..aa084bd5f2685 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -24,7 +24,7 @@ One type of imputation algorithm is univariate, which imputes values in the i-th
 feature dimension using only non-missing values in that feature dimension
 (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation
 algorithms use the entire set of available feature dimensions to estimate the
-missing values (e.g. :class:`impute.ChainedImputer`).
+missing values (e.g. :class:`impute.IterativeImputer`).
 
 
 .. _single_imputer:
@@ -87,19 +87,19 @@ string values or pandas categoricals when using the ``'most_frequent'`` or
      ['a' 'y']
      ['b' 'y']]
 
-.. _chained_imputer:
+.. _iterative_imputer:
 
 
 Multivariate feature imputation
 ===============================
 
-A more sophisticated approach is to use the :class:`ChainedImputer` class,
+A more sophisticated approach is to use the :class:`IterativeImputer` class,
 which models each feature with missing values as a function of other features,
 and uses that estimate for imputation. It does so in an iterated round-robin
 fashion: at each step, a feature column is designated as output `y` and the
 other feature columns are treated as inputs `X`. A regressor is fit on `(X, y)`
 for known `y`. Then, the regressor is used to predict the missing values of `y`.
-This is repeated for each feature in a chained fashion, and then is done for a
+This is repeated for each feature in an iterative fashion, and then is done for a
 number of imputation rounds. The results of the final imputation round are
 returned. Our implementation was inspired by the R MICE package (Multivariate
 Imputation by Chained Equations), but differs from it in setting single imputation
@@ -107,10 +107,10 @@ to default instead of multiple imputation. This is discussed further below.
 Here is an example snippet::
 
     >>> import numpy as np
-    >>> from sklearn.impute import ChainedImputer
-    >>> imp = ChainedImputer(random_state=0)
+    >>> from sklearn.impute import IterativeImputer
+    >>> imp = IterativeImputer(random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
-    ChainedImputer(imputation_order='ascending', initial_strategy='mean',
+    IterativeImputer(imputation_order='ascending', initial_strategy='mean',
             max_value=None, min_value=None, missing_values=nan, n_iter=10,
             n_nearest_features=None, predict_posterior=False, predictor=None,
             random_state=0, verbose=False)
@@ -120,7 +120,7 @@ Here is an example snippet::
      [ 6.  4.]
      [13.  6.]]
 
-Both :class:`SimpleImputer` and :class:`ChainedImputer` can be used in a Pipeline
+Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.
 See :ref:`sphx_glr_auto_examples_plot_missing_values.py`.
 
@@ -138,7 +138,7 @@ analysis results (e.g. held-out validation error) allow the data scientist to
 obtain understanding of the uncertainty inherent in the missing values. The above
 practice is called multiple imputation.
 
-As implemented, the :class:`ChainedImputer` class generates a single imputation
+As implemented, the :class:`IterativeImputer` class generates a single imputation
 for each missing value because this is the most common use case for machine learning
 applications. However, it can also be used for multiple imputations by applying it
 repeatedly to the same dataset with different random seeds.
@@ -149,6 +149,6 @@ more discussion on multiple vs. single imputations.
 It is still an open problem as to how useful single vs. multiple imputation is in
 the context of prediction and classification.
 
-Note that a call to the ``transform`` method of :class:`ChainedImputer` is not
+Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
 achieved by a single call to ``transform``.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 7383b17cf1e6b..bc4ba16716145 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -123,7 +123,7 @@ Preprocessing
   back to the original space via an inverse transform. :issue:`9041` by
   `Andreas Müller`_ and :user:`Guillaume Lemaitre <glemaitre>`.
 
-- Added :class:`impute.ChainedImputer`, which is a strategy for imputing missing
+- Added :class:`impute.IterativeImputer`, which is a strategy for imputing missing
   values by modeling each feature with missing values as a function of
   other features in a round-robin fashion. :issue:`8478` by
   :user:`Sergey Feldman <sergeyf>`.
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index d238a16592edb..96f07233ea0e3 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -8,7 +8,7 @@
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Another option is the ``ChainedImputer``. This uses round-robin linear
+Another option is the ``IterativeImputer``. This uses round-robin linear
 regression, treating every variable as an output in turn. The version
 implemented assumes Gaussian (output) variables. If your features are obviously
 non-Normal, consider transforming them to look more Normal so as to improve
@@ -22,7 +22,7 @@
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer, ChainedImputer
+from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.model_selection import cross_val_score
 
 rng = np.random.RandomState(0)
@@ -67,18 +67,18 @@ def get_results(dataset):
     mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                          scoring='neg_mean_squared_error')
 
-    # Estimate the score after chained imputation of the missing values
-    estimator = Pipeline([("imputer", ChainedImputer(missing_values=0,
-                                                     random_state=0)),
+    # Estimate the score after iterative imputation of the missing values
+    estimator = Pipeline([("imputer", IterativeImputer(missing_values=0,
+                                                       random_state=0)),
                           ("forest", RandomForestRegressor(random_state=0,
                                                            n_estimators=100))])
-    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                            scoring='neg_mean_squared_error')
+    iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                              scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),
             (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (chained_impute_scores.mean(), chained_impute_scores.std()))
+            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
 
 results_diabetes = np.array(get_results(load_diabetes()))
@@ -95,7 +95,7 @@ def get_results(dataset):
 x_labels = ['Full data',
             'Zero imputation',
             'Mean Imputation',
-            'Chained Imputation']
+            'Iterative Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index c89e0bc1a9006..9d6ee3d539a2d 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -36,7 +36,7 @@
 
 __all__ = [
     'SimpleImputer',
-    'ChainedImputer',
+    'IterativeImputer',
 ]
 
 
@@ -423,14 +423,14 @@ def transform(self, X):
         return X
 
 
-class ChainedImputer(BaseEstimator, TransformerMixin):
-    """Chained imputer transformer to impute missing values.
+class IterativeImputer(BaseEstimator, TransformerMixin):
+    """Iterative imputer transformer to impute missing values.
 
-    Basic implementation of chained mutual regressions to find replacement
+    Basic implementation of iterative mutual regressions to find replacement
     values in multivariate missing data. This version assumes all features
     are Gaussian.
 
-    Read more in the :ref:`User Guide <chained_imputer>`.
+    Read more in the :ref:`User Guide <iterative_imputer>`.
 
     Parameters
     ----------
@@ -457,17 +457,19 @@ class ChainedImputer(BaseEstimator, TransformerMixin):
         imputations computed during the final round. A round is a single
         imputation of each feature with missing values.
 
-    predictor : estimator object, default=BayesianRidge()
+    predictor : estimator object, default=BayesianRidge() or RidgeCV()
         The predictor to use at each step of the round-robin imputation.
         It must support ``return_std`` in its ``predict`` method if
-        ``predict_posterior`` option is set to ``True`` below.
+        ``sample_posterior`` option is set to ``True`` below. If
+        ``sample_posterior=True`` the default predictor will be
+        ``BayesianRidge()`` and ``RidgeCV`` otherwise.
 
-    predict_posterior : boolean, default=False
+    sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
         fitted predictor for each imputation. Predictor must support
         ``return_std`` in its ``predict`` method if set to ``True``. Set to
-        ``True`` if using ``ChainedImputer`` to have the same functionality as
-        MICE.
+        ``True`` if using ``IterativeImputer`` to have the same functionality
+        as MICE.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
@@ -551,7 +553,7 @@ def __init__(self,
                  imputation_order='ascending',
                  n_iter=10,
                  predictor=None,
-                 predict_posterior=False,
+                 sample_posterior=False,
                  n_nearest_features=None,
                  initial_strategy="mean",
                  min_value=None,
@@ -563,7 +565,7 @@ def __init__(self,
         self.imputation_order = imputation_order
         self.n_iter = n_iter
         self.predictor = predictor
-        self.predict_posterior = predict_posterior
+        self.sample_posterior = sample_posterior
         self.n_nearest_features = n_nearest_features
         self.initial_strategy = initial_strategy
         self.min_value = min_value
@@ -640,7 +642,7 @@ def _impute_one_feature(self,
         # get posterior samples
         X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
                                missing_row_mask)
-        if self.predict_posterior:
+        if self.sample_posterior:
             mus, sigmas = predictor.predict(X_test, return_std=True)
             good_sigmas = sigmas > 0
             imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
@@ -844,8 +846,13 @@ def fit_transform(self, X, y=None):
                                      check_random_state(self.random_state))
 
         if self.predictor is None:
-            from .linear_model import BayesianRidge
-            self._predictor = BayesianRidge()
+            if self.sample_posterior:
+                from .linear_model import BayesianRidge
+                self._predictor = BayesianRidge()
+            else:
+                from .linear_model import RidgeCV
+                # including a very small alpha to approximate OLS
+                self._predictor = RidgeCV(alphas=np.array([1e-5, 0.1,  1, 10]))
         else:
             self._predictor = clone(self.predictor)
 
@@ -856,12 +863,15 @@ def fit_transform(self, X, y=None):
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
         # edge case: in case the user specifies 0 for n_burn_in,
-        # then there is no need to do chained imputation and the result should
+        # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
             return X_filled
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        # clip only the initial filledin values
+        X_filled[mask_missing_values] = np.clip(X_filled[mask_missing_values],
+                                                self._min_value,
+                                                self._max_value)
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
@@ -875,7 +885,7 @@ def fit_transform(self, X, y=None):
         n_samples, n_features = X_filled.shape
         self.imputation_sequence_ = []
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for i_rnd in range(self.n_iter):
@@ -895,7 +905,7 @@ def fit_transform(self, X, y=None):
                 self.imputation_sequence_.append(predictor_triplet)
 
             if self.verbose > 0:
-                print('[ChainedImputer] Ending imputation round '
+                print('[IterativeImputer] Ending imputation round '
                       '%d/%d, elapsed time %0.2f'
                       % (i_rnd + 1, self.n_iter, time() - start_t))
 
@@ -924,17 +934,20 @@ def transform(self, X):
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
         # edge case: in case the user specifies 0 for n_burn_in,
-        # then there is no need to do chained imputation and the result should
+        # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
             return X_filled
 
-        X_filled = np.clip(X_filled, self._min_value, self._max_value)
+        # clip only the initial filledin values
+        X_filled[mask_missing_values] = np.clip(X_filled[mask_missing_values],
+                                                self._min_value,
+                                                self._max_value)
 
         imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
         if self.verbose > 0:
-            print("[ChainedImputer] Completing matrix with shape %s"
+            print("[IterativeImputer] Completing matrix with shape %s"
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
@@ -948,7 +961,7 @@ def transform(self, X):
             )
             if not (it + 1) % imputations_per_round:
                 if self.verbose > 1:
-                    print('[ChainedImputer] Ending imputation round '
+                    print('[IterativeImputer] Ending imputation round '
                           '%d/%d, elapsed time %0.2f'
                           % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 2fbaa2987379e..d7147045cdc7c 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -13,9 +13,9 @@
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_false
 
-from sklearn.impute import SimpleImputer, ChainedImputer
+from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
-from sklearn.linear_model import BayesianRidge, ARDRegression
+from sklearn.linear_model import BayesianRidge, ARDRegression, RidgeCV
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn import tree
@@ -72,8 +72,8 @@ def test_imputation_shape():
         X_imputed = imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
-        chained_imputer = ChainedImputer(initial_strategy=strategy)
-        X_imputed = chained_imputer.fit_transform(X)
+        iterative_imputer = IterativeImputer(initial_strategy=strategy)
+        X_imputed = iterative_imputer.fit_transform(X)
         assert X_imputed.shape == (10, 2)
 
 
@@ -486,42 +486,25 @@ def test_imputation_copy():
     # made, even if copy=False.
 
 
-def test_chained_imputer_rank_one():
-    rng = np.random.RandomState(0)
-    d = 100
-    A = rng.rand(d, 1)
-    B = rng.rand(1, d)
-    X = np.dot(A, B)
-    nan_mask = rng.rand(d, d) < 0.5
-    X_missing = X.copy()
-    X_missing[nan_mask] = np.nan
-
-    imputer = ChainedImputer(n_iter=5,
-                             verbose=True,
-                             random_state=rng)
-    X_filled = imputer.fit_transform(X_missing)
-    assert_allclose(X_filled, X, atol=0.001)
-
-
 @pytest.mark.parametrize(
     "imputation_order",
     ['random', 'roman', 'ascending', 'descending', 'arabic']
 )
-def test_chained_imputer_imputation_order(imputation_order):
+def test_iterative_imputer_imputation_order(imputation_order):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
-    X[:, 0] = 1  # this column should not be discarded by ChainedImputer
-
-    imputer = ChainedImputer(missing_values=0,
-                             n_iter=2,
-                             n_nearest_features=5,
-                             min_value=0,
-                             max_value=1,
-                             verbose=False,
-                             imputation_order=imputation_order,
-                             random_state=rng)
+    X[:, 0] = 1  # this column should not be discarded by IterativeImputer
+
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=2,
+                               n_nearest_features=5,
+                               min_value=0,
+                               max_value=1,
+                               verbose=False,
+                               imputation_order=imputation_order,
+                               random_state=rng)
     imputer.fit_transform(X)
     ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
     if imputation_order == 'roman':
@@ -538,19 +521,19 @@ def test_chained_imputer_imputation_order(imputation_order):
 
 @pytest.mark.parametrize(
     "predictor",
-    [DummyRegressor(), BayesianRidge(), ARDRegression()]
+    [DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]
 )
-def test_chained_imputer_predictors(predictor):
+def test_iterative_imputer_predictors(predictor):
     rng = np.random.RandomState(0)
 
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_iter=1,
-                             predictor=predictor,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               predictor=predictor,
+                               random_state=rng)
     imputer.fit_transform(X)
 
     # check that types are correct for predictors
@@ -563,18 +546,18 @@ def test_chained_imputer_predictors(predictor):
     assert len(set(hashes)) == len(hashes)
 
 
-def test_chained_imputer_clip():
+def test_iterative_imputer_clip():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
                              random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_iter=1,
-                             min_value=0.1,
-                             max_value=0.2,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               min_value=0.1,
+                               max_value=0.2,
+                               random_state=rng)
 
     Xt = imputer.fit_transform(X)
     assert_allclose(np.min(Xt[X == 0]), 0.1)
@@ -586,7 +569,7 @@ def test_chained_imputer_clip():
     "strategy",
     ["mean", "median", "most_frequent"]
 )
-def test_chained_imputer_missing_at_transform(strategy):
+def test_iterative_imputer_missing_at_transform(strategy):
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -596,10 +579,10 @@ def test_chained_imputer_missing_at_transform(strategy):
     X_train[:, 0] = 1  # definitely no missing values in 0th column
     X_test[0, 0] = 0  # definitely missing value in 0th column
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_iter=1,
-                             initial_strategy=strategy,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               initial_strategy=strategy,
+                               random_state=rng).fit(X_train)
     initial_imputer = SimpleImputer(missing_values=0,
                                     strategy=strategy).fit(X_train)
 
@@ -609,17 +592,17 @@ def test_chained_imputer_missing_at_transform(strategy):
                   initial_imputer.transform(X_test)[:, 0])
 
 
-def test_chained_imputer_transform_stochasticity():
+def test_iterative_imputer_transform_stochasticity():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
                              random_state=rng).toarray()
 
-    imputer = ChainedImputer(missing_values=0,
-                             n_iter=1,
-                             predict_posterior=True,
-                             random_state=rng)
+    imputer = IterativeImputer(missing_values=0,
+                               n_iter=1,
+                               sample_posterior=True,
+                               random_state=rng)
     imputer.fit(X)
 
     X_fitted_1 = imputer.transform(X)
@@ -629,12 +612,12 @@ def test_chained_imputer_transform_stochasticity():
     assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
 
 
-def test_chained_imputer_no_missing():
+def test_iterative_imputer_no_missing():
     rng = np.random.RandomState(0)
     X = rng.rand(100, 100)
     X[:, 0] = np.nan
-    m1 = ChainedImputer(n_iter=10, random_state=rng)
-    m2 = ChainedImputer(n_iter=10, random_state=rng)
+    m1 = IterativeImputer(n_iter=10, random_state=rng)
+    m2 = IterativeImputer(n_iter=10, random_state=rng)
     pred1 = m1.fit(X).transform(X)
     pred2 = m2.fit_transform(X)
     # should exclude the first column entirely
@@ -643,11 +626,28 @@ def test_chained_imputer_no_missing():
     assert_allclose(pred1, pred2)
 
 
+def test_iterative_imputer_rank_one():
+    rng = np.random.RandomState(0)
+    d = 100
+    A = rng.rand(d, 1)
+    B = rng.rand(1, d)
+    X = np.dot(A, B)
+    nan_mask = rng.rand(d, d) < 0.5
+    X_missing = X.copy()
+    X_missing[nan_mask] = np.nan
+
+    imputer = IterativeImputer(n_iter=5,
+                               verbose=True,
+                               random_state=rng)
+    X_filled = imputer.fit_transform(X_missing)
+    assert_allclose(X_filled, X, atol=0.01)
+
+
 @pytest.mark.parametrize(
     "rank",
     [3, 5]
 )
-def test_chained_imputer_transform_recovery(rank):
+def test_iterative_imputer_transform_recovery(rank):
     rng = np.random.RandomState(0)
     n = 100
     d = 100
@@ -665,14 +665,14 @@ def test_chained_imputer_transform_recovery(rank):
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_iter=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=True,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.1)
 
 
-def test_chained_imputer_additive_matrix():
+def test_iterative_imputer_additive_matrix():
     rng = np.random.RandomState(0)
     n = 100
     d = 10
@@ -693,8 +693,8 @@ def test_chained_imputer_additive_matrix():
     X_test_filled = X_filled[n:]
     X_test = X_missing[n:]
 
-    imputer = ChainedImputer(n_iter=10,
-                             verbose=True,
-                             random_state=rng).fit(X_train)
+    imputer = IterativeImputer(n_iter=10,
+                               verbose=True,
+                               random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.1)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 8a1b33f4d92d3..0840b37e312ef 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -77,7 +77,7 @@
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
-ALLOW_NAN = ['Imputer', 'SimpleImputer', 'ChainedImputer',
+ALLOW_NAN = ['Imputer', 'SimpleImputer', 'IterativeImputer',
              'MaxAbsScaler', 'MinMaxScaler', 'StandardScaler',
              'PowerTransformer', 'QuantileTransformer']
 

From cbbc8b053d4188686f6808f86643311aaf74ef15 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 18:25:38 -0700
Subject: [PATCH 09/26] small bug

---
 doc/modules/impute.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index aa084bd5f2685..b354db146e6a0 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -111,9 +111,9 @@ Here is an example snippet::
     >>> imp = IterativeImputer(random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
-            max_value=None, min_value=None, missing_values=nan, n_iter=10,
-            n_nearest_features=None, predict_posterior=False, predictor=None,
-            random_state=0, verbose=False)
+             max_value=None, min_value=None, missing_values=nan, n_iter=10,
+             n_nearest_features=None, predictor=None, random_state=0,
+             sample_posterior=False, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]

From 0c9f64fdb981038402703764bf3d9b5950920a23 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 25 Jun 2018 18:26:58 -0700
Subject: [PATCH 10/26] output changed in impute.rst because of RidgeCV

---
 doc/modules/impute.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index b354db146e6a0..3d6468f95497b 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -117,8 +117,8 @@ Here is an example snippet::
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
     >>> print(np.round(imp.transform(X_test)))
     [[ 1.  2.]
-     [ 6.  4.]
-     [13.  6.]]
+     [ 6.  3.]
+     [24.  6.]]
 
 Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline
 as a way to build a composite estimator that supports imputation.

From c0c6a4c035c93a222ede908b7981d54fcb915b87 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Tue, 26 Jun 2018 07:54:48 -0700
Subject: [PATCH 11/26] Update impute.rst a smidge.

---
 doc/modules/impute.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 3d6468f95497b..7db5386fc4261 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -141,13 +141,15 @@ practice is called multiple imputation.
 As implemented, the :class:`IterativeImputer` class generates a single imputation
 for each missing value because this is the most common use case for machine learning
 applications. However, it can also be used for multiple imputations by applying it
-repeatedly to the same dataset with different random seeds.
+repeatedly to the same dataset with different random seeds when
+``sample_posterior=True``.
 
 See Chapter 4 of "Statistical Analysis with Missing Data" by Little and Rubin for
 more discussion on multiple vs. single imputations.
 
 It is still an open problem as to how useful single vs. multiple imputation is in
-the context of prediction and classification.
+the context of prediction and classification when the user is not interested in
+measuring uncertainty due to missing values.
 
 Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be

From c4ccdfc29d99d6cfdf03bf6304ec89901dcf1b56 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Tue, 26 Jun 2018 07:59:02 -0700
Subject: [PATCH 12/26] Remove pre-clipping

---
 sklearn/impute.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 9d6ee3d539a2d..80bfda09b685d 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -862,17 +862,12 @@ def fit_transform(self, X, y=None):
         self.initial_imputer_ = None
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_burn_in,
+        # edge case: in case the user specifies 0 for n_iter,
         # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
             return X_filled
 
-        # clip only the initial filledin values
-        X_filled[mask_missing_values] = np.clip(X_filled[mask_missing_values],
-                                                self._min_value,
-                                                self._max_value)
-
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
         # and a better way would be good.
@@ -933,17 +928,12 @@ def transform(self, X):
 
         X, X_filled, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_burn_in,
+        # edge case: in case the user specifies 0 for n_iter,
         # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
             return X_filled
 
-        # clip only the initial filledin values
-        X_filled[mask_missing_values] = np.clip(X_filled[mask_missing_values],
-                                                self._min_value,
-                                                self._max_value)
-
         imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
         if self.verbose > 0:

From 3c882e75a4428515368bc451c1d190e6a23a23b5 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Tue, 26 Jun 2018 10:30:21 -0700
Subject: [PATCH 13/26] addressing comments

---
 doc/modules/impute.rst          | 31 +++++++++++++++--------
 examples/plot_missing_values.py |  4 +--
 sklearn/impute.py               | 45 ++++++++++++++++++---------------
 sklearn/tests/test_impute.py    | 40 ++++++++++++++++++++++++-----
 4 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 7db5386fc4261..151eefca8258b 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -99,16 +99,15 @@ and uses that estimate for imputation. It does so in an iterated round-robin
 fashion: at each step, a feature column is designated as output `y` and the
 other feature columns are treated as inputs `X`. A regressor is fit on `(X, y)`
 for known `y`. Then, the regressor is used to predict the missing values of `y`.
-This is repeated for each feature in an iterative fashion, and then is done for a
-number of imputation rounds. The results of the final imputation round are
+This is done for each feature in an iterative fashion, and then is repeated for
+`n_iter` imputation rounds. The results of the final imputation round are
 returned. Our implementation was inspired by the R MICE package (Multivariate
-Imputation by Chained Equations), but differs from it in setting single imputation
-to default instead of multiple imputation. This is discussed further below.
-Here is an example snippet::
+Imputation by Chained Equations) [1], but differs from it in setting single
+imputation to default instead of multiple imputation. This is discussed further below.
 
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
-    >>> imp = IterativeImputer(random_state=0)
+    >>> imp = IterativeImputer(n_iter=10, random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     IterativeImputer(imputation_order='ascending', initial_strategy='mean',
              max_value=None, min_value=None, missing_values=nan, n_iter=10,
@@ -131,9 +130,9 @@ Multiple vs. Single Imputation
 ==============================
 
 In the statistics community, it is common practice to perform multiple imputations,
-generating, for example, 10 separate imputations for a single feature matrix.
-Each of these 10 imputations is then put through the subsequent analysis pipeline
-(e.g. feature engineering, clustering, regression, classification). The 10 final
+generating, for example, `m` separate imputations for a single feature matrix.
+Each of these `m` imputations is then put through the subsequent analysis pipeline
+(e.g. feature engineering, clustering, regression, classification). The `m` final
 analysis results (e.g. held-out validation error) allow the data scientist to
 obtain understanding of the uncertainty inherent in the missing values. The above
 practice is called multiple imputation.
@@ -144,8 +143,8 @@ applications. However, it can also be used for multiple imputations by applying
 repeatedly to the same dataset with different random seeds when
 ``sample_posterior=True``.
 
-See Chapter 4 of "Statistical Analysis with Missing Data" by Little and Rubin for
-more discussion on multiple vs. single imputations.
+See Chapter 4 of "Statistical Analysis with Missing Data" [2] for more discussion
+on multiple vs. single imputations.
 
 It is still an open problem as to how useful single vs. multiple imputation is in
 the context of prediction and classification when the user is not interested in
@@ -154,3 +153,13 @@ measuring uncertainty due to missing values.
 Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
 achieved by a single call to ``transform``.
+
+.. _references:
+
+References
+==========
+[1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate Imputation
+    by Chained Equations in R". Journal of Statistical Software 45: 1-67.
+
+[2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis with Missing
+    Data". John Wiley & Sons, Inc., New York, NY, USA.
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index 96f07233ea0e3..c9a95f2db66be 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -4,11 +4,11 @@
 ====================================================
 
 Missing values can be replaced by the mean, the median or the most frequent
-value using the basic ``SimpleImputer``.
+value using the basic :class:`SimpleImputer`.
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Another option is the ``IterativeImputer``. This uses round-robin linear
+Another option is the :class:`IterativeImputer`. This uses round-robin linear
 regression, treating every variable as an output in turn. The version
 implemented assumes Gaussian (output) variables. If your features are obviously
 non-Normal, consider transforming them to look more Normal so as to improve
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 80bfda09b685d..43012d1f5f431 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -469,14 +469,17 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         fitted predictor for each imputation. Predictor must support
         ``return_std`` in its ``predict`` method if set to ``True``. Set to
         ``True`` if using ``IterativeImputer`` to have the same functionality
-        as MICE.
+        as MICE (Multivariate Imputation by Chained Equations).
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
         the each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
-        initial imputation). Can provide significant speed-up when the number
-        of features is huge. If ``None``, all features will be used.
+        initial imputation). To ensure coverage of features throughout the
+        imputation process, the neighbor features are not necessarily nearest,
+        but are drawn with probability proportional to correlation for each
+        imputed target feature. Can provide significant speed-up when the
+        number of features is huge. If ``None``, all features will be used.
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
@@ -523,11 +526,9 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
     imputation is supported with multiple instances of the imputer with
     different random seeds run in parallel.
 
-    The R version of MICE does not have inductive functionality, i.e. first
-    fitting on ``X_train`` and then transforming any ``X_test`` without
-    additional fitting. We do this by storing each feature's predictor during
-    the round-robin ``fit`` phase, and predicting without refitting (in order)
-    during the ``transform`` phase.
+    To support imputation in inductive mode we store each feature's predictor
+    during the ``fit`` phase, and predict without refitting (in order) during
+    the ``transform`` phase.
 
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
@@ -603,7 +604,8 @@ def _impute_one_feature(self,
 
         predictor : object
             The predictor to use at this step of the round-robin imputation.
-            It must support ``return_std`` in its ``predict`` method.
+            If ``sample_posterior`` is True, the predictor must support
+            ``return_std`` in its ``predict`` method.
             If None, it will be cloned from self._predictor.
 
         fit_mode : boolean, default=True
@@ -856,17 +858,20 @@ def fit_transform(self, X, y=None):
         else:
             self._predictor = clone(self.predictor)
 
+        if hasattr(self._predictor, 'random_state'):
+            self._predictor.random_state = self.random_state_
+
         self._min_value = np.nan if self.min_value is None else self.min_value
         self._max_value = np.nan if self.max_value is None else self.max_value
 
         self.initial_imputer_ = None
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
         # edge case: in case the user specifies 0 for n_iter,
         # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
-            return X_filled
+            return Xt
 
         # order in which to impute
         # note this is probably too slow for large feature data (d > 100000)
@@ -874,10 +879,10 @@ def fit_transform(self, X, y=None):
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
 
-        abs_corr_mat = self._get_abs_corr_mat(X_filled)
+        abs_corr_mat = self._get_abs_corr_mat(Xt)
 
         # impute data
-        n_samples, n_features = X_filled.shape
+        n_samples, n_features = Xt.shape
         self.imputation_sequence_ = []
         if self.verbose > 0:
             print("[IterativeImputer] Completing matrix with shape %s"
@@ -891,8 +896,8 @@ def fit_transform(self, X, y=None):
                 neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
                                                                 feat_idx,
                                                                 abs_corr_mat)
-                X_filled, predictor = self._impute_one_feature(
-                    X_filled, mask_missing_values, feat_idx, neighbor_feat_idx,
+                Xt, predictor = self._impute_one_feature(
+                    Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
                     predictor=None, fit_mode=True)
                 predictor_triplet = ImputerTriplet(feat_idx,
                                                    neighbor_feat_idx,
@@ -904,7 +909,6 @@ def fit_transform(self, X, y=None):
                       '%d/%d, elapsed time %0.2f'
                       % (i_rnd + 1, self.n_iter, time() - start_t))
 
-        Xt = X_filled
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
@@ -926,13 +930,13 @@ def transform(self, X):
         """
         check_is_fitted(self, 'initial_imputer_')
 
-        X, X_filled, mask_missing_values = self._initial_imputation(X)
+        X, Xt, mask_missing_values = self._initial_imputation(X)
 
         # edge case: in case the user specifies 0 for n_iter,
         # then there is no need to do further imputation and the result should
         # be just the initial imputation (before clipping)
         if self.n_iter < 1:
-            return X_filled
+            return Xt
 
         imputations_per_round = len(self.imputation_sequence_) // self.n_iter
         i_rnd = 0
@@ -941,8 +945,8 @@ def transform(self, X):
                   % (X.shape,))
         start_t = time()
         for it, predictor_triplet in enumerate(self.imputation_sequence_):
-            X_filled, _ = self._impute_one_feature(
-                X_filled,
+            Xt, _ = self._impute_one_feature(
+                Xt,
                 mask_missing_values,
                 predictor_triplet.feat_idx,
                 predictor_triplet.neighbor_feat_idx,
@@ -956,7 +960,6 @@ def transform(self, X):
                           % (i_rnd + 1, self.n_iter, time() - start_t))
                 i_rnd += 1
 
-        Xt = X_filled
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index d7147045cdc7c..0baa37e60cf12 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -593,16 +593,18 @@ def test_iterative_imputer_missing_at_transform(strategy):
 
 
 def test_iterative_imputer_transform_stochasticity():
-    rng = np.random.RandomState(0)
+    rng1 = np.random.RandomState(0)
+    rng2 = np.random.RandomState(1)
     n = 100
     d = 10
     X = sparse_random_matrix(n, d, density=0.10,
-                             random_state=rng).toarray()
+                             random_state=rng1).toarray()
 
+    # when sample_posterior=True, two transforms shouldn't be equal
     imputer = IterativeImputer(missing_values=0,
                                n_iter=1,
                                sample_posterior=True,
-                               random_state=rng)
+                               random_state=rng1)
     imputer.fit(X)
 
     X_fitted_1 = imputer.transform(X)
@@ -611,6 +613,32 @@ def test_iterative_imputer_transform_stochasticity():
     # sufficient to assert that the means are not the same
     assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
 
+    # when sample_posterior=False, and n_nearest_features=None
+    # and imputation_order is not random
+    # the two transforms should be identical even if rng are different
+    imputer1 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng1)
+
+    imputer2 = IterativeImputer(missing_values=0,
+                                n_iter=1,
+                                sample_posterior=False,
+                                n_nearest_features=None,
+                                imputation_order='ascending',
+                                random_state=rng2)
+    imputer1.fit(X)
+    imputer2.fit(X)
+
+    X_fitted_1a = imputer1.transform(X)
+    X_fitted_1b = imputer1.transform(X)
+    X_fitted_2 = imputer2.transform(X)
+
+    assert np.all(X_fitted_1a == X_fitted_1b)
+    assert np.all(X_fitted_1a == X_fitted_2)
+
 
 def test_iterative_imputer_no_missing():
     rng = np.random.RandomState(0)
@@ -637,7 +665,7 @@ def test_iterative_imputer_rank_one():
     X_missing[nan_mask] = np.nan
 
     imputer = IterativeImputer(n_iter=5,
-                               verbose=True,
+                               verbose=1,
                                random_state=rng)
     X_filled = imputer.fit_transform(X_missing)
     assert_allclose(X_filled, X, atol=0.01)
@@ -666,7 +694,7 @@ def test_iterative_imputer_transform_recovery(rank):
     X_test = X_missing[n:]
 
     imputer = IterativeImputer(n_iter=10,
-                               verbose=True,
+                               verbose=1,
                                random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.1)
@@ -694,7 +722,7 @@ def test_iterative_imputer_additive_matrix():
     X_test = X_missing[n:]
 
     imputer = IterativeImputer(n_iter=10,
-                               verbose=True,
+                               verbose=2,
                                random_state=rng).fit(X_train)
     X_test_est = imputer.transform(X_test)
     assert_allclose(X_test_filled, X_test_est, atol=0.1)

From 8afdd2f0ccc45f7159d179f3da6375cbbd2a5be9 Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Tue, 26 Jun 2018 10:33:20 -0700
Subject: [PATCH 14/26] forgotten doc change

---
 sklearn/impute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 43012d1f5f431..af0658d98bc5a 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -459,8 +459,8 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     predictor : estimator object, default=BayesianRidge() or RidgeCV()
         The predictor to use at each step of the round-robin imputation.
-        It must support ``return_std`` in its ``predict`` method if
-        ``sample_posterior`` option is set to ``True`` below. If
+        If ``sample_posterior`` is True, the predictor must support
+        ``return_std`` in its ``predict`` method. Also, if
         ``sample_posterior=True`` the default predictor will be
         ``BayesianRidge()`` and ``RidgeCV`` otherwise.
 

From b61fc1f9d19010f8eaa105211d3777bee2b8ce6a Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Tue, 26 Jun 2018 16:32:49 -0700
Subject: [PATCH 15/26] typo

---
 sklearn/impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index af0658d98bc5a..c1829c0fc1e3d 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -473,7 +473,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
-        the each feature column. Nearness between features is measured using
+        each feature column. Nearness between features is measured using
         the absolute correlation coefficient between each feature pair (after
         initial imputation). To ensure coverage of features throughout the
         imputation process, the neighbor features are not necessarily nearest,

From 482e9a5cf81542045549d5498a7b6b49172c02cd Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Fri, 29 Jun 2018 08:42:16 -0700
Subject: [PATCH 16/26] addressing comments

---
 doc/modules/impute.rst       | 22 +++++++++++-----------
 sklearn/impute.py            | 34 ++++++++++++++++++++++------------
 sklearn/tests/test_impute.py |  8 ++++++--
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 151eefca8258b..8d1ba71a6a5f6 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -101,9 +101,7 @@ other feature columns are treated as inputs `X`. A regressor is fit on `(X, y)`
 for known `y`. Then, the regressor is used to predict the missing values of `y`.
 This is done for each feature in an iterative fashion, and then is repeated for
 `n_iter` imputation rounds. The results of the final imputation round are
-returned. Our implementation was inspired by the R MICE package (Multivariate
-Imputation by Chained Equations) [1], but differs from it in setting single
-imputation to default instead of multiple imputation. This is discussed further below.
+returned.
 
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
@@ -133,14 +131,16 @@ In the statistics community, it is common practice to perform multiple imputatio
 generating, for example, `m` separate imputations for a single feature matrix.
 Each of these `m` imputations is then put through the subsequent analysis pipeline
 (e.g. feature engineering, clustering, regression, classification). The `m` final
-analysis results (e.g. held-out validation error) allow the data scientist to
-obtain understanding of the uncertainty inherent in the missing values. The above
-practice is called multiple imputation.
-
-As implemented, the :class:`IterativeImputer` class generates a single imputation
-for each missing value because this is the most common use case for machine learning
-applications. However, it can also be used for multiple imputations by applying it
-repeatedly to the same dataset with different random seeds when
+analysis results (e.g. held-out validation errors) allow the data scientist to
+to obtain understanding of how analytic results may differ as a consequence
+of the inherent uncertainty caused by the missing values. The above practice
+is called multiple imputation.
+
+Our implementation of :class:`IterativeImputer` was inspired by the R MICE
+package (Multivariate Imputation by Chained Equations) [1], but differs from
+it in setting single imputation to default instead of multiple imputation.
+However, :class:`IterativeImputer` can also be used for multiple imputations
+by applying it repeatedly to the same dataset with different random seeds when
 ``sample_posterior=True``.
 
 See Chapter 4 of "Statistical Analysis with Missing Data" [2] for more discussion
diff --git a/sklearn/impute.py b/sklearn/impute.py
index c1829c0fc1e3d..5e214e7cc88d0 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -141,6 +141,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     statistics_ : array of shape (n_features,)
         The imputation fill value for each feature.
 
+    See also
+    --------
+    IterativeImputer: Multivariate imputation of missing values.
+
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
@@ -424,11 +428,10 @@ def transform(self, X):
 
 
 class IterativeImputer(BaseEstimator, TransformerMixin):
-    """Iterative imputer transformer to impute missing values.
+    """Multivariate imputer that estimates each features from all the others.
 
-    Basic implementation of iterative mutual regressions to find replacement
-    values in multivariate missing data. This version assumes all features
-    are Gaussian.
+    A strategy for imputing missing values by modeling each feature with
+    missing values as a function of other features in a round-robin fashion.
 
     Read more in the :ref:`User Guide <iterative_imputer>`.
 
@@ -457,7 +460,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         imputations computed during the final round. A round is a single
         imputation of each feature with missing values.
 
-    predictor : estimator object, default=BayesianRidge() or RidgeCV()
+    predictor : estimator object, default=RidgeCV() or BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
         If ``sample_posterior`` is True, the predictor must support
         ``return_std`` in its ``predict`` method. Also, if
@@ -468,8 +471,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         Whether to sample from the (Gaussian) predictive posterior of the
         fitted predictor for each imputation. Predictor must support
         ``return_std`` in its ``predict`` method if set to ``True``. Set to
-        ``True`` if using ``IterativeImputer`` to have the same functionality
-        as MICE (Multivariate Imputation by Chained Equations).
+        ``True`` if using ``IterativeImputer`` for multiple imputations.
 
     n_nearest_features : int, optional (default=None)
         Number of other features to use to estimate the missing values of
@@ -516,15 +518,22 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         ``feat_idx`` is the current feature to be imputed,
         ``neighbor_feat_idx`` is the array of other features used to impute the
         current feature, and ``predictor`` is the trained predictor used for
-        the imputation. Length is ``n_features_with_missing * n_iter``.
+        the imputation. Length is ``self.n_features_with_missing_ * n_iter``.
+
+    n_features_with_missing_ : int
+        Number of features with missing values.
+
+    See also
+    --------
+    SimpleImputer: Univariate imputation of missing values.
 
     Notes
     -----
     This implementation was inspired by the R MICE package (Multivariate
-    Imputation by Chained Equations), but differs from it in setting single
-    imputation to default instead of multiple imputation. However, multiple
-    imputation is supported with multiple instances of the imputer with
-    different random seeds run in parallel.
+    Imputation by Chained Equations), but differs from it by returning a single
+    imputation instead of multiple imputations. However, multiple imputation is
+    supported with multiple instances of the imputer with different random
+    seeds run in parallel.
 
     To support imputation in inductive mode we store each feature's predictor
     during the ``fit`` phase, and predict without refitting (in order) during
@@ -878,6 +887,7 @@ def fit_transform(self, X, y=None):
         # and a better way would be good.
         # see: https://goo.gl/KyCNwj and subsequent comments
         ordered_idx = self._get_ordered_idx(mask_missing_values)
+        self.n_features_with_missing_ = len(ordered_idx)
 
         abs_corr_mat = self._get_abs_corr_mat(Xt)
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 0baa37e60cf12..c0d2e9ca6884a 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -497,8 +497,9 @@ def test_iterative_imputer_imputation_order(imputation_order):
     X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
     X[:, 0] = 1  # this column should not be discarded by IterativeImputer
 
+    n_iter = 2
     imputer = IterativeImputer(missing_values=0,
-                               n_iter=2,
+                               n_iter=n_iter,
                                n_nearest_features=5,
                                min_value=0,
                                max_value=1,
@@ -507,6 +508,9 @@ def test_iterative_imputer_imputation_order(imputation_order):
                                random_state=rng)
     imputer.fit_transform(X)
     ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
+
+    assert len(ordered_idx) // n_iter == imputer.n_features_with_missing_
+
     if imputation_order == 'roman':
         assert np.all(ordered_idx[:d-1] == np.arange(1, d))
     elif imputation_order == 'arabic':
@@ -516,7 +520,7 @@ def test_iterative_imputer_imputation_order(imputation_order):
         ordered_idx_round_2 = ordered_idx[d-1:]
         assert ordered_idx_round_1 != ordered_idx_round_2
     elif 'ending' in imputation_order:
-        assert len(ordered_idx) == 2 * (d - 1)
+        assert len(ordered_idx) == n_iter * (d - 1)
 
 
 @pytest.mark.parametrize(

From f6a7f45a0a101cc93f82b282824819cb68ba678f Mon Sep 17 00:00:00 2001
From: sergeyf <sergey@data-cowboys.com>
Date: Mon, 2 Jul 2018 08:28:17 -0700
Subject: [PATCH 17/26] fixing docs from review

---
 doc/modules/impute.rst |  3 +--
 sklearn/impute.py      | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 8d1ba71a6a5f6..f8f3e2731953a 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -131,7 +131,7 @@ In the statistics community, it is common practice to perform multiple imputatio
 generating, for example, `m` separate imputations for a single feature matrix.
 Each of these `m` imputations is then put through the subsequent analysis pipeline
 (e.g. feature engineering, clustering, regression, classification). The `m` final
-analysis results (e.g. held-out validation errors) allow the data scientist to
+analysis results (e.g. held-out validation errors) allow the data scientist
 to obtain understanding of how analytic results may differ as a consequence
 of the inherent uncertainty caused by the missing values. The above practice
 is called multiple imputation.
@@ -154,7 +154,6 @@ Note that a call to the ``transform`` method of :class:`IterativeImputer` is not
 allowed to change the number of samples. Therefore multiple imputations cannot be
 achieved by a single call to ``transform``.
 
-.. _references:
 
 References
 ==========
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 5e214e7cc88d0..36731106b2c29 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -143,7 +143,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
 
     See also
     --------
-    IterativeImputer: Multivariate imputation of missing values.
+    IterativeImputer : Multivariate imputation of missing values.
 
     Notes
     -----
@@ -428,7 +428,7 @@ def transform(self, X):
 
 
 class IterativeImputer(BaseEstimator, TransformerMixin):
-    """Multivariate imputer that estimates each features from all the others.
+    """Multivariate imputer that estimates each feature from all the others.
 
     A strategy for imputing missing values by modeling each feature with
     missing values as a function of other features in a round-robin fashion.
@@ -456,9 +456,9 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
             A random order for each round.
 
     n_iter : int, optional (default=10)
-        Number of imputation rounds to perform before returning the final
-        imputations computed during the final round. A round is a single
-        imputation of each feature with missing values.
+        Number of imputation rounds to perform before returning the imputations
+        computed during the final round. A round is a single imputation of each
+        feature with missing values.
 
     predictor : estimator object, default=RidgeCV() or BayesianRidge()
         The predictor to use at each step of the round-robin imputation.
@@ -502,11 +502,11 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         or 2.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        The seed of the pseudo random number generator to use when shuffling
-        the data.  If int, random_state is the seed used by the random number
-        generator; If RandomState instance, random_state is the random number
-        generator; If None, the random number generator is the RandomState
-        instance used by ``np.random``.
+        The seed of the pseudo random number generator to use. Randomizes
+        selection of predictor features if n_nearest_features is not None, the
+        ``imputation_order`` if ``random``, and the sampling from posterior if
+        ``sample_posterior`` is True. Use an integer for determinism.
+        See :term:`the Glossary <random_state>`.
 
     Attributes
     ----------
@@ -525,7 +525,7 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     See also
     --------
-    SimpleImputer: Univariate imputation of missing values.
+    SimpleImputer : Univariate imputation of missing values.
 
     Notes
     -----

From 8bd5e2c27727ff6b76f39eca2c59dc4a698a2303 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 16 Jul 2018 17:22:33 +0200
Subject: [PATCH 18/26] TST update tests with renaming

---
 sklearn/tests/test_impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index f8bae3ce60165..3c09499dcbdce 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -733,7 +733,7 @@ def test_iterative_imputer_additive_matrix():
 
 
 @pytest.mark.parametrize("imputer_constructor",
-                         [SimpleImputer, ChainedImputer])
+                         [SimpleImputer, IterativeImputer])
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [("NaN", np.nan, "Input contains NaN"),

From 65f43a7c9e78a48ec1d60accbd7a79e48378913e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 16 Jul 2018 22:27:19 +0200
Subject: [PATCH 19/26] iter

---
 doc/modules/impute.rst          | 42 ++++++++++++++++-----------------
 examples/plot_missing_values.py |  2 +-
 sklearn/impute.py               | 21 ++++++++---------
 sklearn/tests/test_impute.py    |  8 +++++++
 4 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index f8f3e2731953a..d069a0ec0c217 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -96,12 +96,12 @@ Multivariate feature imputation
 A more sophisticated approach is to use the :class:`IterativeImputer` class,
 which models each feature with missing values as a function of other features,
 and uses that estimate for imputation. It does so in an iterated round-robin
-fashion: at each step, a feature column is designated as output `y` and the
-other feature columns are treated as inputs `X`. A regressor is fit on `(X, y)`
-for known `y`. Then, the regressor is used to predict the missing values of `y`.
-This is done for each feature in an iterative fashion, and then is repeated for
-`n_iter` imputation rounds. The results of the final imputation round are
-returned.
+fashion: at each step, a feature column is designated as output ``y`` and the
+other feature columns are treated as inputs ``X``. A regressor is fit on ``(X,
+y)`` for known ``y``. Then, the regressor is used to predict the missing values
+of ``y``.  This is done for each feature in an iterative fashion, and then is
+repeated for ``n_iter`` imputation rounds. The results of the final imputation
+round are returned.
 
     >>> import numpy as np
     >>> from sklearn.impute import IterativeImputer
@@ -128,23 +128,21 @@ Multiple vs. Single Imputation
 ==============================
 
 In the statistics community, it is common practice to perform multiple imputations,
-generating, for example, `m` separate imputations for a single feature matrix.
-Each of these `m` imputations is then put through the subsequent analysis pipeline
-(e.g. feature engineering, clustering, regression, classification). The `m` final
+generating, for example, ``m`` separate imputations for a single feature matrix.
+Each of these ``m`` imputations is then put through the subsequent analysis pipeline
+(e.g. feature engineering, clustering, regression, classification). The ``m`` final
 analysis results (e.g. held-out validation errors) allow the data scientist
 to obtain understanding of how analytic results may differ as a consequence
 of the inherent uncertainty caused by the missing values. The above practice
 is called multiple imputation.
 
 Our implementation of :class:`IterativeImputer` was inspired by the R MICE
-package (Multivariate Imputation by Chained Equations) [1], but differs from
-it in setting single imputation to default instead of multiple imputation.
-However, :class:`IterativeImputer` can also be used for multiple imputations
-by applying it repeatedly to the same dataset with different random seeds when
-``sample_posterior=True``.
-
-See Chapter 4 of "Statistical Analysis with Missing Data" [2] for more discussion
-on multiple vs. single imputations.
+package (Multivariate Imputation by Chained Equations) [1]_, but differs from
+it by returning a single imputation instead of multiple imputations.  However,
+:class:`IterativeImputer` can also be used for multiple imputations by applying
+it repeatedly to the same dataset with different random seeds when
+``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple
+vs. single imputations.
 
 It is still an open problem as to how useful single vs. multiple imputation is in
 the context of prediction and classification when the user is not interested in
@@ -157,8 +155,10 @@ achieved by a single call to ``transform``.
 
 References
 ==========
-[1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate Imputation
-    by Chained Equations in R". Journal of Statistical Software 45: 1-67.
 
-[2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis with Missing
-    Data". John Wiley & Sons, Inc., New York, NY, USA.
+.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate
+   Imputation by Chained Equations in R". Journal of Statistical Software 45:
+   1-67.
+
+.. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis
+   with Missing Data". John Wiley & Sons, Inc., New York, NY, USA.
diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index c9a95f2db66be..ce0584ee40a09 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -95,7 +95,7 @@ def get_results(dataset):
 x_labels = ['Full data',
             'Zero imputation',
             'Mean Imputation',
-            'Iterative Imputation']
+            'Multivariate Imputation']
 colors = ['r', 'g', 'b', 'orange']
 
 # plot diabetes results
diff --git a/sklearn/impute.py b/sklearn/impute.py
index c80ad1d88e115..d5b09a357f688 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -540,9 +540,9 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
     -----
     This implementation was inspired by the R MICE package (Multivariate
     Imputation by Chained Equations), but differs from it by returning a single
-    imputation instead of multiple imputations. However, multiple imputation is
-    supported with multiple instances of the imputer with different random
-    seeds run in parallel.
+    imputation instead of multiple imputations. However, multiple imputation
+    can be achieved with multiple instances of the imputer with different
+    random seeds run in parallel.
 
     To support imputation in inductive mode we store each feature's predictor
     during the ``fit`` phase, and predict without refitting (in order) during
@@ -866,6 +866,11 @@ def fit_transform(self, X, y=None):
         self.random_state_ = getattr(self, "random_state_",
                                      check_random_state(self.random_state))
 
+        if self.n_iter < 0:
+            raise ValueError(
+                "'n_iter' should be a positive integer. Got {} instead."
+                .format(self.n_iter))
+
         if self.predictor is None:
             if self.sample_posterior:
                 from .linear_model import BayesianRidge
@@ -886,10 +891,7 @@ def fit_transform(self, X, y=None):
         self.initial_imputer_ = None
         X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_iter,
-        # then there is no need to do further imputation and the result should
-        # be just the initial imputation (before clipping)
-        if self.n_iter < 1:
+        if self.n_iter == 0:
             return Xt
 
         # order in which to impute
@@ -952,10 +954,7 @@ def transform(self, X):
 
         X, Xt, mask_missing_values = self._initial_imputation(X)
 
-        # edge case: in case the user specifies 0 for n_iter,
-        # then there is no need to do further imputation and the result should
-        # be just the initial imputation (before clipping)
-        if self.n_iter < 1:
+        if self.n_iter == 0:
             return Xt
 
         imputations_per_round = len(self.imputation_sequence_) // self.n_iter
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 3c09499dcbdce..9884d991fb1ab 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -732,6 +732,14 @@ def test_iterative_imputer_additive_matrix():
     assert_allclose(X_test_filled, X_test_est, atol=0.01)
 
 
+def test_iterative_imputer_error_param():
+    rng = np.random.RandomState(42)
+    X = rng.randn(100, 2)
+    imputer = IterativeImputer(n_iter=-1)
+    with pytest.raises(ValueError, match='should be a positive integer'):
+        imputer.fit_transform(X)
+
+
 @pytest.mark.parametrize("imputer_constructor",
                          [SimpleImputer, IterativeImputer])
 @pytest.mark.parametrize(

From fac9839bf7726628b0515d72b83e411d14171d8e Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Wed, 22 Aug 2018 11:35:34 -0700
Subject: [PATCH 20/26] Missing rename

---
 examples/plot_missing_values.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index 6308535371b90..f66588c7fdb89 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -8,7 +8,7 @@
 The median is a more robust estimator for data with high magnitude variables
 which could dominate results (otherwise known as a 'long tail').
 
-Another option is the :class:`sklearn.impute.ChainedImputer`. This uses
+Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
 round-robin linear regression, treating every variable as an output in
 turn. The version implemented assumes Gaussian (output) variables. If your
 features are obviously non-Normal, consider transforming them to look more

From 0ba454cd5c2fb52d274b861462ec90dce4c81920 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Mon, 3 Sep 2018 20:43:32 +0000
Subject: [PATCH 21/26] Undoing changes to 0.20 rst

Impute not part of 0.20!
---
 doc/whats_new/v0.20.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index b28c5771d664f..2ed336b782174 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -155,14 +155,6 @@ Support for Python 3.3 has been officially dropped.
   :class:`cluster.AgglomerativeClustering`.
   :issue:`9875` by :user:`Kumar Ashutosh <thechargedneutron>`.
 
-- Added :class:`MissingIndicator` which generates a binary indicator for
-  missing values. :issue:`8075` by :user:`Maniteja Nandana <maniteja123>` and
-  :user:`Guillaume Lemaitre <glemaitre>`.
-  
-- Added :class:`impute.IterativeImputer`, which is a strategy for imputing missing
-  values by modeling each feature with missing values as a function of
-  other features in a round-robin fashion. :issue:`8478` by
-  :user:`Sergey Feldman <sergeyf>`.
 
 :mod:`sklearn.compose`
 ......................

From 3744445000b61ef0fd6d0394bcba64fdbd9f3ea1 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Mon, 3 Sep 2018 20:45:05 +0000
Subject: [PATCH 22/26] Updating 0.21 changes to IterativeImputer

---
 doc/whats_new/v0.21.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2c010e5b1be59..2159e39dc126d 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -43,7 +43,7 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.impute`
 .....................
 
-- |MajorFeature| Added :class:`impute.ChainedImputer`, which is a strategy for
+- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for
   imputing missing values by modeling each feature with missing values as a
   function of other features in a round-robin fashion. :issue:`8478` by
   :user:`Sergey Feldman <sergeyf>`.

From 5d6865d5a3a0ab244651f9fd14fd47be000855b9 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Mon, 3 Sep 2018 20:45:51 +0000
Subject: [PATCH 23/26] Update v0.20 rst to not have any changes

---
 doc/whats_new/v0.20.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 2ed336b782174..402b7c178c8dd 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -155,7 +155,6 @@ Support for Python 3.3 has been officially dropped.
   :class:`cluster.AgglomerativeClustering`.
   :issue:`9875` by :user:`Kumar Ashutosh <thechargedneutron>`.
 
-
 :mod:`sklearn.compose`
 ......................
 

From 24bb77bdb0197ca0a3f26a5b18839aaf7d4be780 Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Tue, 4 Sep 2018 09:29:02 +0000
Subject: [PATCH 24/26] Fix plot_missing_values.py

---
 examples/plot_missing_values.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py
index b3547dcc7bf2a..43d7ddfc497f3 100644
--- a/examples/plot_missing_values.py
+++ b/examples/plot_missing_values.py
@@ -73,13 +73,13 @@ def get_results(dataset):
                                          scoring='neg_mean_squared_error',
                                          cv=5)
 
-    # Estimate the score after chained imputation of the missing values
+    # Estimate the score after iterative imputation of the missing values
     estimator = make_pipeline(
         make_union(IterativeImputer(missing_values=0, random_state=0),
                    MissingIndicator(missing_values=0)),
         RandomForestRegressor(random_state=0, n_estimators=100))
-    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
-                                            scoring='neg_mean_squared_error')
+    iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing,
+                                              scoring='neg_mean_squared_error')
 
     return ((full_scores.mean(), full_scores.std()),
             (zero_impute_scores.mean(), zero_impute_scores.std()),

From 12a3456fd9f832dc9d4a2b4bdd4092a463ee3fea Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Fri, 14 Sep 2018 10:31:47 +0300
Subject: [PATCH 25/26] Addressing review comments

---
 sklearn/impute.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f22a89eeee67f..3ffca98db06a2 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -462,7 +462,8 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         If ``sample_posterior`` is True, the predictor must support
         ``return_std`` in its ``predict`` method. Also, if
         ``sample_posterior=True`` the default predictor will be
-        ``BayesianRidge()`` and ``RidgeCV`` otherwise.
+        ``:class:sklearn.linear_model:BayesianRidge()`` and
+        ``:class:sklearn.linear_model.RidgeCV()`` otherwise.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
@@ -526,12 +527,6 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
 
     Notes
     -----
-    This implementation was inspired by the R MICE package (Multivariate
-    Imputation by Chained Equations), but differs from it by returning a single
-    imputation instead of multiple imputations. However, multiple imputation
-    can be achieved with multiple instances of the imputer with different
-    random seeds run in parallel.
-
     To support imputation in inductive mode we store each feature's predictor
     during the ``fit`` phase, and predict without refitting (in order) during
     the ``transform`` phase.
@@ -539,8 +534,9 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
     Features which contain all missing values at ``fit`` are discarded upon
     ``transform``.
 
-    Features with missing values in transform which did not have any missing
-    values in fit will be imputed with the initial imputation method only.
+    Features with missing values during ``transform`` which did not have any
+    missing values during ``fit`` will be imputed with the initial imputation
+    method only.
 
     References
     ----------

From 222b269d35618061f4001ffc22715b928dc0ec2c Mon Sep 17 00:00:00 2001
From: Sergey Feldman <sergeyfeldman@gmail.com>
Date: Sat, 15 Sep 2018 19:57:22 +0300
Subject: [PATCH 26/26] Fixed :class: syntax

---
 sklearn/impute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 3ffca98db06a2..3035040c1179a 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -462,8 +462,8 @@ class IterativeImputer(BaseEstimator, TransformerMixin):
         If ``sample_posterior`` is True, the predictor must support
         ``return_std`` in its ``predict`` method. Also, if
         ``sample_posterior=True`` the default predictor will be
-        ``:class:sklearn.linear_model:BayesianRidge()`` and
-        ``:class:sklearn.linear_model.RidgeCV()`` otherwise.
+        :class:`sklearn.linear_model.BayesianRidge` and
+        :class:`sklearn.linear_model.RidgeCV` otherwise.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the