From cdc71830d7f9aa2126420db0b2ced09b66e48d29 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 17 Sep 2018 20:43:28 +0300 Subject: [PATCH 01/33] first commit --- doc/whats_new/v0.21.rst | 9 +++ examples/impute/README.txt | 6 ++ .../plot_iterative_imputer_as_missforest.py | 60 +++++++++++++++++++ examples/{ => impute}/plot_missing_values.py | 0 4 files changed, 75 insertions(+) create mode 100644 examples/impute/README.txt create mode 100644 examples/impute/plot_iterative_imputer_as_missforest.py rename examples/{ => impute}/plot_missing_values.py (100%) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2159e39dc126d..3c5d4a0e80848 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -40,6 +40,14 @@ Support for Python 3.4 and below has been officially dropped. - An entry goes here - An entry goes here +:mod:`sklearn.cluster` +...................... + +- |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an + algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier + to set and that scales better, by :user:`Shane ` and + :user:`Adrin Jalali `. + :mod:`sklearn.impute` ..................... @@ -48,6 +56,7 @@ Support for Python 3.4 and below has been officially dropped. function of other features in a round-robin fashion. :issue:`8478` by :user:`Sergey Feldman `. + Multiple modules ................ diff --git a/examples/impute/README.txt b/examples/impute/README.txt new file mode 100644 index 0000000000000..87b3631bf329b --- /dev/null +++ b/examples/impute/README.txt @@ -0,0 +1,6 @@ +.. _impute_examples: + +Missing Value Imputation +------------------------ + +Examples concerning the :mod:`sklearn.impute` module. \ No newline at end of file diff --git a/examples/impute/plot_iterative_imputer_as_missforest.py b/examples/impute/plot_iterative_imputer_as_missforest.py new file mode 100644 index 0000000000000..3bce2cc2ed919 --- /dev/null +++ b/examples/impute/plot_iterative_imputer_as_missforest.py @@ -0,0 +1,60 @@ +""" +============================================================= +Replicating Functionality of missForest with IterativeImputer +============================================================= + +There are many well-established imputation packages in the R data science +ecosystem: Amelia, mi, mice, missForest, and others. + +missForest is popular, and turns out to be a particular instance of a class of +sequential imputation algorithms that can all be implemented with the +:class:`sklearn.impute.IterativeImputer` class, which is a strategy for +imputing missing values by modeling each feature with missing values as a +function of other features in a round-robin fashion. In the case of missForest, +the function is a Random Forest. + +In this example we will demonstrate how to use +:class:`sklearn.impute.IterativeImputer` to replicate the functionality of +missForest. +""" + +import numpy as np + +from sklearn.datasets import load_diabetes +from sklearn.ensemble import RandomForestRegressor +from sklearn.impute import IterativeImputer + +rng = np.random.RandomState(0) + +# Load data +dataset = load_diabetes() +X_full, y_full = dataset.data, dataset.target +n_samples = X_full.shape[0] +n_features = X_full.shape[1] + +# Add missing values in 75% of the lines +missing_rate = 0.75 +n_missing_samples = int(np.floor(n_samples * missing_rate)) +missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, + dtype=np.bool), + np.ones(n_missing_samples, + dtype=np.bool))) +rng.shuffle(missing_samples) +missing_features = rng.randint(0, n_features, n_missing_samples) + +X_missing = X_full.copy() +X_missing[np.where(missing_samples)[0], missing_features] = np.nan +y_missing = y_full.copy() + +# Random Forest predictor with default values according to missForest docs +predictor = RandomForestRegressor(n_estimators=100, max_features='sqrt') +imputer = IterativeImputer(n_iter=10, predictor=predictor) + +# Impute missing values with IterativeImputer as missForest +X_imputed = imputer.fit_transform(X_missing) + +# Compute RMSE of the imputed values +imp_missing_vals = X_imputed[np.where(missing_samples)[0], missing_features] +true_missing_vals = X_full[np.where(missing_samples)[0], missing_features] +rmse = np.sqrt(np.mean((true_missing_vals - imp_missing_vals)**2)) +print('RMSE of IterativeImputer as missForest on the Diabetes Data:', rmse) \ No newline at end of file diff --git a/examples/plot_missing_values.py b/examples/impute/plot_missing_values.py similarity index 100% rename from examples/plot_missing_values.py rename to examples/impute/plot_missing_values.py From 493a27a94ee4045c29d5737b41629637d18b874b Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 17 Sep 2018 20:48:03 +0300 Subject: [PATCH 02/33] undoing space --- doc/whats_new/v0.21.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 3c5d4a0e80848..54549e629331c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -56,7 +56,6 @@ Support for Python 3.4 and below has been officially dropped. function of other features in a round-robin fashion. :issue:`8478` by :user:`Sergey Feldman `. - Multiple modules ................ From 75ad2da409d6afe8b6067e1fb8892aa93c410434 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 17 Sep 2018 21:01:19 +0300 Subject: [PATCH 03/33] newline --- examples/impute/plot_iterative_imputer_as_missforest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_as_missforest.py b/examples/impute/plot_iterative_imputer_as_missforest.py index 3bce2cc2ed919..54ba44bdd58fc 100644 --- a/examples/impute/plot_iterative_imputer_as_missforest.py +++ b/examples/impute/plot_iterative_imputer_as_missforest.py @@ -57,4 +57,4 @@ imp_missing_vals = X_imputed[np.where(missing_samples)[0], missing_features] true_missing_vals = X_full[np.where(missing_samples)[0], missing_features] rmse = np.sqrt(np.mean((true_missing_vals - imp_missing_vals)**2)) -print('RMSE of IterativeImputer as missForest on the Diabetes Data:', rmse) \ No newline at end of file +print('RMSE of IterativeImputer as missForest on the Diabetes Data:', rmse) From 1465f774f08cad3dd8002563a939a7500d54d183 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 17 Sep 2018 21:15:30 +0300 Subject: [PATCH 04/33] fixing bug in plot_missing_values and adding a bit more performance --- examples/impute/plot_missing_values.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 43d7ddfc497f3..4b0e0debceb8f 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -40,7 +40,8 @@ def get_results(dataset): # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, - scoring='neg_mean_squared_error', cv=5) + scoring='neg_mean_squared_error', + cv=5) # Add missing values in 75% of the lines missing_rate = 0.75 @@ -75,11 +76,14 @@ def get_results(dataset): # Estimate the score after iterative imputation of the missing values estimator = make_pipeline( - make_union(IterativeImputer(missing_values=0, random_state=0), + make_union(IterativeImputer(missing_values=0, + random_state=0, + n_nearest_features=5), MissingIndicator(missing_values=0)), RandomForestRegressor(random_state=0, n_estimators=100)) iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error') + scoring='neg_mean_squared_error', + cv=5) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), From ea84910211d632817f86e842ab5ad058da05211e Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 17 Sep 2018 21:22:49 +0300 Subject: [PATCH 05/33] slight clarification --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 4b0e0debceb8f..b835b054a31e6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -12,7 +12,7 @@ round-robin linear regression, treating every variable as an output in turn. The version implemented assumes Gaussian (output) variables. If your features are obviously non-Normal, consider transforming them to look more -Normal so as to improve performance. +Normal so as to potentially improve performance. In addition of using an imputing method, we can also keep an indication of the missing information using :func:`sklearn.impute.MissingIndicator` which might From 753f60d0d6f75e72f71fc048c34f8ef327cac9a5 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Tue, 18 Sep 2018 14:27:30 +0300 Subject: [PATCH 06/33] another example --- .../plot_iterative_imputer_as_missforest.py | 1 + ...t_iterative_imputer_variants_comparison.py | 130 ++++++++++++++++++ examples/impute/plot_missing_values.py | 1 + 3 files changed, 132 insertions(+) create mode 100644 examples/impute/plot_iterative_imputer_variants_comparison.py diff --git a/examples/impute/plot_iterative_imputer_as_missforest.py b/examples/impute/plot_iterative_imputer_as_missforest.py index 54ba44bdd58fc..ac1762d42f68d 100644 --- a/examples/impute/plot_iterative_imputer_as_missforest.py +++ b/examples/impute/plot_iterative_imputer_as_missforest.py @@ -17,6 +17,7 @@ :class:`sklearn.impute.IterativeImputer` to replicate the functionality of missForest. """ +print(__doc__) import numpy as np diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py new file mode 100644 index 0000000000000..1aecdef75b5fd --- /dev/null +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -0,0 +1,130 @@ +""" +========================================================= +Imputing missing values with variants of IterativeImputer +========================================================= + +The :class:`sklearn.impute.IterativeImputer` class is very flexible - it can be +used with a variety of predictors to do round-robin regression, treating every +variable as an output in turn. + +In this example we compare some predictors for the purpose of missing feature +imputation with `IterativeImputer`:: + + RidgeCV: default + HuberRegressor: robust linear regression to reduce the impact of outliers + DecisionTreeRegressor: non-linear regression + RandomForestRegressor: equivalent to missForest in R + KNeighborsRegressor: comparable to other KNN imputation approaches + +The goal is to compare different predictors to see which one is best for +the `IterativeImputer` when using a ``RandomForestRegressor`` estimator on the +Boston dataset. + +For the Boston dataset, the ``HuberRegressor`` produces results that are on +average superior to even having the full dataset. We also see that using other +predictors results in an imputer that is worse than using ``SimpleImputer`` +with the ``mean`` strategy. +""" +print(__doc__) + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.datasets import load_boston +from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.linear_model import RidgeCV, HuberRegressor +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.neighbors import KNeighborsRegressor +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import cross_val_score + +rng = np.random.RandomState(0) + +X_full, y_full = load_boston(return_X_y=True) +n_samples = X_full.shape[0] +n_features = X_full.shape[1] + +# Estimate the score on the entire dataset, with no missing values +rf_estimator = RandomForestRegressor(random_state=0, n_estimators=100) +full_scores = cross_val_score(rf_estimator, X_full, y_full, + scoring='neg_mean_squared_error', + cv=5) +mses_boston = [-full_scores.mean()] +stds_boston = [full_scores.std()] + +# Add missing values in 75% of the lines +missing_rate = 0.75 +n_missing_samples = int(np.floor(n_samples * missing_rate)) +missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, + dtype=np.bool), + np.ones(n_missing_samples, + dtype=np.bool))) +rng.shuffle(missing_samples) +missing_features = rng.randint(0, n_features, n_missing_samples) +X_missing = X_full.copy() +X_missing[np.where(missing_samples)[0], missing_features] = np.nan +y_missing = y_full.copy() + +# Estimate the score after imputation (mean strategy) of the missing values +for strategy in ['mean', 'median']: + estimator = make_pipeline( + SimpleImputer(missing_values=np.nan, strategy=strategy), + rf_estimator + ) + mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=5) + mses_boston.append(-mean_impute_scores.mean()) + stds_boston.append(mean_impute_scores.std()) + +# Estimate the score after iterative imputation of the missing values +# with different predictors +predictors = [ + RidgeCV(alphas=(1e-7, 0.01, 0.1, 1.0, 10.0)), + HuberRegressor(), + DecisionTreeRegressor(random_state=0, max_features='sqrt'), + RandomForestRegressor(random_state=0, + n_estimators=100, + max_features='sqrt'), + KNeighborsRegressor(n_neighbors=15) +] + +for predictor in predictors: + estimator = make_pipeline( + IterativeImputer(random_state=0, predictor=predictor), + rf_estimator + ) + pred_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=5) + mses_boston.append(-pred_scores.mean()) + stds_boston.append(pred_scores.std()) + + +n_bars = len(mses_boston) +xval = np.arange(n_bars) + +x_labels = ['Full Data', + 'SimpleImputer w/ Mean Strategy', + 'SimpleImputer w/ Median Strategy', + 'IterativeImputer w/ RidgeCV', + 'IterativeImputer w/ HuberRegressor', + 'IterativeImputer w/ DecisionTreeRegressor', + 'IterativeImputer w/ RandomForestRegressor', + 'IterativeImputer w/ KNeighborsRegressor'] + +# plot boston results +fig, ax = plt.subplots(figsize=(10, 6)) +for i, j in enumerate(xval): + color = 'C' + str(i + 1) + ax.barh(j, mses_boston[j], xerr=stds_boston[j], + color=color, alpha=0.6, align='center') + +ax.set_title('Boston Data Regression MSE With Different Imputation Methods') +ax.set_xlim(left=0, right=45) +ax.set_xlabel('MSE') +ax.set_yticks(xval) +ax.invert_yaxis() +ax.set_yticklabels(x_labels) +plt.show() diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b835b054a31e6..00e6b7ad3ba1d 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -18,6 +18,7 @@ missing information using :func:`sklearn.impute.MissingIndicator` which might carry some information. """ +print(__doc__) import numpy as np import matplotlib.pyplot as plt From 2ec04010bc352eaf37a1da87a7bdaf500e888151 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Tue, 18 Sep 2018 14:52:29 +0300 Subject: [PATCH 07/33] fixing tests --- .../plot_iterative_imputer_variants_comparison.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 1aecdef75b5fd..f0571851b8314 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -101,10 +101,7 @@ mses_boston.append(-pred_scores.mean()) stds_boston.append(pred_scores.std()) - -n_bars = len(mses_boston) -xval = np.arange(n_bars) - +# Plot the results x_labels = ['Full Data', 'SimpleImputer w/ Mean Strategy', 'SimpleImputer w/ Median Strategy', @@ -116,15 +113,12 @@ # plot boston results fig, ax = plt.subplots(figsize=(10, 6)) -for i, j in enumerate(xval): - color = 'C' + str(i + 1) - ax.barh(j, mses_boston[j], xerr=stds_boston[j], - color=color, alpha=0.6, align='center') +for i, j in enumerate(np.arange(len(mses_boston))): + ax.barh(j, mses_boston[j], xerr=stds_boston[j], alpha=0.6, align='center') ax.set_title('Boston Data Regression MSE With Different Imputation Methods') -ax.set_xlim(left=0, right=45) ax.set_xlabel('MSE') -ax.set_yticks(xval) +ax.set_yticks(np.arange(len(mses_boston))) ax.invert_yaxis() ax.set_yticklabels(x_labels) plt.show() From 39fb7f758802b9d18d6fac5f134f17c2e4eacadd Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 26 Sep 2018 08:41:43 -0700 Subject: [PATCH 08/33] modularizing plot_missing_values --- examples/impute/plot_missing_values.py | 57 +++++++++++++------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 00e6b7ad3ba1d..06d1fbc13559d 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -32,6 +32,19 @@ rng = np.random.RandomState(0) +CV_SPLIT_NUM = 5 +REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100) + + +def get_scores_for_imputer(imputer, X_missing, y_missing): + estimator = make_pipeline( + make_union(imputer, MissingIndicator(missing_values=0)), + REGRESSOR) + impute_scores = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=CV_SPLIT_NUM) + return impute_scores + def get_results(dataset): X_full, y_full = dataset.data, dataset.target @@ -39,10 +52,9 @@ def get_results(dataset): n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - full_scores = cross_val_score(estimator, X_full, y_full, + full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', - cv=5) + cv=CV_SPLIT_NUM) # Add missing values in 75% of the lines missing_rate = 0.75 @@ -53,38 +65,27 @@ def get_results(dataset): dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) - - # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() - estimator = RandomForestRegressor(random_state=0, n_estimators=100) - zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + + # Estimate the score after replacing missing values by 0 + imputer = SimpleImputer(missing_values=0, + strategy='constant', + fill_value=0) + zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after imputation (mean strategy) of the missing values - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - estimator = make_pipeline( - make_union(SimpleImputer(missing_values=0, strategy="mean"), - MissingIndicator(missing_values=0)), - RandomForestRegressor(random_state=0, n_estimators=100)) - mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + imputer = SimpleImputer(missing_values=0, strategy="mean") + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) # Estimate the score after iterative imputation of the missing values - estimator = make_pipeline( - make_union(IterativeImputer(missing_values=0, - random_state=0, - n_nearest_features=5), - MissingIndicator(missing_values=0)), - RandomForestRegressor(random_state=0, n_estimators=100)) - iterative_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) + imputer = IterativeImputer(missing_values=0, + random_state=0, + n_nearest_features=5) + iterative_impute_scores = get_scores_for_imputer(imputer, + X_missing, + y_missing) return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), From de2c307b97c728afcb91209b3367d065fe475215 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 26 Sep 2018 11:10:46 -0700 Subject: [PATCH 09/33] fixing cut off plot --- .../plot_iterative_imputer_as_missforest.py | 61 ------------------- ...t_iterative_imputer_variants_comparison.py | 7 ++- 2 files changed, 6 insertions(+), 62 deletions(-) delete mode 100644 examples/impute/plot_iterative_imputer_as_missforest.py diff --git a/examples/impute/plot_iterative_imputer_as_missforest.py b/examples/impute/plot_iterative_imputer_as_missforest.py deleted file mode 100644 index ac1762d42f68d..0000000000000 --- a/examples/impute/plot_iterative_imputer_as_missforest.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -============================================================= -Replicating Functionality of missForest with IterativeImputer -============================================================= - -There are many well-established imputation packages in the R data science -ecosystem: Amelia, mi, mice, missForest, and others. - -missForest is popular, and turns out to be a particular instance of a class of -sequential imputation algorithms that can all be implemented with the -:class:`sklearn.impute.IterativeImputer` class, which is a strategy for -imputing missing values by modeling each feature with missing values as a -function of other features in a round-robin fashion. In the case of missForest, -the function is a Random Forest. - -In this example we will demonstrate how to use -:class:`sklearn.impute.IterativeImputer` to replicate the functionality of -missForest. -""" -print(__doc__) - -import numpy as np - -from sklearn.datasets import load_diabetes -from sklearn.ensemble import RandomForestRegressor -from sklearn.impute import IterativeImputer - -rng = np.random.RandomState(0) - -# Load data -dataset = load_diabetes() -X_full, y_full = dataset.data, dataset.target -n_samples = X_full.shape[0] -n_features = X_full.shape[1] - -# Add missing values in 75% of the lines -missing_rate = 0.75 -n_missing_samples = int(np.floor(n_samples * missing_rate)) -missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) -rng.shuffle(missing_samples) -missing_features = rng.randint(0, n_features, n_missing_samples) - -X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = np.nan -y_missing = y_full.copy() - -# Random Forest predictor with default values according to missForest docs -predictor = RandomForestRegressor(n_estimators=100, max_features='sqrt') -imputer = IterativeImputer(n_iter=10, predictor=predictor) - -# Impute missing values with IterativeImputer as missForest -X_imputed = imputer.fit_transform(X_missing) - -# Compute RMSE of the imputed values -imp_missing_vals = X_imputed[np.where(missing_samples)[0], missing_features] -true_missing_vals = X_full[np.where(missing_samples)[0], missing_features] -rmse = np.sqrt(np.mean((true_missing_vals - imp_missing_vals)**2)) -print('RMSE of IterativeImputer as missForest on the Diabetes Data:', rmse) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index f0571851b8314..c9a686256a1f3 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -16,6 +16,9 @@ RandomForestRegressor: equivalent to missForest in R KNeighborsRegressor: comparable to other KNN imputation approaches +Of particular interest is the ability of ``IterativeImputer`` to mimic the +behavior of missForest, a popular imputation package for R. + The goal is to compare different predictors to see which one is best for the `IterativeImputer` when using a ``RandomForestRegressor`` estimator on the Boston dataset. @@ -84,6 +87,7 @@ RidgeCV(alphas=(1e-7, 0.01, 0.1, 1.0, 10.0)), HuberRegressor(), DecisionTreeRegressor(random_state=0, max_features='sqrt'), + # Random Forest predictor with default values set as in missForest docs RandomForestRegressor(random_state=0, n_estimators=100, max_features='sqrt'), @@ -112,7 +116,7 @@ 'IterativeImputer w/ KNeighborsRegressor'] # plot boston results -fig, ax = plt.subplots(figsize=(10, 6)) +fig, ax = plt.subplots(figsize=(14, 6)) for i, j in enumerate(np.arange(len(mses_boston))): ax.barh(j, mses_boston[j], xerr=stds_boston[j], alpha=0.6, align='center') @@ -121,4 +125,5 @@ ax.set_yticks(np.arange(len(mses_boston))) ax.invert_yaxis() ax.set_yticklabels(x_labels) +plt.tight_layout(pad=1) plt.show() From c135653ea52fbf77b7a2d5aeb34d5e326212242b Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 26 Sep 2018 11:23:27 -0700 Subject: [PATCH 10/33] updating narrative docs --- doc/modules/impute.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 8bb3ad8bf940b..82cbeb10a6177 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -121,6 +121,18 @@ Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipel as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +Flexibility of IterativeImputer +=============================== + +There are many well-established imputation packages in the R data science +ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns +out to be a particular instance of different sequential imputation algorithms +that can all be implemented with :class:`IterativeImputer` by passing in different +regressors to be used for predicting missing feature values. In the case of missForest, +this regressor is a Random Forest. +See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. + + .. _multiple_imputation: Multiple vs. Single Imputation From a8768c44c1358531c528a968bbc906c0cbecbf6f Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 26 Sep 2018 16:19:51 -0700 Subject: [PATCH 11/33] default for verbose should be 0 --- sklearn/impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 3035040c1179a..81e712e96c331 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -561,7 +561,7 @@ def __init__(self, initial_strategy="mean", min_value=None, max_value=None, - verbose=False, + verbose=0, random_state=None): self.missing_values = missing_values From 4368c31970ab182aaf32e5e0b4f0b35254cd3c47 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 26 Sep 2018 22:53:28 -0700 Subject: [PATCH 12/33] fixing doc error --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 82cbeb10a6177..0c0cc805d061a 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -110,7 +110,7 @@ round are returned. IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, - sample_posterior=False, verbose=False) + sample_posterior=False, verbose=0) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] From c5595b84ae9779ece41b7472073097aaa5e8c7ff Mon Sep 17 00:00:00 2001 From: sergeyf Date: Mon, 15 Oct 2018 18:11:31 -0700 Subject: [PATCH 13/33] addressing some comments --- ...t_iterative_imputer_variants_comparison.py | 68 +++++++++---------- examples/impute/plot_missing_values.py | 6 +- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index c9a686256a1f3..39afa60b12232 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -23,10 +23,8 @@ the `IterativeImputer` when using a ``RandomForestRegressor`` estimator on the Boston dataset. -For the Boston dataset, the ``HuberRegressor`` produces results that are on -average superior to even having the full dataset. We also see that using other -predictors results in an imputer that is worse than using ``SimpleImputer`` -with the ``mean`` strategy. +For the Boston dataset and this particular pattern of missing values we see +that ``RandomForestRegressor`` and produces the best results. """ print(__doc__) @@ -42,6 +40,8 @@ from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score +N_SPLITS = 5 + rng = np.random.RandomState(0) X_full, y_full = load_boston(return_X_y=True) @@ -49,37 +49,31 @@ n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values +mses = np.zeros((8, N_SPLITS)) rf_estimator = RandomForestRegressor(random_state=0, n_estimators=100) -full_scores = cross_val_score(rf_estimator, X_full, y_full, - scoring='neg_mean_squared_error', - cv=5) -mses_boston = [-full_scores.mean()] -stds_boston = [full_scores.std()] +mses[0, :] = cross_val_score(rf_estimator, X_full, y_full, + scoring='neg_mean_squared_error', + cv=N_SPLITS) -# Add missing values in 75% of the lines -missing_rate = 0.75 -n_missing_samples = int(np.floor(n_samples * missing_rate)) -missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) -rng.shuffle(missing_samples) -missing_features = rng.randint(0, n_features, n_missing_samples) + +# Add a single missing value in 75% of the lines X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = np.nan y_missing = y_full.copy() +missing_rate = 0.75 +n_missing_samples = int(np.floor(n_samples * missing_rate)) +missing_samples = rng.choice(n_samples, n_missing_samples, replace=False) +missing_features = rng.choice(n_features, n_missing_samples, replace=True) +X_missing[missing_samples, missing_features] = np.nan # Estimate the score after imputation (mean strategy) of the missing values -for strategy in ['mean', 'median']: +for i, strategy in enumerate(['mean', 'median']): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), rf_estimator ) - mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) - mses_boston.append(-mean_impute_scores.mean()) - stds_boston.append(mean_impute_scores.std()) + mses[i + 1, :] = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=N_SPLITS) # Estimate the score after iterative imputation of the missing values # with different predictors @@ -94,16 +88,14 @@ KNeighborsRegressor(n_neighbors=15) ] -for predictor in predictors: +for i, predictor in enumerate(predictors): estimator = make_pipeline( IterativeImputer(random_state=0, predictor=predictor), rf_estimator ) - pred_scores = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=5) - mses_boston.append(-pred_scores.mean()) - stds_boston.append(pred_scores.std()) + mses[i + 3, :] = cross_val_score(estimator, X_missing, y_missing, + scoring='neg_mean_squared_error', + cv=N_SPLITS) # Plot the results x_labels = ['Full Data', @@ -117,12 +109,18 @@ # plot boston results fig, ax = plt.subplots(figsize=(14, 6)) -for i, j in enumerate(np.arange(len(mses_boston))): - ax.barh(j, mses_boston[j], xerr=stds_boston[j], alpha=0.6, align='center') +for i, j in enumerate(np.arange(mses.shape[0])): + ax.barh( + j, + -np.mean(mses[j, :]), + xerr=np.std(mses[j, :]), + alpha=0.6, + align='center' + ) ax.set_title('Boston Data Regression MSE With Different Imputation Methods') -ax.set_xlabel('MSE') -ax.set_yticks(np.arange(len(mses_boston))) +ax.set_xlabel('MSE (smaller is better)') +ax.set_yticks(np.arange(mses.shape[0])) ax.invert_yaxis() ax.set_yticklabels(x_labels) plt.tight_layout(pad=1) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 06d1fbc13559d..897b66aad246c 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -32,7 +32,7 @@ rng = np.random.RandomState(0) -CV_SPLIT_NUM = 5 +N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0, n_estimators=100) @@ -42,7 +42,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', - cv=CV_SPLIT_NUM) + cv=N_SPLITS) return impute_scores @@ -54,7 +54,7 @@ def get_results(dataset): # Estimate the score on the entire dataset, with no missing values full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', - cv=CV_SPLIT_NUM) + cv=N_SPLITS) # Add missing values in 75% of the lines missing_rate = 0.75 From e0e90be826993087ce32ef774561fa0527e14dc0 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 17 Oct 2018 08:46:59 -0700 Subject: [PATCH 14/33] making example more interesting --- doc/modules/impute.rst | 4 +- ...t_iterative_imputer_variants_comparison.py | 58 +++++++++---------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 0c0cc805d061a..bcc79484def79 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -122,7 +122,7 @@ as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. Flexibility of IterativeImputer -=============================== +------------------------------- There are many well-established imputation packages in the R data science ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns @@ -136,7 +136,7 @@ See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. .. _multiple_imputation: Multiple vs. Single Imputation -============================== +------------------------------ In the statistics community, it is common practice to perform multiple imputations, generating, for example, ``m`` separate imputations for a single feature matrix. diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 39afa60b12232..75acb7642f4c5 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -10,32 +10,33 @@ In this example we compare some predictors for the purpose of missing feature imputation with `IterativeImputer`:: - RidgeCV: default - HuberRegressor: robust linear regression to reduce the impact of outliers + RidgeCV: regularized linear regression DecisionTreeRegressor: non-linear regression - RandomForestRegressor: equivalent to missForest in R KNeighborsRegressor: comparable to other KNN imputation approaches + ExtraTreesRegressor: similar to missForest in R Of particular interest is the ability of ``IterativeImputer`` to mimic the -behavior of missForest, a popular imputation package for R. +behavior of missForest, a popular imputation package for R. In this example, +we have chosen to use ``ExtraTreesRegressor`` instead of +``RandomForestRegressor`` (as in missForest) due to its increased speed. The goal is to compare different predictors to see which one is best for -the `IterativeImputer` when using a ``RandomForestRegressor`` estimator on the -Boston dataset. +the `IterativeImputer` when using a ``BayesianRidge`` estimator on the +California housing dataset. -For the Boston dataset and this particular pattern of missing values we see -that ``RandomForestRegressor`` and produces the best results. +For this particular pattern of missing values we see that +``ExtraTreesRegressor`` gives the best results. """ print(__doc__) import numpy as np import matplotlib.pyplot as plt -from sklearn.datasets import load_boston +from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer, IterativeImputer -from sklearn.linear_model import RidgeCV, HuberRegressor +from sklearn.linear_model import RidgeCV, BayesianRidge from sklearn.tree import DecisionTreeRegressor -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score @@ -44,23 +45,22 @@ rng = np.random.RandomState(0) -X_full, y_full = load_boston(return_X_y=True) +X_full, y_full = fetch_california_housing(return_X_y=True) n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values -mses = np.zeros((8, N_SPLITS)) -rf_estimator = RandomForestRegressor(random_state=0, n_estimators=100) -mses[0, :] = cross_val_score(rf_estimator, X_full, y_full, +mses = np.zeros((7, N_SPLITS)) +br_estimator = BayesianRidge() +mses[0, :] = cross_val_score(br_estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) -# Add a single missing value in 75% of the lines +# Add a single missing value to each row X_missing = X_full.copy() y_missing = y_full.copy() -missing_rate = 0.75 -n_missing_samples = int(np.floor(n_samples * missing_rate)) +n_missing_samples = int(np.floor(n_samples)) missing_samples = rng.choice(n_samples, n_missing_samples, replace=False) missing_features = rng.choice(n_features, n_missing_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan @@ -69,7 +69,7 @@ for i, strategy in enumerate(['mean', 'median']): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), - rf_estimator + br_estimator ) mses[i + 1, :] = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', @@ -79,19 +79,16 @@ # with different predictors predictors = [ RidgeCV(alphas=(1e-7, 0.01, 0.1, 1.0, 10.0)), - HuberRegressor(), DecisionTreeRegressor(random_state=0, max_features='sqrt'), - # Random Forest predictor with default values set as in missForest docs - RandomForestRegressor(random_state=0, - n_estimators=100, - max_features='sqrt'), - KNeighborsRegressor(n_neighbors=15) + KNeighborsRegressor(n_neighbors=15), + ExtraTreesRegressor(n_estimators=10) ] for i, predictor in enumerate(predictors): + print(predictor) estimator = make_pipeline( IterativeImputer(random_state=0, predictor=predictor), - rf_estimator + br_estimator ) mses[i + 3, :] = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', @@ -102,13 +99,12 @@ 'SimpleImputer w/ Mean Strategy', 'SimpleImputer w/ Median Strategy', 'IterativeImputer w/ RidgeCV', - 'IterativeImputer w/ HuberRegressor', 'IterativeImputer w/ DecisionTreeRegressor', - 'IterativeImputer w/ RandomForestRegressor', - 'IterativeImputer w/ KNeighborsRegressor'] + 'IterativeImputer w/ KNeighborsRegressor', + 'IterativeImputer w/ ExtraTreesRegressor'] # plot boston results -fig, ax = plt.subplots(figsize=(14, 6)) +fig, ax = plt.subplots(figsize=(13, 6)) for i, j in enumerate(np.arange(mses.shape[0])): ax.barh( j, @@ -118,7 +114,7 @@ align='center' ) -ax.set_title('Boston Data Regression MSE With Different Imputation Methods') +ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') ax.set_yticks(np.arange(mses.shape[0])) ax.invert_yaxis() From 75f94afc03b0f578758c88705d49b6bfadc3024b Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Wed, 16 Jan 2019 07:10:39 -0800 Subject: [PATCH 15/33] Reverting v0.21 to master --- doc/whats_new/v0.21.rst | 223 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 211 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 54549e629331c..1220019aef959 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -17,7 +17,13 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- please add class and reason here (see version 0.20 what's new) +- :class:`linear_model.BayesianRidge` |Fix| +- Decision trees and derived ensembles when both `max_depth` and + `max_leaf_nodes` are set. |Fix| +- :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix| +- :class:`ensemble.GradientBoostingClassifier` for multiclass + classification. |Fix| Details are listed in the changelog below. @@ -37,29 +43,222 @@ Support for Python 3.4 and below has been officially dropped. section should be ordered according to the label ordering above. Entries should end with: :issue:`123456` by :user:`Joe Bloggs `. -- An entry goes here -- An entry goes here - :mod:`sklearn.cluster` ...................... - |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier - to set and that scales better, by :user:`Shane ` and - :user:`Adrin Jalali `. + to set and that scales better, by :user:`Shane `, + :user:`Adrin Jalali `, and :user:`Erich Schubert `. + +:mod:`sklearn.datasets` + ...................... + + - |Fix| Added support for 64-bit group IDs and pointers in SVMLight files + :class:`datasets.svmlight_format` :issue:`10727` by + :user:`Bryan K Woods `, + +:mod:`sklearn.discriminant_analysis` +.................................... + +- |Fix| A ``ChangedBehaviourWarning`` is now raised when + :class:`discriminant_analysis.LinearDiscriminantAnalysis` is given as + parameter ``n_components > min(n_features, n_classes - 1)``, and + ``n_components`` is changed to ``min(n_features, n_classes - 1)`` if so. + Previously the change was made, but silently. :issue:`11526` by + :user:`William de Vazelhes`. + +:mod:`sklearn.ensemble` +....................... + +- |Efficiency| Make :class:`ensemble.IsolationForest` prefer threads over + processes when running with ``n_jobs > 1`` as the underlying decision tree + fit calls do release the GIL. This changes reduces memory usage and + communication overhead. :issue:`12543` by :user:`Isaac Storch ` + and `Olivier Grisel`_. + +- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where + the gradients would be incorrectly computed in multiclass classification + problems. :issue:`12715` by :user:`Nicolas Hug`. + +- |Fix| Fixed a bug in :mod:`ensemble` where the ``predict`` method would + error for multiclass multioutput forests models if any targets were strings. + :issue:`12834` by :user:`Elizabeth Sander `. + +- |Fix| Fixed a bug in :class:`ensemble.gradient_boosting.LossFunction` and + :class:`ensemble.gradient_boosting.LeastSquaresError` where the default + value of ``learning_rate`` in ``update_terminal_regions`` is not consistent + with the document and the caller functions. + :issue:`6463` by :user:`movelikeriver `. + +:mod:`sklearn.linear_model` +........................... + +- |Feature| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` now support Elastic-Net penalty, + with the 'saga' solver. :issue:`11646` by :user:`Nicolas Hug `. + +- |Enhancement| :class:`linear_model.LogisticRegression` now supports an + unregularized objective by setting ``penalty`` to ``'none'``. This is + equivalent to setting ``C=np.inf`` with l2 regularization. Not supported + by the liblinear solver. :issue:`12860` by :user:`Nicolas Hug + `. + +- |Fix| Fixed a bug in :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` with 'saga' solver, where the + weights would not be correctly updated in some cases. + :issue:`11646` by `Tom Dupre la Tour`_. + +- |API| :func:`linear_model.logistic_regression_path` is deprecated + in version 0.21 and will be removed in version 0.23. + :issue:`12821` by :user:`Nicolas Hug `. + +:mod:`sklearn.manifold` +............................ + +- |Efficiency| Make :func:`manifold.tsne.trustworthiness` use an inverted index + instead of an `np.where` lookup to find the rank of neighbors in the input + space. This improves efficiency in particular when computed with + lots of neighbors and/or small datasets. + :issue:`9907` by :user:`William de Vazelhes `. + +:mod:`sklearn.metrics` +...................... + +- |Feature| Added the :func:`metrics.max_error` metric and a corresponding + ``'max_error'`` scorer for single output regression. + :issue:`12232` by :user:`Krishna Sangeeth `. + +- |Feature| Add :func:`metrics.multilabel_confusion_matrix`, which calculates a + confusion matrix with true positive, false positive, false negative and true + negative counts for each class. This facilitates the calculation of set-wise + metrics such as recall, specificity, fall out and miss rate. + :issue:`11179` by :user:`Shangwu Yao ` and `Joel Nothman`_. -:mod:`sklearn.impute` -..................... +- |Enhancement| Use label `accuracy` instead of `micro-average` on + :func:`metrics.classification_report` to avoid confusion. `micro-average` is + only shown for multi-label or multi-class with a subset of classes because + it is otherwise identical to accuracy. + :issue:`12334` by :user:`Emmanuel Arias `, + `Joel Nothman`_ and `Andreas Müller`_ -- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for - imputing missing values by modeling each feature with missing values as a - function of other features in a round-robin fashion. :issue:`8478` by - :user:`Sergey Feldman `. +- |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated + in version 0.21 and will be removed in version 0.23. + :issue:`10580` by :user:`Reshama Shaikh ` and `Sandra + Mitrovic `. + +- |Fix| The metric :func:`metrics.r2_score` is degenerate with a single sample + and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`. + :issue:`12855` by :user:`Pawel Sendyk .` + +:mod:`sklearn.model_selection` +.............................. + +- |Feature| Classes :class:`~model_selection.GridSearchCV` and + :class:`~model_selection.RandomSearchCV` now allow for refit=callable + to add flexibility in identifying the best + estimator. An example for this interface has been added. + :issue:`11354` by :user:`Wenhao Zhang `, + `Joel Nothman`_ and `Adrin Jalali`_ + +- |Enhancement| Classes :class:`~model_selection.GridSearchCV`, + :class:`~model_selection.RandomizedSearchCV`, and methods + :func:`~model_selection.cross_val_score`, + :func:`~model_selection.cross_val_predict`, + :func:`~model_selection.cross_validate`, now print train scores when + `return_train_scores` is True and `verbose` > 2. For + :func:`~model_selection.learning_curve`, and + :func:`~model_selection.validation_curve` only the latter is required. + :issue:`12613` and :issue:`12669` by :user:`Marc Torrellas `. + +:mod:`sklearn.neighbors` +........................ + +- |API| Methods in :class:`neighbors.NearestNeighbors` : + :func:`~neighbors.NearestNeighbors.kneighbors`, + :func:`~neighbors.NearestNeighbors.radius_neighbors`, + :func:`~neighbors.NearestNeighbors.kneighbors_graph`, + :func:`~neighbors.NearestNeighbors.radius_neighbors_graph` + now raise ``NotFittedError``, rather than ``AttributeError``, + when called before ``fit`` :issue:`12279` by :user:`Krishna Sangeeth + `. + +:mod:`sklearn.pipeline` +....................... + +- |API| :class:`pipeline.Pipeline` now supports using ``'passthrough'`` as a + transformer. :issue:`11144` by :user:`Thomas Fan `. + +:mod:`sklearn.preprocessing` +............................ + +- |Efficiency| Make :class:`preprocessing.MultiLabelBinarizer` to cache class + mappings instead of calculating it every time on the fly. + :issue:`12116` by :user:`Ekaterina Krivich ` and `Joel Nothman`_. + +- |Efficiency| :class:`preprocessing.PolynomialFeatures` now supports compressed + sparse row (CSR) matrices as input for degrees 2 and 3. This is typically much + faster than the dense case as it scales with matrix density and expansion degree + (on the order of density^degree), and is much, much faster than the compressed + sparse column (CSC) case. :issue:`12197` by :user:`Andrew Nystrom `. + +- |Efficiency| |API| Speed improvement in :class:`preprocessing.PolynomialFeatures`, + in the dense case. Also added a new parameter ``order`` which controls output + order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. + +:mod:`sklearn.tree` +................... +- |Feature| Decision Trees can now be plotted with matplotlib using + :func:`tree.plot_tree` without relying on the ``dot`` library, + removing a hard-to-install dependency. :issue:`8508` by `Andreas Müller`_. + +- |Feature| ``get_n_leaves()`` and ``get_depth()`` have been added to + :class:`tree.BaseDecisionTree` and consequently all estimators based + on it, including :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, + and :class:`tree.ExtraTreeRegressor`. + :issue:`12300` by :user:`Adrin Jalali `. + +- |Fix| Fixed an issue with :class:`tree.BaseDecisionTree` + and consequently all estimators based + on it, including :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, + and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given + ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and + ``max_depth`` were both specified by the user. Please note that this also + affects all ensemble methods using decision trees. + :issue:`12344` by :user:`Adrin Jalali `. + + +:mod:`sklearn.linear_model` +........................... + +- |Fix| Fixed the posterior mean, posterior covariance and returned + regularization parameters in :class:`linear_model.BayesianRidge`. The + posterior mean and the posterior covariance were not the ones computed + with the last update of the regularization parameters and the returned + regularization parameters were not the final ones. Also fixed the formula of + the log marginal likelihood used to compute the score when + `compute_score=True`. :issue:`12174` by + :user:`Albert Thomas `. Multiple modules ................ +- The `__repr__()` method of all estimators (used when calling + `print(estimator)`) has been entirely re-written, building on Python's + pretty printing standard library. All parameters are printed by default, + but this can be altered with the ``print_changed_only`` option in + :func:`sklearn.set_config`. :issue:`11705` by :user:`Nicolas Hug + `. + Changes to estimator checks --------------------------- These changes mostly affect library developers. + +- Add ``check_fit_idempotent`` to + :func:`~utils.estimator_checks.check_estimator`, which checks that + when `fit` is called twice with the same data, the ouput of + `predict`, `predict_proba`, `transform`, and `decision_function` does not + change. :issue:`12328` by :user:`Nicolas Hug ` From 755dde51e1dbe7a7e80ea0b8d0de54a849583e1d Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Wed, 16 Jan 2019 07:18:10 -0800 Subject: [PATCH 16/33] Responding to reviewer comments. --- .../plot_iterative_imputer_variants_comparison.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 75acb7642f4c5..8ae458bf8e101 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -20,9 +20,13 @@ we have chosen to use ``ExtraTreesRegressor`` instead of ``RandomForestRegressor`` (as in missForest) due to its increased speed. +Note that ``KNeighborsRegressor``is different from KNN imputation, which +learns from samples with missing values by using a distance metric that +accounts for missing values, rather than imputing them. + The goal is to compare different predictors to see which one is best for the `IterativeImputer` when using a ``BayesianRidge`` estimator on the -California housing dataset. +California housing dataset with a single value randomly removed from each row. For this particular pattern of missing values we see that ``ExtraTreesRegressor`` gives the best results. @@ -59,10 +63,9 @@ # Add a single missing value to each row X_missing = X_full.copy() -y_missing = y_full.copy() -n_missing_samples = int(np.floor(n_samples)) -missing_samples = rng.choice(n_samples, n_missing_samples, replace=False) -missing_features = rng.choice(n_features, n_missing_samples, replace=True) +y_missing = y_full +missing_samples = np.arange(n_samples) +missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan # Estimate the score after imputation (mean strategy) of the missing values From 22dc1ef0a3a336097f208abb92f5203c7b14a07b Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 16 Jan 2019 14:24:25 -0800 Subject: [PATCH 17/33] Updating v0.20 --- doc/whats_new/v0.21.rst | 148 +++------------------------------------- 1 file changed, 11 insertions(+), 137 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index d51a294105be5..2159e39dc126d 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -17,13 +17,7 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- :class:`linear_model.BayesianRidge` |Fix| -- Decision trees and derived ensembles when both `max_depth` and - `max_leaf_nodes` are set. |Fix| -- :class:`linear_model.LogisticRegression` and - :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix| -- :class:`ensemble.GradientBoostingClassifier` for multiclass - classification. |Fix| +- please add class and reason here (see version 0.20 what's new) Details are listed in the changelog below. @@ -43,141 +37,21 @@ Support for Python 3.4 and below has been officially dropped. section should be ordered according to the label ordering above. Entries should end with: :issue:`123456` by :user:`Joe Bloggs `. -:mod:`sklearn.cluster` -- |Feature| Added the :func:`metrics.max_error` metric and a corresponding - ``'max_error'`` scorer for single output regression. - :issue:`12232` by :user:`Krishna Sangeeth `. - -- |Feature| Add :func:`metrics.multilabel_confusion_matrix`, which calculates a - confusion matrix with true positive, false positive, false negative and true - negative counts for each class. This facilitates the calculation of set-wise - metrics such as recall, specificity, fall out and miss rate. - :issue:`11179` by :user:`Shangwu Yao ` and `Joel Nothman`_. - -- |Enhancement| Use label `accuracy` instead of `micro-average` on - :func:`metrics.classification_report` to avoid confusion. `micro-average` is - only shown for multi-label or multi-class with a subset of classes because - it is otherwise identical to accuracy. - :issue:`12334` by :user:`Emmanuel Arias `, - `Joel Nothman`_ and `Andreas Müller`_ - -- |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated - in version 0.21 and will be removed in version 0.23. - :issue:`10580` by :user:`Reshama Shaikh ` and `Sandra - Mitrovic `. - -- |Fix| The metric :func:`metrics.r2_score` is degenerate with a single sample - and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`. - :issue:`12855` by :user:`Pawel Sendyk .` - -:mod:`sklearn.model_selection` -.............................. - -- |Feature| Classes :class:`~model_selection.GridSearchCV` and - :class:`~model_selection.RandomSearchCV` now allow for refit=callable - to add flexibility in identifying the best - estimator. An example for this interface has been added. - :issue:`11354` by :user:`Wenhao Zhang `, - `Joel Nothman`_ and `Adrin Jalali`_ - -- |Enhancement| Classes :class:`~model_selection.GridSearchCV`, - :class:`~model_selection.RandomizedSearchCV`, and methods - :func:`~model_selection.cross_val_score`, - :func:`~model_selection.cross_val_predict`, - :func:`~model_selection.cross_validate`, now print train scores when - `return_train_scores` is True and `verbose` > 2. For - :func:`~model_selection.learning_curve`, and - :func:`~model_selection.validation_curve` only the latter is required. - :issue:`12613` and :issue:`12669` by :user:`Marc Torrellas `. - -:mod:`sklearn.neighbors` -........................ - -- |API| Methods in :class:`neighbors.NearestNeighbors` : - :func:`~neighbors.NearestNeighbors.kneighbors`, - :func:`~neighbors.NearestNeighbors.radius_neighbors`, - :func:`~neighbors.NearestNeighbors.kneighbors_graph`, - :func:`~neighbors.NearestNeighbors.radius_neighbors_graph` - now raise ``NotFittedError``, rather than ``AttributeError``, - when called before ``fit`` :issue:`12279` by :user:`Krishna Sangeeth - `. - -:mod:`sklearn.pipeline` -....................... - -- |API| :class:`pipeline.Pipeline` now supports using ``'passthrough'`` as a - transformer. :issue:`11144` by :user:`Thomas Fan `. - -:mod:`sklearn.preprocessing` -............................ - -- |Efficiency| Make :class:`preprocessing.MultiLabelBinarizer` to cache class - mappings instead of calculating it every time on the fly. - :issue:`12116` by :user:`Ekaterina Krivich ` and `Joel Nothman`_. - -- |Efficiency| :class:`preprocessing.PolynomialFeatures` now supports compressed - sparse row (CSR) matrices as input for degrees 2 and 3. This is typically much - faster than the dense case as it scales with matrix density and expansion degree - (on the order of density^degree), and is much, much faster than the compressed - sparse column (CSC) case. :issue:`12197` by :user:`Andrew Nystrom `. - -- |Efficiency| |API| Speed improvement in :class:`preprocessing.PolynomialFeatures`, - in the dense case. Also added a new parameter ``order`` which controls output - order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. - -:mod:`sklearn.tree` -................... -- |Feature| Decision Trees can now be plotted with matplotlib using - :func:`tree.plot_tree` without relying on the ``dot`` library, - removing a hard-to-install dependency. :issue:`8508` by `Andreas Müller`_. - -- |Feature| ``get_n_leaves()`` and ``get_depth()`` have been added to - :class:`tree.BaseDecisionTree` and consequently all estimators based - on it, including :class:`tree.DecisionTreeClassifier`, - :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, - and :class:`tree.ExtraTreeRegressor`. - :issue:`12300` by :user:`Adrin Jalali `. - -- |Fix| Fixed an issue with :class:`tree.BaseDecisionTree` - and consequently all estimators based - on it, including :class:`tree.DecisionTreeClassifier`, - :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, - and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given - ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and - ``max_depth`` were both specified by the user. Please note that this also - affects all ensemble methods using decision trees. - :issue:`12344` by :user:`Adrin Jalali `. - - -:mod:`sklearn.linear_model` -........................... - -- |Fix| Fixed the posterior mean, posterior covariance and returned - regularization parameters in :class:`linear_model.BayesianRidge`. The - posterior mean and the posterior covariance were not the ones computed - with the last update of the regularization parameters and the returned - regularization parameters were not the final ones. Also fixed the formula of - the log marginal likelihood used to compute the score when - `compute_score=True`. :issue:`12174` by - :user:`Albert Thomas `. +- An entry goes here +- An entry goes here + +:mod:`sklearn.impute` +..................... + +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for + imputing missing values by modeling each feature with missing values as a + function of other features in a round-robin fashion. :issue:`8478` by + :user:`Sergey Feldman `. Multiple modules ................ -- The `__repr__()` method of all estimators (used when calling - `print(estimator)`) has been entirely re-written, building on Python's - pretty printing standard library. All parameters are printed by default, - but this can be altered with the ``print_changed_only`` option in - :func:`sklearn.set_config`. :issue:`11705` by :user:`Nicolas Hug - `. - Changes to estimator checks --------------------------- These changes mostly affect library developers. - -- Add ``check_fit_idempotent`` to - :func:`~utils.estimator_checks.check_estimator`, which checks that - when `fit` is called twice with the same data, the ouput of - `predict`, `predict_proba`, `transform`, and `decision_function` does not - change. :issue:`12328` by :user:`Nicolas Hug ` From 3f059df848dd83be6ade93d7d097d2906b80f256 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 16 Jan 2019 18:09:53 -0800 Subject: [PATCH 18/33] Revert changes to v0.21.rst --- doc/whats_new/v0.21.rst | 230 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 223 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2159e39dc126d..9540f855611ff 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -17,7 +17,13 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- please add class and reason here (see version 0.20 what's new) +- :class:`linear_model.BayesianRidge` |Fix| +- Decision trees and derived ensembles when both `max_depth` and + `max_leaf_nodes` are set. |Fix| +- :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix| +- :class:`ensemble.GradientBoostingClassifier` for multiclass + classification. |Fix| Details are listed in the changelog below. @@ -37,21 +43,231 @@ Support for Python 3.4 and below has been officially dropped. section should be ordered according to the label ordering above. Entries should end with: :issue:`123456` by :user:`Joe Bloggs `. -- An entry goes here -- An entry goes here +:mod:`sklearn.cluster` +...................... + +- |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an + algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier + to set and that scales better, by :user:`Shane `, + :user:`Adrin Jalali `, and :user:`Erich Schubert `. + +:mod:`sklearn.datasets` + ...................... + + - |Fix| Added support for 64-bit group IDs and pointers in SVMLight files + :class:`datasets.svmlight_format` :issue:`10727` by + :user:`Bryan K Woods `, + +:mod:`sklearn.discriminant_analysis` +.................................... + +- |Fix| A ``ChangedBehaviourWarning`` is now raised when + :class:`discriminant_analysis.LinearDiscriminantAnalysis` is given as + parameter ``n_components > min(n_features, n_classes - 1)``, and + ``n_components`` is changed to ``min(n_features, n_classes - 1)`` if so. + Previously the change was made, but silently. :issue:`11526` by + :user:`William de Vazelhes`. + +:mod:`sklearn.ensemble` +....................... + +- |Efficiency| Make :class:`ensemble.IsolationForest` prefer threads over + processes when running with ``n_jobs > 1`` as the underlying decision tree + fit calls do release the GIL. This changes reduces memory usage and + communication overhead. :issue:`12543` by :user:`Isaac Storch ` + and `Olivier Grisel`_. + +- |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where + the gradients would be incorrectly computed in multiclass classification + problems. :issue:`12715` by :user:`Nicolas Hug`. + +- |Fix| Fixed a bug in :mod:`ensemble` where the ``predict`` method would + error for multiclass multioutput forests models if any targets were strings. + :issue:`12834` by :user:`Elizabeth Sander `. + +- |Fix| Fixed a bug in :class:`ensemble.gradient_boosting.LossFunction` and + :class:`ensemble.gradient_boosting.LeastSquaresError` where the default + value of ``learning_rate`` in ``update_terminal_regions`` is not consistent + with the document and the caller functions. + :issue:`6463` by :user:`movelikeriver `. :mod:`sklearn.impute` ..................... -- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy for - imputing missing values by modeling each feature with missing values as a - function of other features in a round-robin fashion. :issue:`8478` by - :user:`Sergey Feldman `. +- |MajorFeature| Added :class:`impute.IterativeImputer`, which is a strategy + for imputing missing values by modeling each feature with missing values as a + function of other features in a round-robin fashion. :issue:`8478` and + :issue:`12177` by :user:`Sergey Feldman ` :user:`Ben Lawson + `. + +:mod:`sklearn.linear_model` +........................... + +- |Feature| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` now support Elastic-Net penalty, + with the 'saga' solver. :issue:`11646` by :user:`Nicolas Hug `. + +- |Enhancement| :class:`linear_model.LogisticRegression` now supports an + unregularized objective by setting ``penalty`` to ``'none'``. This is + equivalent to setting ``C=np.inf`` with l2 regularization. Not supported + by the liblinear solver. :issue:`12860` by :user:`Nicolas Hug + `. + +- |Fix| Fixed a bug in :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` with 'saga' solver, where the + weights would not be correctly updated in some cases. + :issue:`11646` by `Tom Dupre la Tour`_. + +- |API| :func:`linear_model.logistic_regression_path` is deprecated + in version 0.21 and will be removed in version 0.23. + :issue:`12821` by :user:`Nicolas Hug `. + +:mod:`sklearn.manifold` +............................ + +- |Efficiency| Make :func:`manifold.tsne.trustworthiness` use an inverted index + instead of an `np.where` lookup to find the rank of neighbors in the input + space. This improves efficiency in particular when computed with + lots of neighbors and/or small datasets. + :issue:`9907` by :user:`William de Vazelhes `. + +:mod:`sklearn.metrics` +...................... + +- |Feature| Added the :func:`metrics.max_error` metric and a corresponding + ``'max_error'`` scorer for single output regression. + :issue:`12232` by :user:`Krishna Sangeeth `. + +- |Feature| Add :func:`metrics.multilabel_confusion_matrix`, which calculates a + confusion matrix with true positive, false positive, false negative and true + negative counts for each class. This facilitates the calculation of set-wise + metrics such as recall, specificity, fall out and miss rate. + :issue:`11179` by :user:`Shangwu Yao ` and `Joel Nothman`_. + +- |Enhancement| Use label `accuracy` instead of `micro-average` on + :func:`metrics.classification_report` to avoid confusion. `micro-average` is + only shown for multi-label or multi-class with a subset of classes because + it is otherwise identical to accuracy. + :issue:`12334` by :user:`Emmanuel Arias `, + `Joel Nothman`_ and `Andreas Müller`_ + +- |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated + in version 0.21 and will be removed in version 0.23. + :issue:`10580` by :user:`Reshama Shaikh ` and `Sandra + Mitrovic `. + +- |Fix| The metric :func:`metrics.r2_score` is degenerate with a single sample + and now it returns NaN and raises :class:`exceptions.UndefinedMetricWarning`. + :issue:`12855` by :user:`Pawel Sendyk .` + +:mod:`sklearn.model_selection` +.............................. + +- |Feature| Classes :class:`~model_selection.GridSearchCV` and + :class:`~model_selection.RandomSearchCV` now allow for refit=callable + to add flexibility in identifying the best + estimator. An example for this interface has been added. + :issue:`11354` by :user:`Wenhao Zhang `, + `Joel Nothman`_ and `Adrin Jalali`_ + +- |Enhancement| Classes :class:`~model_selection.GridSearchCV`, + :class:`~model_selection.RandomizedSearchCV`, and methods + :func:`~model_selection.cross_val_score`, + :func:`~model_selection.cross_val_predict`, + :func:`~model_selection.cross_validate`, now print train scores when + `return_train_scores` is True and `verbose` > 2. For + :func:`~model_selection.learning_curve`, and + :func:`~model_selection.validation_curve` only the latter is required. + :issue:`12613` and :issue:`12669` by :user:`Marc Torrellas `. + +:mod:`sklearn.neighbors` +........................ + +- |API| Methods in :class:`neighbors.NearestNeighbors` : + :func:`~neighbors.NearestNeighbors.kneighbors`, + :func:`~neighbors.NearestNeighbors.radius_neighbors`, + :func:`~neighbors.NearestNeighbors.kneighbors_graph`, + :func:`~neighbors.NearestNeighbors.radius_neighbors_graph` + now raise ``NotFittedError``, rather than ``AttributeError``, + when called before ``fit`` :issue:`12279` by :user:`Krishna Sangeeth + `. + +:mod:`sklearn.pipeline` +....................... + +- |API| :class:`pipeline.Pipeline` now supports using ``'passthrough'`` as a + transformer. :issue:`11144` by :user:`Thomas Fan `. + +:mod:`sklearn.preprocessing` +............................ + +- |Efficiency| Make :class:`preprocessing.MultiLabelBinarizer` to cache class + mappings instead of calculating it every time on the fly. + :issue:`12116` by :user:`Ekaterina Krivich ` and `Joel Nothman`_. + +- |Efficiency| :class:`preprocessing.PolynomialFeatures` now supports compressed + sparse row (CSR) matrices as input for degrees 2 and 3. This is typically much + faster than the dense case as it scales with matrix density and expansion degree + (on the order of density^degree), and is much, much faster than the compressed + sparse column (CSC) case. :issue:`12197` by :user:`Andrew Nystrom `. + +- |Efficiency| |API| Speed improvement in :class:`preprocessing.PolynomialFeatures`, + in the dense case. Also added a new parameter ``order`` which controls output + order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. + +:mod:`sklearn.tree` +................... +- |Feature| Decision Trees can now be plotted with matplotlib using + :func:`tree.plot_tree` without relying on the ``dot`` library, + removing a hard-to-install dependency. :issue:`8508` by `Andreas Müller`_. + +- |Feature| ``get_n_leaves()`` and ``get_depth()`` have been added to + :class:`tree.BaseDecisionTree` and consequently all estimators based + on it, including :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, + and :class:`tree.ExtraTreeRegressor`. + :issue:`12300` by :user:`Adrin Jalali `. + +- |Fix| Fixed an issue with :class:`tree.BaseDecisionTree` + and consequently all estimators based + on it, including :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, + and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given + ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and + ``max_depth`` were both specified by the user. Please note that this also + affects all ensemble methods using decision trees. + :issue:`12344` by :user:`Adrin Jalali `. + + +:mod:`sklearn.linear_model` +........................... + +- |Fix| Fixed the posterior mean, posterior covariance and returned + regularization parameters in :class:`linear_model.BayesianRidge`. The + posterior mean and the posterior covariance were not the ones computed + with the last update of the regularization parameters and the returned + regularization parameters were not the final ones. Also fixed the formula of + the log marginal likelihood used to compute the score when + `compute_score=True`. :issue:`12174` by + :user:`Albert Thomas `. Multiple modules ................ +- The `__repr__()` method of all estimators (used when calling + `print(estimator)`) has been entirely re-written, building on Python's + pretty printing standard library. All parameters are printed by default, + but this can be altered with the ``print_changed_only`` option in + :func:`sklearn.set_config`. :issue:`11705` by :user:`Nicolas Hug + `. + Changes to estimator checks --------------------------- These changes mostly affect library developers. + +- Add ``check_fit_idempotent`` to + :func:`~utils.estimator_checks.check_estimator`, which checks that + when `fit` is called twice with the same data, the ouput of + `predict`, `predict_proba`, `transform`, and `decision_function` does not + change. :issue:`12328` by :user:`Nicolas Hug ` From 2ed49af15daeb311455a83f7c9363b5d23de1522 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 16 Jan 2019 19:04:10 -0800 Subject: [PATCH 19/33] fixing doctest impute.rst --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 90a021078e812..53cb66fa14d52 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -110,7 +110,7 @@ round are returned. IterativeImputer(imputation_order='ascending', initial_strategy='mean', max_value=None, min_value=None, missing_values=nan, n_iter=10, n_nearest_features=None, predictor=None, random_state=0, - sample_posterior=False, verbose=False) + sample_posterior=False, verbose=0) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] From dae2e3ae421aac6eb6593df18ae2bd1ad5f1db37 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 16 Jan 2019 19:27:52 -0800 Subject: [PATCH 20/33] fixing doctest impute.rst v2 --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 53cb66fa14d52..f521f185e8cee 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -115,7 +115,7 @@ round are returned. >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] [ 6. 3.] - [24. 6.]] + [26. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From cc9ae8baed28a5fad9f2836702eb0295b75abaa0 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Wed, 16 Jan 2019 21:57:27 -0800 Subject: [PATCH 21/33] fixing doctest impute.rst v3 --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index f521f185e8cee..53cb66fa14d52 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -115,7 +115,7 @@ round are returned. >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] [ 6. 3.] - [26. 6.]] + [24. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From bd0be1116640635a380d7a005719266c39343086 Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Thu, 17 Jan 2019 08:57:13 -0800 Subject: [PATCH 22/33] One more try with expected/actual issue in impute.rst --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 53cb66fa14d52..f521f185e8cee 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -115,7 +115,7 @@ round are returned. >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] [ 6. 3.] - [24. 6.]] + [26. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From d2c335775d0c056209ce22f8dad2befe13486929 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 17 Jan 2019 12:49:43 -0800 Subject: [PATCH 23/33] updating to 26 --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 53cb66fa14d52..f521f185e8cee 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -115,7 +115,7 @@ round are returned. >>> print(np.round(imp.transform(X_test))) [[ 1. 2.] [ 6. 3.] - [24. 6.]] + [26. 6.]] Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. From b2f2b540b5fea1cfd241fbf1ed0e8866d4bda61c Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Wed, 23 Jan 2019 22:18:09 -0800 Subject: [PATCH 24/33] Updating RidgeCV to BayesianRidge to be more in line with the default --- .../impute/plot_iterative_imputer_variants_comparison.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 8ae458bf8e101..2c03d6f95c836 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -10,7 +10,7 @@ In this example we compare some predictors for the purpose of missing feature imputation with `IterativeImputer`:: - RidgeCV: regularized linear regression + BayesianRidge: regularized linear regression DecisionTreeRegressor: non-linear regression KNeighborsRegressor: comparable to other KNN imputation approaches ExtraTreesRegressor: similar to missForest in R @@ -81,7 +81,7 @@ # Estimate the score after iterative imputation of the missing values # with different predictors predictors = [ - RidgeCV(alphas=(1e-7, 0.01, 0.1, 1.0, 10.0)), + BayesianRidge(), DecisionTreeRegressor(random_state=0, max_features='sqrt'), KNeighborsRegressor(n_neighbors=15), ExtraTreesRegressor(n_estimators=10) @@ -101,7 +101,7 @@ x_labels = ['Full Data', 'SimpleImputer w/ Mean Strategy', 'SimpleImputer w/ Median Strategy', - 'IterativeImputer w/ RidgeCV', + 'IterativeImputer w/ BayesianRidge', 'IterativeImputer w/ DecisionTreeRegressor', 'IterativeImputer w/ KNeighborsRegressor', 'IterativeImputer w/ ExtraTreesRegressor'] From b305620371cc40f3f0e5e397931fbc926f66678f Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 08:47:16 -0800 Subject: [PATCH 25/33] updating to glemaitre's plot --- ...t_iterative_imputer_variants_comparison.py | 115 +++++++++--------- 1 file changed, 56 insertions(+), 59 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 2c03d6f95c836..919edbe25d371 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -8,37 +8,43 @@ variable as an output in turn. In this example we compare some predictors for the purpose of missing feature -imputation with `IterativeImputer`:: - - BayesianRidge: regularized linear regression - DecisionTreeRegressor: non-linear regression - KNeighborsRegressor: comparable to other KNN imputation approaches - ExtraTreesRegressor: similar to missForest in R - -Of particular interest is the ability of ``IterativeImputer`` to mimic the -behavior of missForest, a popular imputation package for R. In this example, -we have chosen to use ``ExtraTreesRegressor`` instead of -``RandomForestRegressor`` (as in missForest) due to its increased speed. - -Note that ``KNeighborsRegressor``is different from KNN imputation, which -learns from samples with missing values by using a distance metric that -accounts for missing values, rather than imputing them. - -The goal is to compare different predictors to see which one is best for -the `IterativeImputer` when using a ``BayesianRidge`` estimator on the -California housing dataset with a single value randomly removed from each row. +imputation with :class:`sklearn.imputeIterativeImputer`:: + + :class:`sklearn.linear_model.BayesianRidge`: regularized linear regression + :class:`sklearn.tree.DecisionTreeRegressor`: non-linear regression + :class:`sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN + imputation approaches + :class:`sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R + +Of particular interest is the ability of +:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a +popular imputation package for R. In this example, we have chosen to use +:class:`sklearn.ensemble.ExtraTreesRegressor` instead of +:class:`sklearn.ensemble.RandomForestRegressor` (as in missForest) due to its +increased speed. + +Note that :class:`sklearn.neighbors.KNeighborsRegressor` is different from KNN +imputation, which learns from samples with missing values by using a distance +metric that accounts for missing values, rather than imputing them. + +The goal is to compare different predictors to see which one is best for the +:class:`sklearn.impute.IterativeImputer` when using a +:class:`sklearn.linear_model.BayesianRidge` estimator on the California housing +dataset with a single value randomly removed from each row. For this particular pattern of missing values we see that -``ExtraTreesRegressor`` gives the best results. +:class:`sklearn.ensemble.ExtraTreesRegressor`` and +:class:`sklearn.linear_model.BayesianRidge` give the best results. """ print(__doc__) import numpy as np import matplotlib.pyplot as plt +import pandas as pd from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer, IterativeImputer -from sklearn.linear_model import RidgeCV, BayesianRidge +from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor @@ -54,12 +60,13 @@ n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values -mses = np.zeros((7, N_SPLITS)) br_estimator = BayesianRidge() -mses[0, :] = cross_val_score(br_estimator, X_full, y_full, - scoring='neg_mean_squared_error', - cv=N_SPLITS) - +score_full_data = pd.DataFrame( + cross_val_score( + br_estimator, X_full, y_full, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) +) # Add a single missing value to each row X_missing = X_full.copy() @@ -69,58 +76,48 @@ X_missing[missing_samples, missing_features] = np.nan # Estimate the score after imputation (mean strategy) of the missing values -for i, strategy in enumerate(['mean', 'median']): +score_simple_imputer = pd.DataFrame() +for strategy in ('mean', 'median'): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator ) - mses[i + 1, :] = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=N_SPLITS) + score_simple_imputer[strategy] = cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) # Estimate the score after iterative imputation of the missing values # with different predictors predictors = [ BayesianRidge(), - DecisionTreeRegressor(random_state=0, max_features='sqrt'), + DecisionTreeRegressor(max_features='sqrt', random_state=0), KNeighborsRegressor(n_neighbors=15), - ExtraTreesRegressor(n_estimators=10) + ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0) ] - -for i, predictor in enumerate(predictors): - print(predictor) +score_iterative_imputer = pd.DataFrame() +for predictor in predictors: estimator = make_pipeline( IterativeImputer(random_state=0, predictor=predictor), br_estimator ) - mses[i + 3, :] = cross_val_score(estimator, X_missing, y_missing, - scoring='neg_mean_squared_error', - cv=N_SPLITS) - -# Plot the results -x_labels = ['Full Data', - 'SimpleImputer w/ Mean Strategy', - 'SimpleImputer w/ Median Strategy', - 'IterativeImputer w/ BayesianRidge', - 'IterativeImputer w/ DecisionTreeRegressor', - 'IterativeImputer w/ KNeighborsRegressor', - 'IterativeImputer w/ ExtraTreesRegressor'] + score_iterative_imputer[predictor.__class__.__name__] = \ + cross_val_score( + estimator, X_missing, y_missing, scoring='neg_mean_squared_error', + cv=N_SPLITS + ) + +scores = pd.concat( + [score_full_data, score_simple_imputer, score_iterative_imputer], + keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 +) # plot boston results fig, ax = plt.subplots(figsize=(13, 6)) -for i, j in enumerate(np.arange(mses.shape[0])): - ax.barh( - j, - -np.mean(mses[j, :]), - xerr=np.std(mses[j, :]), - alpha=0.6, - align='center' - ) - +means = -scores.mean() +errors = scores.std() +means.plot.barh(xerr=errors, ax=ax) ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') -ax.set_yticks(np.arange(mses.shape[0])) -ax.invert_yaxis() -ax.set_yticklabels(x_labels) plt.tight_layout(pad=1) plt.show() From c8dccb438d9dda9ee67e22af652495a3cf36a962 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 08:52:31 -0800 Subject: [PATCH 26/33] line lengths for impute.rst --- doc/modules/impute.rst | 60 +++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 79c1d84d0df4e..1db20e9c6dcdb 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -9,19 +9,19 @@ Imputation of missing values For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an -array are numerical, and that all have and hold meaning. A basic strategy to use -incomplete datasets is to discard entire rows and/or columns containing missing -values. However, this comes at the price of losing data which may be valuable -(even though incomplete). A better strategy is to impute the missing values, -i.e., to infer them from the known part of the data. See the :ref:`glossary` -entry on imputation. +array are numerical, and that all have and hold meaning. A basic strategy to +use incomplete datasets is to discard entire rows and/or columns containing +missing values. However, this comes at the price of losing data which may be +valuable (even though incomplete). A better strategy is to impute the missing +values, i.e., to infer them from the known part of the data. See the +:ref:`glossary` entry on imputation. Univariate vs. Multivariate Imputation ====================================== -One type of imputation algorithm is univariate, which imputes values in the i-th -feature dimension using only non-missing values in that feature dimension +One type of imputation algorithm is univariate, which imputes values in the +i-th feature dimension using only non-missing values in that feature dimension (e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the missing values (e.g. :class:`impute.IterativeImputer`). @@ -66,9 +66,9 @@ The :class:`SimpleImputer` class also supports sparse matrices:: [6. 3.] [7. 6.]] -Note that this format is not meant to be used to implicitly store missing values -in the matrix because it would densify it at transform time. Missing values encoded -by 0 must be used with dense input. +Note that this format is not meant to be used to implicitly store missing +values in the matrix because it would densify it at transform time. Missing +values encoded by 0 must be used with dense input. The :class:`SimpleImputer` class also supports categorical data represented as string values or pandas categoricals when using the ``'most_frequent'`` or @@ -118,8 +118,8 @@ round are returned. [ 6. 12.] [ 3. 6.]] -Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a Pipeline -as a way to build a composite estimator that supports imputation. +Both :class:`SimpleImputer` and :class:`IterativeImputer` can be used in a +Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. Flexibility of IterativeImputer @@ -128,9 +128,9 @@ Flexibility of IterativeImputer There are many well-established imputation packages in the R data science ecosystem: Amelia, mi, mice, missForest, etc. missForest is popular, and turns out to be a particular instance of different sequential imputation algorithms -that can all be implemented with :class:`IterativeImputer` by passing in different -regressors to be used for predicting missing feature values. In the case of missForest, -this regressor is a Random Forest. +that can all be implemented with :class:`IterativeImputer` by passing in +different regressors to be used for predicting missing feature values. In the +case of missForest, this regressor is a Random Forest. See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. @@ -139,14 +139,14 @@ See :ref:`sphx_glr_auto_examples_plot_iterative_imputer_variants_comparison.py`. Multiple vs. Single Imputation ------------------------------ -In the statistics community, it is common practice to perform multiple imputations, -generating, for example, ``m`` separate imputations for a single feature matrix. -Each of these ``m`` imputations is then put through the subsequent analysis pipeline -(e.g. feature engineering, clustering, regression, classification). The ``m`` final -analysis results (e.g. held-out validation errors) allow the data scientist -to obtain understanding of how analytic results may differ as a consequence -of the inherent uncertainty caused by the missing values. The above practice -is called multiple imputation. +In the statistics community, it is common practice to perform multiple +imputations, generating, for example, ``m`` separate imputations for a single +feature matrix. Each of these ``m`` imputations is then put through the +subsequent analysis pipeline (e.g. feature engineering, clustering, regression, +classification). The ``m`` final analysis results (e.g. held-out validation +errors) allow the data scientist to obtain understanding of how analytic +results may differ as a consequence of the inherent uncertainty caused by the +missing values. The above practice is called multiple imputation. Our implementation of :class:`IterativeImputer` was inspired by the R MICE package (Multivariate Imputation by Chained Equations) [1]_, but differs from @@ -156,13 +156,13 @@ it repeatedly to the same dataset with different random seeds when ``sample_posterior=True``. See [2]_, chapter 4 for more discussion on multiple vs. single imputations. -It is still an open problem as to how useful single vs. multiple imputation is in -the context of prediction and classification when the user is not interested in -measuring uncertainty due to missing values. +It is still an open problem as to how useful single vs. multiple imputation is +in the context of prediction and classification when the user is not +interested in measuring uncertainty due to missing values. -Note that a call to the ``transform`` method of :class:`IterativeImputer` is not -allowed to change the number of samples. Therefore multiple imputations cannot be -achieved by a single call to ``transform``. +Note that a call to the ``transform`` method of :class:`IterativeImputer` is +not allowed to change the number of samples. Therefore multiple imputations +cannot be achieved by a single call to ``transform``. References ========== From a102cec870f774c91c7f1da9fcbb805528d476b9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Jan 2019 08:56:27 -0800 Subject: [PATCH 27/33] Update examples/impute/plot_iterative_imputer_variants_comparison.py Co-Authored-By: sergeyf --- examples/impute/plot_iterative_imputer_variants_comparison.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 919edbe25d371..7fe97b0c5fb50 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -43,7 +43,8 @@ import pandas as pd from sklearn.datasets import fetch_california_housing -from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.impute import SimpleImputer +from sklearn.impute import IterativeImputer from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor From 57b83d684e4f60999798bd6ded110d0bf769eed3 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 09:07:47 -0800 Subject: [PATCH 28/33] fixing y-axis labels --- .../plot_iterative_imputer_variants_comparison.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 919edbe25d371..4fd44ecf36c4f 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -112,6 +112,14 @@ keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 ) +labels = ['Full Data', + 'SimpleImputer w/ Mean Strategy', + 'SimpleImputer w/ Median Strategy', + 'IterativeImputer w/ BayesianRidge', + 'IterativeImputer w/ DecisionTreeRegressor', + 'IterativeImputer w/ KNeighborsRegressor', + 'IterativeImputer w/ ExtraTreesRegressor'] + # plot boston results fig, ax = plt.subplots(figsize=(13, 6)) means = -scores.mean() @@ -119,5 +127,8 @@ means.plot.barh(xerr=errors, ax=ax) ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') +ax.set_yticks(np.arange(means.shape[0])) +ax.invert_yaxis() +ax.set_yticklabels(labels) plt.tight_layout(pad=1) plt.show() From a86b5351300df3603a159a1bdd5176d22d38fe76 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Jan 2019 09:10:22 -0800 Subject: [PATCH 29/33] Update examples/impute/plot_iterative_imputer_variants_comparison.py Co-Authored-By: sergeyf --- examples/impute/plot_iterative_imputer_variants_comparison.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 230dd290daf01..2a73e1813449a 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -33,7 +33,7 @@ dataset with a single value randomly removed from each row. For this particular pattern of missing values we see that -:class:`sklearn.ensemble.ExtraTreesRegressor`` and +:class:`sklearn.ensemble.ExtraTreesRegressor` and :class:`sklearn.linear_model.BayesianRidge` give the best results. """ print(__doc__) From c0b743914db6938862cc34c320759abdc6ce4b75 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Jan 2019 09:10:32 -0800 Subject: [PATCH 30/33] Update examples/impute/plot_iterative_imputer_variants_comparison.py Co-Authored-By: sergeyf --- examples/impute/plot_iterative_imputer_variants_comparison.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 2a73e1813449a..6f5e4f11c83a8 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -76,7 +76,7 @@ missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan -# Estimate the score after imputation (mean strategy) of the missing values +# Estimate the score after imputation (mean and median strategies) of the missing values score_simple_imputer = pd.DataFrame() for strategy in ('mean', 'median'): estimator = make_pipeline( From 332a01d2b9f661e2bd15da9fb7935138a744c3e1 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 09:38:48 -0800 Subject: [PATCH 31/33] minor change --- examples/impute/plot_iterative_imputer_variants_comparison.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 230dd290daf01..f6b2c577d71fe 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -57,8 +57,7 @@ rng = np.random.RandomState(0) X_full, y_full = fetch_california_housing(return_X_y=True) -n_samples = X_full.shape[0] -n_features = X_full.shape[1] +n_samples, n_features = X_full.shape # Estimate the score on the entire dataset, with no missing values br_estimator = BayesianRidge() From 46e21dc22c7cda5018928b773bcf89c5229e1cee Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 09:45:33 -0800 Subject: [PATCH 32/33] addressing reviewer comments --- ...plot_iterative_imputer_variants_comparison.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index e06f0b9b7be91..8d218ea82a006 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -65,7 +65,8 @@ cross_val_score( br_estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS - ) + ), + columns=['Full Data'] ) # Add a single missing value to each row @@ -75,7 +76,7 @@ missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan -# Estimate the score after imputation (mean and median strategies) of the missing values +# Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() for strategy in ('mean', 'median'): estimator = make_pipeline( @@ -112,14 +113,6 @@ keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1 ) -labels = ['Full Data', - 'SimpleImputer w/ Mean Strategy', - 'SimpleImputer w/ Median Strategy', - 'IterativeImputer w/ BayesianRidge', - 'IterativeImputer w/ DecisionTreeRegressor', - 'IterativeImputer w/ KNeighborsRegressor', - 'IterativeImputer w/ ExtraTreesRegressor'] - # plot boston results fig, ax = plt.subplots(figsize=(13, 6)) means = -scores.mean() @@ -128,7 +121,6 @@ ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') ax.set_yticks(np.arange(means.shape[0])) -ax.invert_yaxis() -ax.set_yticklabels(labels) +ax.set_yticklabels([" w/ ".join(label) for label in means.index.get_values()]) plt.tight_layout(pad=1) plt.show() From ddab109e1b1da313889ea34feebb28698c321ed3 Mon Sep 17 00:00:00 2001 From: sergeyf Date: Thu, 24 Jan 2019 18:04:08 -0800 Subject: [PATCH 33/33] reordering predictors --- .../plot_iterative_imputer_variants_comparison.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 8d218ea82a006..a850deb273f24 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -10,11 +10,11 @@ In this example we compare some predictors for the purpose of missing feature imputation with :class:`sklearn.imputeIterativeImputer`:: - :class:`sklearn.linear_model.BayesianRidge`: regularized linear regression - :class:`sklearn.tree.DecisionTreeRegressor`: non-linear regression - :class:`sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN + :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression + :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression + :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R + :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN imputation approaches - :class:`sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R Of particular interest is the ability of :class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a @@ -93,8 +93,8 @@ predictors = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), - KNeighborsRegressor(n_neighbors=15), - ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0) + ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0), + KNeighborsRegressor(n_neighbors=15) ] score_iterative_imputer = pd.DataFrame() for predictor in predictors: