From 4458efa8c740d95d04369ddea789a2d73821dd60 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 11:19:43 +0100 Subject: [PATCH 01/62] first few comments --- examples/impute/plot_missing_values.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 5186cf0ba3bac..b924452084e69 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -23,6 +23,9 @@ """ print(__doc__) +# Authors: Maria Telenczuk + + import numpy as np import matplotlib.pyplot as plt @@ -113,6 +116,9 @@ def get_results(dataset): mses_boston = results_boston[:, 0] * -1 stds_boston = results_boston[:, 1] +# TODO: load Ames instead + + n_bars = len(mses_diabetes) xval = np.arange(n_bars) @@ -144,7 +150,9 @@ def get_results(dataset): ax2.barh(j, mses_boston[j], xerr=stds_boston[j], color=colors[j], alpha=0.6, align='center') -ax2.set_title('Imputation Techniques with Boston Data') +# plot Ames results + +ax2.set_title('Imputation Techniques with Ames Data') ax2.set_yticks(xval) ax2.set_xlabel('MSE') ax2.invert_yaxis() From 5d3c4e59ee6f6810cd3aaf6a3de9c872f8ec8e32 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 11:43:41 +0100 Subject: [PATCH 02/62] added new california dataset --- examples/impute/plot_missing_values.py | 31 ++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b924452084e69..1ffe162a36e4f 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -24,7 +24,7 @@ print(__doc__) # Authors: Maria Telenczuk - +# License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt @@ -44,6 +44,10 @@ N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) +############################################################################### +# +############################################################################### +# def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( @@ -54,9 +58,21 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): cv=N_SPLITS) return impute_scores +############################################################################### +# +############################################################################### +# def get_results(dataset): X_full, y_full = dataset.data, dataset.target + + # As California dataset is quite large [(20640, 8)] to speed up the + # calculations we will limit number of entries of both datasets to 440, + # however feel free to use the whole datasets + + X_full = X_full[:440] + y_full = y_full[:440] + n_samples = X_full.shape[0] n_features = X_full.shape[1] @@ -108,6 +124,12 @@ def get_results(dataset): (iterative_impute_scores.mean(), iterative_impute_scores.std())) +############################################################################### +# Download the data +############################################################################### +# +from sklearn.datasets import fetch_california_housing + results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] @@ -116,8 +138,13 @@ def get_results(dataset): mses_boston = results_boston[:, 0] * -1 stds_boston = results_boston[:, 1] -# TODO: load Ames instead +from sklearn.datasets import fetch_openml + +import pdb; pdb.set_trace() +results_california = np.array(get_results(fetch_california_housing())) +mses_california = results_boston[:, 0] * -1 +stds_california = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) From f51b73c03b33ac8a773a1a241582bf9eb366e388 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 12:46:00 +0100 Subject: [PATCH 03/62] removed boston dataset from the file --- examples/impute/plot_missing_values.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 1ffe162a36e4f..1e9f51ed79c19 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -32,7 +32,6 @@ # To use the experimental IterativeImputer, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import load_diabetes -from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline, make_union from sklearn.impute import ( @@ -134,17 +133,9 @@ def get_results(dataset): mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] -results_boston = np.array(get_results(load_boston())) -mses_boston = results_boston[:, 0] * -1 -stds_boston = results_boston[:, 1] - -from sklearn.datasets import fetch_openml - -import pdb; pdb.set_trace() - results_california = np.array(get_results(fetch_california_housing())) -mses_california = results_boston[:, 0] * -1 -stds_california = results_boston[:, 1] +mses_california = results_california[:, 0] * -1 +stds_california = results_california[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) @@ -171,15 +162,15 @@ def get_results(dataset): ax1.invert_yaxis() ax1.set_yticklabels(x_labels) -# plot boston results +# plot california results ax2 = plt.subplot(122) for j in xval: - ax2.barh(j, mses_boston[j], xerr=stds_boston[j], + ax2.barh(j, mses_california[j], xerr=stds_california[j], color=colors[j], alpha=0.6, align='center') # plot Ames results -ax2.set_title('Imputation Techniques with Ames Data') +ax2.set_title('Imputation Techniques with California Data') ax2.set_yticks(xval) ax2.set_xlabel('MSE') ax2.invert_yaxis() From 9bf0860340f852ac01adb53913f8b0f801647426 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 13:15:47 +0100 Subject: [PATCH 04/62] updating the DOCs --- examples/impute/plot_missing_values.py | 60 ++++++++++++++++---------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 1e9f51ed79c19..595547f8c0bb3 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -5,40 +5,24 @@ Missing values can be replaced by the mean, the median or the most frequent value using the basic :class:`sklearn.impute.SimpleImputer`. -The median is a more robust estimator for data with high magnitude variables -which could dominate results (otherwise known as a 'long tail'). -With ``KNNImputer``, missing values can be imputed using the weighted -or unweighted mean of the desired number of nearest neighbors. +In this example we will investigate different imputation techniques on two +datasets: Diabetes dataset which is the set of parameteres collected from the +diabetes patients and California Housing dataset for which the target is the +median house value for California districts. -Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -round-robin linear regression, treating every variable as an output in -turn. The version implemented assumes Gaussian (output) variables. If your -features are obviously non-Normal, consider transforming them to look more -Normal so as to potentially improve performance. - -In addition of using an imputing method, we can also keep an indication of the -missing information using :func:`sklearn.impute.MissingIndicator` which might -carry some information. """ print(__doc__) # Authors: Maria Telenczuk # License: BSD 3 clause + import numpy as np -import matplotlib.pyplot as plt -# To use the experimental IterativeImputer, we need to explicitly ask for it: -from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.datasets import load_diabetes +rng = np.random.RandomState(0) from sklearn.ensemble import RandomForestRegressor -from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import ( - SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator) -from sklearn.model_selection import cross_val_score -rng = np.random.RandomState(0) N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) @@ -48,6 +32,12 @@ ############################################################################### # +from sklearn.impute import MissingIndicator +from sklearn.model_selection import cross_val_score +from sklearn.pipeline import make_pipeline, make_union + + + def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), @@ -60,8 +50,34 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): ############################################################################### # ############################################################################### +#The median is a more robust estimator for data with high magnitude variables +#which could dominate results (otherwise known as a 'long tail'). +# +#With ``KNNImputer``, missing values can be imputed using the weighted +#or unweighted mean of the desired number of nearest neighbors. +# +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +# round-robin linear regression, treating every variable as an output in +# turn. The version implemented assumes Gaussian (output) variables. If your +# features are obviously non-Normal, consider transforming them to look more +# Normal so as to potentially improve performance. +# +# In addition of using an imputing method, we can also keep an indication of the +# missing information using :func:`sklearn.impute.MissingIndicator` which might +#carry some information. # +import matplotlib.pyplot as plt + +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.datasets import load_diabetes + + +from sklearn.impute import ( + SimpleImputer, KNNImputer, IterativeImputer) + + def get_results(dataset): X_full, y_full = dataset.data, dataset.target From d22619a7ee5a0a1d7e5806a8f28e6f8f5a5933bf Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 13:44:23 +0100 Subject: [PATCH 05/62] adding a DOC for calculating the error --- examples/impute/plot_missing_values.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 595547f8c0bb3..97d7cd698a698 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -8,8 +8,13 @@ In this example we will investigate different imputation techniques on two datasets: Diabetes dataset which is the set of parameteres collected from the -diabetes patients and California Housing dataset for which the target is the -median house value for California districts. +diabetes patients with aim to predict disease progression and California +Housing dataset for which the target is the median house value for California +districts. + +Neither of those datasets has missing values. We will remove some of the values +and compare how will the results change if we use original data and the data +with imputed missing data by means of different techniques. """ print(__doc__) @@ -17,7 +22,6 @@ # Authors: Maria Telenczuk # License: BSD 3 clause - import numpy as np rng = np.random.RandomState(0) @@ -28,9 +32,11 @@ REGRESSOR = RandomForestRegressor(random_state=0) ############################################################################### -# +# Calculate the error ############################################################################### # +# We are going to calculate the score for the imputers using negative mean +# square error. from sklearn.impute import MissingIndicator from sklearn.model_selection import cross_val_score From 7b7efc2db5cdc77912632d0a6508b70bf96efa5f Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 15:36:50 +0100 Subject: [PATCH 06/62] exchanged the order started writing functions on scoring the imputers --- examples/impute/plot_missing_values.py | 218 +++++++++++++++---------- 1 file changed, 135 insertions(+), 83 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 97d7cd698a698..73a9be0a87f67 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -12,9 +12,9 @@ Housing dataset for which the target is the median house value for California districts. -Neither of those datasets has missing values. We will remove some of the values -and compare how will the results change if we use original data and the data -with imputed missing data by means of different techniques. +Neither of those datasets has missing values. We will remove some of the +values and compare the results of RandomForestRegressor TODO: add link on the +full data and the data with the missing values imputed by different techniques. """ print(__doc__) @@ -22,27 +22,73 @@ # Authors: Maria Telenczuk # License: BSD 3 clause +############################################################################### +# Download the data and make missing values sets +############################################################################### +# +# First we are downloading the two datasets. Diabets dataset is shipped with +# scikit-learn. It has 442 entries, each with 10 features. California Housing +# dataset is much larger with 20640 entires and 8 features and we will need to +# fetch it using fetch_california_housing TODO:link function. We will only use +# the first 500 entries here for sake of speeding up the calculations but feel +# free to use the whole dataset. +# + import numpy as np -rng = np.random.RandomState(0) -from sklearn.ensemble import RandomForestRegressor +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import load_diabetes -N_SPLITS = 5 -REGRESSOR = RandomForestRegressor(random_state=0) +X_diabetes, y_diabetes = load_diabetes(return_X_y=True) +X_california, y_california = fetch_california_housing(return_X_y=True) +X_california = X_california[:500] +y_california = y_california[:500] + +def add_missing_values(X_full, y_full): + n_samples = X_full.shape[0] + n_features = X_full.shape[1] + + # Add missing values in 75% of the lines + missing_rate = 0.75 + n_missing_samples = int(np.floor(n_samples * missing_rate)) + missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, + dtype=np.bool), + np.ones(n_missing_samples, + dtype=np.bool))) + rng.shuffle(missing_samples) + missing_features = rng.randint(0, n_features, n_missing_samples) + X_missing = X_full.copy() + X_missing[np.where(missing_samples)[0], missing_features] = 0 + y_missing = y_full.copy() + + return X_missing, y_missing + +X_miss_california, y_miss_california = add_missing_values( + X_california, y_california) + +X_miss_diabetes, y_miss_diabetes = add_missing_values( + X_diabetes, y_diabetes) + ############################################################################### -# Calculate the error +# Impute the missing data and score ############################################################################### +# Now we will write a function which will impute the data given type of +# imputer, perform RandomForestRegresssor TODO: link on it and calculate the +# negative mean squared error # -# We are going to calculate the score for the imputers using negative mean -# square error. +rng = np.random.RandomState(0) + +from sklearn.ensemble import RandomForestRegressor from sklearn.impute import MissingIndicator from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline, make_union +N_SPLITS = 5 +REGRESSOR = RandomForestRegressor(random_state=0) def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( @@ -53,103 +99,105 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): cv=N_SPLITS) return impute_scores -############################################################################### -# -############################################################################### -#The median is a more robust estimator for data with high magnitude variables -#which could dominate results (otherwise known as a 'long tail'). +x_labels = ['Full data', + 'Zero imputation', + 'Mean Imputation', + 'KNN Imputation', + 'Iterative Imputation'] + +mses_california = np.zeros(5) +stds_california = np.zeros(5) +mses_diabetes = np.zeros(5) +stds_diabetes = np.zeros(5) + +# Let's get a score for performing RandomForestRegresssor on a full data +# Estimate the score on the entire dataset, with no missing values + +def get_full_score(X_full, y_full) + full_scores = cross_val_score(REGRESSOR, X_full, y_full, + scoring='neg_mean_squared_error', + cv=N_SPLITS) + return full_scores.mean(), full_scores.std() + +mses_california[0], stds_california[0] = get_full_score( + X_miss_california, y_miss_california) +mses_diabetes[0], stds_diabetes[0] = get_full_score( + X_miss_diabetes, y_miss_diabetes) + +# The median is a more robust estimator for data with high magnitude variables +# which could dominate results (otherwise known as a 'long tail'). # -#With ``KNNImputer``, missing values can be imputed using the weighted -#or unweighted mean of the desired number of nearest neighbors. + + +def get_impute_zero_score(X_missing, y_missing): + # Estimate the score after replacing missing values by 0 + imputer = SimpleImputer(missing_values=0, + strategy='constant', + fill_value=0) + zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + +mses_california[1], stds_california[1] = get_impute_zero_score(X_missing, + y_missing) +mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_missing, + y_missing) + +# With ``KNNImputer``, missing values can be imputed using the weighted +# or unweighted mean of the desired number of nearest neighbors. # +# Estimate the score after kNN-imputation of the missing values + + +imputer = KNNImputer(missing_values=0) +knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + + # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses # round-robin linear regression, treating every variable as an output in # turn. The version implemented assumes Gaussian (output) variables. If your # features are obviously non-Normal, consider transforming them to look more # Normal so as to potentially improve performance. # +# Estimate the score after imputation (mean strategy) of the missing values +imputer = SimpleImputer(missing_values=0, strategy="mean") +mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + + # In addition of using an imputing method, we can also keep an indication of the # missing information using :func:`sklearn.impute.MissingIndicator` which might #carry some information. # - -import matplotlib.pyplot as plt +# Estimate the score after iterative imputation of the missing values +imputer = IterativeImputer(missing_values=0, + random_state=0, + n_nearest_features=5, + sample_posterior=True) +iterative_impute_scores = get_scores_for_imputer(imputer, + X_missing, + y_missing) # To use the experimental IterativeImputer, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.datasets import load_diabetes - - from sklearn.impute import ( SimpleImputer, KNNImputer, IterativeImputer) -def get_results(dataset): - X_full, y_full = dataset.data, dataset.target +''' - # As California dataset is quite large [(20640, 8)] to speed up the - # calculations we will limit number of entries of both datasets to 440, - # however feel free to use the whole datasets + return ((full_scores.mean(), full_scores.std()), + (zero_impute_scores.mean(), zero_impute_scores.std()), + (mean_impute_scores.mean(), mean_impute_scores.std()), + (knn_impute_scores.mean(), knn_impute_scores.std()), + (iterative_impute_scores.mean(), iterative_impute_scores.std())) +''' - X_full = X_full[:440] - y_full = y_full[:440] - n_samples = X_full.shape[0] - n_features = X_full.shape[1] - # Estimate the score on the entire dataset, with no missing values - full_scores = cross_val_score(REGRESSOR, X_full, y_full, - scoring='neg_mean_squared_error', - cv=N_SPLITS) - # Add missing values in 75% of the lines - missing_rate = 0.75 - n_missing_samples = int(np.floor(n_samples * missing_rate)) - missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) - rng.shuffle(missing_samples) - missing_features = rng.randint(0, n_features, n_missing_samples) - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - # Estimate the score after replacing missing values by 0 - imputer = SimpleImputer(missing_values=0, - strategy='constant', - fill_value=0) - zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - # Estimate the score after imputation (mean strategy) of the missing values - imputer = SimpleImputer(missing_values=0, strategy="mean") - mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - # Estimate the score after kNN-imputation of the missing values - imputer = KNNImputer(missing_values=0) - knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - # Estimate the score after iterative imputation of the missing values - imputer = IterativeImputer(missing_values=0, - random_state=0, - n_nearest_features=5, - sample_posterior=True) - iterative_impute_scores = get_scores_for_imputer(imputer, - X_missing, - y_missing) - return ((full_scores.mean(), full_scores.std()), - (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std()), - (knn_impute_scores.mean(), knn_impute_scores.std()), - (iterative_impute_scores.mean(), iterative_impute_scores.std())) - - -############################################################################### -# Download the data -############################################################################### -# -from sklearn.datasets import fetch_california_housing results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 @@ -159,16 +207,20 @@ def get_results(dataset): mses_california = results_california[:, 0] * -1 stds_california = results_california[:, 1] + + +############################################################################### +# Plot the results +############################################################################### +# + n_bars = len(mses_diabetes) xval = np.arange(n_bars) -x_labels = ['Full data', - 'Zero imputation', - 'Mean Imputation', - 'KNN Imputation', - 'Iterative Imputation'] + colors = ['r', 'g', 'b', 'orange', 'black'] +import matplotlib.pyplot as plt # plot diabetes results plt.figure(figsize=(12, 6)) ax1 = plt.subplot(121) From 164ed5ac36b5b8f95603838b1d61ea2941daa86a Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 15:42:25 +0100 Subject: [PATCH 07/62] finished writing functions for imputers --- examples/impute/plot_missing_values.py | 41 +++++++++++++++++++------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 73a9be0a87f67..00d2d61b50494 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -113,7 +113,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): # Let's get a score for performing RandomForestRegresssor on a full data # Estimate the score on the entire dataset, with no missing values -def get_full_score(X_full, y_full) +def get_full_score(X_full, y_full): full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) @@ -135,20 +135,27 @@ def get_impute_zero_score(X_missing, y_missing): strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return zero_impute_scores.mean(), zero_impute_scores.std() -mses_california[1], stds_california[1] = get_impute_zero_score(X_missing, - y_missing) -mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_missing, - y_missing) +mses_california[1], stds_california[1] = get_impute_zero_score(X_miss_california, + y_miss_california) +mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, + y_miss_diabetes) # With ``KNNImputer``, missing values can be imputed using the weighted # or unweighted mean of the desired number of nearest neighbors. # # Estimate the score after kNN-imputation of the missing values +def get_impute_KNN_score(X_missing, y_missing): + imputer = KNNImputer(missing_values=0) + knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return knn_impute_scores.mean(), knn_impute_scores.std() -imputer = KNNImputer(missing_values=0) -knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) +mses_california[2], stds_california[2] = get_impute_KNN_score(X_miss_california, + y_miss_california) +mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, + y_miss_diabetes) # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses @@ -158,22 +165,34 @@ def get_impute_zero_score(X_missing, y_missing): # Normal so as to potentially improve performance. # # Estimate the score after imputation (mean strategy) of the missing values -imputer = SimpleImputer(missing_values=0, strategy="mean") -mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) +def get_impute_mean(X_missing, y_missing): + imputer = SimpleImputer(missing_values=0, strategy="mean") + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return mean_impute_scores.mean(), mean_impute_scores.std() +mses_california[3], stds_california[3] = get_impute_mean(X_miss_california, + y_miss_california) +mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, + y_miss_diabetes) # In addition of using an imputing method, we can also keep an indication of the # missing information using :func:`sklearn.impute.MissingIndicator` which might #carry some information. # # Estimate the score after iterative imputation of the missing values -imputer = IterativeImputer(missing_values=0, + +def get_impute_iterative(X_missing, y_missing): + imputer = IterativeImputer(missing_values=0, random_state=0, n_nearest_features=5, sample_posterior=True) -iterative_impute_scores = get_scores_for_imputer(imputer, + iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) +mses_california[4], stds_california[4] = get_impute_iterative(X_miss_california, + y_miss_california) +mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, + y_miss_diabetes) # To use the experimental IterativeImputer, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa From 00b9067e28fe7c0382064af3ac83a7a93fd879ad Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 16:09:49 +0100 Subject: [PATCH 08/62] finished writing functions and started on DOcs --- examples/impute/plot_missing_values.py | 43 +++++++++----------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 00d2d61b50494..93ba157c5bf26 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -40,6 +40,8 @@ from sklearn.datasets import load_diabetes +rng = np.random.RandomState(0) + X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) X_california = X_california[:500] @@ -82,7 +84,11 @@ def add_missing_values(X_full, y_full): rng = np.random.RandomState(0) from sklearn.ensemble import RandomForestRegressor -from sklearn.impute import MissingIndicator + +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.impute import (SimpleImputer, KNNImputer, IterativeImputer, + MissingIndicator) from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline, make_union @@ -119,10 +125,9 @@ def get_full_score(X_full, y_full): cv=N_SPLITS) return full_scores.mean(), full_scores.std() -mses_california[0], stds_california[0] = get_full_score( - X_miss_california, y_miss_california) -mses_diabetes[0], stds_diabetes[0] = get_full_score( - X_miss_diabetes, y_miss_diabetes) +mses_california[0], stds_california[0] = get_full_score(X_california, + y_california) +mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) # The median is a more robust estimator for data with high magnitude variables # which could dominate results (otherwise known as a 'long tail'). @@ -189,17 +194,12 @@ def get_impute_iterative(X_missing, y_missing): iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return iterative_impute_scores.mean(), iterative_impute_scores.std() + mses_california[4], stds_california[4] = get_impute_iterative(X_miss_california, y_miss_california) mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, y_miss_diabetes) - -# To use the experimental IterativeImputer, we need to explicitly ask for it: -from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.impute import ( - SimpleImputer, KNNImputer, IterativeImputer) - - ''' return ((full_scores.mean(), full_scores.std()), @@ -209,23 +209,8 @@ def get_impute_iterative(X_missing, y_missing): (iterative_impute_scores.mean(), iterative_impute_scores.std())) ''' - - - - - - - - - -results_diabetes = np.array(get_results(load_diabetes())) -mses_diabetes = results_diabetes[:, 0] * -1 -stds_diabetes = results_diabetes[:, 1] - -results_california = np.array(get_results(fetch_california_housing())) -mses_california = results_california[:, 0] * -1 -stds_california = results_california[:, 1] - +mses_diabetes = mses_diabetes * -1 +mses_california = mses_california * -1 ############################################################################### From 5a2177b3e5bf3c334b3db4f7faf24066dbaacd55 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 16:25:26 +0100 Subject: [PATCH 09/62] working on the DOCs for imputers --- examples/impute/plot_missing_values.py | 37 +++++++++++++------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 93ba157c5bf26..ba10e4481048c 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -13,7 +13,7 @@ districts. Neither of those datasets has missing values. We will remove some of the -values and compare the results of RandomForestRegressor TODO: add link on the +values and compare the results of RandomForestRegressor on the full data and the data with the missing values imputed by different techniques. """ @@ -26,12 +26,11 @@ # Download the data and make missing values sets ############################################################################### # -# First we are downloading the two datasets. Diabets dataset is shipped with +# First we download the two datasets. Diabets dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entires and 8 features and we will need to -# fetch it using fetch_california_housing TODO:link function. We will only use -# the first 500 entries here for sake of speeding up the calculations but feel -# free to use the whole dataset. +# fetch it. We will only use the first 500 entries here for sake of speeding up +# the calculations but feel free to use the whole dataset. # import numpy as np @@ -76,9 +75,8 @@ def add_missing_values(X_full, y_full): ############################################################################### # Impute the missing data and score ############################################################################### -# Now we will write a function which will impute the data given type of -# imputer, perform RandomForestRegresssor TODO: link on it and calculate the -# negative mean squared error +# Now we will write a function which will score the results on the differently +# prepared data. Let's look at each imputer one by on # rng = np.random.RandomState(0) @@ -116,8 +114,8 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) -# Let's get a score for performing RandomForestRegresssor on a full data -# Estimate the score on the entire dataset, with no missing values +# First, we will calculate the score on the original data sets +# def get_full_score(X_full, y_full): full_scores = cross_val_score(REGRESSOR, X_full, y_full, @@ -129,21 +127,19 @@ def get_full_score(X_full, y_full): y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -# The median is a more robust estimator for data with high magnitude variables -# which could dominate results (otherwise known as a 'long tail'). +# Now we will estimate the score after replacing missing values by 0 # - def get_impute_zero_score(X_missing, y_missing): - # Estimate the score after replacing missing values by 0 + imputer = SimpleImputer(missing_values=0, strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return zero_impute_scores.mean(), zero_impute_scores.std() -mses_california[1], stds_california[1] = get_impute_zero_score(X_miss_california, - y_miss_california) +mses_california[1], stds_california[1] = get_impute_zero_score( + X_miss_california, y_miss_california) mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, y_miss_diabetes) @@ -157,8 +153,8 @@ def get_impute_KNN_score(X_missing, y_missing): knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() -mses_california[2], stds_california[2] = get_impute_KNN_score(X_miss_california, - y_miss_california) +mses_california[2], stds_california[2] = get_impute_KNN_score( + X_miss_california, y_miss_california) mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, y_miss_diabetes) @@ -255,3 +251,8 @@ def get_impute_iterative(X_missing, y_missing): ax2.set_yticklabels([''] * n_bars) plt.show() + +# The median is a more robust estimator for data with high magnitude variables +# which could dominate results (otherwise known as a 'long tail'). +# + From 841efced6a8e590ebd0ad995b31bf939f013d580 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 16:30:46 +0100 Subject: [PATCH 10/62] cleaning up --- examples/impute/plot_missing_values.py | 59 +++++++++++++------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index ba10e4481048c..25ee78bf58e80 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -13,8 +13,8 @@ districts. Neither of those datasets has missing values. We will remove some of the -values and compare the results of RandomForestRegressor on the -full data and the data with the missing values imputed by different techniques. +values and compare the results of RandomForestRegressor on the full data and +the data with the missing values imputed by different techniques. """ print(__doc__) @@ -76,7 +76,7 @@ def add_missing_values(X_full, y_full): # Impute the missing data and score ############################################################################### # Now we will write a function which will score the results on the differently -# prepared data. Let's look at each imputer one by on +# prepared data. Let's look at each imputer separately # rng = np.random.RandomState(0) @@ -94,6 +94,11 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) +# In addition of using an imputing method, we can also keep an indication of +# the missing information using :func:`sklearn.impute.MissingIndicator` which +# might carry some information. +# + def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), @@ -114,7 +119,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) -# First, we will calculate the score on the original data sets +# Now, we will calculate the score on the original data sets # def get_full_score(X_full, y_full): @@ -127,7 +132,7 @@ def get_full_score(X_full, y_full): y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -# Now we will estimate the score after replacing missing values by 0 +# Next, we will estimate the score after replacing missing values by 0 # def get_impute_zero_score(X_missing, y_missing): @@ -143,10 +148,11 @@ def get_impute_zero_score(X_missing, y_missing): mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, y_miss_diabetes) +# Estimate the score after kNN-imputation of the missing values +# # With ``KNNImputer``, missing values can be imputed using the weighted # or unweighted mean of the desired number of nearest neighbors. # -# Estimate the score after kNN-imputation of the missing values def get_impute_KNN_score(X_missing, y_missing): imputer = KNNImputer(missing_values=0) @@ -158,14 +164,8 @@ def get_impute_KNN_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, y_miss_diabetes) - -# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -# round-robin linear regression, treating every variable as an output in -# turn. The version implemented assumes Gaussian (output) variables. If your -# features are obviously non-Normal, consider transforming them to look more -# Normal so as to potentially improve performance. -# # Estimate the score after imputation (mean strategy) of the missing values +# def get_impute_mean(X_missing, y_missing): imputer = SimpleImputer(missing_values=0, strategy="mean") @@ -176,11 +176,16 @@ def get_impute_mean(X_missing, y_missing): y_miss_california) mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes) -# In addition of using an imputing method, we can also keep an indication of the -# missing information using :func:`sklearn.impute.MissingIndicator` which might -#carry some information. -# + # Estimate the score after iterative imputation of the missing values +# +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +# round-robin linear regression, treating every variable as an output in +# turn. +# The version implemented assumes Gaussian (output) variables. If your +# features are obviously non-Normal, consider transforming them to look more +# Normal so as to potentially improve performance. +# def get_impute_iterative(X_missing, y_missing): imputer = IterativeImputer(missing_values=0, @@ -196,31 +201,24 @@ def get_impute_iterative(X_missing, y_missing): y_miss_california) mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, y_miss_diabetes) -''' - - return ((full_scores.mean(), full_scores.std()), - (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std()), - (knn_impute_scores.mean(), knn_impute_scores.std()), - (iterative_impute_scores.mean(), iterative_impute_scores.std())) -''' mses_diabetes = mses_diabetes * -1 mses_california = mses_california * -1 - ############################################################################### # Plot the results ############################################################################### # +# Finally we are going to visualize the score + +import matplotlib.pyplot as plt + n_bars = len(mses_diabetes) xval = np.arange(n_bars) - colors = ['r', 'g', 'b', 'orange', 'black'] -import matplotlib.pyplot as plt # plot diabetes results plt.figure(figsize=(12, 6)) ax1 = plt.subplot(121) @@ -252,7 +250,8 @@ def get_impute_iterative(X_missing, y_missing): plt.show() -# The median is a more robust estimator for data with high magnitude variables -# which could dominate results (otherwise known as a 'long tail'). +# You can also try different techniques. For instance, the median is a more +# robust estimator for data with high magnitude variables which could dominate +# results (otherwise known as a 'long tail'). # From f2248b8ffc6c7d9932abdd66007e84eac335a3e7 Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 21 Feb 2020 16:34:58 +0100 Subject: [PATCH 11/62] flake8 --- examples/impute/plot_missing_values.py | 32 ++++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 25ee78bf58e80..89deb9470ff05 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -46,6 +46,7 @@ X_california = X_california[:500] y_california = y_california[:500] + def add_missing_values(X_full, y_full): n_samples = X_full.shape[0] n_features = X_full.shape[1] @@ -65,6 +66,7 @@ def add_missing_values(X_full, y_full): return X_missing, y_missing + X_miss_california, y_miss_california = add_missing_values( X_california, y_california) @@ -99,6 +101,7 @@ def add_missing_values(X_full, y_full): # might carry some information. # + def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), @@ -108,6 +111,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): cv=N_SPLITS) return impute_scores + x_labels = ['Full data', 'Zero imputation', 'Mean Imputation', @@ -122,12 +126,14 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): # Now, we will calculate the score on the original data sets # + def get_full_score(X_full, y_full): full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) return full_scores.mean(), full_scores.std() + mses_california[0], stds_california[0] = get_full_score(X_california, y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) @@ -135,6 +141,7 @@ def get_full_score(X_full, y_full): # Next, we will estimate the score after replacing missing values by 0 # + def get_impute_zero_score(X_missing, y_missing): imputer = SimpleImputer(missing_values=0, @@ -143,6 +150,7 @@ def get_impute_zero_score(X_missing, y_missing): zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return zero_impute_scores.mean(), zero_impute_scores.std() + mses_california[1], stds_california[1] = get_impute_zero_score( X_miss_california, y_miss_california) mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, @@ -154,11 +162,13 @@ def get_impute_zero_score(X_missing, y_missing): # or unweighted mean of the desired number of nearest neighbors. # + def get_impute_KNN_score(X_missing, y_missing): imputer = KNNImputer(missing_values=0) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() + mses_california[2], stds_california[2] = get_impute_KNN_score( X_miss_california, y_miss_california) mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, @@ -167,11 +177,13 @@ def get_impute_KNN_score(X_missing, y_missing): # Estimate the score after imputation (mean strategy) of the missing values # + def get_impute_mean(X_missing, y_missing): imputer = SimpleImputer(missing_values=0, strategy="mean") mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return mean_impute_scores.mean(), mean_impute_scores.std() + mses_california[3], stds_california[3] = get_impute_mean(X_miss_california, y_miss_california) mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, @@ -187,20 +199,22 @@ def get_impute_mean(X_missing, y_missing): # Normal so as to potentially improve performance. # + def get_impute_iterative(X_missing, y_missing): imputer = IterativeImputer(missing_values=0, - random_state=0, - n_nearest_features=5, - sample_posterior=True) + random_state=0, + n_nearest_features=5, + sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, - X_missing, - y_missing) + X_missing, + y_missing) return iterative_impute_scores.mean(), iterative_impute_scores.std() -mses_california[4], stds_california[4] = get_impute_iterative(X_miss_california, - y_miss_california) + +mses_california[4], stds_california[4] = get_impute_iterative( + X_miss_california, y_miss_california) mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, - y_miss_diabetes) + y_miss_diabetes) mses_diabetes = mses_diabetes * -1 mses_california = mses_california * -1 @@ -253,5 +267,3 @@ def get_impute_iterative(X_missing, y_missing): # You can also try different techniques. For instance, the median is a more # robust estimator for data with high magnitude variables which could dominate # results (otherwise known as a 'long tail'). -# - From 4ab578fe726fae7c421b313022d40278fc0b7434 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 11:29:46 +0100 Subject: [PATCH 12/62] cleaning up --- examples/impute/plot_missing_values.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 89deb9470ff05..b98718558f10f 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -13,8 +13,8 @@ districts. Neither of those datasets has missing values. We will remove some of the -values and compare the results of RandomForestRegressor on the full data and -the data with the missing values imputed by different techniques. +values and compare the results of RandomForestRegressor on the original data +and the data with the missing values imputed by different techniques. """ print(__doc__) @@ -29,8 +29,8 @@ # First we download the two datasets. Diabets dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entires and 8 features and we will need to -# fetch it. We will only use the first 500 entries here for sake of speeding up -# the calculations but feel free to use the whole dataset. +# fetch it. We will only use the first 500 entries for sake of speeding up the +# calculations but feel free to use the whole dataset. # import numpy as np @@ -96,7 +96,8 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) -# In addition of using an imputing method, we can also keep an indication of +# +# In addition to using an imputing method, we can also keep an indication of # the missing information using :func:`sklearn.impute.MissingIndicator` which # might carry some information. # @@ -123,6 +124,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) +# # Now, we will calculate the score on the original data sets # @@ -138,6 +140,7 @@ def get_full_score(X_full, y_full): y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) +# # Next, we will estimate the score after replacing missing values by 0 # @@ -156,6 +159,7 @@ def get_impute_zero_score(X_missing, y_missing): mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, y_miss_diabetes) +# # Estimate the score after kNN-imputation of the missing values # # With ``KNNImputer``, missing values can be imputed using the weighted @@ -174,6 +178,7 @@ def get_impute_KNN_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, y_miss_diabetes) +# # Estimate the score after imputation (mean strategy) of the missing values # From ca26d15cf8aeb32df0fb4e6a858a09f0e0a136fa Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 13:04:35 +0100 Subject: [PATCH 13/62] cleaning up --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b98718558f10f..30e648688c64a 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -29,7 +29,7 @@ # First we download the two datasets. Diabets dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entires and 8 features and we will need to -# fetch it. We will only use the first 500 entries for sake of speeding up the +# fetch it. We will only use the first 500 entries for sake of speeding up the # calculations but feel free to use the whole dataset. # From 1fbc80d608df85aeab15cb29e9367bf639ddcf2a Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 13:30:55 +0100 Subject: [PATCH 14/62] restructuring the document --- examples/impute/plot_missing_values.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 30e648688c64a..cbeedf7140d52 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -96,12 +96,15 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) -# -# In addition to using an imputing method, we can also keep an indication of -# the missing information using :func:`sklearn.impute.MissingIndicator` which -# might carry some information. -# +""" +------------------------------------------------------------------------------- + Missing information +------------------------------------------------------------------------------- + In addition to using an imputing method, we can also keep an indication of + the missing information using :func:`sklearn.impute.MissingIndicator` which + might carry some information. +""" def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( @@ -124,10 +127,12 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) -# -# Now, we will calculate the score on the original data sets -# +""" +------------------------------------------------------------------------------- + Calculate score on original data +------------------------------------------------------------------------------- +""" def get_full_score(X_full, y_full): full_scores = cross_val_score(REGRESSOR, X_full, y_full, From 656fea4bf145f532f7eb2e521f97248a9812eb9e Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 13:39:01 +0100 Subject: [PATCH 15/62] further text restructuring --- examples/impute/plot_missing_values.py | 36 +++++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index cbeedf7140d52..271bfc0b3d483 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -127,10 +127,12 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) + """ ------------------------------------------------------------------------------- - Calculate score on original data + Estimate the score ------------------------------------------------------------------------------- + First, we want to estimate the score on the original data """ @@ -145,9 +147,16 @@ def get_full_score(X_full, y_full): y_california) mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -# -# Next, we will estimate the score after replacing missing values by 0 -# + +""" +------------------------------------------------------------------------------- + Replace missing values by 0 +------------------------------------------------------------------------------- + + Now we will estimate the score on the data where the missing values are + replaced by 0 + +""" def get_impute_zero_score(X_missing, y_missing): @@ -164,12 +173,17 @@ def get_impute_zero_score(X_missing, y_missing): mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, y_miss_diabetes) -# -# Estimate the score after kNN-imputation of the missing values -# -# With ``KNNImputer``, missing values can be imputed using the weighted -# or unweighted mean of the desired number of nearest neighbors. -# + +""" +------------------------------------------------------------------------------- + kNN-imputation of the missing values +------------------------------------------------------------------------------- + + With ``KNNImputer``, missing values can be imputed using the weighted + or unweighted mean of the desired number of nearest neighbors. + +""" + def get_impute_KNN_score(X_missing, y_missing): @@ -183,6 +197,7 @@ def get_impute_KNN_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, y_miss_diabetes) + # # Estimate the score after imputation (mean strategy) of the missing values # @@ -199,6 +214,7 @@ def get_impute_mean(X_missing, y_missing): mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes) + # Estimate the score after iterative imputation of the missing values # # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses From 02e5e3677e6e234afb7d384ef4cff3d73b93e4b6 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 13:41:04 +0100 Subject: [PATCH 16/62] text restructuring --- examples/impute/plot_missing_values.py | 31 +++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 271bfc0b3d483..0b37da3a97149 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -185,7 +185,6 @@ def get_impute_zero_score(X_missing, y_missing): """ - def get_impute_KNN_score(X_missing, y_missing): imputer = KNNImputer(missing_values=0) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) @@ -198,9 +197,12 @@ def get_impute_KNN_score(X_missing, y_missing): y_miss_diabetes) -# -# Estimate the score after imputation (mean strategy) of the missing values -# +""" +------------------------------------------------------------------------------- + Impute missing values with mean +------------------------------------------------------------------------------- + +""" def get_impute_mean(X_missing, y_missing): @@ -215,15 +217,18 @@ def get_impute_mean(X_missing, y_missing): y_miss_diabetes) -# Estimate the score after iterative imputation of the missing values -# -# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -# round-robin linear regression, treating every variable as an output in -# turn. -# The version implemented assumes Gaussian (output) variables. If your -# features are obviously non-Normal, consider transforming them to look more -# Normal so as to potentially improve performance. -# +""" +------------------------------------------------------------------------------- +Iterative imputation of the missing values +------------------------------------------------------------------------------- + + Another option is the :class:`sklearn.impute.IterativeImputer`. This uses + round-robin linear regression, treating every variable as an output in turn. + The version implemented assumes Gaussian (output) variables. If your features + are obviously non-Normal, consider transforming them to look more Normal so as + to potentially improve performance. + +""" def get_impute_iterative(X_missing, y_missing): From f0e7ab034307c015091fa94fe64fbc4979639365 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 13:41:36 +0100 Subject: [PATCH 17/62] flake8 --- examples/impute/plot_missing_values.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 0b37da3a97149..db075f2863780 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -106,6 +106,7 @@ def add_missing_values(X_full, y_full): """ + def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=0)), @@ -136,6 +137,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): """ + def get_full_score(X_full, y_full): full_scores = cross_val_score(REGRESSOR, X_full, y_full, scoring='neg_mean_squared_error', From 7e6efbc616e977493077a9bf0b556b8235a681ff Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 14:13:15 +0100 Subject: [PATCH 18/62] reformatting --- examples/impute/plot_missing_values.py | 75 ++++++++++++-------------- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index db075f2863780..77353eb9f8ae4 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -96,15 +96,13 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) -""" -------------------------------------------------------------------------------- - Missing information -------------------------------------------------------------------------------- - In addition to using an imputing method, we can also keep an indication of - the missing information using :func:`sklearn.impute.MissingIndicator` which - might carry some information. - -""" +############################################################################## +# Missing information +#------------------------------------------------------------------------------ +# In addition to using an imputing method, we can also keep an indication of +# the missing information using :func:`sklearn.impute.MissingIndicator` which +# might carry some information. +# def get_scores_for_imputer(imputer, X_missing, y_missing): @@ -129,13 +127,12 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): stds_diabetes = np.zeros(5) -""" -------------------------------------------------------------------------------- - Estimate the score -------------------------------------------------------------------------------- - First, we want to estimate the score on the original data - -""" +############################################################################## +# Estimate the score +#------------------------------------------------------------------------------ +# First, we want to estimate the score on the original data +# +# def get_full_score(X_full, y_full): @@ -150,15 +147,13 @@ def get_full_score(X_full, y_full): mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -""" -------------------------------------------------------------------------------- +############################################################################## Replace missing values by 0 -------------------------------------------------------------------------------- - - Now we will estimate the score on the data where the missing values are - replaced by 0 - -""" +#------------------------------------------------------------------------------ +# +# Now we will estimate the score on the data where the missing values are +# replaced by 0 +# def get_impute_zero_score(X_missing, y_missing): @@ -176,15 +171,13 @@ def get_impute_zero_score(X_missing, y_missing): y_miss_diabetes) -""" -------------------------------------------------------------------------------- - kNN-imputation of the missing values -------------------------------------------------------------------------------- - - With ``KNNImputer``, missing values can be imputed using the weighted - or unweighted mean of the desired number of nearest neighbors. +############################################################################### +# kNN-imputation of the missing values +#------------------------------------------------------------------------------ +# +# With ``KNNImputer``, missing values can be imputed using the weighted +# or unweighted mean of the desired number of nearest neighbors. -""" def get_impute_KNN_score(X_missing, y_missing): @@ -199,12 +192,10 @@ def get_impute_KNN_score(X_missing, y_missing): y_miss_diabetes) -""" -------------------------------------------------------------------------------- - Impute missing values with mean -------------------------------------------------------------------------------- - -""" +############################################################################### +# Impute missing values with mean +#------------------------------------------------------------------------------ +# def get_impute_mean(X_missing, y_missing): @@ -219,10 +210,10 @@ def get_impute_mean(X_missing, y_missing): y_miss_diabetes) -""" -------------------------------------------------------------------------------- -Iterative imputation of the missing values -------------------------------------------------------------------------------- +############################################################################### +# Iterative imputation of the missing values +#------------------------------------------------------------------------------ +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses round-robin linear regression, treating every variable as an output in turn. From 694835cff65ed0d1a4537b9f0e3d82edb6176b2d Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 24 Feb 2020 14:42:53 +0100 Subject: [PATCH 19/62] flake8 --- examples/impute/plot_missing_values.py | 33 +++++++++++--------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 77353eb9f8ae4..c24c359448cfa 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -98,7 +98,7 @@ def add_missing_values(X_full, y_full): ############################################################################## # Missing information -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # In addition to using an imputing method, we can also keep an indication of # the missing information using :func:`sklearn.impute.MissingIndicator` which # might carry some information. @@ -129,7 +129,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): ############################################################################## # Estimate the score -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # First, we want to estimate the score on the original data # # @@ -148,8 +148,8 @@ def get_full_score(X_full, y_full): ############################################################################## - Replace missing values by 0 -#------------------------------------------------------------------------------ +# Replace missing values by 0 +# ----------------------------------------------------------------------------- # # Now we will estimate the score on the data where the missing values are # replaced by 0 @@ -173,13 +173,11 @@ def get_impute_zero_score(X_missing, y_missing): ############################################################################### # kNN-imputation of the missing values -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # # With ``KNNImputer``, missing values can be imputed using the weighted # or unweighted mean of the desired number of nearest neighbors. - - def get_impute_KNN_score(X_missing, y_missing): imputer = KNNImputer(missing_values=0) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) @@ -194,10 +192,9 @@ def get_impute_KNN_score(X_missing, y_missing): ############################################################################### # Impute missing values with mean -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # - def get_impute_mean(X_missing, y_missing): imputer = SimpleImputer(missing_values=0, strategy="mean") mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) @@ -212,17 +209,14 @@ def get_impute_mean(X_missing, y_missing): ############################################################################### # Iterative imputation of the missing values -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- +# +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +# round-robin linear regression, treating every variable as an output in turn. +# The version implemented assumes Gaussian (output) variables. If your features +# are obviously non-Normal, consider transforming them to look more Normal so +# as to potentially improve performance. # - - Another option is the :class:`sklearn.impute.IterativeImputer`. This uses - round-robin linear regression, treating every variable as an output in turn. - The version implemented assumes Gaussian (output) variables. If your features - are obviously non-Normal, consider transforming them to look more Normal so as - to potentially improve performance. - -""" - def get_impute_iterative(X_missing, y_missing): imputer = IterativeImputer(missing_values=0, @@ -248,6 +242,7 @@ def get_impute_iterative(X_missing, y_missing): ############################################################################### # # Finally we are going to visualize the score +# import matplotlib.pyplot as plt From 304a1b0568b68b9c43f4be67bcf6f2a8e98a8ef0 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 11 Mar 2020 16:08:34 +0100 Subject: [PATCH 20/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Olivier Grisel --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index c24c359448cfa..c6c1ca5f5bd5a 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -26,7 +26,7 @@ # Download the data and make missing values sets ############################################################################### # -# First we download the two datasets. Diabets dataset is shipped with +# First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entires and 8 features and we will need to # fetch it. We will only use the first 500 entries for sake of speeding up the From 05cc639f6932d49c1bf1481377efc14e1ffefa31 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 11 Mar 2020 16:15:59 +0100 Subject: [PATCH 21/62] updated the intro --- examples/impute/plot_missing_values.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index c6c1ca5f5bd5a..c182acd29bf6b 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -6,11 +6,18 @@ Missing values can be replaced by the mean, the median or the most frequent value using the basic :class:`sklearn.impute.SimpleImputer`. -In this example we will investigate different imputation techniques on two -datasets: Diabetes dataset which is the set of parameteres collected from the -diabetes patients with aim to predict disease progression and California -Housing dataset for which the target is the median house value for California -districts. +In this example we will investigate different imputation techniques: + +- imputation by the constant 0 value +- imputation by the mean value of each feature combined with a missing-ness +indicator auxiliary variable +- k nearest neighbor imputation +- iterative imputation + +We will use two datasets: Diabetes dataset which consists of set of parameteres +collected from the diabetes patients with aim to predict disease progression +and California Housing dataset for which the target is the median house value +for California districts. Neither of those datasets has missing values. We will remove some of the values and compare the results of RandomForestRegressor on the original data From 6d431937eb2613aaf48d0b92fafa7cdfe3312891 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:06:30 +0100 Subject: [PATCH 22/62] improve bullet point rendering --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index c182acd29bf6b..8c801d760e598 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -10,7 +10,7 @@ - imputation by the constant 0 value - imputation by the mean value of each feature combined with a missing-ness -indicator auxiliary variable + indicator auxiliary variable - k nearest neighbor imputation - iterative imputation From b17b5ea35d038c668931db35ed072eba49a4df05 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:09:44 +0100 Subject: [PATCH 23/62] spelling --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 8c801d760e598..253d0116d05a0 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -35,7 +35,7 @@ # # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing -# dataset is much larger with 20640 entires and 8 features and we will need to +# dataset is much larger with 20640 entries and 8 features and we will need to # fetch it. We will only use the first 500 entries for sake of speeding up the # calculations but feel free to use the whole dataset. # @@ -280,7 +280,7 @@ def get_impute_iterative(X_missing, y_missing): ax2.barh(j, mses_california[j], xerr=stds_california[j], color=colors[j], alpha=0.6, align='center') -# plot Ames results +# plot California results ax2.set_title('Imputation Techniques with California Data') ax2.set_yticks(xval) From df93916c4fa2da3f64883e27918649723c32be19 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:10:24 +0100 Subject: [PATCH 24/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 253d0116d05a0..5a20cdeb00986 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -36,7 +36,7 @@ # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entries and 8 features and we will need to -# fetch it. We will only use the first 500 entries for sake of speeding up the +# fetch it. We will only use the first 500 entries for the sake of speeding up the # calculations but feel free to use the whole dataset. # From b0531edd0b3261235fb18ae3ae6390777bfb1f9f Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:10:35 +0100 Subject: [PATCH 25/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 5a20cdeb00986..269a718c8952a 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -46,7 +46,7 @@ from sklearn.datasets import load_diabetes -rng = np.random.RandomState(0) +rng = np.random.RandomState(42) X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) From b5e87f53d8b0096b61987231703bcc877642d893 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:10:49 +0100 Subject: [PATCH 26/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 269a718c8952a..d020c27ad47cb 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -55,7 +55,7 @@ def add_missing_values(X_full, y_full): - n_samples = X_full.shape[0] + n_samples, n_features = X_full.shape n_features = X_full.shape[1] # Add missing values in 75% of the lines From dca599cc4f361724d692fb1959fa8c2a272dbb24 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:11:03 +0100 Subject: [PATCH 27/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index d020c27ad47cb..a6c3e1fdc2362 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -83,7 +83,7 @@ def add_missing_values(X_full, y_full): ############################################################################### # Impute the missing data and score -############################################################################### +# ######################################## # Now we will write a function which will score the results on the differently # prepared data. Let's look at each imputer separately # From 1c8e158ddc86990e3d14c114c7623bfa93a0ad69 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:11:13 +0100 Subject: [PATCH 28/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index a6c3e1fdc2362..8b4e7c2a61bea 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -139,7 +139,6 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): # ----------------------------------------------------------------------------- # First, we want to estimate the score on the original data # -# def get_full_score(X_full, y_full): From a36553f29f5879637009fdcb480cd494987a2d1c Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:11:47 +0100 Subject: [PATCH 29/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 8b4e7c2a61bea..c16caeb84dfcf 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -60,7 +60,7 @@ def add_missing_values(X_full, y_full): # Add missing values in 75% of the lines missing_rate = 0.75 - n_missing_samples = int(np.floor(n_samples * missing_rate)) + n_missing_samples = int(n_samples * missing_rate) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, From 5ef729c43a31b94a0cbaa282cc686efb599f7785 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:13:43 +0100 Subject: [PATCH 30/62] changed the naming --- examples/impute/plot_missing_values.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index c16caeb84dfcf..881073ddbb06a 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -184,15 +184,15 @@ def get_impute_zero_score(X_missing, y_missing): # With ``KNNImputer``, missing values can be imputed using the weighted # or unweighted mean of the desired number of nearest neighbors. -def get_impute_KNN_score(X_missing, y_missing): +def get_impute_knn_score(X_missing, y_missing): imputer = KNNImputer(missing_values=0) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() -mses_california[2], stds_california[2] = get_impute_KNN_score( +mses_california[2], stds_california[2] = get_impute_knn_score( X_miss_california, y_miss_california) -mses_diabetes[2], stds_diabetes[2] = get_impute_KNN_score(X_miss_diabetes, +mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes, y_miss_diabetes) From fb903d33f8b47c038341de2f85af161bffe0121c Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:23:50 +0100 Subject: [PATCH 31/62] restructuring text --- examples/impute/plot_missing_values.py | 34 ++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 881073ddbb06a..b22b87cd08c82 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -31,7 +31,7 @@ ############################################################################### # Download the data and make missing values sets -############################################################################### +################################################ # # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing @@ -56,15 +56,14 @@ def add_missing_values(X_full, y_full): n_samples, n_features = X_full.shape - n_features = X_full.shape[1] # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(n_samples * missing_rate) - missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) + + missing_samples = np.zeros(n_samples, dtype=np.bool) + missing_samples[: n_missing_samples] = True + rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) X_missing = X_full.copy() @@ -83,7 +82,7 @@ def add_missing_values(X_full, y_full): ############################################################################### # Impute the missing data and score -# ######################################## +# ################################# # Now we will write a function which will score the results on the differently # prepared data. Let's look at each imputer separately # @@ -103,9 +102,9 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 REGRESSOR = RandomForestRegressor(random_state=0) -############################################################################## +############################################################################### # Missing information -# ----------------------------------------------------------------------------- +# ------------------- # In addition to using an imputing method, we can also keep an indication of # the missing information using :func:`sklearn.impute.MissingIndicator` which # might carry some information. @@ -133,10 +132,9 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) - -############################################################################## +############################################################################### # Estimate the score -# ----------------------------------------------------------------------------- +# ------------------ # First, we want to estimate the score on the original data # @@ -153,9 +151,9 @@ def get_full_score(X_full, y_full): mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -############################################################################## +############################################################################### # Replace missing values by 0 -# ----------------------------------------------------------------------------- +# --------------------------- # # Now we will estimate the score on the data where the missing values are # replaced by 0 @@ -179,7 +177,7 @@ def get_impute_zero_score(X_missing, y_missing): ############################################################################### # kNN-imputation of the missing values -# ----------------------------------------------------------------------------- +# ------------------------------------ # # With ``KNNImputer``, missing values can be imputed using the weighted # or unweighted mean of the desired number of nearest neighbors. @@ -198,7 +196,7 @@ def get_impute_knn_score(X_missing, y_missing): ############################################################################### # Impute missing values with mean -# ----------------------------------------------------------------------------- +# ------------------------------- # def get_impute_mean(X_missing, y_missing): @@ -215,7 +213,7 @@ def get_impute_mean(X_missing, y_missing): ############################################################################### # Iterative imputation of the missing values -# ----------------------------------------------------------------------------- +# ------------------------------------------ # # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses # round-robin linear regression, treating every variable as an output in turn. @@ -245,7 +243,7 @@ def get_impute_iterative(X_missing, y_missing): ############################################################################### # Plot the results -############################################################################### +################## # # Finally we are going to visualize the score # From 040f11d2b9328d5cf7c528385df6fc0ff3c61549 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:25:22 +0100 Subject: [PATCH 32/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b22b87cd08c82..cb086268c1069 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -67,7 +67,7 @@ def add_missing_values(X_full, y_full): rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 + X_missing[missing_samples, missing_features] = 0 y_missing = y_full.copy() return X_missing, y_missing From c178eb1e244786c7a121b8dcd4c367d639764ea8 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 15:29:26 +0100 Subject: [PATCH 33/62] flake8 --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index cb086268c1069..448f5c69d6083 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -36,8 +36,8 @@ # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entries and 8 features and we will need to -# fetch it. We will only use the first 500 entries for the sake of speeding up the -# calculations but feel free to use the whole dataset. +# fetch it. We will only use the first 500 entries for the sake of speeding up +# the calculations but feel free to use the whole dataset. # import numpy as np From 2cdfad55fbb7f2fc4720ffc92452a673b80a1e58 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 18:11:38 +0100 Subject: [PATCH 34/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 448f5c69d6083..b42afcd6523b0 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -67,7 +67,7 @@ def add_missing_values(X_full, y_full): rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) X_missing = X_full.copy() - X_missing[missing_samples, missing_features] = 0 + X_missing[missing_samples, missing_features] = np.nan y_missing = y_full.copy() return X_missing, y_missing From 436c18bf65f606da62ad33cb10e8bd47b49197f5 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 18:11:47 +0100 Subject: [PATCH 35/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b42afcd6523b0..85aa3793d9fd4 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -183,7 +183,7 @@ def get_impute_zero_score(X_missing, y_missing): # or unweighted mean of the desired number of nearest neighbors. def get_impute_knn_score(X_missing, y_missing): - imputer = KNNImputer(missing_values=0) + imputer = KNNImputer(missing_values=np.nan) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() From 5d449c403a8c43702b72a3d044018c2985ff0cab Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 18:11:58 +0100 Subject: [PATCH 36/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 85aa3793d9fd4..2010973107fb4 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -200,7 +200,7 @@ def get_impute_knn_score(X_missing, y_missing): # def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=0, strategy="mean") + imputer = SimpleImputer(missing_values=np.nan, strategy="mean") mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return mean_impute_scores.mean(), mean_impute_scores.std() From c33cabd9bd11f56023bf06adc826af6cd52946b2 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 18:12:05 +0100 Subject: [PATCH 37/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 2010973107fb4..9a0f330324d0e 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -162,7 +162,7 @@ def get_full_score(X_full, y_full): def get_impute_zero_score(X_missing, y_missing): - imputer = SimpleImputer(missing_values=0, + imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) From faf5fd44eeb1cded86fc56d3df90409b7f3d2416 Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 18:12:15 +0100 Subject: [PATCH 38/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 9a0f330324d0e..9d91efd83744f 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -243,7 +243,7 @@ def get_impute_iterative(X_missing, y_missing): ############################################################################### # Plot the results -################## +# ################ # # Finally we are going to visualize the score # From 719c494498f249d5eddfdca4ec3b621e5bbb758e Mon Sep 17 00:00:00 2001 From: maikia Date: Wed, 25 Mar 2020 19:08:18 +0100 Subject: [PATCH 39/62] changing missing values from 0 to nan --- examples/impute/plot_missing_values.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 9d91efd83744f..8983e71f3c764 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -113,7 +113,7 @@ def add_missing_values(X_full, y_full): def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( - make_union(imputer, MissingIndicator(missing_values=0)), + make_union(imputer, MissingIndicator(missing_values=np.nan)), REGRESSOR) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', @@ -223,7 +223,7 @@ def get_impute_mean(X_missing, y_missing): # def get_impute_iterative(X_missing, y_missing): - imputer = IterativeImputer(missing_values=0, + imputer = IterativeImputer(missing_values=np.nan, random_state=0, n_nearest_features=5, sample_posterior=True) @@ -271,14 +271,12 @@ def get_impute_iterative(X_missing, y_missing): ax1.invert_yaxis() ax1.set_yticklabels(x_labels) -# plot california results +# plot california dataset results ax2 = plt.subplot(122) for j in xval: ax2.barh(j, mses_california[j], xerr=stds_california[j], color=colors[j], alpha=0.6, align='center') -# plot California results - ax2.set_title('Imputation Techniques with California Data') ax2.set_yticks(xval) ax2.set_xlabel('MSE') From 8e6d125d7ac738a7609b67a56c47b0ba442555b4 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 26 Mar 2020 17:58:53 +0100 Subject: [PATCH 40/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 8983e71f3c764..9f9c8ac761315 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -100,7 +100,7 @@ def add_missing_values(X_full, y_full): N_SPLITS = 5 -REGRESSOR = RandomForestRegressor(random_state=0) +regressor = RandomForestRegressor(random_state=0) ############################################################################### # Missing information From 0bda8150b26ca862bf0895ebf2682d2b5a5277db Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 30 Mar 2020 11:28:24 +0200 Subject: [PATCH 41/62] REGRESSOR to regressor --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 9f9c8ac761315..c7e46fc3806a5 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -114,7 +114,7 @@ def add_missing_values(X_full, y_full): def get_scores_for_imputer(imputer, X_missing, y_missing): estimator = make_pipeline( make_union(imputer, MissingIndicator(missing_values=np.nan)), - REGRESSOR) + regressor) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) @@ -140,7 +140,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): def get_full_score(X_full, y_full): - full_scores = cross_val_score(REGRESSOR, X_full, y_full, + full_scores = cross_val_score(regressor, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) return full_scores.mean(), full_scores.std() From 9e18cae179851d82d665c100aa2a4d477f8186e0 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 15:10:10 +0200 Subject: [PATCH 42/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index c7e46fc3806a5..955bc2b1a12b6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -8,7 +8,7 @@ In this example we will investigate different imputation techniques: -- imputation by the constant 0 value +- imputation by the constant value 0 - imputation by the mean value of each feature combined with a missing-ness indicator auxiliary variable - k nearest neighbor imputation From cea8551ca79c37eb0b430f13eaecf715eb2f21d9 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 15:11:05 +0200 Subject: [PATCH 43/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 955bc2b1a12b6..329a6ab2de784 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -84,7 +84,7 @@ def add_missing_values(X_full, y_full): # Impute the missing data and score # ################################# # Now we will write a function which will score the results on the differently -# prepared data. Let's look at each imputer separately +# imputed data. Let's look at each imputer separately: # rng = np.random.RandomState(0) From 5a88c12a5534206631974a27d0f13a93bd02bc75 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 15:28:09 +0200 Subject: [PATCH 44/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 329a6ab2de784..52b005f2551d1 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -245,7 +245,7 @@ def get_impute_iterative(X_missing, y_missing): # Plot the results # ################ # -# Finally we are going to visualize the score +# Finally we are going to visualize the score: # import matplotlib.pyplot as plt From 62d291e03a7c3509aa10d4f7a44fb73514b4dadd Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:25:27 +0200 Subject: [PATCH 45/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 52b005f2551d1..deec1e7613013 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -218,8 +218,8 @@ def get_impute_mean(X_missing, y_missing): # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses # round-robin linear regression, treating every variable as an output in turn. # The version implemented assumes Gaussian (output) variables. If your features -# are obviously non-Normal, consider transforming them to look more Normal so -# as to potentially improve performance. +# are obviously non-normal, consider transforming them to look more normal +# to potentially improve performance. # def get_impute_iterative(X_missing, y_missing): From 3f22f482b944567e40e50910b05d7637d263b075 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:26:06 +0200 Subject: [PATCH 46/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index deec1e7613013..852af3186c92e 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -19,9 +19,11 @@ and California Housing dataset for which the target is the median house value for California districts. -Neither of those datasets has missing values. We will remove some of the -values and compare the results of RandomForestRegressor on the original data -and the data with the missing values imputed by different techniques. +As neither of these datasets have missing values, we will remove some +values to create new versions with artificially missing data. The performance of +:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset is then +compared the performance on the altered datasets with the artificially missing values +imputed using different techniques. """ print(__doc__) From 2664c9d080c3c06dfcf1614bd92992cd53ea2f64 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:26:17 +0200 Subject: [PATCH 47/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 852af3186c92e..430dfdba82da6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -181,7 +181,7 @@ def get_impute_zero_score(X_missing, y_missing): # kNN-imputation of the missing values # ------------------------------------ # -# With ``KNNImputer``, missing values can be imputed using the weighted +# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted # or unweighted mean of the desired number of nearest neighbors. def get_impute_knn_score(X_missing, y_missing): From 02e3b71a7f43520e10f477438e9ad03765909fe6 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:26:27 +0200 Subject: [PATCH 48/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 430dfdba82da6..7aad27348e5d6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -218,7 +218,8 @@ def get_impute_mean(X_missing, y_missing): # ------------------------------------------ # # Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -# round-robin linear regression, treating every variable as an output in turn. +# round-robin linear regression, modeling each feature with missing values as a +# function of other features, in turn. # The version implemented assumes Gaussian (output) variables. If your features # are obviously non-normal, consider transforming them to look more normal # to potentially improve performance. From e211514c01759af07a53915729fe25a071578f4a Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:26:40 +0200 Subject: [PATCH 49/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 7aad27348e5d6..0de098a23b827 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -158,7 +158,7 @@ def get_full_score(X_full, y_full): # --------------------------- # # Now we will estimate the score on the data where the missing values are -# replaced by 0 +# replaced by 0: # From dfe208c3027ab3f087d4566cd6d9c5b7860ebca6 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:27:00 +0200 Subject: [PATCH 50/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 0de098a23b827..cc39a0a80aa64 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -107,8 +107,8 @@ def add_missing_values(X_full, y_full): ############################################################################### # Missing information # ------------------- -# In addition to using an imputing method, we can also keep an indication of -# the missing information using :func:`sklearn.impute.MissingIndicator` which +# In addition to imputing the missing values, we can also mark the values +# that were missing using :func:`sklearn.impute.MissingIndicator`, which # might carry some information. # From f91a9d2e12d0de32618417defe0e95c1824c1c6b Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:27:14 +0200 Subject: [PATCH 51/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index cc39a0a80aa64..0222f16f9f308 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -137,7 +137,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): ############################################################################### # Estimate the score # ------------------ -# First, we want to estimate the score on the original data +# First, we want to estimate the score on the original data: # From 959e907b918d5bd41b543567e22f77be8b933145 Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:27:33 +0200 Subject: [PATCH 52/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 0222f16f9f308..8133873c80109 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -37,8 +37,8 @@ # # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing -# dataset is much larger with 20640 entries and 8 features and we will need to -# fetch it. We will only use the first 500 entries for the sake of speeding up +# dataset is much larger with 20640 entries and 8 features. It needs to be +# downloaded. We will only use the first 500 entries for the sake of speeding up # the calculations but feel free to use the whole dataset. # From 4f638397ea829829bc4aec502defa06830b734dc Mon Sep 17 00:00:00 2001 From: maikia Date: Thu, 16 Apr 2020 18:27:48 +0200 Subject: [PATCH 53/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu --- examples/impute/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 8133873c80109..15305f8523468 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -14,8 +14,8 @@ - k nearest neighbor imputation - iterative imputation -We will use two datasets: Diabetes dataset which consists of set of parameteres -collected from the diabetes patients with aim to predict disease progression +We will use two datasets: Diabetes dataset which consists of 10 feature variables +collected from diabetes patients with an aim to predict disease progression and California Housing dataset for which the target is the median house value for California districts. From c335760296959d900995730c8fb0bf3939c9f07e Mon Sep 17 00:00:00 2001 From: maikia Date: Fri, 24 Apr 2020 11:18:29 +0200 Subject: [PATCH 54/62] flake8 --- examples/impute/plot_missing_values.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 15305f8523468..9e1b719c60350 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -14,16 +14,17 @@ - k nearest neighbor imputation - iterative imputation -We will use two datasets: Diabetes dataset which consists of 10 feature variables -collected from diabetes patients with an aim to predict disease progression -and California Housing dataset for which the target is the median house value -for California districts. +We will use two datasets: Diabetes dataset which consists of 10 feature +variables collected from diabetes patients with an aim to predict disease +progression and California Housing dataset for which the target is the median +house value for California districts. As neither of these datasets have missing values, we will remove some -values to create new versions with artificially missing data. The performance of -:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset is then -compared the performance on the altered datasets with the artificially missing values -imputed using different techniques. +values to create new versions with artificially missing data. The performance +of +:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset +is then compared the performance on the altered datasets with the artificially +missing values imputed using different techniques. """ print(__doc__) @@ -38,8 +39,8 @@ # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entries and 8 features. It needs to be -# downloaded. We will only use the first 500 entries for the sake of speeding up -# the calculations but feel free to use the whole dataset. +# downloaded. We will only use the first 500 entries for the sake of speeding +# up the calculations but feel free to use the whole dataset. # import numpy as np From a3bd39dab2da87895d03c50428e0482385d74c96 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 11:06:08 +0200 Subject: [PATCH 55/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 9e1b719c60350..d93790067cd89 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -165,7 +165,7 @@ def get_full_score(X_full, y_full): def get_impute_zero_score(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, + imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) From 55b5de01784ab2f824a52862702f15f9769f50a5 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 11:06:25 +0200 Subject: [PATCH 56/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index d93790067cd89..5f99584d6c871 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -186,7 +186,7 @@ def get_impute_zero_score(X_missing, y_missing): # or unweighted mean of the desired number of nearest neighbors. def get_impute_knn_score(X_missing, y_missing): - imputer = KNNImputer(missing_values=np.nan) + imputer = KNNImputer(missing_values=np.nan, add_indicator=True) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() From eb9ba0e60563e44da485ac1637a754966461842d Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 11:06:38 +0200 Subject: [PATCH 57/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 5f99584d6c871..ae0eb1bf1e62e 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -203,7 +203,7 @@ def get_impute_knn_score(X_missing, y_missing): # def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, strategy="mean") + imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True) mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return mean_impute_scores.mean(), mean_impute_scores.std() From f3a8607ff8ecac9fe69e36630c4af37231b3ac75 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 11:06:53 +0200 Subject: [PATCH 58/62] Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index ae0eb1bf1e62e..57f854d9de8d3 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -227,7 +227,7 @@ def get_impute_mean(X_missing, y_missing): # def get_impute_iterative(X_missing, y_missing): - imputer = IterativeImputer(missing_values=np.nan, + imputer = IterativeImputer(missing_values=np.nan, add_indicator=True, random_state=0, n_nearest_features=5, sample_posterior=True) From a96f9373d80fd14726c9ad314b772ae9ccfa52bf Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 11:43:36 +0200 Subject: [PATCH 59/62] flake8 --- examples/impute/plot_missing_values.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 57f854d9de8d3..32f9a136814f6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -203,7 +203,8 @@ def get_impute_knn_score(X_missing, y_missing): # def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True) + imputer = SimpleImputer(missing_values=np.nan, strategy="mean", + add_indicator=True) mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return mean_impute_scores.mean(), mean_impute_scores.std() From 18bae2d456b98cac3379a3f690b2917f252a2f18 Mon Sep 17 00:00:00 2001 From: maikia Date: Mon, 27 Apr 2020 12:48:00 +0200 Subject: [PATCH 60/62] reducting number of samples used from california dataset --- examples/impute/plot_missing_values.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 32f9a136814f6..72500b2ae7cbc 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -39,7 +39,7 @@ # First we download the two datasets. Diabetes dataset is shipped with # scikit-learn. It has 442 entries, each with 10 features. California Housing # dataset is much larger with 20640 entries and 8 features. It needs to be -# downloaded. We will only use the first 500 entries for the sake of speeding +# downloaded. We will only use the first 400 entries for the sake of speeding # up the calculations but feel free to use the whole dataset. # @@ -53,8 +53,8 @@ X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) -X_california = X_california[:500] -y_california = y_california[:500] +X_california = X_california[:400] +y_california = y_california[:400] def add_missing_values(X_full, y_full): From f96b7ee165dcd041f7560b214d24adeea0be3d1a Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 27 Apr 2020 14:59:02 -0400 Subject: [PATCH 61/62] CLN Removes the need for MissingIndicator --- examples/impute/plot_missing_values.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 72500b2ae7cbc..2ba7dc05d16b6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -96,10 +96,9 @@ def add_missing_values(X_full, y_full): # To use the experimental IterativeImputer, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.impute import (SimpleImputer, KNNImputer, IterativeImputer, - MissingIndicator) +from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer from sklearn.model_selection import cross_val_score -from sklearn.pipeline import make_pipeline, make_union +from sklearn.pipeline import make_pipeline N_SPLITS = 5 @@ -108,16 +107,14 @@ def add_missing_values(X_full, y_full): ############################################################################### # Missing information # ------------------- -# In addition to imputing the missing values, we can also mark the values -# that were missing using :func:`sklearn.impute.MissingIndicator`, which +# In addition to imputing the missing values, the imputers have an +# `add_indicator` parameter that marks the values that were missing, which # might carry some information. # def get_scores_for_imputer(imputer, X_missing, y_missing): - estimator = make_pipeline( - make_union(imputer, MissingIndicator(missing_values=np.nan)), - regressor) + estimator = make_pipeline(imputer, regressor) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) @@ -166,8 +163,7 @@ def get_full_score(X_full, y_full): def get_impute_zero_score(X_missing, y_missing): imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, - strategy='constant', - fill_value=0) + strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return zero_impute_scores.mean(), zero_impute_scores.std() @@ -229,8 +225,7 @@ def get_impute_mean(X_missing, y_missing): def get_impute_iterative(X_missing, y_missing): imputer = IterativeImputer(missing_values=np.nan, add_indicator=True, - random_state=0, - n_nearest_features=5, + random_state=0, n_nearest_features=5, sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, From 2eb35ec43085de7fa6bbc58622cd631eac859587 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 27 Apr 2020 19:18:43 -0400 Subject: [PATCH 62/62] FIX Unrelated bug but is stopping the CI from passing --- doc/developers/contributing.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index c886119e908c1..e13b6850d50eb 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -434,8 +434,9 @@ You can check for common programming errors with the following tools: must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular, - - when importing C or Cython modules - - on properties with decorators + + - when importing C or Cython modules + - on properties with decorators Bonus points for contributions that include a performance analysis with a benchmark script and profiling output (please report on the mailing