From a8fa2c4886835faf02c9e722f3de06f815a59d82 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 18 Nov 2019 15:58:31 +0100 Subject: [PATCH 01/85] Add example on interpretation of linear model coefficients. --- ...linear_model_coefficient_interpretation.py | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 examples/inspection/plot_linear_model_coefficient_interpretation.py diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py new file mode 100644 index 0000000000000..e7077a953364a --- /dev/null +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -0,0 +1,191 @@ +""" +=============================================== +Interpretation of coefficients in linear models +=============================================== + +Linear models describe situations in which the target value is expected to be +a linear combination of the features (see the :ref:`linear_model` User guide +section for a description of a set of linear model methods available in +scikit-learn). +It is important to emphasize that linear models compute conditional links. +The interpretation of the coefficient gives the relationship between the +feature and the target given that other features remain constant. + +This example will show some hints in interpreting coefficient in linear models, +using data from the "Current Population Survey" from 1985. +""" + +print(__doc__) + +from time import time +import numpy as np +import scipy as sp +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from sklearn.compose import make_column_transformer +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import RidgeCV +from sklearn.compose import TransformedTargetRegressor +from sklearn.metrics import median_absolute_error + +############################################################################# +# Determinants of Wages from the 1985 Current Population Survey +# ------------------------------------------------------------- +# +# First of all we fetch the data from `OpenML `_. +# Note that setting the parameter `as_frame` to `True` will retrieve the data +# as a pandas dataframe. +# Then, we identify features (`X`) and targets (`y`): the column 'WAGE' is our +# target variable (i.e., the variable which we want to predict). + +survey = fetch_openml(data_id=534, as_frame=True) + +X = survey.data[survey.feature_names] +y = survey.target.values.ravel() +ax = sns.kdeplot(y, shade=True, color="r") +plt.xlabel(survey.target_names) +plt.show() + +############################################################################## +# Note that the "WAGE" distribution has a long tail and we could take its log +# to simplify our problem getting closer to a normal distribution. +# +# The dataset is composed by columns with different data types and we need to +# apply a specific preprocessing for each data types. +# Our pre-processor will +# +# - one-hot encode (i.e., generate a column by category) the categorical +# columns; +# - replace by 0 and 1 the categories of binary columns; +# - keep numerical values as they are. + +categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR'] +binary_columns = ['MARR', 'UNION', 'SEX', 'SOUTH'] +numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] + +preprocessor = make_column_transformer( + (OneHotEncoder(), categorical_columns), + (OrdinalEncoder(), binary_columns), + remainder='passthrough' +) + +############################################################################## +# Modeling the data +# ................. +# +# We will fit a ridge regressor and transform the target before the fit using +# a log transform. +# But before computing the model we split the sample in a train and a test +# dataset. + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=RidgeCV(), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=42 +) + +model.fit(X_train, y_train); + + +############################################################################## +# Scoring the model +# ................. +# +# We can check the performance of the computed model using, for example, the +# median absolute error of the model. + +def mae_scorer(model, X_train, X_test, y_train, y_test): + y_pred = model.predict(X_train) + string_score = f'MAE on training set: {median_absolute_error(y_train, y_pred):.2f} $/hour' + y_pred = model.predict(X_test) + string_score += f'\nMAE on testing set: {median_absolute_error(y_test, y_pred):.2f} $/hour' + return string_score + +fig, ax = plt.subplots(figsize=(6, 6)) +y_pred = model.predict(X_test) +sns.regplot(y_test, y_pred) + +plt.text(3, 20, mae_scorer(model, X_train, X_test, y_train, y_test)) + +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +plt.ylim([0, 27]); + +############################################################################## +# The model learnt is far to be a good model making accurate prediction. +# As interpretation tools characterize model rather than the generative process +# of the data itself, it needs to be emphasized that interpretations are correct +# if the model is correct as well. + +############################################################################## +# Interpreting coefficients +# ......................... +# +# First of all, we can plot the values of the coefficients of the regressor we +# have fitted. + +feature_names = (model.named_steps['columntransformer'] + .named_transformers_['onehotencoder'] + .get_feature_names(input_features=categorical_columns)) +feature_names = np.concatenate([feature_names, binary_columns, numerical_columns]) + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.axvline(x=0, color='.5'); + +############################################################################### +# Soon we realize that we cannot compare different coefficients since we did +# not scale the data before the fit features having different value ranges. +# For instance, the "AGE" coefficient is expressed in $/hours/leaving years +# while the "EDUCATION" is expressed in $/hours/years of education. +# This is evident if we compare feature standard deviations. + +X_train_preprocessed = pd.DataFrame( + model.named_steps['columntransformer'].transform(X_train), + columns=feature_names +) +X_train_preprocessed.std().plot(kind='barh', figsize=(9, 7)) +plt.title('Features std. dev.'); + +############################################################################### +# We can then normalize the coefficients by the standard deviation and we will +# be able to compare them. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std(), + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.axvline(x=0, color='.5'); + +############################################################################### +# The plot above tells us that an increase of the "AGE" will induce a decrease +# of the "WAGE" when all other features remain constant. Also an increase of +# the "EXPERIENCE" will induce an increase of the "WAGE" when all other +# features remain constant. +# +# The first interpretation might look counter-intuitive at first, if one relates +# the relationship between "AGE" and "WAGE" as a marginal link. +# However, as previously mentioned, a linear model computes a conditional +# link between "AGE" and "WAGE" given all other features. +# Therefore, one should also interpret that for a given experience (and all other +# features constant as well ...), a younger person would have an higher wage. + From 407439c9642df13331656e2288646c8340641ad8 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 18 Nov 2019 17:14:28 +0100 Subject: [PATCH 02/85] Add coefficient instability. --- ...linear_model_coefficient_interpretation.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index e7077a953364a..de556bf486bad 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -33,6 +33,8 @@ from sklearn.linear_model import RidgeCV from sklearn.compose import TransformedTargetRegressor from sklearn.metrics import median_absolute_error +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedKFold ############################################################################# # Determinants of Wages from the 1985 Current Population Survey @@ -188,4 +190,39 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): # link between "AGE" and "WAGE" given all other features. # Therefore, one should also interpret that for a given experience (and all other # features constant as well ...), a younger person would have an higher wage. +# +# Checking the coefficient stability +# .................................. +# +# The stability of the coefficients is a guarantee of the robustness of the +# model. We can check the coefficient stability through cross-validation. + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std() + for est in cv_model['estimator']], + columns=feature_names +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxenplot(data=coefs, orient='h', color='C0') +plt.axvline(x=0, color='.5') +plt.title('Stability of coefficients'); + +############################################################################### +# The "AGE" and "EXPERIENCE" coefficients are highly instable which might be +# due to the collinearity between the 2 features. + +age = survey.data['AGE'].values +experience = survey.data['EXPERIENCE'].values +sns.regplot(age,experience,scatter_kws={"color": "black", "alpha": 0.2, "s": 30}, + line_kws={"color": "red"}) + +############################################################################## +# We can remove one of the 2 features and check what is the impact on the +# features stability. From e70b5aa6b15288e0f2b7a19a41cf28a7ed71749a Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 19 Nov 2019 17:06:24 +0100 Subject: [PATCH 03/85] Working on coefficient stability --- ...linear_model_coefficient_interpretation.py | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index de556bf486bad..f04cdd8ad6b37 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -8,7 +8,7 @@ section for a description of a set of linear model methods available in scikit-learn). It is important to emphasize that linear models compute conditional links. -The interpretation of the coefficient gives the relationship between the +The interpretation of the coefficients gives the relationship between the feature and the target given that other features remain constant. This example will show some hints in interpreting coefficient in linear models, @@ -24,6 +24,10 @@ import matplotlib.pyplot as plt import seaborn as sns + +from matplotlib.axes._axes import _log as matplotlib_axes_logger +matplotlib_axes_logger.setLevel('ERROR') + from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.compose import make_column_transformer @@ -111,9 +115,11 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): y_pred = model.predict(X_train) - string_score = f'MAE on training set: {median_absolute_error(y_train, y_pred):.2f} $/hour' + string_score = f'MAE on training set: \ + {median_absolute_error(y_train, y_pred):.2f} $/hour' y_pred = model.predict(X_test) - string_score += f'\nMAE on testing set: {median_absolute_error(y_test, y_pred):.2f} $/hour' + string_score += f'\nMAE on testing set: \ + {median_absolute_error(y_test, y_pred):.2f} $/hour' return string_score fig, ax = plt.subplots(figsize=(6, 6)) @@ -219,10 +225,29 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): age = survey.data['AGE'].values experience = survey.data['EXPERIENCE'].values -sns.regplot(age,experience,scatter_kws={"color": "black", "alpha": 0.2, "s": 30}, - line_kws={"color": "red"}) +sns.regplot(age,experience,scatter_kws={"color": "black", "alpha": 0.2, "s": 30} + , line_kws={"color": "red"}) ############################################################################## # We can remove one of the 2 features and check what is the impact on the # features stability. +column_to_drop = ['AGE'] + +cv_model = cross_validate( + model, X.drop(columns=column_to_drop), y, + cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.drop(columns=column_to_drop).std() + for est in cv_model['estimator']], + columns=feature_names[:-1] +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxenplot(data=coefs, orient='h', color='C0') +plt.axvline(x=0, color='.5') +plt.title('Stability of coefficients'); + From e5eb09ba883101fc513c3effc21e098574cd97c5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 21 Nov 2019 17:09:42 +0100 Subject: [PATCH 04/85] Fix plot properties. --- ...linear_model_coefficient_interpretation.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index f04cdd8ad6b37..8ccb5c81dc8d3 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -54,14 +54,8 @@ X = survey.data[survey.feature_names] y = survey.target.values.ravel() -ax = sns.kdeplot(y, shade=True, color="r") -plt.xlabel(survey.target_names) -plt.show() ############################################################################## -# Note that the "WAGE" distribution has a long tail and we could take its log -# to simplify our problem getting closer to a normal distribution. -# # The dataset is composed by columns with different data types and we need to # apply a specific preprocessing for each data types. # Our pre-processor will @@ -85,8 +79,7 @@ # Modeling the data # ................. # -# We will fit a ridge regressor and transform the target before the fit using -# a log transform. +# We will fit a ridge regressor. # But before computing the model we split the sample in a train and a test # dataset. @@ -105,6 +98,13 @@ model.fit(X_train, y_train); +############################################################################## +# Note that the "WAGE" distribution has a long tail and we could take its log +# to simplify our problem getting closer to a normal distribution. +# That's why a log transform has been applied to the target before the fit. + +ax = sns.kdeplot(y, shade=True, color="r") +plt.xlabel(survey.target_names) ############################################################################## # Scoring the model @@ -157,6 +157,8 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): ) coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5'); +plt.subplots_adjust(left=.3) +plt.show() ############################################################################### # Soon we realize that we cannot compare different coefficients since we did @@ -171,6 +173,7 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): ) X_train_preprocessed.std().plot(kind='barh', figsize=(9, 7)) plt.title('Features std. dev.'); +plt.subplots_adjust(left=.3) ############################################################################### # We can then normalize the coefficients by the standard deviation and we will @@ -183,6 +186,7 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): ) coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5'); +plt.subplots_adjust(left=.3) ############################################################################### # The plot above tells us that an increase of the "AGE" will induce a decrease @@ -218,6 +222,7 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): sns.boxenplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients'); +plt.subplots_adjust(left=.3) ############################################################################### # The "AGE" and "EXPERIENCE" coefficients are highly instable which might be @@ -227,6 +232,8 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): experience = survey.data['EXPERIENCE'].values sns.regplot(age,experience,scatter_kws={"color": "black", "alpha": 0.2, "s": 30} , line_kws={"color": "red"}) +plt.ylabel('EXPERIENCE') +plt.xlabel('AGE') ############################################################################## # We can remove one of the 2 features and check what is the impact on the @@ -250,4 +257,5 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): sns.boxenplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients'); +plt.subplots_adjust(left=.3) From dc3fca805fd97b962890faa95bec3ec5e4ec2858 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 15:20:12 +0100 Subject: [PATCH 05/85] Rearrange plot. Introduce more dataset inspection. --- ...linear_model_coefficient_interpretation.py | 143 ++++++++++-------- 1 file changed, 80 insertions(+), 63 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 8ccb5c81dc8d3..edd4cdcd39ba8 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -4,7 +4,7 @@ =============================================== Linear models describe situations in which the target value is expected to be -a linear combination of the features (see the :ref:`linear_model` User guide +a linear combination of the features (see the :ref:`linear_model` User Guide section for a description of a set of linear model methods available in scikit-learn). It is important to emphasize that linear models compute conditional links. @@ -24,38 +24,54 @@ import matplotlib.pyplot as plt import seaborn as sns - +# this import hids some warnings in axis plotting from matplotlib.axes._axes import _log as matplotlib_axes_logger matplotlib_axes_logger.setLevel('ERROR') -from sklearn.datasets import fetch_openml -from sklearn.model_selection import train_test_split -from sklearn.compose import make_column_transformer -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import OrdinalEncoder -from sklearn.pipeline import make_pipeline -from sklearn.linear_model import RidgeCV -from sklearn.compose import TransformedTargetRegressor -from sklearn.metrics import median_absolute_error -from sklearn.model_selection import cross_validate -from sklearn.model_selection import RepeatedKFold - ############################################################################# # Determinants of Wages from the 1985 Current Population Survey # ------------------------------------------------------------- # -# First of all we fetch the data from `OpenML `_. +# The dataset +# ........... +# +# We fetch the data from `OpenML `_. # Note that setting the parameter `as_frame` to `True` will retrieve the data # as a pandas dataframe. -# Then, we identify features (`X`) and targets (`y`): the column 'WAGE' is our -# target variable (i.e., the variable which we want to predict). +from sklearn.datasets import fetch_openml + survey = fetch_openml(data_id=534, as_frame=True) +############################################################################## +# Then, we identify features (`X`) and targets (`y`): the column 'WAGE' is our +# target variable (i.e., the variable which we want to predict). + X = survey.data[survey.feature_names] +print(X.head()) +############################################################################## y = survey.target.values.ravel() +print(survey.target.head()) + +############################################################################## +# First, let's get some insights by looking at the marginal links between the +# different variables. Only numerical variables will be used. + +sns.pairplot(survey.frame, diag_kind='kde') ############################################################################## +# Note that the "WAGE" distribution has a long tail and we could take its log +# to simplify our problem getting closer to a normal distribution. +# For all 3 variables, "EDUCATION", "EXPERIENCE", and "AGE", the "WAGE" is +# increasing when these variables are increasing. Also, the "EXPERIENCE" and +# "AGE" are correlated. +# +# The pipeline +# ............ + +survey.data.info() + +############################################################################# # The dataset is composed by columns with different data types and we need to # apply a specific preprocessing for each data types. # Our pre-processor will @@ -65,6 +81,10 @@ # - replace by 0 and 1 the categories of binary columns; # - keep numerical values as they are. +from sklearn.compose import make_column_transformer +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder + categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR'] binary_columns = ['MARR', 'UNION', 'SEX', 'SOUTH'] numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] @@ -76,12 +96,12 @@ ) ############################################################################## -# Modeling the data -# ................. -# -# We will fit a ridge regressor. -# But before computing the model we split the sample in a train and a test -# dataset. +# To describe the dataset as a linear model we choose to use a ridge regressor +# and to model le log of the "WAGE". + +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import RidgeCV +from sklearn.compose import TransformedTargetRegressor model = make_pipeline( preprocessor, @@ -92,54 +112,53 @@ ) ) +############################################################################## +# Processing the dataset +# ...................... +# +# First of all we split the sample in a train and a test dataset. + +from sklearn.model_selection import train_test_split + X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42 ) -model.fit(X_train, y_train); - ############################################################################## -# Note that the "WAGE" distribution has a long tail and we could take its log -# to simplify our problem getting closer to a normal distribution. -# That's why a log transform has been applied to the target before the fit. +# Then we fit the model -ax = sns.kdeplot(y, shade=True, color="r") -plt.xlabel(survey.target_names) +model.fit(X_train, y_train); ############################################################################## -# Scoring the model -# ................. -# # We can check the performance of the computed model using, for example, the # median absolute error of the model. -def mae_scorer(model, X_train, X_test, y_train, y_test): - y_pred = model.predict(X_train) - string_score = f'MAE on training set: \ - {median_absolute_error(y_train, y_pred):.2f} $/hour' - y_pred = model.predict(X_test) - string_score += f'\nMAE on testing set: \ - {median_absolute_error(y_test, y_pred):.2f} $/hour' - return string_score +from sklearn.metrics import median_absolute_error -fig, ax = plt.subplots(figsize=(6, 6)) +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) + +string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) + +fig, ax = plt.subplots(figsize=(6, 6)) sns.regplot(y_test, y_pred) -plt.text(3, 20, mae_scorer(model, X_train, X_test, y_train, y_test)) +plt.text(3, 20, string_score) plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) -plt.ylim([0, 27]); +plt.ylim([0, 27]) ############################################################################## # The model learnt is far to be a good model making accurate prediction. # As interpretation tools characterize model rather than the generative process # of the data itself, it needs to be emphasized that interpretations are correct # if the model is correct as well. - -############################################################################## +# # Interpreting coefficients # ......................... # @@ -158,11 +177,10 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5'); plt.subplots_adjust(left=.3) -plt.show() ############################################################################### # Soon we realize that we cannot compare different coefficients since we did -# not scale the data before the fit features having different value ranges. +# not scale the data before the fit, features having different value ranges. # For instance, the "AGE" coefficient is expressed in $/hours/leaving years # while the "EDUCATION" is expressed in $/hours/years of education. # This is evident if we compare feature standard deviations. @@ -176,8 +194,8 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): plt.subplots_adjust(left=.3) ############################################################################### -# We can then normalize the coefficients by the standard deviation and we will -# be able to compare them. +# We should then normalize the coefficients by the standard deviation and we +# will be able to compare them. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * @@ -198,8 +216,9 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): # the relationship between "AGE" and "WAGE" as a marginal link. # However, as previously mentioned, a linear model computes a conditional # link between "AGE" and "WAGE" given all other features. -# Therefore, one should also interpret that for a given experience (and all other -# features constant as well ...), a younger person would have an higher wage. +# Therefore, one should also interpret that for a given experience and all +# other features constant as well, a younger person would have an higher +# wage. # # Checking the coefficient stability # .................................. @@ -207,6 +226,9 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): # The stability of the coefficients is a guarantee of the robustness of the # model. We can check the coefficient stability through cross-validation. +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedKFold + cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), return_estimator=True, n_jobs=-1 @@ -227,17 +249,9 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): ############################################################################### # The "AGE" and "EXPERIENCE" coefficients are highly instable which might be # due to the collinearity between the 2 features. - -age = survey.data['AGE'].values -experience = survey.data['EXPERIENCE'].values -sns.regplot(age,experience,scatter_kws={"color": "black", "alpha": 0.2, "s": 30} - , line_kws={"color": "red"}) -plt.ylabel('EXPERIENCE') -plt.xlabel('AGE') - -############################################################################## -# We can remove one of the 2 features and check what is the impact on the -# features stability. +# +# In order to verify our interpretation we can remove one of the 2 features and +# check what is the impact on the features stability. column_to_drop = ['AGE'] @@ -259,3 +273,6 @@ def mae_scorer(model, X_train, X_test, y_train, y_test): plt.title('Stability of coefficients'); plt.subplots_adjust(left=.3) +############################################################################### +# The estimation of the "EXPERIENCE" coefficient is now more stable with +# respect to cross_validation. From 860050706b03cf7c37eeb6087e9b1bde1c0d9583 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 15:34:11 +0100 Subject: [PATCH 06/85] Fix lint issues. --- ...linear_model_coefficient_interpretation.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index edd4cdcd39ba8..46c36cab31679 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -11,20 +11,19 @@ The interpretation of the coefficients gives the relationship between the feature and the target given that other features remain constant. -This example will show some hints in interpreting coefficient in linear models, +This example will show some hints in interpreting coefficient in linear models, using data from the "Current Population Survey" from 1985. """ print(__doc__) -from time import time import numpy as np import scipy as sp import pandas as pd import matplotlib.pyplot as plt import seaborn as sns -# this import hids some warnings in axis plotting +# this import hids some warnings in axis plotting from matplotlib.axes._axes import _log as matplotlib_axes_logger matplotlib_axes_logger.setLevel('ERROR') @@ -38,7 +37,7 @@ # We fetch the data from `OpenML `_. # Note that setting the parameter `as_frame` to `True` will retrieve the data # as a pandas dataframe. - + from sklearn.datasets import fetch_openml survey = fetch_openml(data_id=534, as_frame=True) @@ -55,7 +54,7 @@ ############################################################################## # First, let's get some insights by looking at the marginal links between the -# different variables. Only numerical variables will be used. +# different variables. Only numerical variables will be used. sns.pairplot(survey.frame, diag_kind='kde') @@ -75,7 +74,7 @@ # The dataset is composed by columns with different data types and we need to # apply a specific preprocessing for each data types. # Our pre-processor will -# +# # - one-hot encode (i.e., generate a column by category) the categorical # columns; # - replace by 0 and 1 the categories of binary columns; @@ -127,11 +126,11 @@ ############################################################################## # Then we fit the model -model.fit(X_train, y_train); +model.fit(X_train, y_train) ############################################################################## # We can check the performance of the computed model using, for example, the -# median absolute error of the model. +# median absolute error of the model. from sklearn.metrics import median_absolute_error @@ -156,8 +155,8 @@ ############################################################################## # The model learnt is far to be a good model making accurate prediction. # As interpretation tools characterize model rather than the generative process -# of the data itself, it needs to be emphasized that interpretations are correct -# if the model is correct as well. +# of the data itself, it needs to be emphasized that interpretations are +# correct if the model is correct as well. # # Interpreting coefficients # ......................... @@ -168,14 +167,15 @@ feature_names = (model.named_steps['columntransformer'] .named_transformers_['onehotencoder'] .get_feature_names(input_features=categorical_columns)) -feature_names = np.concatenate([feature_names, binary_columns, numerical_columns]) +feature_names = np.concatenate([feature_names, binary_columns, + numerical_columns]) coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, columns=['Coefficients'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) -plt.axvline(x=0, color='.5'); +plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) ############################################################################### @@ -190,7 +190,7 @@ columns=feature_names ) X_train_preprocessed.std().plot(kind='barh', figsize=(9, 7)) -plt.title('Features std. dev.'); +plt.title('Features std. dev.') plt.subplots_adjust(left=.3) ############################################################################### @@ -203,7 +203,7 @@ columns=['Coefficients'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) -plt.axvline(x=0, color='.5'); +plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) ############################################################################### @@ -212,8 +212,8 @@ # the "EXPERIENCE" will induce an increase of the "WAGE" when all other # features remain constant. # -# The first interpretation might look counter-intuitive at first, if one relates -# the relationship between "AGE" and "WAGE" as a marginal link. +# The first interpretation might look counter-intuitive at first, if one +# relates the relationship between "AGE" and "WAGE" as a marginal link. # However, as previously mentioned, a linear model computes a conditional # link between "AGE" and "WAGE" given all other features. # Therefore, one should also interpret that for a given experience and all @@ -243,7 +243,7 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxenplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') -plt.title('Stability of coefficients'); +plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) ############################################################################### @@ -270,7 +270,7 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxenplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') -plt.title('Stability of coefficients'); +plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) ############################################################################### From 159d028279e632d7285a90972d2f67db1fbb6a6c Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 15:36:15 +0100 Subject: [PATCH 07/85] Fix overindentation. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 46c36cab31679..ba8149e0be872 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -168,7 +168,7 @@ .named_transformers_['onehotencoder'] .get_feature_names(input_features=categorical_columns)) feature_names = np.concatenate([feature_names, binary_columns, - numerical_columns]) + numerical_columns]) coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, From 43125faf93d871590c5e561a3913dffddc7dbafb Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 15:40:55 +0100 Subject: [PATCH 08/85] Fix underindentation :) . --- .../plot_linear_model_coefficient_interpretation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index ba8149e0be872..68d5126d9fb7e 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -167,8 +167,8 @@ feature_names = (model.named_steps['columntransformer'] .named_transformers_['onehotencoder'] .get_feature_names(input_features=categorical_columns)) -feature_names = np.concatenate([feature_names, binary_columns, - numerical_columns]) +feature_names = np.concatenate( + [feature_names, binary_columns, numerical_columns]) coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, From 9df008afab9c7a6b446e2ad50af1074d807457d2 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 15:59:03 +0100 Subject: [PATCH 09/85] Add seaborn in build doc conda environment. --- build_tools/circle/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 59c0fe659a2ad..15196aca1e723 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -169,7 +169,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \ cython="${CYTHON_VERSION:-*}" pytest coverage \ matplotlib="${MATPLOTLIB_VERSION:-*}" sphinx=2.1.2 pillow \ scikit-image="${SCIKIT_IMAGE_VERSION:-*}" pandas="${PANDAS_VERSION:-*}" \ - joblib memory_profiler packaging + joblib memory_profiler packaging seaborn source activate testenv pip install sphinx-gallery==0.3.1 From da237628bd236b49704eed82d8a8609a099d8b9a Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 22 Nov 2019 16:20:07 +0100 Subject: [PATCH 10/85] Remove matplotlib logger (too recent for build doc-min-dependencies?). --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 68d5126d9fb7e..00ca2ca280cc6 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -24,8 +24,6 @@ import seaborn as sns # this import hids some warnings in axis plotting -from matplotlib.axes._axes import _log as matplotlib_axes_logger -matplotlib_axes_logger.setLevel('ERROR') ############################################################################# # Determinants of Wages from the 1985 Current Population Survey From c875e4b6d8fd030c34b6ba2b616841e5e915b311 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 25 Nov 2019 13:20:28 +0100 Subject: [PATCH 11/85] Change boxenplot to boxplot to avoid warnings. --- .../plot_linear_model_coefficient_interpretation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 00ca2ca280cc6..b784be4d3171f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -23,8 +23,6 @@ import matplotlib.pyplot as plt import seaborn as sns -# this import hids some warnings in axis plotting - ############################################################################# # Determinants of Wages from the 1985 Current Population Survey # ------------------------------------------------------------- @@ -239,7 +237,7 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxenplot(data=coefs, orient='h', color='C0') +sns.boxplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) @@ -266,11 +264,12 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxenplot(data=coefs, orient='h', color='C0') +sns.boxplot(data=coefs, orient='h', color='C0') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) +plt.show() ############################################################################### # The estimation of the "EXPERIENCE" coefficient is now more stable with # respect to cross_validation. From 6dcd6bc95d0f10a86b91c6af6b017c500868efa1 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 25 Nov 2019 13:22:48 +0100 Subject: [PATCH 12/85] Remove debug lines. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index b784be4d3171f..a328d1e6347ab 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -269,7 +269,6 @@ plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) -plt.show() ############################################################################### # The estimation of the "EXPERIENCE" coefficient is now more stable with # respect to cross_validation. From 9bdff51c8e1c86ca42493eca8ca2882d287b6f3b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 25 Nov 2019 13:43:48 +0100 Subject: [PATCH 13/85] Fix 'Invalid rgb arg C0' Error for doc min dependencies build. --- .../plot_linear_model_coefficient_interpretation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index a328d1e6347ab..db3b7fac6799a 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -237,7 +237,7 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='C0') +sns.boxplot(data=coefs, orient='h', color='blue') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) @@ -264,7 +264,7 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='C0') +sns.boxplot(data=coefs, orient='h', color='blue') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) From d152cd10996c3d7b46f5584ac5a52601fa3e1d6e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 28 Nov 2019 15:08:19 +0100 Subject: [PATCH 14/85] Trigger CI From 46ada5d26739a2e2a71035374ca914e05866be63 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Dec 2019 21:20:05 +0100 Subject: [PATCH 15/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Nicolas Hug --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index db3b7fac6799a..feb20b43130bb 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -11,7 +11,7 @@ The interpretation of the coefficients gives the relationship between the feature and the target given that other features remain constant. -This example will show some hints in interpreting coefficient in linear models, +This example will provide some hints in interpreting coefficient in linear models, using data from the "Current Population Survey" from 1985. """ From 7bfd0c7861ba9dc7b61f0ff0e12897a10a8d6c07 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Dec 2019 21:20:43 +0100 Subject: [PATCH 16/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Nicolas Hug --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index feb20b43130bb..da0e9b973e53a 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -39,7 +39,7 @@ survey = fetch_openml(data_id=534, as_frame=True) ############################################################################## -# Then, we identify features (`X`) and targets (`y`): the column 'WAGE' is our +# Then, we identify features `X` and targets `y`: the column WAGE is our # target variable (i.e., the variable which we want to predict). X = survey.data[survey.feature_names] From c023cff070a12f547ee6f66004f583e79e23ef27 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 3 Dec 2019 21:21:50 +0100 Subject: [PATCH 17/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Nicolas Hug --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index da0e9b973e53a..c23eb14a73e03 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -149,7 +149,7 @@ plt.ylim([0, 27]) ############################################################################## -# The model learnt is far to be a good model making accurate prediction. +# The model learnt is far from being a good model making accurate predictions. # As interpretation tools characterize model rather than the generative process # of the data itself, it needs to be emphasized that interpretations are # correct if the model is correct as well. From 64380b2036c900ff2782399ada48ce702ee6298d Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 4 Dec 2019 12:07:37 +0100 Subject: [PATCH 18/85] Add seaborn in doc dependencies, fix easy comments. --- doc/developers/contributing.rst | 3 +- ...linear_model_coefficient_interpretation.py | 48 +++++++++---------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 863ecfb7741b3..43d7daf357914 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -542,7 +542,8 @@ the development version. Building the documentation requires installing some additional packages:: - pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas scikit-image packaging + pip install sphinx sphinx-gallery numpydoc matplotlib Pillow pandas \ + scikit-image packaging seaborn To build the documentation, you need to be in the ``doc`` folder:: diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index c23eb14a73e03..6b8a31cdca942 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -8,11 +8,11 @@ section for a description of a set of linear model methods available in scikit-learn). It is important to emphasize that linear models compute conditional links. -The interpretation of the coefficients gives the relationship between the -feature and the target given that other features remain constant. +The interpretation of a coefficient gives the relationship between the given +feature and the target assuming that other features remain constant. -This example will provide some hints in interpreting coefficient in linear models, -using data from the "Current Population Survey" from 1985. +This example will provide some hints in interpreting coefficient in linear +models, using data from the "Current Population Survey" from 1985. """ print(__doc__) @@ -31,7 +31,7 @@ # ........... # # We fetch the data from `OpenML `_. -# Note that setting the parameter `as_frame` to `True` will retrieve the data +# Note that setting the parameter `as_frame` to True will retrieve the data # as a pandas dataframe. from sklearn.datasets import fetch_openml @@ -43,10 +43,10 @@ # target variable (i.e., the variable which we want to predict). X = survey.data[survey.feature_names] -print(X.head()) +X.head() ############################################################################## y = survey.target.values.ravel() -print(survey.target.head()) +survey.target.head() ############################################################################## # First, let's get some insights by looking at the marginal links between the @@ -55,11 +55,11 @@ sns.pairplot(survey.frame, diag_kind='kde') ############################################################################## -# Note that the "WAGE" distribution has a long tail and we could take its log +# Note that the WAGE distribution has a long tail and we could take its log # to simplify our problem getting closer to a normal distribution. -# For all 3 variables, "EDUCATION", "EXPERIENCE", and "AGE", the "WAGE" is -# increasing when these variables are increasing. Also, the "EXPERIENCE" and -# "AGE" are correlated. +# For all 3 variables, EDUCATION, EXPERIENCE, and AGE, the WAGE is +# increasing when these variables are increasing. Also, the EXPERIENCE and +# AGE are correlated. # # The pipeline # ............ @@ -92,7 +92,7 @@ ############################################################################## # To describe the dataset as a linear model we choose to use a ridge regressor -# and to model le log of the "WAGE". +# and to model le log of the WAGE. from sklearn.pipeline import make_pipeline from sklearn.linear_model import RidgeCV @@ -177,8 +177,8 @@ ############################################################################### # Soon we realize that we cannot compare different coefficients since we did # not scale the data before the fit, features having different value ranges. -# For instance, the "AGE" coefficient is expressed in $/hours/leaving years -# while the "EDUCATION" is expressed in $/hours/years of education. +# For instance, the AGE coefficient is expressed in $/hours/leaving years +# while the EDUCATION is expressed in $/hours/years of education. # This is evident if we compare feature standard deviations. X_train_preprocessed = pd.DataFrame( @@ -203,17 +203,17 @@ plt.subplots_adjust(left=.3) ############################################################################### -# The plot above tells us that an increase of the "AGE" will induce a decrease -# of the "WAGE" when all other features remain constant. Also an increase of -# the "EXPERIENCE" will induce an increase of the "WAGE" when all other +# The plot above tells us that an increase of the AGE will induce a decrease +# of the WAGE when all other features remain constant. Also an increase of +# the EXPERIENCE will induce an increase of the WAGE when all other # features remain constant. # # The first interpretation might look counter-intuitive at first, if one -# relates the relationship between "AGE" and "WAGE" as a marginal link. +# relates the relationship between AGE and WAGE as a marginal link. # However, as previously mentioned, a linear model computes a conditional -# link between "AGE" and "WAGE" given all other features. +# link between AGE and WAGE given all other features. # Therefore, one should also interpret that for a given experience and all -# other features constant as well, a younger person would have an higher +# other features constant as well, a younger person would have a higher # wage. # # Checking the coefficient stability @@ -243,10 +243,10 @@ plt.subplots_adjust(left=.3) ############################################################################### -# The "AGE" and "EXPERIENCE" coefficients are highly instable which might be +# The AGE and EXPERIENCE coefficients are highly unstable which might be # due to the collinearity between the 2 features. # -# In order to verify our interpretation we can remove one of the 2 features and +# In order to verify our interpretation we remove one of the 2 features and # check what is the impact on the features stability. column_to_drop = ['AGE'] @@ -270,5 +270,5 @@ plt.subplots_adjust(left=.3) ############################################################################### -# The estimation of the "EXPERIENCE" coefficient is now more stable with -# respect to cross_validation. +# The estimation of the EXPERIENCE coefficient is now more stable and +# remain important for all predictors trained during cross-validation. From b8cd2c13680fb6d8948b6f54780c3184b34e8285 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 5 Dec 2019 16:58:13 +0100 Subject: [PATCH 19/85] Add intersphinx_mapping for seaborn. Address some of the comments. --- doc/conf.py | 1 + ...linear_model_coefficient_interpretation.py | 24 +++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 7959a0862f547..e4ea45e3eb767 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -244,6 +244,7 @@ 'matplotlib': ('https://matplotlib.org/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'joblib': ('https://joblib.readthedocs.io/en/latest/', None), + 'seaborn': ('https://seaborn.pydata.org/', None), } v = parse(release) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 6b8a31cdca942..a0db7da621338 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -7,9 +7,9 @@ a linear combination of the features (see the :ref:`linear_model` User Guide section for a description of a set of linear model methods available in scikit-learn). -It is important to emphasize that linear models compute conditional links. -The interpretation of a coefficient gives the relationship between the given -feature and the target assuming that other features remain constant. +It is important to emphasize that coefficients in multiple linear models +represent the relationship between the given feature and the target +assuming that other features remain constant. This example will provide some hints in interpreting coefficient in linear models, using data from the "Current Population Survey" from 1985. @@ -41,9 +41,19 @@ ############################################################################## # Then, we identify features `X` and targets `y`: the column WAGE is our # target variable (i.e., the variable which we want to predict). - +# X = survey.data[survey.feature_names] +X.describe(include="all") + +############################################################################## +# Notice that the dataset contains categorical and numerical variables. +# Some of the categorical variables are binary variables. +# About the numerical ones we can observe that AGE and EXPERIENCE have similar +# distributions while the EDUCATION distribution is narrower. +# This will give us directions on how to preprocess the data thereafter. + X.head() + ############################################################################## y = survey.target.values.ravel() survey.target.head() @@ -67,14 +77,14 @@ survey.data.info() ############################################################################# -# The dataset is composed by columns with different data types and we need to -# apply a specific preprocessing for each data types. +# As seen previously, the dataset contains columns with different data types +# and we need to apply a specific preprocessing for each data types. # Our pre-processor will # # - one-hot encode (i.e., generate a column by category) the categorical # columns; # - replace by 0 and 1 the categories of binary columns; -# - keep numerical values as they are. +# - as a first approach, keep numerical values as they are. from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder From a8b2f39add25d9db1621086bb505d03a0aadc06c Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Dec 2019 17:12:39 +0100 Subject: [PATCH 20/85] Address some comments. test_trai_split moved above. --- ...linear_model_coefficient_interpretation.py | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index a0db7da621338..472819e02b55e 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -13,6 +13,8 @@ This example will provide some hints in interpreting coefficient in linear models, using data from the "Current Population Survey" from 1985. + +A description of the dataset follows. """ print(__doc__) @@ -58,33 +60,55 @@ y = survey.target.values.ravel() survey.target.head() +############################################################################### +# We split the sample in a train and a test dataset, +# Only the train dataset will be used in the following exploratory analysis. +# This is a way to emulate a real situation where predictions are performed on +# an unknown target. + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=42 +) + ############################################################################## # First, let's get some insights by looking at the marginal links between the # different variables. Only numerical variables will be used. -sns.pairplot(survey.frame, diag_kind='kde') +train_dataset = X_train.copy() +train_dataset.insert(0,"WAGE",y_train) +sns.pairplot(train_dataset, diag_kind='kde') +plt.show() ############################################################################## -# Note that the WAGE distribution has a long tail and we could take its log -# to simplify our problem getting closer to a normal distribution. +# Looking closely at the WAGE distribution it could be noticed that it has a +# long tail and we could take its logarithm +# to simplify our problem and approximate a normal distribution. # For all 3 variables, EDUCATION, EXPERIENCE, and AGE, the WAGE is # increasing when these variables are increasing. Also, the EXPERIENCE and # AGE are correlated. # # The pipeline # ............ +# +# To design our machine-learning pipeline, we will manually +# check the type of data that we are dealing with: survey.data.info() ############################################################################# # As seen previously, the dataset contains columns with different data types # and we need to apply a specific preprocessing for each data types. +# In particular categorical variables cannot be included in linear model if not +# coded as integers first. # Our pre-processor will # # - one-hot encode (i.e., generate a column by category) the categorical # columns; # - replace by 0 and 1 the categories of binary columns; -# - as a first approach, keep numerical values as they are. +# - as a first approach (we will see after how the normalisation of numerical +# values will affect our discussion), keep numerical values as they are. from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder @@ -121,16 +145,7 @@ # Processing the dataset # ...................... # -# First of all we split the sample in a train and a test dataset. - -from sklearn.model_selection import train_test_split - -X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=42 -) - -############################################################################## -# Then we fit the model +# First we fit the model model.fit(X_train, y_train) @@ -274,7 +289,7 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='blue') +sns.boxplot(data=coefs, orient='h', color='cyan') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) From d7392f8fbd69a6d11795c0f4fd53d7a7ed8ae791 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Dec 2019 19:30:59 +0100 Subject: [PATCH 21/85] Fix lint error. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 472819e02b55e..056a1ba87647c 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -77,7 +77,7 @@ # different variables. Only numerical variables will be used. train_dataset = X_train.copy() -train_dataset.insert(0,"WAGE",y_train) +train_dataset.insert(0, "WAGE", y_train) sns.pairplot(train_dataset, diag_kind='kde') plt.show() From 916736cbed180b309776ab28461e7207c128d047 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Dec 2019 19:33:46 +0100 Subject: [PATCH 22/85] Fix trailing space. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 056a1ba87647c..c238e0f78067e 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -61,7 +61,7 @@ survey.target.head() ############################################################################### -# We split the sample in a train and a test dataset, +# We split the sample in a train and a test dataset, # Only the train dataset will be used in the following exploratory analysis. # This is a way to emulate a real situation where predictions are performed on # an unknown target. From 2d2244e8376b8a19e44ccae58122a7e0476c8d8a Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 9 Dec 2019 21:15:53 +0100 Subject: [PATCH 23/85] Fix some comments. --- .../plot_linear_model_coefficient_interpretation.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index c238e0f78067e..f660e0cdacfb7 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -13,6 +13,8 @@ This example will provide some hints in interpreting coefficient in linear models, using data from the "Current Population Survey" from 1985. +We will be interested in the prediction of the wage as a function of various +features such as experience, age, or education. A description of the dataset follows. """ @@ -85,14 +87,13 @@ # Looking closely at the WAGE distribution it could be noticed that it has a # long tail and we could take its logarithm # to simplify our problem and approximate a normal distribution. -# For all 3 variables, EDUCATION, EXPERIENCE, and AGE, the WAGE is -# increasing when these variables are increasing. Also, the EXPERIENCE and -# AGE are correlated. +# The WAGE is increasing when EDUCATION is increasing. +# Also, the EXPERIENCE and AGE are linearly correlated. # # The pipeline # ............ # -# To design our machine-learning pipeline, we will manually +# To design our machine-learning pipeline, we manually # check the type of data that we are dealing with: survey.data.info() @@ -101,7 +102,8 @@ # As seen previously, the dataset contains columns with different data types # and we need to apply a specific preprocessing for each data types. # In particular categorical variables cannot be included in linear model if not -# coded as integers first. +# coded as integers first. In addition, to avoid categorical features to be +# treated as ordered values, we need to one-hot-encode them. # Our pre-processor will # # - one-hot encode (i.e., generate a column by category) the categorical From ef0593f60dfcc4b69882ea601a88f55fdcb97772 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Dec 2019 16:52:14 +0100 Subject: [PATCH 24/85] Add r2 score and alpha discussion. Address some other comments. --- ...linear_model_coefficient_interpretation.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index f660e0cdacfb7..abb9aa2f6f926 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -128,7 +128,8 @@ ############################################################################## # To describe the dataset as a linear model we choose to use a ridge regressor -# and to model le log of the WAGE. +# and to model the logarithm of the WAGE. +# We sample the complexity parameter space between 1.e-10 and 1.e10. from sklearn.pipeline import make_pipeline from sklearn.linear_model import RidgeCV @@ -137,7 +138,7 @@ model = make_pipeline( preprocessor, TransformedTargetRegressor( - regressor=RidgeCV(), + regressor=RidgeCV(alphas=np.logspace(-10,10,21)), func=np.log10, inverse_func=sp.special.exp10 ) @@ -147,13 +148,17 @@ # Processing the dataset # ...................... # -# First we fit the model +# First, we fit the model and we verify which value for :math:`\alpha` has been +# selected. model.fit(X_train, y_train) +model[-1].regressor_.alpha_ ############################################################################## -# We can check the performance of the computed model using, for example, the -# median absolute error of the model. +# Once verified that the :math:`\alpha` parameter is not at the boundary of +# the sampled parameter space, we can check the performance of the computed +# model using, for example, the median absolute error of the model and the R +# squared coefficient. from sklearn.metrics import median_absolute_error @@ -162,9 +167,10 @@ string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) +r2score = model.score(X_test,y_test) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) - +string_score += '\nR2 score: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) sns.regplot(y_test, y_pred) @@ -176,10 +182,14 @@ plt.ylim([0, 27]) ############################################################################## -# The model learnt is far from being a good model making accurate predictions. +# The model learnt is far from being a good model making accurate predictions: +# the R squared score is very low. # As interpretation tools characterize model rather than the generative process # of the data itself, it needs to be emphasized that interpretations are # correct if the model is correct as well. +# In this case, we are more interested in providing a methodology than in +# having a good description of the data: a bad example illustrates the +# importance of cross checking the results. # # Interpreting coefficients # ......................... @@ -264,7 +274,7 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='blue') +sns.boxplot(data=coefs, orient='h', color='cyan') plt.axvline(x=0, color='.5') plt.title('Stability of coefficients') plt.subplots_adjust(left=.3) @@ -274,7 +284,7 @@ # due to the collinearity between the 2 features. # # In order to verify our interpretation we remove one of the 2 features and -# check what is the impact on the features stability. +# check what is the impact on the model stability. column_to_drop = ['AGE'] From ec9b982ea74fec9e9970b38b2d9f28437f99eb7b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Dec 2019 16:57:17 +0100 Subject: [PATCH 25/85] Fix lint error. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index abb9aa2f6f926..ff3d9b35c84a3 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -167,7 +167,7 @@ string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -r2score = model.score(X_test,y_test) +r2score = model.score(X_test, y_test) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) string_score += '\nR2 score: {0:.4f}'.format(r2score) From f33cfddfad9519416f9a2ecf1d02e6df1b12cc66 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Dec 2019 17:01:42 +0100 Subject: [PATCH 26/85] Fix lint error.. again. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index ff3d9b35c84a3..eb2e489f03c18 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -138,7 +138,7 @@ model = make_pipeline( preprocessor, TransformedTargetRegressor( - regressor=RidgeCV(alphas=np.logspace(-10,10,21)), + regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), func=np.log10, inverse_func=sp.special.exp10 ) From 9ad495219a87e582d1dcd60875ca51f471edce86 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 12 Dec 2019 12:22:49 +0100 Subject: [PATCH 27/85] Address all comments but big ones. --- ...linear_model_coefficient_interpretation.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index eb2e489f03c18..034d955c1a5cf 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -228,7 +228,9 @@ ############################################################################### # We should then normalize the coefficients by the standard deviation and we -# will be able to compare them. +# will be able to compare them and helps interpretation: the greater the +# variance of a feature, the large the impact of the corresponding coefficent +# on the output. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * @@ -253,11 +255,12 @@ # other features constant as well, a younger person would have a higher # wage. # -# Checking the coefficient stability -# .................................. +# Checking the coefficient variability +# .................................... # -# The stability of the coefficients is a guarantee of the robustness of the -# model. We can check the coefficient stability through cross-validation. +# We can check the coefficient variability through cross-validation. +# If coefficients vary in a significant way changing the input dataset +# the robustness of the model is not guaranteed. from sklearn.model_selection import cross_validate from sklearn.model_selection import RepeatedKFold @@ -276,12 +279,12 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan') plt.axvline(x=0, color='.5') -plt.title('Stability of coefficients') +plt.title('Coefficient variability') plt.subplots_adjust(left=.3) ############################################################################### -# The AGE and EXPERIENCE coefficients are highly unstable which might be -# due to the collinearity between the 2 features. +# The AGE and EXPERIENCE coefficients are affected by strong variability which +# might be due to the collinearity between the 2 features. # # In order to verify our interpretation we remove one of the 2 features and # check what is the impact on the model stability. @@ -303,9 +306,9 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan') plt.axvline(x=0, color='.5') -plt.title('Stability of coefficients') +plt.title('Coefficient variability') plt.subplots_adjust(left=.3) ############################################################################### -# The estimation of the EXPERIENCE coefficient is now more stable and +# The estimation of the EXPERIENCE coefficient is now less variable and # remain important for all predictors trained during cross-validation. From b01892e44cb914c7e92f71bf873497fece9c6119 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 12 Dec 2019 12:26:57 +0100 Subject: [PATCH 28/85] Fix lint issue. :( --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 034d955c1a5cf..b5234d76c20ce 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -260,7 +260,7 @@ # # We can check the coefficient variability through cross-validation. # If coefficients vary in a significant way changing the input dataset -# the robustness of the model is not guaranteed. +# the robustness of the model is not guaranteed. from sklearn.model_selection import cross_validate from sklearn.model_selection import RepeatedKFold From 3bfeda29f721c4eaa10e2fd1962026cdfb93b7d5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 12 Dec 2019 15:23:51 +0100 Subject: [PATCH 29/85] Fix some new comments and start discussion about dataset normalisation. --- ...linear_model_coefficient_interpretation.py | 93 ++++++++++++++++++- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index b5234d76c20ce..d49430821fdae 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -81,7 +81,6 @@ train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) sns.pairplot(train_dataset, diag_kind='kde') -plt.show() ############################################################################## # Looking closely at the WAGE distribution it could be noticed that it has a @@ -90,6 +89,8 @@ # The WAGE is increasing when EDUCATION is increasing. # Also, the EXPERIENCE and AGE are linearly correlated. # +# .. _the-pipeline: +# # The pipeline # ............ # @@ -228,8 +229,8 @@ ############################################################################### # We should then normalize the coefficients by the standard deviation and we -# will be able to compare them and helps interpretation: the greater the -# variance of a feature, the large the impact of the corresponding coefficent +# will be able to compare them helping interpretation: the greater the +# variance of a feature, the larger the impact of the corresponding coefficent # on the output. coefs = pd.DataFrame( @@ -312,3 +313,89 @@ ############################################################################### # The estimation of the EXPERIENCE coefficient is now less variable and # remain important for all predictors trained during cross-validation. +# +# Preprocessing numerical variables +# ................................. +# +# As said above (see :ref:`the-pipeline`), we could also choose to scale +# numerical values before training the model. +# The preprocessor is redefined in order to subtract the mean and scale +# variables to unit variance. + +from sklearn.preprocessing import StandardScaler + +preprocessor = make_column_transformer( + (OneHotEncoder(), categorical_columns), + (OrdinalEncoder(), binary_columns), + (StandardScaler(), numerical_columns), + remainder='passthrough' +) + +############################################################################### +# The model will stay unchanged. + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +model.fit(X_train, y_train) +model[-1].regressor_.alpha_ + +############################################################################## +# Again, we check the performance of the computed +# model using, for example, the median absolute error of the model and the R +# squared coefficient. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +r2score = model.score(X_test, y_test) + +string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += '\nR2 score: {0:.4f}'.format(r2score) +fig, ax = plt.subplots(figsize=(6, 6)) +sns.regplot(y_test, y_pred) + +plt.text(3, 20, string_score) + +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +plt.ylim([0, 27]) + +############################################################################## +# Coefficients do not need to be rescaled this time + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################## +# Cross validation for coefficients + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ + for est in cv_model['estimator']], + columns=feature_names +) +plt.figure(figsize=(9, 7)) +sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.boxplot(data=coefs, orient='h', color='cyan') +plt.axvline(x=0, color='.5') +plt.title('Coefficient variability') +plt.subplots_adjust(left=.3) From b539dbb28d07290699eb2acf8a7f875cdd007fa8 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 13 Dec 2019 11:29:47 +0100 Subject: [PATCH 30/85] Add comparison between scaled and unscaled data analysis. Address some comments. --- ...linear_model_coefficient_interpretation.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index d49430821fdae..45f5baff0c910 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -223,7 +223,7 @@ model.named_steps['columntransformer'].transform(X_train), columns=feature_names ) -X_train_preprocessed.std().plot(kind='barh', figsize=(9, 7)) +X_train_preprocessed.std(axis=0).plot(kind='barh', figsize=(9, 7)) plt.title('Features std. dev.') plt.subplots_adjust(left=.3) @@ -235,7 +235,7 @@ coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * - X_train_preprocessed.std(), + X_train_preprocessed.std(axis=0), columns=['Coefficients'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) @@ -272,13 +272,13 @@ ) coefs = pd.DataFrame( [est.named_steps['transformedtargetregressor'].regressor_.coef_ * - X_train_preprocessed.std() + X_train_preprocessed.std(axis=0) for est in cv_model['estimator']], columns=feature_names ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='cyan') +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.title('Coefficient variability') plt.subplots_adjust(left=.3) @@ -299,13 +299,13 @@ ) coefs = pd.DataFrame( [est.named_steps['transformedtargetregressor'].regressor_.coef_ * - X_train_preprocessed.drop(columns=column_to_drop).std() + X_train_preprocessed.drop(columns=column_to_drop).std(axis=0) for est in cv_model['estimator']], columns=feature_names[:-1] ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='cyan') +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.title('Coefficient variability') plt.subplots_adjust(left=.3) @@ -317,7 +317,7 @@ # Preprocessing numerical variables # ................................. # -# As said above (see :ref:`the-pipeline`), we could also choose to scale +# As said above (see ":ref:`the-pipeline`"), we could also choose to scale # numerical values before training the model. # The preprocessor is redefined in order to subtract the mean and scale # variables to unit variance. @@ -371,7 +371,8 @@ plt.ylim([0, 27]) ############################################################################## -# Coefficients do not need to be rescaled this time +# The R squared coefficient is not better than for the non- normalized case. +# For the coefficient analysis scaling is not needed this time. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, @@ -382,7 +383,7 @@ plt.subplots_adjust(left=.3) ############################################################################## -# Cross validation for coefficients +# We cross validate the coefficients. cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), @@ -395,7 +396,13 @@ ) plt.figure(figsize=(9, 7)) sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) -sns.boxplot(data=coefs, orient='h', color='cyan') +sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.title('Coefficient variability') plt.subplots_adjust(left=.3) + +############################################################################## +# The result is significantly different. +# AGE and EXPERIENCE coefficients are less variable than other coefficients. +# They are also smaller, meaning that the model is less influenced by those two +# variables than by other features. From 730547ac2546725f18929ced848a9d3a1b1dc55f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 16 Dec 2019 11:47:21 +0100 Subject: [PATCH 31/85] More discussion about feature normalization. --- ...linear_model_coefficient_interpretation.py | 47 ++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 45f5baff0c910..7f3666233db39 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -149,7 +149,7 @@ # Processing the dataset # ...................... # -# First, we fit the model and we verify which value for :math:`\alpha` has been +# First, we fit the model and we verify which value of :math:`\alpha` has been # selected. model.fit(X_train, y_train) @@ -189,7 +189,7 @@ # of the data itself, it needs to be emphasized that interpretations are # correct if the model is correct as well. # In this case, we are more interested in providing a methodology than in -# having a good description of the data: a bad example illustrates the +# having a good description of the data: a bad example could illustrate the # importance of cross checking the results. # # Interpreting coefficients @@ -228,10 +228,10 @@ plt.subplots_adjust(left=.3) ############################################################################### -# We should then normalize the coefficients by the standard deviation and we -# will be able to compare them helping interpretation: the greater the +# We should then scale the coefficients by the standard deviation and we +# will be able to better compare them: indeed, the greater the # variance of a feature, the larger the impact of the corresponding coefficent -# on the output. +# on the output, all else being equal. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * @@ -286,9 +286,21 @@ ############################################################################### # The AGE and EXPERIENCE coefficients are affected by strong variability which # might be due to the collinearity between the 2 features. +# To verify this interpretation we plot the variability of the AGE and +# EXPERIENCE coefficient: + +plt.ylabel('Age coefficient') +plt.xlabel('Experience coefficient') +plt.grid(True) +plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) + +############################################################################### +# Two regions are populated: when the EXPERIENCE coefficient is +# positive the AGE one is in general negative and viceversa, except for a small +# number of positive points around zero. # -# In order to verify our interpretation we remove one of the 2 features and -# check what is the impact on the model stability. +# To go further we remove one of the 2 features and check what is the impact +# on the model stability. column_to_drop = ['AGE'] @@ -319,6 +331,8 @@ # # As said above (see ":ref:`the-pipeline`"), we could also choose to scale # numerical values before training the model. +# In the following we will check how this approach will modify the analysis on +# coefficient variability and interpretation. # The preprocessor is redefined in order to subtract the mean and scale # variables to unit variance. @@ -371,8 +385,8 @@ plt.ylim([0, 27]) ############################################################################## -# The R squared coefficient is not better than for the non- normalized case. -# For the coefficient analysis scaling is not needed this time. +# The R squared coefficient is not better than for the non-normalized case. +# For the coefficient analysis, scaling is not needed this time. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, @@ -403,6 +417,15 @@ ############################################################################## # The result is significantly different. -# AGE and EXPERIENCE coefficients are less variable than other coefficients. -# They are also smaller, meaning that the model is less influenced by those two -# variables than by other features. +# AGE and EXPERIENCE coefficients are less variable than other coefficients, +# they are both positive. + +plt.ylabel('Age coefficient') +plt.xlabel('Experience coefficient') +plt.grid(True) +plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) + +############################################################################## +# Even if the model is still not able to provide a good description of the +# dataset, the normalization of numerical features clearly provides more +# reliable results for the coefficients. From 2d82432926555490a21a92331614a1e8b8b49344 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 16 Dec 2019 13:24:59 +0100 Subject: [PATCH 32/85] Cleaning marginal and conditional concepts. --- ...linear_model_coefficient_interpretation.py | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 7f3666233db39..2063c11fba5c5 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -63,7 +63,7 @@ survey.target.head() ############################################################################### -# We split the sample in a train and a test dataset, +# We split the sample in a train and a test dataset. # Only the train dataset will be used in the following exploratory analysis. # This is a way to emulate a real situation where predictions are performed on # an unknown target. @@ -75,8 +75,9 @@ ) ############################################################################## -# First, let's get some insights by looking at the marginal links between the -# different variables. Only numerical variables will be used. +# First, let's get some insights by looking at the variable distributions and +# at the pairwise relationships between them. Only numerical +# variables will be used. train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) @@ -244,17 +245,9 @@ ############################################################################### # The plot above tells us that an increase of the AGE will induce a decrease -# of the WAGE when all other features remain constant. Also an increase of -# the EXPERIENCE will induce an increase of the WAGE when all other -# features remain constant. -# -# The first interpretation might look counter-intuitive at first, if one -# relates the relationship between AGE and WAGE as a marginal link. -# However, as previously mentioned, a linear model computes a conditional -# link between AGE and WAGE given all other features. -# Therefore, one should also interpret that for a given experience and all -# other features constant as well, a younger person would have a higher -# wage. +# of the WAGE when all other features remain constant, for instance at a +# constant EXPERIENCE. An increase of the EXPERIENCE will induce an increase +# of the WAGE when all other features remain constant. # # Checking the coefficient variability # .................................... From 4b02e1fca8f1389a11798fe8163ace2a7656d9c4 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 20 Dec 2019 16:45:27 +0100 Subject: [PATCH 33/85] Rewrite introduction. --- ...t_linear_model_coefficient_interpretation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 2063c11fba5c5..6a77e6b772698 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -7,14 +7,19 @@ a linear combination of the features (see the :ref:`linear_model` User Guide section for a description of a set of linear model methods available in scikit-learn). -It is important to emphasize that coefficients in multiple linear models -represent the relationship between the given feature and the target -assuming that other features remain constant. +Coefficients in multiple linear models represent the relationship between the +given feature (`X[i]`) and the target (`y`) assuming that all the other +features remain constant. +This is not the same thing than plotting `X[i]` versus `y` and fitting a linear +relationship: in that case all possible values of the other features are +added to the estimation. This example will provide some hints in interpreting coefficient in linear -models, using data from the "Current Population Survey" from 1985. -We will be interested in the prediction of the wage as a function of various -features such as experience, age, or education. +models, pointing at problems that arise when either the linear model is not +appropriate to describe the dataset, or features are correlated. + +We will use data from the "Current Population Survey" from 1985 to predict +wage as a function of various features such as experience, age, or education. A description of the dataset follows. """ From c722102b838d2ada5a6828acc088b07bc08f9e3a Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 20 Dec 2019 16:52:04 +0100 Subject: [PATCH 34/85] Minor rephrasing. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 6a77e6b772698..fdcde53ac9763 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -220,7 +220,8 @@ ############################################################################### # Soon we realize that we cannot compare different coefficients since we did -# not scale the data before the fit, features having different value ranges. +# not scale the data before the fit, features having different value ranges +# because of their different unit of measure. # For instance, the AGE coefficient is expressed in $/hours/leaving years # while the EDUCATION is expressed in $/hours/years of education. # This is evident if we compare feature standard deviations. From feaedd235344187bdc0bf7ffdeff38bb3e35025d Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 20 Dec 2019 16:55:31 +0100 Subject: [PATCH 35/85] Fix white spaces. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index fdcde53ac9763..eb8ec3827fde1 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -16,7 +16,7 @@ This example will provide some hints in interpreting coefficient in linear models, pointing at problems that arise when either the linear model is not -appropriate to describe the dataset, or features are correlated. +appropriate to describe the dataset, or features are correlated. We will use data from the "Current Population Survey" from 1985 to predict wage as a function of various features such as experience, age, or education. From f65670cf321dbca13473922f6d234bef964bb5c9 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 15:01:35 +0100 Subject: [PATCH 36/85] Explain coefficient scaling by multiplication. --- .../plot_linear_model_coefficient_interpretation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index eb8ec3827fde1..ca1964470af15 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -235,10 +235,10 @@ plt.subplots_adjust(left=.3) ############################################################################### -# We should then scale the coefficients by the standard deviation and we -# will be able to better compare them: indeed, the greater the -# variance of a feature, the larger the impact of the corresponding coefficent -# on the output, all else being equal. +# We should then multiply the coefficients by the standard deviation and we +# will be able to better compare them: in that way, we emphasize that the +# greater the variance of a feature, the larger the impact of the corresponding +# coefficent on the output, all else being equal. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * From a767eb3b670c6d7decd62b56c2a1ed22f5b79874 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 16:15:45 +0100 Subject: [PATCH 37/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index ca1964470af15..8438a62d27273 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -136,7 +136,7 @@ ############################################################################## # To describe the dataset as a linear model we choose to use a ridge regressor # and to model the logarithm of the WAGE. -# We sample the complexity parameter space between 1.e-10 and 1.e10. +# We sample the regularization parameter space between 1.e-10 and 1.e10. from sklearn.pipeline import make_pipeline from sklearn.linear_model import RidgeCV From 4477ae038b07a00e41f246fe65b7fab5ce9b8665 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 16:17:09 +0100 Subject: [PATCH 38/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 8438a62d27273..858bf5e8d3e95 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -219,7 +219,7 @@ plt.subplots_adjust(left=.3) ############################################################################### -# Soon we realize that we cannot compare different coefficients since we did +# Soon we realize that we cannot compare different coefficients since the # not scale the data before the fit, features having different value ranges # because of their different unit of measure. # For instance, the AGE coefficient is expressed in $/hours/leaving years From d5b095394e63ba1e1e9130424103ffd22fa32210 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 16:17:35 +0100 Subject: [PATCH 39/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 858bf5e8d3e95..78f16789ba62b 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -220,7 +220,7 @@ ############################################################################### # Soon we realize that we cannot compare different coefficients since the -# not scale the data before the fit, features having different value ranges +# features have different natural scales and hence value ranges # because of their different unit of measure. # For instance, the AGE coefficient is expressed in $/hours/leaving years # while the EDUCATION is expressed in $/hours/years of education. From 8da36af5dbb5a15a863cc18c2054e9d17cb2648f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 9 Jan 2020 16:18:22 +0100 Subject: [PATCH 40/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 78f16789ba62b..989225ade2cc4 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -222,7 +222,7 @@ # Soon we realize that we cannot compare different coefficients since the # features have different natural scales and hence value ranges # because of their different unit of measure. -# For instance, the AGE coefficient is expressed in $/hours/leaving years +# For instance, the AGE coefficient is expressed in $/hours/living years # while the EDUCATION is expressed in $/hours/years of education. # This is evident if we compare feature standard deviations. From abe46af0ff8d36fd8ee42cfbb20c78ef132d255f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 14 Jan 2020 15:36:51 +0100 Subject: [PATCH 41/85] Address some comments. --- ...ot_linear_model_coefficient_interpretation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 989225ade2cc4..2677e624b7b4a 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -238,12 +238,12 @@ # We should then multiply the coefficients by the standard deviation and we # will be able to better compare them: in that way, we emphasize that the # greater the variance of a feature, the larger the impact of the corresponding -# coefficent on the output, all else being equal. +# coefficient on the output, all else being equal. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_ * X_train_preprocessed.std(axis=0), - columns=['Coefficients'], index=feature_names + columns=['Coefficient importance'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5') @@ -251,12 +251,12 @@ ############################################################################### # The plot above tells us that an increase of the AGE will induce a decrease -# of the WAGE when all other features remain constant, for instance at a -# constant EXPERIENCE. An increase of the EXPERIENCE will induce an increase -# of the WAGE when all other features remain constant. +# of the WAGE when all other features remain constant. On the contrary, an +# increase of the EXPERIENCE will induce an increase of the WAGE when all +# other features remain constant. # -# Checking the coefficient variability -# .................................... +# Checking the variability of the coefficients +# ............................................ # # We can check the coefficient variability through cross-validation. # If coefficients vary in a significant way changing the input dataset @@ -292,6 +292,7 @@ plt.xlabel('Experience coefficient') plt.grid(True) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) +plt.title('Variations of coefficients for AGE and EXPERIENCE across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is @@ -423,6 +424,7 @@ plt.xlabel('Experience coefficient') plt.grid(True) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) +plt.title('Variations of coefficients for AGE and EXPERIENCE across folds') ############################################################################## # Even if the model is still not able to provide a good description of the From b7587ba01bdabf4ce7ecca16c1dc89a4faeb4fa9 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 14 Jan 2020 17:11:28 +0100 Subject: [PATCH 42/85] Add toc. Change section levels. Add definition of marginal dependence. --- ...linear_model_coefficient_interpretation.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 2677e624b7b4a..2d494c6f76f81 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -3,6 +3,10 @@ Interpretation of coefficients in linear models =============================================== +.. contents:: + :local: + :depth: 1 + Linear models describe situations in which the target value is expected to be a linear combination of the features (see the :ref:`linear_model` User Guide section for a description of a set of linear model methods available in @@ -33,11 +37,8 @@ import seaborn as sns ############################################################################# -# Determinants of Wages from the 1985 Current Population Survey -# ------------------------------------------------------------- -# # The dataset -# ........... +# ----------- # # We fetch the data from `OpenML `_. # Note that setting the parameter `as_frame` to True will retrieve the data @@ -93,12 +94,15 @@ # long tail and we could take its logarithm # to simplify our problem and approximate a normal distribution. # The WAGE is increasing when EDUCATION is increasing. +# It should be noted that the dependence between WAGE and EDUCATION +# represented here is a marginal dependence, i.e., it describe the behavior +# of a specific variable without fixing the others. # Also, the EXPERIENCE and AGE are linearly correlated. # # .. _the-pipeline: # # The pipeline -# ............ +# ------------ # # To design our machine-learning pipeline, we manually # check the type of data that we are dealing with: @@ -153,7 +157,7 @@ ############################################################################## # Processing the dataset -# ...................... +# ---------------------- # # First, we fit the model and we verify which value of :math:`\alpha` has been # selected. @@ -199,7 +203,7 @@ # importance of cross checking the results. # # Interpreting coefficients -# ......................... +# ------------------------- # # First of all, we can plot the values of the coefficients of the regressor we # have fitted. @@ -256,7 +260,7 @@ # other features remain constant. # # Checking the variability of the coefficients -# ............................................ +# -------------------------------------------- # # We can check the coefficient variability through cross-validation. # If coefficients vary in a significant way changing the input dataset @@ -327,7 +331,7 @@ # remain important for all predictors trained during cross-validation. # # Preprocessing numerical variables -# ................................. +# --------------------------------- # # As said above (see ":ref:`the-pipeline`"), we could also choose to scale # numerical values before training the model. From e0c121e269a167d831a6f19c803a904ce6b76deb Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 15 Jan 2020 14:05:14 +0100 Subject: [PATCH 43/85] Add definition of conditional dependence. More details about scaling coefficient. --- ...linear_model_coefficient_interpretation.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 2d494c6f76f81..303c9059965dc 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -13,10 +13,11 @@ scikit-learn). Coefficients in multiple linear models represent the relationship between the given feature (`X[i]`) and the target (`y`) assuming that all the other -features remain constant. +features remain constant (`conditional dependence +`_). This is not the same thing than plotting `X[i]` versus `y` and fitting a linear relationship: in that case all possible values of the other features are -added to the estimation. +added to the estimation (marginal dependence). This example will provide some hints in interpreting coefficient in linear models, pointing at problems that arise when either the linear model is not @@ -195,12 +196,11 @@ ############################################################################## # The model learnt is far from being a good model making accurate predictions: # the R squared score is very low. -# As interpretation tools characterize model rather than the generative process -# of the data itself, it needs to be emphasized that interpretations are -# correct if the model is correct as well. -# In this case, we are more interested in providing a methodology than in -# having a good description of the data: a bad example could illustrate the -# importance of cross checking the results. +# In the following section, we will interpret the coefficients of the model. +# While we do so, we should keep in mind that any conclusion we way draw will +# be about +# the model that we build, rather than about the true (real-world) generative +# process of the data. # # Interpreting coefficients # ------------------------- @@ -227,7 +227,10 @@ # features have different natural scales and hence value ranges # because of their different unit of measure. # For instance, the AGE coefficient is expressed in $/hours/living years -# while the EDUCATION is expressed in $/hours/years of education. +# while the EDUCATION one is expressed in $/hours/years of education. +# Looking at the coefficient plot to extrapolate feature importance could be +# misleading as some of them vary on a small scale (as UNION or SEX that are +# either 0 or 1), while feature like AGE varies a lot more, several decades. # This is evident if we compare feature standard deviations. X_train_preprocessed = pd.DataFrame( @@ -239,9 +242,11 @@ plt.subplots_adjust(left=.3) ############################################################################### -# We should then multiply the coefficients by the standard deviation and we -# will be able to better compare them: in that way, we emphasize that the -# greater the variance of a feature, the larger the impact of the corresponding +# For the reasons explained above, multiplying the coefficients by the +# standard deviation of the related feature would improve our understanding on +# feature importance on the model. +# In that way, we emphasize that the +# greater the variance of a feature, the larger the weight of the corresponding # coefficient on the output, all else being equal. coefs = pd.DataFrame( @@ -254,7 +259,9 @@ plt.subplots_adjust(left=.3) ############################################################################### -# The plot above tells us that an increase of the AGE will induce a decrease +# The plot above tells us about dependencies between a specific feature and +# the target when all other features remain constant, i.e., conditional +# dependencies. An increase of the AGE will induce a decrease # of the WAGE when all other features remain constant. On the contrary, an # increase of the EXPERIENCE will induce an increase of the WAGE when all # other features remain constant. From fff0f5df4fdb156492b43bffd102269a1b6936cf Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 15 Jan 2020 14:08:30 +0100 Subject: [PATCH 44/85] Fix lint error. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 303c9059965dc..261acaf3e9626 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -244,7 +244,7 @@ ############################################################################### # For the reasons explained above, multiplying the coefficients by the # standard deviation of the related feature would improve our understanding on -# feature importance on the model. +# feature importance on the model. # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding # coefficient on the output, all else being equal. From 7351fe71d2c3b3c9924241387f5b9afcee071d87 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 31 Jan 2020 14:44:33 +0100 Subject: [PATCH 45/85] Add regularization discussion. --- ...linear_model_coefficient_interpretation.py | 106 +++++++++++++----- 1 file changed, 77 insertions(+), 29 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 261acaf3e9626..54bbb97ed5054 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -1,7 +1,7 @@ """ -=============================================== -Interpretation of coefficients in linear models -=============================================== +================================================================== +Common pitfalls in interpretation of coefficients of linear models +================================================================== .. contents:: :local: @@ -140,17 +140,17 @@ ############################################################################## # To describe the dataset as a linear model we choose to use a ridge regressor -# and to model the logarithm of the WAGE. -# We sample the regularization parameter space between 1.e-10 and 1.e10. +# with a very small regularization and to model the logarithm of the WAGE. + from sklearn.pipeline import make_pipeline -from sklearn.linear_model import RidgeCV +from sklearn.linear_model import Ridge from sklearn.compose import TransformedTargetRegressor model = make_pipeline( preprocessor, TransformedTargetRegressor( - regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), + regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10 ) @@ -160,15 +160,12 @@ # Processing the dataset # ---------------------- # -# First, we fit the model and we verify which value of :math:`\alpha` has been -# selected. +# First, we fit the model. model.fit(X_train, y_train) -model[-1].regressor_.alpha_ ############################################################################## -# Once verified that the :math:`\alpha` parameter is not at the boundary of -# the sampled parameter space, we can check the performance of the computed +# Then we check the performance of the computed # model using, for example, the median absolute error of the model and the R # squared coefficient. @@ -290,7 +287,7 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') -plt.title('Coefficient variability') +plt.title('Coefficient importance variability') plt.subplots_adjust(left=.3) ############################################################################### @@ -307,8 +304,7 @@ ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is -# positive the AGE one is in general negative and viceversa, except for a small -# number of positive points around zero. +# positive the AGE one is negative and viceversa. # # To go further we remove one of the 2 features and check what is the impact # on the model stability. @@ -342,8 +338,6 @@ # # As said above (see ":ref:`the-pipeline`"), we could also choose to scale # numerical values before training the model. -# In the following we will check how this approach will modify the analysis on -# coefficient variability and interpretation. # The preprocessor is redefined in order to subtract the mean and scale # variables to unit variance. @@ -362,14 +356,13 @@ model = make_pipeline( preprocessor, TransformedTargetRegressor( - regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), + regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10 ) ) model.fit(X_train, y_train) -model[-1].regressor_.alpha_ ############################################################################## # Again, we check the performance of the computed @@ -427,17 +420,72 @@ plt.subplots_adjust(left=.3) ############################################################################## -# The result is significantly different. -# AGE and EXPERIENCE coefficients are less variable than other coefficients, -# they are both positive. +# The result is quite similar to the non-normalised case. +# +# Linear models with regularization +# --------------------------------- +# +# In practice, Ridge Regression is more often used with some regularization. +# Regularization improves the conditioning of the problem and reduces the +# variance of the estimates. RidgeCV applies cross validation in order to +# determine which value of the regularization parameter (`alpha`) is best +# suited for the model estimation. -plt.ylabel('Age coefficient') -plt.xlabel('Experience coefficient') -plt.grid(True) -plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -plt.title('Variations of coefficients for AGE and EXPERIENCE across folds') +from sklearn.linear_model import RidgeCV +from sklearn.compose import TransformedTargetRegressor + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=RidgeCV(alphas=np.logspace(-10, 10, 21)), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +model.fit(X_train, y_train) + +############################################################################## +# First we verify which value of :math:`\alpha` has been selected. + +model[-1].regressor_.alpha_ + +############################################################################## +# Then we check the quality of the predictions. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +r2score = model.score(X_test, y_test) + +string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += '\nR2 score: {0:.4f}'.format(r2score) +fig, ax = plt.subplots(figsize=(6, 6)) +sns.regplot(y_test, y_pred) + +plt.text(3, 20, string_score) + +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +plt.ylim([0, 27]) + +############################################################################## +# The R squared coefficient is similar to the non-regularized case. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) ############################################################################## +# Coefficients are significantly different. +# AGE and EXPERIENCE coefficients are both positive. # Even if the model is still not able to provide a good description of the -# dataset, the normalization of numerical features clearly provides more -# reliable results for the coefficients. +# dataset, the regularization manages to lower the influence of correlated +# variables on the model. From 53a9b5b69d80231ed2525d5a89338a3d5528dc1a Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:05:27 +0100 Subject: [PATCH 46/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 54bbb97ed5054..5a191db47f4ef 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -66,6 +66,7 @@ X.head() ############################################################################## +# Our target for prediction: the wage y = survey.target.values.ravel() survey.target.head() From 77f9cd4ed0077cd23c6856570979fe321fdc261d Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:06:06 +0100 Subject: [PATCH 47/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 5a191db47f4ef..9e2ed8910d23d 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -38,7 +38,7 @@ import seaborn as sns ############################################################################# -# The dataset +# The dataset: wages # ----------- # # We fetch the data from `OpenML `_. From 43ad1199baba35181e086e5dbc74941faab8deaa Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:06:21 +0100 Subject: [PATCH 48/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 9e2ed8910d23d..79b72ee494ed6 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -39,7 +39,7 @@ ############################################################################# # The dataset: wages -# ----------- +# ------------------ # # We fetch the data from `OpenML `_. # Note that setting the parameter `as_frame` to True will retrieve the data From 521fdd305c57ee4f301cd15414cd6afaafd756e3 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:07:18 +0100 Subject: [PATCH 49/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 79b72ee494ed6..6e5640eeef61b 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -103,7 +103,7 @@ # # .. _the-pipeline: # -# The pipeline +# The machine-learning pipeline # ------------ # # To design our machine-learning pipeline, we manually From 8d42670827fc4fc9a280cb75d9faf815b1c50a07 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:07:34 +0100 Subject: [PATCH 50/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 6e5640eeef61b..d40b3ad1e0e78 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -104,7 +104,7 @@ # .. _the-pipeline: # # The machine-learning pipeline -# ------------ +# ------------------------------------ # # To design our machine-learning pipeline, we manually # check the type of data that we are dealing with: From 81c1c349d863050f74a5302f229f4d584902efe4 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:07:54 +0100 Subject: [PATCH 51/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index d40b3ad1e0e78..371987c73514c 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -181,7 +181,7 @@ string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) string_score += '\nR2 score: {0:.4f}'.format(r2score) -fig, ax = plt.subplots(figsize=(6, 6)) +fig, ax = plt.subplots(figsize=(5, 5)) sns.regplot(y_test, y_pred) plt.text(3, 20, string_score) From 0d968d8f26c7d0118b8a8d6d48db39a1d5e07f48 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:08:24 +0100 Subject: [PATCH 52/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 371987c73514c..0dd2f7a83ee0d 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -200,7 +200,7 @@ # the model that we build, rather than about the true (real-world) generative # process of the data. # -# Interpreting coefficients +# Interpreting coefficients: scale matters # ------------------------- # # First of all, we can plot the values of the coefficients of the regressor we From 7cd9701960759cd0f10d6d71b9b58597f32087d0 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:08:40 +0100 Subject: [PATCH 53/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 0dd2f7a83ee0d..05028395e9fe6 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -201,7 +201,7 @@ # process of the data. # # Interpreting coefficients: scale matters -# ------------------------- +# --------------------------------------------- # # First of all, we can plot the values of the coefficients of the regressor we # have fitted. From 54a00a0f5d011571206e464dce212d7286507403 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:11:09 +0100 Subject: [PATCH 54/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 05028395e9fe6..1a6ce9bd36ebb 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -288,6 +288,7 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') +plt.xlabel('Coefficient importance') plt.title('Coefficient importance variability') plt.subplots_adjust(left=.3) From 56de1527b2830300557f3eebb35730c76661c0b5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:12:12 +0100 Subject: [PATCH 55/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 1a6ce9bd36ebb..378c7c91c274a 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -302,7 +302,7 @@ plt.xlabel('Experience coefficient') plt.grid(True) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -plt.title('Variations of coefficients for AGE and EXPERIENCE across folds') +plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is From 2a9b953841c6b1dbce550fb797fe09ef3668967e Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:34:30 +0100 Subject: [PATCH 56/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Gael Varoquaux --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 378c7c91c274a..829bb7d8017aa 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -293,6 +293,8 @@ plt.subplots_adjust(left=.3) ############################################################################### +# The problem of correlated variables +# ------------------------------------------------- # The AGE and EXPERIENCE coefficients are affected by strong variability which # might be due to the collinearity between the 2 features. # To verify this interpretation we plot the variability of the AGE and From 62360af50dedd6971fbf55cebc6d96c5abbce192 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:35:42 +0100 Subject: [PATCH 57/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Nicolas Hug --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 829bb7d8017aa..50b04f5c413f1 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -21,7 +21,7 @@ This example will provide some hints in interpreting coefficient in linear models, pointing at problems that arise when either the linear model is not -appropriate to describe the dataset, or features are correlated. +appropriate to describe the dataset, or when features are correlated. We will use data from the "Current Population Survey" from 1985 to predict wage as a function of various features such as experience, age, or education. From 24d1981c8685e7ab71899226c862d70d387600d8 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 17:38:06 +0100 Subject: [PATCH 58/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Nicolas Hug --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 50b04f5c413f1..ee66e8eec3f49 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -228,7 +228,7 @@ # while the EDUCATION one is expressed in $/hours/years of education. # Looking at the coefficient plot to extrapolate feature importance could be # misleading as some of them vary on a small scale (as UNION or SEX that are -# either 0 or 1), while feature like AGE varies a lot more, several decades. +# either 0 or 1), while a feature like AGE varies a lot more, several decades. # This is evident if we compare feature standard deviations. X_train_preprocessed = pd.DataFrame( From 07f708ca1453697252a0516d7ef76343462cbd5b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 18:41:22 +0100 Subject: [PATCH 59/85] Address some comments. --- ...linear_model_coefficient_interpretation.py | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index ee66e8eec3f49..071c9a5bedb98 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -3,19 +3,15 @@ Common pitfalls in interpretation of coefficients of linear models ================================================================== -.. contents:: - :local: - :depth: 1 - Linear models describe situations in which the target value is expected to be a linear combination of the features (see the :ref:`linear_model` User Guide section for a description of a set of linear model methods available in scikit-learn). Coefficients in multiple linear models represent the relationship between the -given feature (`X[i]`) and the target (`y`) assuming that all the other +given feature, :math:`X_i` and the target, :math:`y`, assuming that all the other features remain constant (`conditional dependence `_). -This is not the same thing than plotting `X[i]` versus `y` and fitting a linear +This is different from plotting :math:`X_i` versus :math:`y` and fitting a linear relationship: in that case all possible values of the other features are added to the estimation (marginal dependence). @@ -26,7 +22,9 @@ We will use data from the "Current Population Survey" from 1985 to predict wage as a function of various features such as experience, age, or education. -A description of the dataset follows. +.. contents:: + :local: + :depth: 1 """ print(__doc__) @@ -59,8 +57,6 @@ ############################################################################## # Notice that the dataset contains categorical and numerical variables. # Some of the categorical variables are binary variables. -# About the numerical ones we can observe that AGE and EXPERIENCE have similar -# distributions while the EDUCATION distribution is narrower. # This will give us directions on how to preprocess the data thereafter. X.head() @@ -74,7 +70,8 @@ # We split the sample in a train and a test dataset. # Only the train dataset will be used in the following exploratory analysis. # This is a way to emulate a real situation where predictions are performed on -# an unknown target. +# an unknown target, and we don't want our analysis and decisions to be biased +# by our knowledge of the test data. from sklearn.model_selection import train_test_split @@ -85,7 +82,7 @@ ############################################################################## # First, let's get some insights by looking at the variable distributions and # at the pairwise relationships between them. Only numerical -# variables will be used. +# variables will be used. In the following plot, each dot represents a sample. train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) @@ -104,7 +101,7 @@ # .. _the-pipeline: # # The machine-learning pipeline -# ------------------------------------ +# ----------------------------- # # To design our machine-learning pipeline, we manually # check the type of data that we are dealing with: @@ -257,9 +254,14 @@ plt.subplots_adjust(left=.3) ############################################################################### +# .. warning:: +# +# Why does the plot above suggest that an increase in age leads to a +# decrease in wage? Is that counter-intuitive? +# # The plot above tells us about dependencies between a specific feature and -# the target when all other features remain constant, i.e., conditional -# dependencies. An increase of the AGE will induce a decrease +# the target when all other features remain constant, i.e., **conditional +# dependencies**. An increase of the AGE will induce a decrease # of the WAGE when all other features remain constant. On the contrary, an # increase of the EXPERIENCE will induce an increase of the WAGE when all # other features remain constant. @@ -269,7 +271,8 @@ # # We can check the coefficient variability through cross-validation. # If coefficients vary in a significant way changing the input dataset -# the robustness of the model is not guaranteed. +# their robustness is not guaranteed, and they should probably be interpreted +# with caution. from sklearn.model_selection import cross_validate from sklearn.model_selection import RepeatedKFold @@ -289,16 +292,19 @@ sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.xlabel('Coefficient importance') -plt.title('Coefficient importance variability') +plt.title('Coefficient importance and its variability') plt.subplots_adjust(left=.3) ############################################################################### # The problem of correlated variables -# ------------------------------------------------- +# ----------------------------------- +# # The AGE and EXPERIENCE coefficients are affected by strong variability which -# might be due to the collinearity between the 2 features. +# might be due to the collinearity between the 2 features: as AGE and +# EXPERIENCE vary together in the data, their effect is difficult to tease +# apart. # To verify this interpretation we plot the variability of the AGE and -# EXPERIENCE coefficient: +# EXPERIENCE coefficient. plt.ylabel('Age coefficient') plt.xlabel('Experience coefficient') @@ -330,7 +336,8 @@ sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') -plt.title('Coefficient variability') +plt.title('Coefficient importance and its variability') +plt.xlabel('Coefficient importance') plt.subplots_adjust(left=.3) ############################################################################### @@ -342,6 +349,8 @@ # # As said above (see ":ref:`the-pipeline`"), we could also choose to scale # numerical values before training the model. +# This can be useful to apply a similar amount regularization to all of them +# in the Ridge. # The preprocessor is redefined in order to subtract the mean and scale # variables to unit variance. @@ -429,7 +438,9 @@ # Linear models with regularization # --------------------------------- # -# In practice, Ridge Regression is more often used with some regularization. +# In machine-learning practice, Ridge Regression is more often used with +# non-negligible regularization. +# Above, we limited this regularization to a very little amount. # Regularization improves the conditioning of the problem and reduces the # variance of the estimates. RidgeCV applies cross validation in order to # determine which value of the regularization parameter (`alpha`) is best From 17f8201e52495f27a1fcc2947058981d5c8d45ad Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 Feb 2020 18:48:11 +0100 Subject: [PATCH 60/85] Fix lint errors. --- .../plot_linear_model_coefficient_interpretation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 071c9a5bedb98..962f90257b666 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -8,11 +8,11 @@ section for a description of a set of linear model methods available in scikit-learn). Coefficients in multiple linear models represent the relationship between the -given feature, :math:`X_i` and the target, :math:`y`, assuming that all the other -features remain constant (`conditional dependence +given feature, :math:`X_i` and the target, :math:`y`, assuming that all the +other features remain constant (`conditional dependence `_). -This is different from plotting :math:`X_i` versus :math:`y` and fitting a linear -relationship: in that case all possible values of the other features are +This is different from plotting :math:`X_i` versus :math:`y` and fitting a +linear relationship: in that case all possible values of the other features are added to the estimation (marginal dependence). This example will provide some hints in interpreting coefficient in linear @@ -255,7 +255,7 @@ ############################################################################### # .. warning:: -# +# # Why does the plot above suggest that an increase in age leads to a # decrease in wage? Is that counter-intuitive? # From 05a02b1bba4540df5249efcc76f046b39c0f39bc Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 4 Feb 2020 16:27:48 +0100 Subject: [PATCH 61/85] Add marginal regressions in pairplot. --- .../plot_linear_model_coefficient_interpretation.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 962f90257b666..13ac075f3e5e9 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -83,10 +83,12 @@ # First, let's get some insights by looking at the variable distributions and # at the pairwise relationships between them. Only numerical # variables will be used. In the following plot, each dot represents a sample. +# +# .. _marginal_dependencies: train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) -sns.pairplot(train_dataset, diag_kind='kde') +sns.pairplot(train_dataset, kind='reg', diag_kind='kde') ############################################################################## # Looking closely at the WAGE distribution it could be noticed that it has a @@ -160,7 +162,7 @@ # # First, we fit the model. -model.fit(X_train, y_train) +_ = model.fit(X_train, y_train) ############################################################################## # Then we check the performance of the computed @@ -257,7 +259,8 @@ # .. warning:: # # Why does the plot above suggest that an increase in age leads to a -# decrease in wage? Is that counter-intuitive? +# decrease in wage? Why this is different from the :ref:`initial pairplot +# `? # # The plot above tells us about dependencies between a specific feature and # the target when all other features remain constant, i.e., **conditional @@ -375,7 +378,7 @@ ) ) -model.fit(X_train, y_train) +_ = model.fit(X_train, y_train) ############################################################################## # Again, we check the performance of the computed @@ -458,7 +461,7 @@ ) ) -model.fit(X_train, y_train) +_ = model.fit(X_train, y_train) ############################################################################## # First we verify which value of :math:`\alpha` has been selected. From 66abf43614bccec66be7b335aa5f26fff329ff74 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 4 Feb 2020 17:15:26 +0100 Subject: [PATCH 62/85] Explicit discussion about coefficient multiplication. --- ...linear_model_coefficient_interpretation.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 13ac075f3e5e9..08e700cce7cbb 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -184,7 +184,7 @@ sns.regplot(y_test, y_pred) plt.text(3, 20, string_score) - +plt.title('Ridge model, small regularization') plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) @@ -215,19 +215,23 @@ model.named_steps['transformedtargetregressor'].regressor_.coef_, columns=['Coefficients'], index=feature_names ) +plt.title('Ridge model, small regularization') coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) ############################################################################### +# In the plot above, the AGE coefficient is expressed in $/hours/living years +# while the EDUCATION one is expressed in $/hours/years of education. +# On the other hand, categorical variables (as UNION or SEX) are adimensional +# numbers taking the value either of 0 or 1. Their coefficients are expressed +# in $/hours. +# Looking at the coefficient plot to extrapolate feature importance could be +# misleading as some of them vary on a small scale, while others, like AGE, +# varies a lot more, several decades. # Soon we realize that we cannot compare different coefficients since the # features have different natural scales and hence value ranges # because of their different unit of measure. -# For instance, the AGE coefficient is expressed in $/hours/living years -# while the EDUCATION one is expressed in $/hours/years of education. -# Looking at the coefficient plot to extrapolate feature importance could be -# misleading as some of them vary on a small scale (as UNION or SEX that are -# either 0 or 1), while a feature like AGE varies a lot more, several decades. # This is evident if we compare feature standard deviations. X_train_preprocessed = pd.DataFrame( @@ -239,9 +243,10 @@ plt.subplots_adjust(left=.3) ############################################################################### -# For the reasons explained above, multiplying the coefficients by the -# standard deviation of the related feature would improve our understanding on -# feature importance on the model. +# Multiplying the coefficients by the standard deviation of the related +# feature would reduce all the coefficients to the same unit of measure. +# As we will see :ref:`after` this is equivalent to normalize +# numerical variables to their standard deviation. # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding # coefficient on the output, all else being equal. @@ -251,6 +256,7 @@ X_train_preprocessed.std(axis=0), columns=['Coefficient importance'], index=feature_names ) +plt.title('Ridge model, small regularization') coefs.plot(kind='barh', figsize=(9, 7)) plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) @@ -347,6 +353,8 @@ # The estimation of the EXPERIENCE coefficient is now less variable and # remain important for all predictors trained during cross-validation. # +# .. _scaling_num: +# # Preprocessing numerical variables # --------------------------------- # From dc560a450ddcd978daba76b9bdf99c93d9f26015 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 4 Feb 2020 17:16:37 +0100 Subject: [PATCH 63/85] Fix trailing spaces. --- .../plot_linear_model_coefficient_interpretation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 08e700cce7cbb..ce2508bef50cc 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -246,7 +246,7 @@ # Multiplying the coefficients by the standard deviation of the related # feature would reduce all the coefficients to the same unit of measure. # As we will see :ref:`after` this is equivalent to normalize -# numerical variables to their standard deviation. +# numerical variables to their standard deviation. # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding # coefficient on the output, all else being equal. @@ -354,7 +354,7 @@ # remain important for all predictors trained during cross-validation. # # .. _scaling_num: -# +# # Preprocessing numerical variables # --------------------------------- # From 662a7956e7ae5ae067ad94c7ff0f6f016d59ab91 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 5 Feb 2020 15:43:16 +0100 Subject: [PATCH 64/85] Add plot titles, explicit discussion about coeffitient scaling. --- ...linear_model_coefficient_interpretation.py | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index ce2508bef50cc..dbb4b789a0a58 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -172,14 +172,16 @@ from sklearn.metrics import median_absolute_error y_pred = model.predict(X_train) + mae = median_absolute_error(y_train, y_pred) string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -r2score = model.score(X_test, y_test) - string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -string_score += '\nR2 score: {0:.4f}'.format(r2score) +r2score = model.score(X_train, y_train) +string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) +r2score = model.score(X_test, y_test) +string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(5, 5)) sns.regplot(y_test, y_pred) @@ -215,23 +217,28 @@ model.named_steps['transformedtargetregressor'].regressor_.coef_, columns=['Coefficients'], index=feature_names ) -plt.title('Ridge model, small regularization') coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization') plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) ############################################################################### -# In the plot above, the AGE coefficient is expressed in $/hours/living years -# while the EDUCATION one is expressed in $/hours/years of education. +# In the plot above, the AGE coefficient is expressed in +# :math:`$/hours/(living\ years)` +# while the EDUCATION one is expressed in :math:`$/hours/(years\ of\ education)`. # On the other hand, categorical variables (as UNION or SEX) are adimensional # numbers taking the value either of 0 or 1. Their coefficients are expressed -# in $/hours. +# in :math:`$/hours`. An increase of +# :math:`1` in AGE is not comparable with an increase of :math:`1` in UNION. +# We cannot compare different coefficients since the +# features have different natural scales and hence value ranges +# because of their different unit of measure. +# Indeed, from this plot the most important factor in determining WAGE is the +# variable UNION, even if it is plausible that variables like EXPERIENCE +# should have more impact. # Looking at the coefficient plot to extrapolate feature importance could be # misleading as some of them vary on a small scale, while others, like AGE, # varies a lot more, several decades. -# Soon we realize that we cannot compare different coefficients since the -# features have different natural scales and hence value ranges -# because of their different unit of measure. # This is evident if we compare feature standard deviations. X_train_preprocessed = pd.DataFrame( @@ -246,7 +253,9 @@ # Multiplying the coefficients by the standard deviation of the related # feature would reduce all the coefficients to the same unit of measure. # As we will see :ref:`after` this is equivalent to normalize -# numerical variables to their standard deviation. +# numerical variables to their standard deviation, +# as :math:`y = \sum{coeff_i * X_i} = \sum{(coeff_i * std_i) * (X_i / std_i)}`. +# # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding # coefficient on the output, all else being equal. @@ -256,8 +265,8 @@ X_train_preprocessed.std(axis=0), columns=['Coefficient importance'], index=feature_names ) -plt.title('Ridge model, small regularization') coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization') plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) @@ -268,12 +277,15 @@ # decrease in wage? Why this is different from the :ref:`initial pairplot # `? # +# Now that the coefficients have been scaled, we can start to interpret them. # The plot above tells us about dependencies between a specific feature and # the target when all other features remain constant, i.e., **conditional # dependencies**. An increase of the AGE will induce a decrease # of the WAGE when all other features remain constant. On the contrary, an # increase of the EXPERIENCE will induce an increase of the WAGE when all # other features remain constant. +# Also, AGE, EXPERIENCE and EDUCATION are the three variables that most +# influence the model. # # Checking the variability of the coefficients # -------------------------------------------- @@ -398,15 +410,17 @@ string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -r2score = model.score(X_test, y_test) - string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -string_score += '\nR2 score: {0:.4f}'.format(r2score) +r2score = model.score(X_train, y_train) +string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) +r2score = model.score(X_test, y_test) +string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) sns.regplot(y_test, y_pred) plt.text(3, 20, string_score) +plt.title('Ridge model, small regularization, normalized variables') plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) @@ -421,6 +435,7 @@ columns=['Coefficients'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, small regularization, normalized variables') plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) @@ -444,7 +459,7 @@ plt.subplots_adjust(left=.3) ############################################################################## -# The result is quite similar to the non-normalised case. +# The result is quite similar to the non-normalized case. # # Linear models with regularization # --------------------------------- @@ -484,15 +499,18 @@ string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) +string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +r2score = model.score(X_train, y_train) +string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) r2score = model.score(X_test, y_test) +string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) -string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -string_score += '\nR2 score: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) sns.regplot(y_test, y_pred) plt.text(3, 20, string_score) +plt.title('Ridge model, regularization, normalized variables') plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) @@ -506,6 +524,7 @@ columns=['Coefficients'], index=feature_names ) coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Ridge model, regularization, normalized variables') plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) From cbcbb228d0c75d9c6b621339a4d15fbdb165d5b4 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 5 Feb 2020 19:18:32 +0100 Subject: [PATCH 65/85] Finalize Lasso discussion. Add link to the example in UG. --- doc/inspection.rst | 4 + doc/modules/linear_model.rst | 3 +- ...linear_model_coefficient_interpretation.py | 73 ++++++++++++++++++- 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/doc/inspection.rst b/doc/inspection.rst index b53aeb436b4cd..1304a1030abb9 100644 --- a/doc/inspection.rst +++ b/doc/inspection.rst @@ -17,6 +17,10 @@ predictions from a model and what affects them. This can be used to evaluate assumptions and biases of a model, design a better model, or to diagnose issues with model performance. +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` + .. toctree:: modules/partial_dependence diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 19205385f311b..cfc47aa2b7aec 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -146,7 +146,7 @@ a linear kernel. * :ref:`sphx_glr_auto_examples_linear_model_plot_ridge_path.py` * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` Ridge Complexity ---------------- @@ -232,6 +232,7 @@ computes the coefficients along the full path of possible values. * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` * :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py` + * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` .. note:: **Feature selection with Lasso** diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index dbb4b789a0a58..6f07c7527456a 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -473,7 +473,6 @@ # suited for the model estimation. from sklearn.linear_model import RidgeCV -from sklearn.compose import TransformedTargetRegressor model = make_pipeline( preprocessor, @@ -534,3 +533,75 @@ # Even if the model is still not able to provide a good description of the # dataset, the regularization manages to lower the influence of correlated # variables on the model. +# +# Linear models with sparse coefficients +# -------------------------------------- +# +# Another possibility to take into account correlated variables in the dataset, +# is to estimate sparse coefficients. In some way we already did it manually +# when we dropped the AGE column in a previous Ridge estimation. +# +# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse +# coefficients. LassoCV applies cross validation in order to +# determine which value of the regularization parameter (`alpha`) is best +# suited for the model estimation. + +from sklearn.linear_model import LassoCV + +model = make_pipeline( + preprocessor, + TransformedTargetRegressor( + regressor=LassoCV(alphas=np.logspace(-10, 10, 21), max_iter=100000), + func=np.log10, + inverse_func=sp.special.exp10 + ) +) + +_ = model.fit(X_train, y_train) + +############################################################################## +# First we verify which value of :math:`\alpha` has been selected. + +model[-1].regressor_.alpha_ + +############################################################################## +# Then we check the quality of the predictions. + +y_pred = model.predict(X_train) +mae = median_absolute_error(y_train, y_pred) +string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +y_pred = model.predict(X_test) +mae = median_absolute_error(y_test, y_pred) +string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +r2score = model.score(X_train, y_train) +string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) +r2score = model.score(X_test, y_test) +string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) + +fig, ax = plt.subplots(figsize=(6, 6)) +sns.regplot(y_test, y_pred) + +plt.text(3, 20, string_score) + +plt.title('Lasso model, regularization, normalized variables') +plt.ylabel('Model predictions') +plt.xlabel('Truths') +plt.xlim([0, 27]) +plt.ylim([0, 27]) + +############################################################################## +# For our dataset the R squared coefficient is of the same order than all other +# models. + +coefs = pd.DataFrame( + model.named_steps['transformedtargetregressor'].regressor_.coef_, + columns=['Coefficients'], index=feature_names +) +coefs.plot(kind='barh', figsize=(9, 7)) +plt.title('Lasso model, regularization, normalized variables') +plt.axvline(x=0, color='.5') +plt.subplots_adjust(left=.3) + +############################################################################# +# It is worth noticing that a Lasso model identifies the correlation between +# AGE and EXPERIENCE and suppresses one of them. From c993af8b65b60dbbd8a4471578a657d4b3547bb6 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 5 Feb 2020 19:20:08 +0100 Subject: [PATCH 66/85] Fix lint errors. --- .../plot_linear_model_coefficient_interpretation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 6f07c7527456a..1f24c70439207 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -224,8 +224,8 @@ ############################################################################### # In the plot above, the AGE coefficient is expressed in -# :math:`$/hours/(living\ years)` -# while the EDUCATION one is expressed in :math:`$/hours/(years\ of\ education)`. +# :math:`$/hours/(living\ years)` while the EDUCATION one is expressed +# in :math:`$/hours/(years\ of\ education)`. # On the other hand, categorical variables (as UNION or SEX) are adimensional # numbers taking the value either of 0 or 1. Their coefficients are expressed # in :math:`$/hours`. An increase of From 12c83f9abe2bda6f2f0b4db9a4a8c149d3eda661 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 24 Feb 2020 17:19:01 +0100 Subject: [PATCH 67/85] Removing OrdinalEncoder. --- .../plot_linear_model_coefficient_interpretation.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 1f24c70439207..e9e4f2d640ae4 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -56,7 +56,6 @@ ############################################################################## # Notice that the dataset contains categorical and numerical variables. -# Some of the categorical variables are binary variables. # This will give us directions on how to preprocess the data thereafter. X.head() @@ -120,21 +119,18 @@ # # - one-hot encode (i.e., generate a column by category) the categorical # columns; -# - replace by 0 and 1 the categories of binary columns; # - as a first approach (we will see after how the normalisation of numerical # values will affect our discussion), keep numerical values as they are. from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import OrdinalEncoder -categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR'] -binary_columns = ['MARR', 'UNION', 'SEX', 'SOUTH'] +categorical_columns = ['RACE', 'OCCUPATION', 'SECTOR', + 'MARR', 'UNION', 'SEX', 'SOUTH'] numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] preprocessor = make_column_transformer( (OneHotEncoder(), categorical_columns), - (OrdinalEncoder(), binary_columns), remainder='passthrough' ) @@ -211,7 +207,7 @@ .named_transformers_['onehotencoder'] .get_feature_names(input_features=categorical_columns)) feature_names = np.concatenate( - [feature_names, binary_columns, numerical_columns]) + [feature_names, numerical_columns]) coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, @@ -381,7 +377,6 @@ preprocessor = make_column_transformer( (OneHotEncoder(), categorical_columns), - (OrdinalEncoder(), binary_columns), (StandardScaler(), numerical_columns), remainder='passthrough' ) From a71baaaefdb7c90275cc7128030cd1e8339de0d4 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 24 Feb 2020 17:35:46 +0100 Subject: [PATCH 68/85] Removing R2 from logarithmic models. --- ...linear_model_coefficient_interpretation.py | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index e9e4f2d640ae4..a8f65368410d7 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -174,13 +174,9 @@ y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -r2score = model.score(X_train, y_train) -string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) -r2score = model.score(X_test, y_test) -string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(5, 5)) -sns.regplot(y_test, y_pred) - +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") plt.text(3, 20, string_score) plt.title('Ridge model, small regularization') plt.ylabel('Model predictions') @@ -190,7 +186,8 @@ ############################################################################## # The model learnt is far from being a good model making accurate predictions: -# the R squared score is very low. +# this is obvious when looking at the plot above, where good predictions +# should lie on the red line. # In the following section, we will interpret the coefficients of the model. # While we do so, we should keep in mind that any conclusion we way draw will # be about @@ -406,12 +403,9 @@ y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -r2score = model.score(X_train, y_train) -string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) -r2score = model.score(X_test, y_test) -string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) -sns.regplot(y_test, y_pred) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") plt.text(3, 20, string_score) @@ -494,13 +488,10 @@ y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -r2score = model.score(X_train, y_train) -string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) -r2score = model.score(X_test, y_test) -string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) -sns.regplot(y_test, y_pred) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") plt.text(3, 20, string_score) @@ -568,13 +559,10 @@ y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) -r2score = model.score(X_train, y_train) -string_score += '\nR2 score on training set: {0:.4f}'.format(r2score) -r2score = model.score(X_test, y_test) -string_score += '\nR2 score on testing set: {0:.4f}'.format(r2score) fig, ax = plt.subplots(figsize=(6, 6)) -sns.regplot(y_test, y_pred) +plt.scatter(y_test, y_pred) +ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") plt.text(3, 20, string_score) From fa0b41cf4a4f0dcc4d6399fab4954b0173850191 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 24 Feb 2020 21:30:09 +0100 Subject: [PATCH 69/85] Fix lint errors. --- .../plot_linear_model_coefficient_interpretation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index a8f65368410d7..cf8e3690e1f5f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -176,7 +176,7 @@ string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) fig, ax = plt.subplots(figsize=(5, 5)) plt.scatter(y_test, y_pred) -ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") plt.text(3, 20, string_score) plt.title('Ridge model, small regularization') plt.ylabel('Model predictions') @@ -405,7 +405,7 @@ string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) -ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") plt.text(3, 20, string_score) @@ -491,7 +491,7 @@ fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) -ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") plt.text(3, 20, string_score) @@ -562,7 +562,7 @@ fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) -ax.plot([0, 1], [0, 1], transform=ax.transAxes,ls="--", c="red") +ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") plt.text(3, 20, string_score) From 97b4f9002d012af478e7f25ebd81a0d09c921d13 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 25 Feb 2020 10:04:14 +0100 Subject: [PATCH 70/85] Add details on non-normalized model. --- ...linear_model_coefficient_interpretation.py | 42 ++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index cf8e3690e1f5f..514584a00de2f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -197,8 +197,8 @@ # Interpreting coefficients: scale matters # --------------------------------------------- # -# First of all, we can plot the values of the coefficients of the regressor we -# have fitted. +# First of all, we can take a look to the values of the coefficients of the +# regressor we have fitted. feature_names = (model.named_steps['columntransformer'] .named_transformers_['onehotencoder'] @@ -210,23 +210,33 @@ model.named_steps['transformedtargetregressor'].regressor_.coef_, columns=['Coefficients'], index=feature_names ) + +coefs + +############################################################################## +# The AGE coefficient is expressed in +# :math:`$/hours/(living\ years)` while the EDUCATION one is expressed +# in :math:`$/hours/(years\ of\ education)`. +# This representation of the coefficients has the advantage of making clear +# the practical predictions of the model: +# an increase of :math:`1` year in AGE means a decrease of :math:`0.030867$`, +# while an increase of :math:`1` year in EDUCATION means an increase of +# :math:`0.054699$`. +# On the other hand, categorical variables (as UNION or SEX) are adimensional +# numbers taking the value either of 0 or 1. Their coefficients are expressed +# in :math:`$/hours`. Then, we cannot compare the magnitude of different +# coefficients since the features have different natural scales, and hence +# value ranges, because of their different unit of measure. +# This is more evident if we plot the coefficients. + coefs.plot(kind='barh', figsize=(9, 7)) plt.title('Ridge model, small regularization') plt.axvline(x=0, color='.5') plt.subplots_adjust(left=.3) ############################################################################### -# In the plot above, the AGE coefficient is expressed in -# :math:`$/hours/(living\ years)` while the EDUCATION one is expressed -# in :math:`$/hours/(years\ of\ education)`. -# On the other hand, categorical variables (as UNION or SEX) are adimensional -# numbers taking the value either of 0 or 1. Their coefficients are expressed -# in :math:`$/hours`. An increase of -# :math:`1` in AGE is not comparable with an increase of :math:`1` in UNION. -# We cannot compare different coefficients since the -# features have different natural scales and hence value ranges -# because of their different unit of measure. -# Indeed, from this plot the most important factor in determining WAGE is the +# Indeed, from the plot above the most important factor in determining WAGE +# appears to be the # variable UNION, even if it is plausible that variables like EXPERIENCE # should have more impact. # Looking at the coefficient plot to extrapolate feature importance could be @@ -247,7 +257,8 @@ # feature would reduce all the coefficients to the same unit of measure. # As we will see :ref:`after` this is equivalent to normalize # numerical variables to their standard deviation, -# as :math:`y = \sum{coeff_i * X_i} = \sum{(coeff_i * std_i) * (X_i / std_i)}`. +# as :math:`y = \sum{coeff_i \times X_i} = +# \sum{(coeff_i \times std_i) \times (X_i / std_i)}`. # # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding @@ -264,13 +275,14 @@ plt.subplots_adjust(left=.3) ############################################################################### +# Now that the coefficients have been scaled, we can safely compare them. +# # .. warning:: # # Why does the plot above suggest that an increase in age leads to a # decrease in wage? Why this is different from the :ref:`initial pairplot # `? # -# Now that the coefficients have been scaled, we can start to interpret them. # The plot above tells us about dependencies between a specific feature and # the target when all other features remain constant, i.e., **conditional # dependencies**. An increase of the AGE will induce a decrease From 96dec51404caf28741698cd8629f7cb3b187ab8f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 25 Feb 2020 13:54:15 +0100 Subject: [PATCH 71/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Joel Nothman --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 514584a00de2f..87a38474b592e 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -66,7 +66,7 @@ survey.target.head() ############################################################################### -# We split the sample in a train and a test dataset. +# We split the sample into a train and a test dataset. # Only the train dataset will be used in the following exploratory analysis. # This is a way to emulate a real situation where predictions are performed on # an unknown target, and we don't want our analysis and decisions to be biased From bc37c3e005e1c294c0061cc0ddf93a3e5f2b7440 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 25 Feb 2020 14:51:54 +0100 Subject: [PATCH 72/85] Address some comments. --- .../plot_linear_model_coefficient_interpretation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 87a38474b592e..cbe3d96fba06f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -13,7 +13,7 @@ `_). This is different from plotting :math:`X_i` versus :math:`y` and fitting a linear relationship: in that case all possible values of the other features are -added to the estimation (marginal dependence). +taken into account in the estimation (marginal dependence). This example will provide some hints in interpreting coefficient in linear models, pointing at problems that arise when either the linear model is not @@ -368,7 +368,7 @@ ############################################################################### # The estimation of the EXPERIENCE coefficient is now less variable and -# remain important for all predictors trained during cross-validation. +# remain important for all models trained during cross-validation. # # .. _scaling_num: # From 68158023d11e0b7804bd4ebc32d6fc5bab76e8b2 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 27 Feb 2020 14:34:26 +0100 Subject: [PATCH 73/85] More details on Ridge with regularization. --- ...linear_model_coefficient_interpretation.py | 47 +++++++++++++++---- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index cbe3d96fba06f..3330f170dff40 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -331,10 +331,14 @@ # apart. # To verify this interpretation we plot the variability of the AGE and # EXPERIENCE coefficient. +# +# .. _covariation: plt.ylabel('Age coefficient') plt.xlabel('Experience coefficient') plt.grid(True) +plt.xlim(-0.4, 0.5) +plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') @@ -514,7 +518,8 @@ plt.ylim([0, 27]) ############################################################################## -# The R squared coefficient is similar to the non-regularized case. +# The ability to reproduce the data of the regularized model is similar to +# the one of the non-regularized model. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, @@ -526,12 +531,37 @@ plt.subplots_adjust(left=.3) ############################################################################## -# Coefficients are significantly different. -# AGE and EXPERIENCE coefficients are both positive. -# Even if the model is still not able to provide a good description of the -# dataset, the regularization manages to lower the influence of correlated -# variables on the model. -# +# The coefficients are significantly different. +# AGE and EXPERIENCE coefficients are both positive but they have less +# influence on the prediction. +# The regularization manages to lower the influence of correlated +# variables on the model because the weight is shared between the two +# predictive variables, so neither alone would be very strongly weighted. +# On the other hand, those weights are more robust with respect to +# cross validation (see the :ref:`ridge_regression` User Guide section), +# as is shown in the plot below to be compared with the +# :ref:`previous one`. + +cv_model = cross_validate( + model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), + return_estimator=True, n_jobs=-1 +) +coefs = pd.DataFrame( + [est.named_steps['transformedtargetregressor'].regressor_.coef_ * + X_train_preprocessed.std(axis=0) + for est in cv_model['estimator']], + columns=feature_names +) + +plt.ylabel('Age coefficient') +plt.xlabel('Experience coefficient') +plt.grid(True) +plt.xlim(-0.4, 0.5) +plt.ylim(-0.4, 0.5) +plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) +plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') + +############################################################################## # Linear models with sparse coefficients # -------------------------------------- # @@ -585,8 +615,7 @@ plt.ylim([0, 27]) ############################################################################## -# For our dataset the R squared coefficient is of the same order than all other -# models. +# For our dataset, again the model is not very predictive. coefs = pd.DataFrame( model.named_steps['transformedtargetregressor'].regressor_.coef_, From 12fe6909b507fc85d86eb975fa57ab0e5df7de4f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 27 Feb 2020 14:35:42 +0100 Subject: [PATCH 74/85] Fix linting error. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 3330f170dff40..8571176b9c0a5 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -540,7 +540,7 @@ # On the other hand, those weights are more robust with respect to # cross validation (see the :ref:`ridge_regression` User Guide section), # as is shown in the plot below to be compared with the -# :ref:`previous one`. +# :ref:`previous one`. cv_model = cross_validate( model, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), From 4f12268647d17ac200302c77265246dcff7e2658 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 28 Feb 2020 14:33:27 +0100 Subject: [PATCH 75/85] Make explicit the comparison between pairwise plot and coefficients of collinear variables (@agramfort suggestion). --- .../plot_linear_model_coefficient_interpretation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 8571176b9c0a5..39487c53452eb 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -280,8 +280,8 @@ # .. warning:: # # Why does the plot above suggest that an increase in age leads to a -# decrease in wage? Why this is different from the :ref:`initial pairplot -# `? +# decrease in wage? Why the :ref:`initial pairplot +# ` is telling the opposite? # # The plot above tells us about dependencies between a specific feature and # the target when all other features remain constant, i.e., **conditional @@ -432,7 +432,6 @@ plt.ylim([0, 27]) ############################################################################## -# The R squared coefficient is not better than for the non-normalized case. # For the coefficient analysis, scaling is not needed this time. coefs = pd.DataFrame( @@ -627,5 +626,6 @@ plt.subplots_adjust(left=.3) ############################################################################# -# It is worth noticing that a Lasso model identifies the correlation between -# AGE and EXPERIENCE and suppresses one of them. +# A Lasso model identifies the correlation between +# AGE and EXPERIENCE and suppresses one of them for the sake of the prediction. + From 5790006b36bcdcdd87c1cdff8a6835ececda02ba Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 28 Feb 2020 15:53:33 +0100 Subject: [PATCH 76/85] Add summary. --- .../plot_linear_model_coefficient_interpretation.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 39487c53452eb..4c03faee12268 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -628,4 +628,15 @@ ############################################################################# # A Lasso model identifies the correlation between # AGE and EXPERIENCE and suppresses one of them for the sake of the prediction. - +# +# Lessons learned +# --------------- +# +# * Feature importance could be extrapolated from the coefficients only after +# having scaled them to the same unit of measure. +# * Coefficients in multiple linear models represent conditional dependencies +# between a given feature and the target. +# * Correlated features induce variability in the coefficients of linear +# models. +# * Different linear models respond differently to feature correlation and +# coefficients could significantly vary from one another. From 4e3bb4a4f1e89c746431ea0aa9f542ad8d82840f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 28 Feb 2020 15:57:06 +0100 Subject: [PATCH 77/85] Fix linting error. --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 4c03faee12268..1ce0f4b4be745 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -631,7 +631,7 @@ # # Lessons learned # --------------- -# +# # * Feature importance could be extrapolated from the coefficients only after # having scaled them to the same unit of measure. # * Coefficients in multiple linear models represent conditional dependencies From fcff805923b9132c533efb8e5aed8ca59b176fec Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 15:19:28 +0100 Subject: [PATCH 78/85] Fix OneHotEncoder. Address last comments. --- ...linear_model_coefficient_interpretation.py | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 1ce0f4b4be745..07743ee8cbd9b 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -19,7 +19,8 @@ models, pointing at problems that arise when either the linear model is not appropriate to describe the dataset, or when features are correlated. -We will use data from the "Current Population Survey" from 1985 to predict +We will use data from the `"Current Population Survey" +_` from 1985 to predict wage as a function of various features such as experience, age, or education. .. contents:: @@ -61,7 +62,8 @@ X.head() ############################################################################## -# Our target for prediction: the wage +# Our target for prediction: the wage. +# Wages are described as floating-point number in :math:`k$` y = survey.target.values.ravel() survey.target.head() @@ -87,7 +89,7 @@ train_dataset = X_train.copy() train_dataset.insert(0, "WAGE", y_train) -sns.pairplot(train_dataset, kind='reg', diag_kind='kde') +_ = sns.pairplot(train_dataset, kind='reg', diag_kind='kde') ############################################################################## # Looking closely at the WAGE distribution it could be noticed that it has a @@ -130,7 +132,7 @@ numerical_columns = ['EDUCATION', 'EXPERIENCE', 'AGE'] preprocessor = make_column_transformer( - (OneHotEncoder(), categorical_columns), + (OneHotEncoder(drop='if_binary'), categorical_columns), remainder='passthrough' ) @@ -161,19 +163,19 @@ _ = model.fit(X_train, y_train) ############################################################################## -# Then we check the performance of the computed -# model using, for example, the median absolute error of the model and the R -# squared coefficient. +# Then we check the performance of the computed model plotting its predictions +# on the test set and computing, +# for example, the median absolute error of the model. from sklearn.metrics import median_absolute_error y_pred = model.predict(X_train) mae = median_absolute_error(y_train, y_pred) -string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +string_score = f'MAE on training set: {mae:.2f} $/hour' y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' fig, ax = plt.subplots(figsize=(5, 5)) plt.scatter(y_test, y_pred) ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") @@ -182,7 +184,7 @@ plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) -plt.ylim([0, 27]) +_ = plt.ylim([0, 27]) ############################################################################## # The model learnt is far from being a good model making accurate predictions: @@ -248,6 +250,7 @@ model.named_steps['columntransformer'].transform(X_train), columns=feature_names ) + X_train_preprocessed.std(axis=0).plot(kind='barh', figsize=(9, 7)) plt.title('Features std. dev.') plt.subplots_adjust(left=.3) @@ -389,7 +392,7 @@ from sklearn.preprocessing import StandardScaler preprocessor = make_column_transformer( - (OneHotEncoder(), categorical_columns), + (OneHotEncoder(drop='if_binary'), categorical_columns), (StandardScaler(), numerical_columns), remainder='passthrough' ) @@ -415,10 +418,10 @@ y_pred = model.predict(X_train) mae = median_absolute_error(y_train, y_pred) -string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +string_score = f'MAE on training set: {mae:.2f} $/hour' y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red") @@ -499,10 +502,10 @@ y_pred = model.predict(X_train) mae = median_absolute_error(y_train, y_pred) -string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +string_score = f'MAE on training set: {mae:.2f} $/hour' y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) @@ -596,10 +599,10 @@ y_pred = model.predict(X_train) mae = median_absolute_error(y_train, y_pred) -string_score = 'MAE on training set: {0:.2f} $/hour'.format(mae) +string_score = f'MAE on training set: {mae:.2f} $/hour' y_pred = model.predict(X_test) mae = median_absolute_error(y_test, y_pred) -string_score += '\nMAE on testing set: {0:.2f} $/hour'.format(mae) +string_score += f'\nMAE on testing set: {mae:.2f} $/hour' fig, ax = plt.subplots(figsize=(6, 6)) plt.scatter(y_test, y_pred) From 8e113869276d4eaf050fc6af6703b213b5a7c735 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 15:28:00 +0100 Subject: [PATCH 79/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Guillaume Lemaitre --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 07743ee8cbd9b..f1b629a94fdc2 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -260,7 +260,7 @@ # feature would reduce all the coefficients to the same unit of measure. # As we will see :ref:`after` this is equivalent to normalize # numerical variables to their standard deviation, -# as :math:`y = \sum{coeff_i \times X_i} = +# as :math:`y = \sum{coef_i \times X_i} = # \sum{(coeff_i \times std_i) \times (X_i / std_i)}`. # # In that way, we emphasize that the From 2e89d9c7ca035d91951a245b9c7f5e371fd2e09b Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 15:30:30 +0100 Subject: [PATCH 80/85] Update examples/inspection/plot_linear_model_coefficient_interpretation.py Co-Authored-By: Guillaume Lemaitre --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index f1b629a94fdc2..060fe601fe025 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -261,7 +261,7 @@ # As we will see :ref:`after` this is equivalent to normalize # numerical variables to their standard deviation, # as :math:`y = \sum{coef_i \times X_i} = -# \sum{(coeff_i \times std_i) \times (X_i / std_i)}`. +# \sum{(coef_i \times std_i) \times (X_i / std_i)}`. # # In that way, we emphasize that the # greater the variance of a feature, the larger the weight of the corresponding From d07181d6d8d504d150f28b343b5f07def5c414e5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 18:45:34 +0100 Subject: [PATCH 81/85] Address comments. --- README.rst | 3 ++- doc/install.rst | 3 ++- .../plot_linear_model_coefficient_interpretation.py | 12 ++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index fa0b665bbc8dd..171a19785dd73 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,8 @@ scikit-learn 0.23 and later require Python 3.6 or newer. Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and classes end with "Display") require Matplotlib (>= 2.1.1). For running the examples Matplotlib >= 2.1.1 is required. A few examples require -scikit-image >= 0.13, a few examples require pandas >= 0.18.0. +scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples +require seaborn >= 0.9.0. User installation ~~~~~~~~~~~~~~~~~ diff --git a/doc/install.rst b/doc/install.rst index 6a2b83605c1a6..9f8c277577a3c 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -134,7 +134,8 @@ it as ``scikit-learn[alldeps]``. Scikit-learn plotting capabilities (i.e., functions start with "plot\_" and classes end with "Display") require Matplotlib (>= 2.1.1). For running the examples Matplotlib >= 2.1.1 is required. A few examples require -scikit-image >= 0.13, a few examples require pandas >= 0.18.0. +scikit-image >= 0.13, a few examples require pandas >= 0.18.0, some examples +require seaborn >= 0.9.0. .. warning:: diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 060fe601fe025..0f96ced97cded 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -20,7 +20,7 @@ appropriate to describe the dataset, or when features are correlated. We will use data from the `"Current Population Survey" -_` from 1985 to predict +`_ from 1985 to predict wage as a function of various features such as experience, age, or education. .. contents:: @@ -343,7 +343,7 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is @@ -432,7 +432,7 @@ plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) -plt.ylim([0, 27]) +_ = plt.ylim([0, 27]) ############################################################################## # For the coefficient analysis, scaling is not needed this time. @@ -517,7 +517,7 @@ plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) -plt.ylim([0, 27]) +_ = plt.ylim([0, 27]) ############################################################################## # The ability to reproduce the data of the regularized model is similar to @@ -561,7 +561,7 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') ############################################################################## # Linear models with sparse coefficients @@ -614,7 +614,7 @@ plt.ylabel('Model predictions') plt.xlabel('Truths') plt.xlim([0, 27]) -plt.ylim([0, 27]) +_ = plt.ylim([0, 27]) ############################################################################## # For our dataset, again the model is not very predictive. From 4f97c0600e71a8d9247e58f27aa1a7294475443d Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 18:48:13 +0100 Subject: [PATCH 82/85] Add dataframe css. --- .../scikit-learn-modern/static/css/theme.css | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index a77fb03e36f65..2b80d6fe2b762 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -963,6 +963,44 @@ div.sphx-glr-thumbcontainer { } } +/* Pandas dataframe css */ +/* Taken from: https://github.com/spatialaudio/nbsphinx/blob/fb3ba670fc1ba5f54d4c487573dbc1b4ecf7e9ff/src/nbsphinx.py#L587-L619 */ +/* FIXME: to be removed when sphinx-gallery >= 5.0 will be released */ + +table.dataframe { + border: none !important; + border-collapse: collapse; + border-spacing: 0; + border-color: transparent; + color: black; + font-size: 12px; + table-layout: fixed; +} +table.dataframe thead { + border-bottom: 1px solid black; + vertical-align: bottom; +} +table.dataframe tr, +table.dataframe th, +table.dataframe td { + text-align: right; + vertical-align: middle; + padding: 0.5em 0.5em; + line-height: normal; + white-space: normal; + max-width: none; + border: none; +} +table.dataframe th { + font-weight: bold; +} +table.dataframe tbody tr:nth-child(odd) { + background: #f5f5f5; +} +table.dataframe tbody tr:hover { + background: rgba(66, 165, 245, 0.2); +} + /* rellinks */ .sk-btn-rellink { From a2f29bc579d5b92ba508288ad6df89727bc6f049 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 10 Mar 2020 18:54:11 +0100 Subject: [PATCH 83/85] Fix lint errors. --- .../plot_linear_model_coefficient_interpretation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 0f96ced97cded..8287080e326ae 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -343,7 +343,8 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE \ + across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is @@ -561,7 +562,8 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE \ + across folds') ############################################################################## # Linear models with sparse coefficients From 2d2f6df658762aa9591154635db3be5bfaeb92be Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Mar 2020 19:25:26 +0100 Subject: [PATCH 84/85] Update plot_linear_model_coefficient_interpretation.py --- .../plot_linear_model_coefficient_interpretation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 8287080e326ae..90c9176e504b0 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -343,8 +343,8 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE \ - across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE ' + 'across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is @@ -562,8 +562,8 @@ plt.xlim(-0.4, 0.5) plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) -_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE \ - across folds') +_ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE ' + 'across folds') ############################################################################## # Linear models with sparse coefficients From ffd06d753aaca5896b81dfc8f367fe08f6b14a5e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 10 Mar 2020 19:27:24 +0100 Subject: [PATCH 85/85] PEP8 --- .../inspection/plot_linear_model_coefficient_interpretation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 90c9176e504b0..8fdad51a3ff7f 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -344,7 +344,7 @@ plt.ylim(-0.4, 0.5) plt.scatter(coefs["AGE"], coefs["EXPERIENCE"]) _ = plt.title('Co-variations of coefficients for AGE and EXPERIENCE ' - 'across folds') + 'across folds') ############################################################################### # Two regions are populated: when the EXPERIENCE coefficient is