|
3 | 3 | Feature transformations with ensembles of trees
|
4 | 4 | ===============================================
|
5 | 5 |
|
6 |
| -Transform your features into a higher dimensional, sparse space. Then |
7 |
| -train a linear model on these features. |
| 6 | +Transform your features into a higher dimensional, sparse space. Then train a |
| 7 | +linear model on these features. |
8 | 8 |
|
9 |
| -First fit an ensemble of trees (totally random trees, a random |
10 |
| -forest, or gradient boosted trees) on the training set. Then each leaf |
11 |
| -of each tree in the ensemble is assigned a fixed arbitrary feature |
12 |
| -index in a new feature space. These leaf indices are then encoded in a |
13 |
| -one-hot fashion. |
| 9 | +First fit an ensemble of trees (totally random trees, a random forest, or |
| 10 | +gradient boosted trees) on the training set. Then each leaf of each tree in the |
| 11 | +ensemble is assigned a fixed arbitrary feature index in a new feature space. |
| 12 | +These leaf indices are then encoded in a one-hot fashion. |
14 | 13 |
|
15 |
| -Each sample goes through the decisions of each tree of the ensemble |
16 |
| -and ends up in one leaf per tree. The sample is encoded by setting |
17 |
| -feature values for these leaves to 1 and the other feature values to 0. |
| 14 | +Each sample goes through the decisions of each tree of the ensemble and ends up |
| 15 | +in one leaf per tree. The sample is encoded by setting feature values for these |
| 16 | +leaves to 1 and the other feature values to 0. |
18 | 17 |
|
19 | 18 | The resulting transformer has then learned a supervised, sparse,
|
20 | 19 | high-dimensional categorical embedding of the data.
|
21 |
| -
|
22 | 20 | """
|
23 | 21 |
|
24 | 22 | # Author: Tim Head <betatim@gmail.com>
|
25 | 23 | #
|
26 | 24 | # License: BSD 3 clause
|
27 | 25 |
|
28 |
| -import numpy as np |
29 |
| -np.random.seed(10) |
| 26 | +print(__doc__) |
30 | 27 |
|
31 |
| -import matplotlib.pyplot as plt |
| 28 | +from sklearn import set_config |
| 29 | +set_config(display='diagram') |
| 30 | + |
| 31 | +# %% |
| 32 | +# First, we will create a large dataset and split it into three sets: |
| 33 | +# |
| 34 | +# - a set to train the ensemble methods which are later used to as a feature |
| 35 | +# engineering transformer; |
| 36 | +# - a set to train the linear model; |
| 37 | +# - a set to test the linear model. |
| 38 | +# |
| 39 | +# It is important to split the data in such way to avoid overfitting by leaking |
| 40 | +# data. |
32 | 41 |
|
33 | 42 | from sklearn.datasets import make_classification
|
34 |
| -from sklearn.linear_model import LogisticRegression |
35 |
| -from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, |
36 |
| - GradientBoostingClassifier) |
37 |
| -from sklearn.preprocessing import OneHotEncoder |
38 | 43 | from sklearn.model_selection import train_test_split
|
39 |
| -from sklearn.metrics import roc_curve |
| 44 | + |
| 45 | +X, y = make_classification(n_samples=80000, random_state=10) |
| 46 | + |
| 47 | +X_full_train, X_test, y_full_train, y_test = train_test_split( |
| 48 | + X, y, test_size=0.5, random_state=10) |
| 49 | +X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = \ |
| 50 | + train_test_split(X_full_train, y_full_train, test_size=0.5, |
| 51 | + random_state=10) |
| 52 | + |
| 53 | +# %% |
| 54 | +# For each of the ensemble methods, we will use 10 estimators and a maximum |
| 55 | +# depth of 3 levels. |
| 56 | + |
| 57 | +n_estimators = 10 |
| 58 | +max_depth = 3 |
| 59 | + |
| 60 | +# %% |
| 61 | +# First, we will start by training the random forest and gradient boosting on |
| 62 | +# the separated training set |
| 63 | + |
| 64 | +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
| 65 | + |
| 66 | +random_forest = RandomForestClassifier( |
| 67 | + n_estimators=n_estimators, max_depth=max_depth, random_state=10) |
| 68 | +random_forest.fit(X_train_ensemble, y_train_ensemble) |
| 69 | + |
| 70 | +gradient_boosting = GradientBoostingClassifier( |
| 71 | + n_estimators=n_estimators, max_depth=max_depth, random_state=10) |
| 72 | +_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble) |
| 73 | + |
| 74 | +# %% |
| 75 | +# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method |
| 76 | +# and thus does not required to be trained independently. |
| 77 | + |
| 78 | +from sklearn.ensemble import RandomTreesEmbedding |
| 79 | + |
| 80 | +random_tree_embedding = RandomTreesEmbedding( |
| 81 | + n_estimators=n_estimators, max_depth=max_depth, random_state=0) |
| 82 | + |
| 83 | +# %% |
| 84 | +# Now, we will create three pipelines that will use the above embedding as |
| 85 | +# a preprocessing stage. |
| 86 | +# |
| 87 | +# The random trees embedding can be directly pipelined with the logistic |
| 88 | +# regression because it is a standard scikit-learn transformer. |
| 89 | + |
| 90 | +from sklearn.linear_model import LogisticRegression |
40 | 91 | from sklearn.pipeline import make_pipeline
|
41 | 92 |
|
42 |
| -n_estimator = 10 |
43 |
| -X, y = make_classification(n_samples=80000) |
44 |
| -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) |
45 |
| - |
46 |
| -# It is important to train the ensemble of trees on a different subset |
47 |
| -# of the training data than the linear regression model to avoid |
48 |
| -# overfitting, in particular if the total number of leaves is |
49 |
| -# similar to the number of training samples |
50 |
| -X_train, X_train_lr, y_train, y_train_lr = train_test_split( |
51 |
| - X_train, y_train, test_size=0.5) |
52 |
| - |
53 |
| -# Unsupervised transformation based on totally random trees |
54 |
| -rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, |
55 |
| - random_state=0) |
56 |
| - |
57 |
| -rt_lm = LogisticRegression(max_iter=1000) |
58 |
| -pipeline = make_pipeline(rt, rt_lm) |
59 |
| -pipeline.fit(X_train, y_train) |
60 |
| -y_pred_rt = pipeline.predict_proba(X_test)[:, 1] |
61 |
| -fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) |
62 |
| - |
63 |
| -# Supervised transformation based on random forests |
64 |
| -rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) |
65 |
| -rf_enc = OneHotEncoder() |
66 |
| -rf_lm = LogisticRegression(max_iter=1000) |
67 |
| -rf.fit(X_train, y_train) |
68 |
| -rf_enc.fit(rf.apply(X_train)) |
69 |
| -rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) |
70 |
| - |
71 |
| -y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] |
72 |
| -fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) |
73 |
| - |
74 |
| -# Supervised transformation based on gradient boosted trees |
75 |
| -grd = GradientBoostingClassifier(n_estimators=n_estimator) |
76 |
| -grd_enc = OneHotEncoder() |
77 |
| -grd_lm = LogisticRegression(max_iter=1000) |
78 |
| -grd.fit(X_train, y_train) |
79 |
| -grd_enc.fit(grd.apply(X_train)[:, :, 0]) |
80 |
| -grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) |
81 |
| - |
82 |
| -y_pred_grd_lm = grd_lm.predict_proba( |
83 |
| - grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] |
84 |
| -fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) |
85 |
| - |
86 |
| -# The gradient boosted model by itself |
87 |
| -y_pred_grd = grd.predict_proba(X_test)[:, 1] |
88 |
| -fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) |
89 |
| - |
90 |
| -# The random forest model by itself |
91 |
| -y_pred_rf = rf.predict_proba(X_test)[:, 1] |
92 |
| -fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) |
93 |
| - |
94 |
| -plt.figure(1) |
95 |
| -plt.plot([0, 1], [0, 1], 'k--') |
96 |
| -plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') |
97 |
| -plt.plot(fpr_rf, tpr_rf, label='RF') |
98 |
| -plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') |
99 |
| -plt.plot(fpr_grd, tpr_grd, label='GBT') |
100 |
| -plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') |
101 |
| -plt.xlabel('False positive rate') |
102 |
| -plt.ylabel('True positive rate') |
103 |
| -plt.title('ROC curve') |
104 |
| -plt.legend(loc='best') |
105 |
| -plt.show() |
106 |
| - |
107 |
| -plt.figure(2) |
108 |
| -plt.xlim(0, 0.2) |
109 |
| -plt.ylim(0.8, 1) |
110 |
| -plt.plot([0, 1], [0, 1], 'k--') |
111 |
| -plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') |
112 |
| -plt.plot(fpr_rf, tpr_rf, label='RF') |
113 |
| -plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') |
114 |
| -plt.plot(fpr_grd, tpr_grd, label='GBT') |
115 |
| -plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') |
116 |
| -plt.xlabel('False positive rate') |
117 |
| -plt.ylabel('True positive rate') |
118 |
| -plt.title('ROC curve (zoomed in at top left)') |
119 |
| -plt.legend(loc='best') |
120 |
| -plt.show() |
| 93 | +rt_model = make_pipeline( |
| 94 | + random_tree_embedding, LogisticRegression(max_iter=1000)) |
| 95 | +rt_model.fit(X_train_linear, y_train_linear) |
| 96 | + |
| 97 | +# %% |
| 98 | +# Then, we can pipeline random forest or gradient boosting with a logistic |
| 99 | +# regression. However, the feature transformation will happen by calling the |
| 100 | +# method `apply`. The pipeline in scikit-learn expects a call to `transform`. |
| 101 | +# Therefore, we wrapped the call to `apply` within a `FunctionTransformer`. |
| 102 | + |
| 103 | +from sklearn.preprocessing import FunctionTransformer |
| 104 | +from sklearn.preprocessing import OneHotEncoder |
| 105 | + |
| 106 | + |
| 107 | +def rf_apply(X, model): |
| 108 | + return model.apply(X) |
| 109 | + |
| 110 | + |
| 111 | +rf_leaves_yielder = FunctionTransformer( |
| 112 | + rf_apply, kw_args={"model": random_forest}) |
| 113 | + |
| 114 | +rf_model = make_pipeline( |
| 115 | + rf_leaves_yielder, OneHotEncoder(handle_unknown="ignore"), |
| 116 | + LogisticRegression(max_iter=1000)) |
| 117 | +rf_model.fit(X_train_linear, y_train_linear) |
| 118 | + |
| 119 | + |
| 120 | +# %% |
| 121 | +def gbdt_apply(X, model): |
| 122 | + return model.apply(X)[:, :, 0] |
| 123 | + |
| 124 | + |
| 125 | +gbdt_leaves_yielder = FunctionTransformer( |
| 126 | + gbdt_apply, kw_args={"model": gradient_boosting}) |
| 127 | + |
| 128 | +gbdt_model = make_pipeline( |
| 129 | + gbdt_leaves_yielder, OneHotEncoder(handle_unknown="ignore"), |
| 130 | + LogisticRegression(max_iter=1000)) |
| 131 | +gbdt_model.fit(X_train_linear, y_train_linear) |
| 132 | + |
| 133 | +# %% |
| 134 | +# We can finally show the different ROC curves for all the models. |
| 135 | + |
| 136 | +import matplotlib.pyplot as plt |
| 137 | +from sklearn.metrics import plot_roc_curve |
| 138 | + |
| 139 | +fig, ax = plt.subplots() |
| 140 | + |
| 141 | +models = [ |
| 142 | + ("RT embedding -> LR", rt_model), |
| 143 | + ("RF", random_forest), |
| 144 | + ("RF embedding -> LR", rf_model), |
| 145 | + ("GBDT", gradient_boosting), |
| 146 | + ("GBDT embedding -> LR", gbdt_model), |
| 147 | +] |
| 148 | + |
| 149 | +model_displays = {} |
| 150 | +for name, pipeline in models: |
| 151 | + model_displays[name] = plot_roc_curve( |
| 152 | + pipeline, X_test, y_test, ax=ax, name=name) |
| 153 | +_ = ax.set_title('ROC curve') |
| 154 | + |
| 155 | +# %% |
| 156 | +fig, ax = plt.subplots() |
| 157 | +for name, pipeline in models: |
| 158 | + model_displays[name].plot(ax=ax) |
| 159 | + |
| 160 | +ax.set_xlim(0, 0.2) |
| 161 | +ax.set_ylim(0.8, 1) |
| 162 | +_ = ax.set_title('ROC curve (zoomed in at top left)') |
0 commit comments