8000 EXA use pipeline in feature engineering example with RF,RT,GBDT (#18835) · scikit-learn/scikit-learn@07508a8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 07508a8

Browse files
EXA use pipeline in feature engineering example with RF,RT,GBDT (#18835)
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
1 parent c0c7ca5 commit 07508a8

File tree

1 file changed

+140
-98
lines changed

1 file changed

+140
-98
lines changed

examples/ensemble/plot_feature_transformation.py

Lines changed: 140 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -3,118 +3,160 @@
33
Feature transformations with ensembles of trees
44
===============================================
55
6-
Transform your features into a higher dimensional, sparse space. Then
7-
train a linear model on these features.
6+
Transform your features into a higher dimensional, sparse space. Then train a
7+
linear model on these features.
88
9-
First fit an ensemble of trees (totally random trees, a random
10-
forest, or gradient boosted trees) on the training set. Then each leaf
11-
of each tree in the ensemble is assigned a fixed arbitrary feature
12-
index in a new feature space. These leaf indices are then encoded in a
13-
one-hot fashion.
9+
First fit an ensemble of trees (totally random trees, a random forest, or
10+
gradient boosted trees) on the training set. Then each leaf of each tree in the
11+
ensemble is assigned a fixed arbitrary feature index in a new feature space.
12+
These leaf indices are then encoded in a one-hot fashion.
1413
15-
Each sample goes through the decisions of each tree of the ensemble
16-
and ends up in one leaf per tree. The sample is encoded by setting
17-
feature values for these leaves to 1 and the other feature values to 0.
14+
Each sample goes through the decisions of each tree of the ensemble and ends up
15+
in one leaf per tree. The sample is encoded by setting feature values for these
16+
leaves to 1 and the other feature values to 0.
1817
1918
The resulting transformer has then learned a supervised, sparse,
2019
high-dimensional categorical embedding of the data.
21-
2220
"""
2321

2422
# Author: Tim Head <betatim@gmail.com>
2523
#
2624
# License: BSD 3 clause
2725

28-
import numpy as np
29-
np.random.seed(10)
26+
print(__doc__)
3027

31-
import matplotlib.pyplot as plt
28+
from sklearn import set_config
29+
set_config(display='diagram')
30+
31+
# %%
32+
# First, we will create a large dataset and split it into three sets:
33+
#
34+
# - a set to train the ensemble methods which are later used to as a feature
35+
# engineering transformer;
36+
# - a set to train the linear model;
37+
# - a set to test the linear model.
38+
#
39+
# It is important to split the data in such way to avoid overfitting by leaking
40+
# data.
3241

3342
from sklearn.datasets import make_classification
34-
from sklearn.linear_model import LogisticRegression
35-
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
36-
GradientBoostingClassifier)
37-
from sklearn.preprocessing import OneHotEncoder
3843
from sklearn.model_selection import train_test_split
39-
from sklearn.metrics import roc_curve
44+
45+
X, y = make_classification(n_samples=80000, random_state=10)
46+
47+
X_full_train, X_test, y_full_train, y_test = train_test_split(
48+
X, y, test_size=0.5, random_state=10)
49+
X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = \
50+
train_test_split(X_full_train, y_full_train, test_size=0.5,
51+
random_state=10)
52+
53+
# %%
54+
# For each of the ensemble methods, we will use 10 estimators and a maximum
55+
# depth of 3 levels.
56+
57+
n_estimators = 10
58+
max_depth = 3
59+
60+
# %%
61+
# First, we will start by training the random forest and gradient boosting on
62+
# the separated training set
63+
64+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
65+
66+
random_forest = RandomForestClassifier(
67+
n_estimators=n_estimators, max_depth=max_depth, random_state=10)
68+
random_forest.fit(X_train_ensemble, y_train_ensemble)
69+
70+
gradient_boosting = GradientBoostingClassifier(
71+
n_estimators=n_estimators, max_depth=max_depth, random_state=10)
72+
_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)
73+
74+
# %%
75+
# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
76+
# and thus does not required to be trained independently.
77+
78+
from sklearn.ensemble import RandomTreesEmbedding
79+
80+
random_tree_embedding = RandomTreesEmbedding(
81+
n_estimators=n_estimators, max_depth=max_depth, random_state=0)
82+
83+
# %%
84+
# Now, we will create three pipelines that will use the above embedding as
85+
# a preprocessing stage.
86+
#
87+
# The random trees embedding can be directly pipelined with the logistic
88+
# regression because it is a standard scikit-learn transformer.
89+
90+
from sklearn.linear_model import LogisticRegression
4091
from sklearn.pipeline import make_pipeline
4192

42-
n_estimator = 10
43-
X, y = make_classification(n_samples=80000)
44-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
45-
46-
# It is important to train the ensemble of trees on a different subset
47-
# of the training data than the linear regression model to avoid
48-
# overfitting, in particular if the total number of leaves is
49-
# similar to the number of training samples
50-
X_train, X_train_lr, y_train, y_train_lr = train_test_split(
51-
X_train, y_train, test_size=0.5)
52-
53-
# Unsupervised transformation based on totally random trees
54-
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
55-
random_state=0)
56-
57-
rt_lm = LogisticRegression(max_iter=1000)
58-
pipeline = make_pipeline(rt, rt_lm)
59-
pipeline.fit(X_train, y_train)
60-
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
61-
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
62-
63-
# Supervised transformation based on random forests
64-
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
65-
rf_enc = OneHotEncoder()
66-
rf_lm = LogisticRegression(max_iter=1000)
67-
rf.fit(X_train, y_train)
68-
rf_enc.fit(rf.apply(X_train))
69-
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
70-
71-
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
72-
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
73-
74-
# Supervised transformation based on gradient boosted trees
75-
grd = GradientBoostingClassifier(n_estimators=n_estimator)
76-
grd_enc = OneHotEncoder()
77-
grd_lm = LogisticRegression(max_iter=1000)
78-
grd.fit(X_train, y_train)
79-
grd_enc.fit(grd.apply(X_train)[:, :, 0])
80-
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
81-
82-
y_pred_grd_lm = grd_lm.predict_proba(
83-
grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
84-
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
85-
86-
# The gradient boosted model by itself
87-
y_pred_grd = grd.predict_proba(X_test)[:, 1]
88-
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
89-
90-
# The random forest model by itself
91-
y_pred_rf = rf.predict_proba(X_test)[:, 1]
92-
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
93-
94-
plt.figure(1)
95-
plt.plot([0, 1], [0, 1], 'k--')
96-
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
97-
plt.plot(fpr_rf, tpr_rf, label='RF')
98-
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
99-
plt.plot(fpr_grd, tpr_grd, label='GBT')
100-
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
101-
plt.xlabel('False positive rate')
102-
plt.ylabel('True positive rate')
103-
plt.title('ROC curve')
104-
plt.legend(loc='best')
105-
plt.show()
106-
107-
plt.figure(2)
108-
plt.xlim(0, 0.2)
109-
plt.ylim(0.8, 1)
110-
plt.plot([0, 1], [0, 1], 'k--')
111-
plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
112-
plt.plot(fpr_rf, tpr_rf, label='RF')
113-
plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
114-
plt.plot(fpr_grd, tpr_grd, label='GBT')
115-
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
116-
plt.xlabel('False positive rate')
117-
plt.ylabel('True positive rate')
118-
plt.title('ROC curve (zoomed in at top left)')
119-
plt.legend(loc='best')
120-
plt.show()
93+
rt_model = make_pipeline(
94+
random_tree_embedding, LogisticRegression(max_iter=1000))
95+
rt_model.fit(X_train_linear, y_train_linear)
96+
97+
# %%
98+
# Then, we can pipeline random forest or gradient boosting with a logistic
99+
# regression. However, the feature transformation will happen by calling the
100+
# method `apply`. The pipeline in scikit-learn expects a call to `transform`.
101+
# Therefore, we wrapped the call to `apply` within a `FunctionTransformer`.
102+
103+
from sklearn.preprocessing import FunctionTransformer
104+
from sklearn.preprocessing import OneHotEncoder
105+
106+
107+
def rf_apply(X, model):
108+
return model.apply(X)
109+
110+
111+
rf_leaves_yielder = FunctionTransformer(
112+
rf_apply, kw_args={"model": random_forest})
113+
114+
rf_model = make_pipeline(
115+
rf_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
116+
LogisticRegression(max_iter=1000))
117+
rf_model.fit(X_train_linear, y_train_linear)
118+
119+
120+
# %%
121+
def gbdt_apply(X, model):
122+
return model.apply(X)[:, :, 0]
123+
124+
125+
gbdt_leaves_yielder = FunctionTransformer(
126+
gbdt_apply, kw_args={"model": gradient_boosting})
127+
128+
gbdt_model = make_pipeline(
129+
gbdt_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
130+
LogisticRegression(max_iter=1000))
131+
gbdt_model.fit(X_train_linear, y_train_linear)
132+
133+
# %%
134+
# We can finally show the different ROC curves for all the models.
135+
136+
import matplotlib.pyplot as plt
137+
from sklearn.metrics import plot_roc_curve
138+
139+
fig, ax = plt.subplots()
140+
141+
models = [
142+
("RT embedding -> LR", rt_model),
143+
("RF", random_forest),
144+
("RF embedding -> LR", rf_model),
145+
("GBDT", gradient_boosting),
146+
("GBDT embedding -> LR", gbdt_model),
147+
]
148+
149+
model_displays = {}
150+
for name, pipeline in models:
151+
model_displays[name] = plot_roc_curve(
152+
pipeline, X_test, y_test, ax=ax, name=name)
153+
_ = ax.set_title('ROC curve')
154+
155+
# %%
156+
fig, ax = plt.subplots()
157+
for name, pipeline in models:
158+
model_displays[name].plot(ax=ax)
159+
160+
ax.set_xlim(0, 0.2)
161+
ax.set_ylim(0.8, 1)
162+
_ = ax.set_title('ROC curve (zoomed in at top left)')

0 commit comments

Comments
 (0)
0