|
3 | 3 | Discrete versus Real AdaBoost
|
4 | 4 | =============================
|
5 | 5 |
|
6 |
| -This example is based on Figure 10.2 from Hastie et al 2009 [1]_ and |
| 6 | +This notebook is based on Figure 10.2 from Hastie et al 2009 [1]_ and |
7 | 7 | illustrates the difference in performance between the discrete SAMME [2]_
|
8 | 8 | boosting algorithm and real SAMME.R boosting algorithm. Both algorithms are
|
9 | 9 | evaluated on a binary classification task where the target Y is a non-linear
|
|
15 | 15 | .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
|
16 | 16 | Learning Ed. 2", Springer, 2009.
|
17 | 17 |
|
18 |
| -.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009. |
| 18 | +.. [2] J Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", |
| 19 | + Statistics and Its Interface, 2009. |
19 | 20 |
|
20 | 21 | """
|
21 | 22 |
|
22 |
| -# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>, |
23 |
| -# Noel Dawe <noel.dawe@gmail.com> |
| 23 | +# %% |
| 24 | +# Preparing the data and baseline models |
| 25 | +# -------------------------------------- |
| 26 | +# We start by generating the binary classification dataset |
| 27 | +# used in Hastie et al. 2009, Example 10.2. |
| 28 | + |
| 29 | +# Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>, |
| 30 | +# Noel Dawe <noel.dawe@gmail.com> |
24 | 31 | #
|
25 | 32 | # License: BSD 3 clause
|
26 | 33 |
|
27 |
| -import numpy as np |
28 |
| -import matplotlib.pyplot as plt |
29 |
| - |
30 | 34 | from sklearn import datasets
|
31 |
| -from sklearn.tree import DecisionTreeClassifier |
32 |
| -from sklearn.metrics import zero_one_loss |
33 |
| -from sklearn.ensemble import AdaBoostClassifier |
34 | 35 |
|
| 36 | +X, y = datasets.make_hastie_10_2(n_samples=12_000, random_state=1) |
| 37 | + |
| 38 | +# %% |
| 39 | +# Now, we set the hyperparameters for our AdaBoost classifiers. |
| 40 | +# Be aware, a learning rate of 1.0 may not be optimal for both SAMME and SAMME.R |
35 | 41 |
|
36 | 42 | n_estimators = 400
|
37 |
| -# A learning rate of 1. may not be optimal for both SAMME and SAMME.R |
38 | 43 | learning_rate = 1.0
|
39 | 44 |
|
40 |
| -X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) |
| 45 | +# %% |
| 46 | +# We split the data into a training and a test set. |
| 47 | +# Then, we train our baseline classifiers, a `DecisionTreeClassifier` with `depth=9` |
| 48 | +# and a "stump" `DecisionTreeClassifier` with `depth=1` and compute the test error. |
41 | 49 |
|
42 |
| -X_test, y_test = X[2000:], y[2000:] |
43 |
| -X_train, y_train = X[:2000], y[:2000] |
| 50 | +from sklearn.model_selection import train_test_split |
| 51 | +from sklearn.tree import DecisionTreeClassifier |
| 52 | + |
| 53 | +X_train, X_test, y_train, y_test = train_test_split( |
| 54 | + X, y, test_size=2_000, shuffle=False |
| 55 | +) |
44 | 56 |
|
45 | 57 | dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
|
46 | 58 | dt_stump.fit(X_train, y_train)
|
|
50 | 62 | dt.fit(X_train, y_train)
|
51 | 63 | dt_err = 1.0 - dt.score(X_test, y_test)
|
52 | 64 |
|
| 65 | +# %% |
| 66 | +# Adaboost with discrete SAMME and real SAMME.R |
| 67 | +# --------------------------------------------- |
| 68 | +# We now define the discrete and real AdaBoost classifiers |
| 69 | +# and fit them to the training set. |
| 70 | + |
| 71 | +from sklearn.ensemble import AdaBoostClassifier |
| 72 | + |
53 | 73 | ada_discrete = AdaBoostClassifier(
|
54 | 74 | base_estimator=dt_stump,
|
55 | 75 | learning_rate=learning_rate,
|
|
58 | 78 | )
|
59 | 79 | ada_discrete.fit(X_train, y_train)
|
60 | 80 |
|
| 81 | +# %% |
| 82 | + |
61 | 83 | ada_real = AdaBoostClassifier(
|
62 | 84 | base_estimator=dt_stump,
|
63 | 85 | learning_rate=learning_rate,
|
|
66 | 88 | )
|
67 | 89 | ada_real.fit(X_train, y_train)
|
68 | 90 |
|
69 |
| -fig = plt.figure() |
70 |
| -ax = fig.add_subplot(111) |
| 91 | +# %% |
| 92 | +# Now, let's compute the test error of the discrete and |
| 93 | +# real AdaBoost classifiers for each new stump in `n_estimators` |
| 94 | +# added to the ensemble. |
71 | 95 |
|
72 |
| -ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error") |
73 |
| -ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error") |
| 96 | +import numpy as np |
| 97 | +from sklearn.metrics import zero_one_loss |
74 | 98 |
|
75 | 99 | ada_discrete_err = np.zeros((n_estimators,))
|
76 | 100 | for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
|
|
88 | 112 | for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
|
89 | 113 | ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
|
90 | 114 |
|
| 115 | +# %% |
| 116 | +# Plotting the results |
| 117 | +# -------------------- |
| 118 | +# Finally, we plot the train and test errors of our baselines |
| 119 | +# and of the discrete and real AdaBoost classifiers |
| 120 | + |
| 121 | +import matplotlib.pyplot as plt |
| 122 | +import seaborn as sns |
| 123 | + |
| 124 | +fig = plt.figure() |
| 125 | +ax = fig.add_subplot(111) |
| 126 | + |
| 127 | +ax.plot([1, n_estimators], [dt_stump_err] * 2, "k-", label="Decision Stump Error") |
| 128 | +ax.plot([1, n_estimators], [dt_err] * 2, "k--", label="Decision Tree Error") |
| 129 | + |
| 130 | +colors = sns.color_palette("colorblind") |
| 131 | + |
91 | 132 | ax.plot(
|
92 | 133 | np.arange(n_estimators) + 1,
|
93 | 134 | ada_discrete_err,
|
94 | 135 | label="Discrete AdaBoost Test Error",
|
95 |
| - color="red", |
| 136 | + color=colors[0], |
96 | 137 | )
|
97 | 138 | ax.plot(
|
98 | 139 | np.arange(n_estimators) + 1,
|
99 | 140 | ada_discrete_err_train,
|
100 | 141 | label="Discrete AdaBoost Train Error",
|
101 |
| - color="blue", |
| 142 | + color=colors[1], |
102 | 143 | )
|
103 | 144 | ax.plot(
|
104 | 145 | np.arange(n_estimators) + 1,
|
105 | 146 | ada_real_err,
|
106 | 147 | label="Real AdaBoost Test Error",
|
107 |
| - color="orange", |
| 148 | + color=colors[2], |
108 | 149 | )
|
109 | 150 | ax.plot(
|
110 | 151 | np.arange(n_estimators) + 1,
|
111 | 152 | ada_real_err_train,
|
112 | 153 | label="Real AdaBoost Train Error",
|
113 |
| - color="green", |
| 154 | + color=colors[4], |
114 | 155 | )
|
115 | 156 |
|
116 | 157 | ax.set_ylim((0.0, 0.5))
|
117 |
| -ax.set_xlabel("n_estimators") |
| 158 | +ax.set_xlabel("Number of weak learners") |
118 | 159 | ax.set_ylabel("error rate")
|
119 | 160 |
|
120 | 161 | leg = ax.legend(loc="upper right", fancybox=True)
|
121 | 162 | leg.get_frame().set_alpha(0.7)
|
122 | 163 |
|
123 | 164 | plt.show()
|
| 165 | +# %% |
| 166 | +# |
| 167 | +# Concluding remarks |
| 168 | +# ------------------ |
| 169 | +# |
| 170 | +# We observe that the error rate for both train and test sets of real AdaBoost |
| 171 | +# is lower than that of discrete AdaBoost. |
0 commit comments