|
172 | 172 | #
|
173 | 173 | # Here, we use the modern
|
174 | 174 | # :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
|
175 |
| -# for categorical features. Therefore, we only do minimal ordinal encoding for |
176 |
| -# the categorical variables and then |
177 |
| -# let the model know that it should treat those as categorical variables by |
178 |
| -# using a dedicated tree splitting rule. Since we use an ordinal encoder, we |
179 |
| -# pass the list of categorical values explicitly to use a logical order when |
180 |
| -# encoding the categories as integers instead of the lexicographical order. |
181 |
| -# This also has the added benefit of preventing any issue with unknown |
182 |
| -# categories when using cross-validation. |
| 175 | +# for categorical features. Therefore, we only need to set |
| 176 | +# `categorical_features="from_dtype"` such that features with categorical dtype |
| 177 | +# are considered categorical features. For reference, we extract the categorical |
| 178 | +# features from the dataframe based on the dtype. The internal trees use a dedicated |
| 179 | +# tree splitting rule for these features. |
183 | 180 | #
|
184 | 181 | # The numerical variables need no preprocessing and, for the sake of simplicity,
|
185 | 182 | # we only try the default hyper-parameters for this model:
|
186 | 183 | from sklearn.compose import ColumnTransformer
|
187 | 184 | from sklearn.ensemble import HistGradientBoostingRegressor
|
188 | 185 | from sklearn.model_selection import cross_validate
|
189 | 186 | from sklearn.pipeline import make_pipeline
|
190 |
| -from sklearn.preprocessing import OrdinalEncoder |
191 | 187 |
|
192 |
| -categorical_columns = [ |
193 |
| - "weather", |
194 |
| - "season", |
195 |
| - "holiday", |
196 |
| - "workingday", |
197 |
| -] |
198 |
| -categories = [ |
199 |
| - ["clear", "misty", "rain"], |
200 |
| - ["spring", "summer", "fall", "winter"], |
201 |
| - ["False", "True"], |
202 |
| - ["False", "True"], |
203 |
| -] |
204 |
| -ordinal_encoder = OrdinalEncoder(categories=categories) |
205 |
| - |
206 |
| - |
207 |
| -gbrt_pipeline = make_pipeline( |
208 |
| - ColumnTransformer( |
209 |
| - transformers=[ |
210 |
| - ("categorical", ordinal_encoder, categorical_columns), |
211 |
| - ], |
212 |
| - remainder="passthrough", |
213 |
| - # Use short feature names to make it easier to specify the categorical |
214 |
| - # variables in the HistGradientBoostingRegressor in the next |
215 |
| - # step of the pipeline. |
216 |
| - verbose_feature_names_out=False, |
217 |
| - ), |
218 |
| - HistGradientBoostingRegressor( |
219 |
| - max_iter=300, |
220 |
| - early_stopping=True, |
221 |
| - validation_fraction=0.1, |
222 |
| - categorical_features=categorical_columns, |
223 |
| - random_state=42, |
224 |
| - ), |
225 |
| -).set_output(transform="pandas") |
| 188 | +gbrt = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=42) |
| 189 | +categorical_columns = X.columns[X.dtypes == "category"] |
| 190 | +print("Categorical features:", categorical_columns.tolist()) |
226 | 191 |
|
227 | 192 | # %%
|
228 | 193 | #
|
@@ -256,14 +221,7 @@ def evaluate(model, X, y, cv, model_prop=None, model_step=None):
|
256 | 221 | )
|
257 | 222 |
|
258 | 223 |
|
259 |
| -evaluate( |
260 |
| - gbrt_pipeline, |
261 |
| - X, |
262 |
| - y, |
263 |
| - cv=ts_cv, |
264 |
| - model_prop="n_iter_", |
265 |
| - model_step="histgradientboostingregressor", |
266 |
| -) |
| 224 | +evaluate(gbrt, X, y, cv=ts_cv, model_prop="n_iter_") |
267 | 225 |
|
268 | 226 | # %%
|
269 | 227 | # We see that we set `max_iter` large enough such that early stopping took place.
|
@@ -735,8 +693,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
|
735 | 693 | # Let us now have a qualitative look at the predictions of the kernel models
|
736 | 694 | # and of the gradient boosted trees that should be able to better model
|
737 | 695 | # non-linear interactions between features:
|
738 |
| -gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0]) |
739 |
| -gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0]) |
| 696 | +gbrt.fit(X.iloc[train_0], y.iloc[train_0]) |
| 697 | +gbrt_predictions = gbrt.predict(X.iloc[test_0]) |
740 | 698 |
|
741 | 699 | one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
|
742 | 700 | one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])
|
|
0 commit comments