8000 ENH Adds native pandas categorical support to gradient boosting (#26411) · scikit-learn/scikit-learn@4cf13d2 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4cf13d2

Browse files
thomasjpfanogriselglemaitre
authored
ENH Adds native pandas categorical support to gradient boosting (#26411)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 5d83a2e commit 4cf13d2

File tree

5 files changed

+338
-113
lines changed

5 files changed

+338
-113
lines changed

doc/whats_new/v1.4.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ Changelog
295295
for classification or `squared_error`, `friedman_mse`, or `poisson`
296296
for regression. :pr:`26391` by `Thomas Fan`_.
297297

298+
- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and
299+
:class:`ensemble.HistGradientBoostingRegressor` supports
300+
`categorical_features="from_dtype"`, which treats columns with Pandas Categorical
301+
dtype as categories in the algorithm. `categorical_features="from_dtype"` will become
302+
the default in v1.6. :pr:`26411` by `Thomas Fan`_.
303+
298304
- |Feature| :class:`ensemble.RandomForestClassifier`,
299305
:class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`
300306
and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints,

examples/applications/plot_cyclical_feature_engineering.py

Lines changed: 11 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -172,57 +172,22 @@
172172
#
173173
# Here, we use the modern
174174
# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
175-
# for categorical features. Therefore, we only do minimal ordinal encoding for
176-
# the categorical variables and then
177-
# let the model know that it should treat those as categorical variables by
178-
# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
179-
# pass the list of categorical values explicitly to use a logical order when
180-
# encoding the categories as integers instead of the lexicographical order.
181-
# This also has the added benefit of preventing any issue with unknown
182-
# categories when using cross-validation.
175+
# for categorical features. Therefore, we only need to set
176+
# `categorical_features="from_dtype"` such that features with categorical dtype
177+
# are considered categorical features. For reference, we extract the categorical
178+
# features from the dataframe based on the dtype. The internal trees use a dedicated
179+
# tree splitting rule for these features.
183180
#
184181
# The numerical variables need no preprocessing and, for the sake of simplicity,
185182
# we only try the default hyper-parameters for this model:
186183
from sklearn.compose import ColumnTransformer
187184
from sklearn.ensemble import HistGradientBoostingRegressor
188185
from sklearn.model_selection import cross_validate
189186
from sklearn.pipeline import make_pipeline
190-
from sklearn.preprocessing import OrdinalEncoder
191187

192-
categorical_columns = [
193-
"weather",
194-
"season",
195-
"holiday",
196-
"workingday",
197-
]
198-
categories = [
199-
["clear", "misty", "rain"],
200-
["spring", "summer", "fall", "winter"],
201-
["False", "True"],
202-
["False", "True"],
203-
]
204-
ordinal_encoder = OrdinalEncoder(categories=categories)
205-
206-
207-
gbrt_pipeline = make_pipeline(
208-
ColumnTransformer(
209-
transformers=[
210-
("categorical", ordinal_encoder, categorical_columns),
211-
],
212-
remainder="passthrough",
213-
# Use short feature names to make it easier to specify the categorical
214-
# variables in the HistGradientBoostingRegressor in the next
215-
# step of the pipeline.
216-
verbose_feature_names_out=False,
217-
),
218-
HistGradientBoostingRegressor(
219-
max_iter=300,
220-
early_stopping=True,
221-
validation_fraction=0.1,
222-
categorical_features=categorical_columns,
223-
random_state=42,
224-
),
225-
).set_output(transform="pandas")
188+
gbrt = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=42)
189+
categorical_columns = X.columns[X.dtypes == "category"]
190+
print("Categorical features:", categorical_columns.tolist())
226191

227192
# %%
228193
#
@@ -256,14 +221,7 @@ def evaluate(model, X, y, cv, model_prop=None, model_step=None):
256221
)
257222

258223

259-
evaluate(
260-
gbrt_pipeline,
261-
X,
262-
y,
263-
cv=ts_cv,
264-
model_prop="n_iter_",
265-
model_step="histgradientboostingregressor",
266-
)
224+
evaluate(gbrt, X, y, cv=ts_cv, model_prop="n_iter_")
267225

268226
# %%
269227
# We see that we set `max_iter` large enough such that early stopping took place.
@@ -735,8 +693,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
735693
# Let us now have a qualitative look at the predictions of the kernel models
736694
# and of the gradient boosted trees that should be able to better model
737695
# non-linear interactions between features:
738-
gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
739-
gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])
696+
gbrt.fit(X.iloc[train_0], y.iloc[train_0])
697+
gbrt_predictions = gbrt.predict(X.iloc[test_0])
740698

741699
one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
742700
one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])

examples/ensemble/plot_gradient_boosting_categorical.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -138,26 +138,17 @@
138138
# -----------------------------------------------------------
139139
# We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator
140140
# that will natively handle categorical features. This estimator will not treat
141-
# categorical features as ordered quantities.
141+
# categorical features as ordered quantities. We set
142+
# `categorical_features="from_dtype"` such that features with categorical dtype
143+
# are considered categorical features.
142144
#
143-
# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
144-
# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
145-
# :class:`~preprocessing.OrdinalEncoder` to pre-process the data.
146-
#
147-
# The main difference between this pipeline and the previous one is that in
145+
# The main difference between this estimator and the previous one is that in
148146
# this one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know
149147
# which features are categorical.
150148

151-
# The ordinal encoder will first output the categorical features, and then the
152-
# continuous (passed-through) features
153-
154-
hist_native = make_pipeline(
155-
ordinal_encoder,
156-
HistGradientBoostingRegressor(
157-
random_state=42,
158-
categorical_features=categorical_columns,
159-
),
160-
).set_output(transform="pandas")
149+
hist_native = HistGradientBoostingRegressor(
150+
random_state=42, categorical_features="from_dtype"
151+
)
161152

162153
# %%
163154
# Model comparison
@@ -256,10 +247,15 @@ def plot_results(figure_title):
256247
# of trees and the depth of each tree.
257248

258249
for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
259-
pipe.set_params(
260-
histgradientboostingregressor__max_depth=3,
261-
histgradientboostingregressor__max_iter=15,
262-
)
250+
if pipe is hist_native:
251+
# The native model does not use a pipeline so, we can set the parameters
252+
# directly.
253+
pipe.set_params(max_depth=3, max_iter=15)
254+
else:
255+
pipe.set_params(
256+
histgradientboostingregressor__max_depth=3,
257+
histgradientboostingregressor__max_iter=15,
258+
)
263259

264260
dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
265261
one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)

0 commit comments

Comments
 (0)
0