scikit-learn
diff --git a/‎doc/whats_new/v1.4.rst
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new/v1.4.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/applications/plot_cyclical_feature_engineering.py
Lines changed: 11 additions & 53 deletions b/‎examples/applications/plot_cyclical_feature_engineering.py
Lines changed: 11 additions & 53 deletions
diff --git a/‎examples/ensemble/plot_gradient_boosting_categorical.py
Lines changed: 16 additions & 20 deletions b/‎examples/ensemble/plot_gradient_boosting_categorical.py
Lines changed: 16 additions & 20 deletions
@@ -295,6 +295,12 @@ Changelog
   for classification or `squared_error`, `friedman_mse`, or `poisson`
   for regression. :pr:`26391` by `Thomas Fan`_.
 
+- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` supports
+  `categorical_features="from_dtype"`, which treats columns with Pandas Categorical
+  dtype as categories in the algorithm. `categorical_features="from_dtype"` will become
+  the default in v1.6. :pr:`26411` by `Thomas Fan`_.
+
 - |Feature| :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`
   and :class:`ensemble.ExtraTreesRegressor` now support monotonic constraints,
 
@@ -172,57 +172,22 @@
 #
 # Here, we use the modern
 # :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
-# for categorical features. Therefore, we only do minimal ordinal encoding for
-# the categorical variables and then
-# let the model know that it should treat those as categorical variables by
-# using a dedicated tree splitting rule. Since we use an ordinal encoder, we
-# pass the list of categorical values explicitly to use a logical order when
-# encoding the categories as integers instead of the lexicographical order.
-# This also has the added benefit of preventing any issue with unknown
-# categories when using cross-validation.
+# for categorical features. Therefore, we only need to set
+# `categorical_features="from_dtype"` such that features with categorical dtype
+# are considered categorical features. For reference, we extract the categorical
+# features from the dataframe based on the dtype. The internal trees use a dedicated
+# tree splitting rule for these features.
 #
 # The numerical variables need no preprocessing and, for the sake of simplicity,
 # we only try the default hyper-parameters for this model:
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.model_selection import cross_validate
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OrdinalEncoder
 
-categorical_columns = [
-    "weather",
-    "season",
-    "holiday",
-    "workingday",
-]
-categories = [
-    ["clear", "misty", "rain"],
-    ["spring", "summer", "fall", "winter"],
-    ["False", "True"],
-    ["False", "True"],
-]
-ordinal_encoder = OrdinalEncoder(categories=categories)
-
-
-gbrt_pipeline = make_pipeline(
-    ColumnTransformer(
-        transformers=[
-            ("categorical", ordinal_encoder, categorical_columns),
-        ],
-        remainder="passthrough",
-        # Use short feature names to make it easier to specify the categorical
-        # variables in the HistGradientBoostingRegressor in the next
-        # step of the pipeline.
-        verbose_feature_names_out=False,
-    ),
-    HistGradientBoostingRegressor(
-        max_iter=300,
-        early_stopping=True,
-        validation_fraction=0.1,
-        categorical_features=categorical_columns,
-        random_state=42,
-    ),
-).set_output(transform="pandas")
+gbrt = HistGradientBoostingRegressor(categorical_features="from_dtype", random_state=42)
+categorical_columns = X.columns[X.dtypes == "category"]
+print("Categorical features:", categorical_columns.tolist())
 
 # %%
 #
@@ -256,14 +221,7 @@ def evaluate(model, X, y, cv, model_prop=None, model_step=None):
     )
 
 
-evaluate(
-    gbrt_pipeline,
-    X,
-    y,
-    cv=ts_cv,
-    model_prop="n_iter_",
-    model_step="histgradientboostingregressor",
-)
+evaluate(gbrt, X, y, cv=ts_cv, model_prop="n_iter_")
 
 # %%
 # We see that we set `max_iter` large enough such that early stopping took place.
@@ -735,8 +693,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # Let us now have a qualitative look at the predictions of the kernel models
 # and of the gradient boosted trees that should be able to better model
 # non-linear interactions between features:
-gbrt_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
-gbrt_predictions = gbrt_pipeline.predict(X.iloc[test_0])
+gbrt.fit(X.iloc[train_0], y.iloc[train_0])
+gbrt_predictions = gbrt.predict(X.iloc[test_0])
 
 one_hot_poly_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
 one_hot_poly_predictions = one_hot_poly_pipeline.predict(X.iloc[test_0])
 
@@ -138,26 +138,17 @@
 # -----------------------------------------------------------
 # We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator
 # that will natively handle categorical features. This estimator will not treat
-# categorical features as ordered quantities.
+# categorical features as ordered quantities. We set
+# `categorical_features="from_dtype"` such that features with categorical dtype
+# are considered categorical features.
 #
-# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
-# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
-# :class:`~preprocessing.OrdinalEncoder` to pre-process the data.
-#
-# The main difference between this pipeline and the previous one is that in
+# The main difference between this estimator and the previous one is that in
 # this one, we let the :class:`~ensemble.HistGradientBoostingRegressor` know
 # which features are categorical.
 
-# The ordinal encoder will first output the categorical features, and then the
-# continuous (passed-through) features
-
-hist_native = make_pipeline(
-    ordinal_encoder,
-    HistGradientBoostingRegressor(
-        random_state=42,
-        categorical_features=categorical_columns,
-    ),
-).set_output(transform="pandas")
+hist_native = HistGradientBoostingRegressor(
+    random_state=42, categorical_features="from_dtype"
+)
 
 # %%
 # Model comparison
@@ -256,10 +247,15 @@ def plot_results(figure_title):
 # of trees and the depth of each tree.
 
 for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
-    pipe.set_params(
-        histgradientboostingregressor__max_depth=3,
-        histgradientboostingregressor__max_iter=15,
-    )
+    if pipe is hist_native:
+        # The native model does not use a pipeline so, we can set the parameters
+        # directly.
+        pipe.set_params(max_depth=3, max_iter=15)
+    else:
+        pipe.set_params(
+            histgradientboostingregressor__max_depth=3,
+            histgradientboostingregressor__max_iter=15,
+        )
 
 dropped_result = cross_validate(hist_dropped, X, y, cv=n_cv_folds, scoring=scoring)
 one_hot_result = cross_validate(hist_one_hot, X, y, cv=n_cv_folds, scoring=scoring)