8000 EXA Update OpenML dataset in plot_gradient_boosting_categorical (#21789) · thomasjpfan/scikit-learn@f3b36e8 · GitHub
[go: up one dir, main page]

Skip to content

Commit f3b36e8

Browse files
author
Sven Eschlbeck
authored
EXA Update OpenML dataset in plot_gradient_boosting_categorical (scikit-learn#21789)
1 parent f19bf4c commit f3b36e8

File tree

1 file changed

+20
-21
lines changed

1 file changed

+20
-21
lines changed

examples/ensemble/plot_gradient_boosting_categorical.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,36 +30,37 @@
3030
# are either categorical or numerical:
3131
from sklearn.datasets import fetch_openml
3232

33-
X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
33+
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
3434

3535
# Select only a subset of features of X to make the example faster to run
3636
categorical_columns_subset = [
37-
"Bldg_Type",
38-
"Garage_Finish",
39-
"Lot_Config",
37+
"BldgType",
38+
"GarageFinish",
39+
"LotConfig",
4040
"Functional",
41-
"Mas_Vnr_Type",
42-
"House_Style",
43-
"Fireplace_Qu",
44-
"Exter_Cond",
45-
"Exter_Qual",
46-
"Pool_QC",
41+
"MasVnrType",
42+
"HouseStyle",
43+
"FireplaceQu",
44+
"ExterCond",
45+
"ExterQual",
46+
"PoolQC",
4747
]
4848

4949
numerical_columns_subset = [
50-
"Three_season_porch",
50+
"3SsnPorch",
5151
"Fireplaces",
52-
"Bsmt_Half_Bath",
53-
"Half_Bath",
54-
"Garage_Cars",
55-
"TotRms_AbvGrd",
56-
"BsmtFin_SF_1",
57-
"BsmtFin_SF_2",
58-
"Gr_Liv_Area",
59-
"Screen_Porch",
52+
"BsmtHalfBath",
53+
"HalfBath",
54+
"GarageCars",
55+
"TotRmsAbvGrd",
56+
"BsmtFinSF1",
57+
"BsmtFinSF2",
58+
"GrLivArea",
59+
"ScreenPorch",
6060
]
6161

6262
X = X[categorical_columns_subset + numerical_columns_subset]
63+
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
6364

6465
n_categorical_features = X.select_dtypes(include="category").shape[1]
6566
n_numerical_features = X.select_dtypes(include="number").shape[1]
@@ -153,7 +154,6 @@
153154
),
154155
)
155156

156-
157157
# %%
158158
# Model comparison
159159
# ----------------
@@ -230,7 +230,6 @@ def plot_results(figure_title):
230230
# %%
231231
# Limitting the number of splits
232232
# ------------------------------
233-
#
234233
# In general, one can expect poorer predictions from one-hot-encoded data,
235234
# especially when the tree depths or the number of nodes are limited: with
236235
# one-hot-encoded data, one needs more split points, i.e. more depth, in order

0 commit comments

Comments
 (0)
0