10000 DOC Adds Release Highlights for 1.4 (#27933) · punndcoder28/scikit-learn@fb35756 · GitHub
[go: up one dir, main page]

Skip to content

Commit fb35756

Browse files
thomasjpfanadrinjalaliglemaitreGaelVaroquauxjeremiedbb
authored
DOC Adds Release Highlights for 1.4 (scikit-learn#27933)
Co-authored-by: adrinjalali <adrin.jalali@gmail.com> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com> Co-authored-by: Gael Varoquaux <gael.varoquaux@normalesup.org> Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
1 parent 11b45b3 commit fb35756

File tree

6 files changed

+216
-2
lines changed

6 files changed

+216
-2
lines changed

build_tools/circle/doc_environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies:
2828
- numpydoc
2929
- sphinx-prompt
3030
- plotly
31+
- polars
3132
- pooch
3233
- sphinxext-opengraph
3334
- pip

build_tools/circle/doc_linux-64_conda.lock

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Generated by conda-lock.
22
# platform: linux-64
3-
# input_hash: 74e9e451b651d0b84d1c066a106b93d1a0f711e6aa6c5a48d2169af2e01f4d90
3+
# input_hash: 0d62c56444fc81a1e285d3657990a983d2c40ceb6fb44130975b4e8e72626137
44
@EXPLICIT
55
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
66
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda#01ffc8d36f9eba0ce0b3c1955fa780ee
@@ -178,6 +178,7 @@ https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.2.0-pyha21a80b_0.c
178178
https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2#f832c45a477c78bebd107098db465095
179179
https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2#5844808ffab9ebdb694585b50ba02a96
180180
https://conda.anaconda.org/conda-forge/linux-64/tornado-6.3.3-py39hd1e30aa_1.conda#cbe186eefb0bcd91e8f47c3908489874
181+
https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.9.0-pyha770c72_0.conda#a92a6440c3fe7052d63244f3aba2a4a7
181182
https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-15.1.0-py39hd1e30aa_0.conda#1da984bbb6e765743e13388ba7b7b2c8
182183
https://conda.anaconda.org/conda-forge/noarch/wheel-0.42.0-pyhd8ed1ab_0.conda#1cdea58981c5cbc17b51973bcaddcea7
183184
https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-h8ee46fc_1.conda#9d7bcddf49cbf727730af10e71022c73
@@ -225,6 +226,7 @@ https://conda.anaconda.org/conda-forge/linux-64/imagecodecs-2023.9.18-py39hf9b8f
225226
https://conda.anaconda.org/conda-forge/noarch/imageio-2.33.1-pyh8c1a49c_0.conda#1c34d58ac469a34e7e96832861368bce
226227
https://conda.anaconda.org/conda-forge/linux-64/pandas-2.1.4-py39hddac248_0.conda#dcfd2f15c6f8f0bbf234412b18a2a5d0
227228
https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.4-pyhd8ed1ab_0.conda#1184267eddebb57e47f8e1419c225595
229+
https://conda.anaconda.org/conda-forge/linux-64/polars-0.19.19-py39h90d8ae4_0.conda#9cefe0d7ce9208c3afbbac29951aff59
228230
https://conda.anaconda.org/conda-forge/noarch/pooch-1.8.0-pyhd8ed1ab_0.conda#134b2b57b7865d2316a7cce1915a51ed
229231
https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-2.5.0-pyhd8ed1ab_0.tar.bz2#1fdd1f3baccf0deb647385c677a1a48e
230232
https://conda.anaconda.org/conda-forge/linux-64/pywavelets-1.4.1-py39h44dd56e_1.conda#d037c20e3da2e85f03ebd20ad480c359

build_tools/circle/doc_min_dependencies_environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies:
2828
- numpydoc=1.2.0 # min
2929
- sphinx-prompt=1.3.0 # min
3030
- plotly=5.14.0 # min
31+
- polars=0.19.12 # min
3132
- pooch
3233
- pip
3334
- pip:

build_tools/circle/doc_min_dependencies_linux-64_conda.lock

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Generated by conda-lock.
22
# platform: linux-64
3-
# input_hash: 35f943b65f19232746bf1ac103664d9fa08c9fce0bcc39d7ee2ecf873d996bff
3+
# input_hash: 63e92fdc759dcf030bf7e6d4a5d86bec102c98562cfb7ebd4d3d4991c895678b
44
@EXPLICIT
55
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
66
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda#01ffc8d36f9eba0ce0b3c1955fa780ee
@@ -208,6 +208,7 @@ https://conda.anaconda.org/conda-forge/noarch/imageio-2.33.1-pyh8c1a49c_0.conda#
208208
https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.3.4-py39h2fa2bec_0.tar.bz2#9ec0b2186fab9121c54f4844f93ee5b7
209209
https://conda.anaconda.org/conda-forge/linux-64/pandas-1.1.5-py39hde0f152_0.tar.bz2#79fc4b5b3a865b90dd3701cecf1ad33c
210210
https://conda.anaconda.org/conda-forge/noarch/patsy-0.5.4-pyhd8ed1ab_0.conda#1184267eddebb57e47f8e1419c225595
211+
https://conda.anaconda.org/conda-forge/linux-64/polars-0.19.12-py39h90d8ae4_0.conda#191828961c95f8d59fa2b86a590f9905
211212
https://conda.anaconda.org/conda-forge/noarch/pooch-1.8.0-pyhd8ed1ab_0.conda#134b2b57b7865d2316a7cce1915a51ed
212213
https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-2.5.0-pyhd8ed1ab_0.tar.bz2#1fdd1f3baccf0deb647385c677a1a48e
213214
https://conda.anaconda.org/conda-forge/linux-64/pywavelets-1.3.0-py39hd257fcd_1.tar.bz2#c4b698994b2d8d2e659ae02202e6abe4

build_tools/update_environments_and_lock_files.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ def remove_from(alist, to_remove):
276276
"numpydoc",
277277
"sphinx-prompt",
278278
"plotly",
279+
"polars",
279280
"pooch",
280281
],
281282
"pip_dependencies": ["sphinxext-opengraph"],
@@ -294,6 +295,7 @@ def remove_from(alist, to_remove):
294295
"sphinx-prompt": "min",
295296
"sphinxext-opengraph": "min",
296297
"plotly": "min",
298+
"polars": "min",
297299
},
298300
},
299301
{
@@ -312,6 +314,7 @@ def remove_from(alist, to_remove):
312314
"numpydoc",
313315
"sphinx-prompt",
314316
"plotly",
317+
"polars",
315318
"pooch",
316319
"sphinxext-opengraph",
317320
],
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# ruff: noqa
2+
"""
3+
=======================================
4+
Release Highlights for scikit-learn 1.4
5+
=======================================
6+
7+
.. currentmodule:: sklearn
8+
9+
We are pleased to announce the release of scikit-learn 1.4! Many bug fixes
10+
and improvements were added, as well as some new key features. We detail
11+
below a few of the major features of this release. **For an exhaustive list of
12+
all the changes**, please refer to the :ref:`release notes <changes_1_4>`.
13+
14+
To install the latest version (with pip)::
15+
16+
pip install --upgrade scikit-learn
17+
18+
or with conda::
19+
20+
conda install -c conda-forge scikit-learn
21+
22+
"""
23+
24+
# %%
25+
# HistGradientBoosting Natively Supports Categorical DTypes in DataFrames
26+
# -----------------------------------------------------------------------
27+
# :class:`ensemble.HistGradientBoostingClassifier` and
28+
# :class:`ensemble.HistGradientBoostingRegressor` now directly supports dataframes with
29+
# categorical features. Here we have a dataset with a mixture of
30+
# categorical and numerical features:
31+
from sklearn.datasets import fetch_openml
32+
33+
X_adult, y_adult = fetch_openml("adult", version=2, return_X_y=True)
34+
35+
# Remove redundant and non-feature columns
36+
X_adult = X_adult.drop(["education-num", "fnlwgt"], axis="columns")
37+
X_adult.dtypes
38+
39+
# %%
40+
# By setting `categorical_features="from_dtype"`, the gradient boosting classifier
41+
# treats the columns with categorical dtypes as categorical features in the
42+
# algorithm:
43+
from sklearn.ensemble import HistGradientBoostingClassifier
44+
from sklearn.model_selection import train_test_split
45+
from sklearn.metrics import roc_auc_score
46+
47+
X_train, X_test, y_train, y_test = train_test_split(X_adult, y_adult, random_state=0)
48+
hist = HistGradientBoostingClassifier(categorical_features="from_dtype")
49+
50+
hist.fit(X_train, y_train)
51+
y_decision = hist.decision_function(X_test)
52+
print(f"ROC AUC score is {roc_auc_score(y_test, y_decision)}")
53+
54+
# %%
55+
# Polars output in `set_output`
56+
# -----------------------------
57+
# scikit-learn's transformers now support polars output with the `set_output` API.
58+
import polars as pl
59+
from sklearn.preprocessing import StandardScaler
60+
from sklearn.preprocessing import OneHotEncoder
61+
from sklearn.compose import ColumnTransformer
62+
63+
df = pl.DataFrame(
64+
{"height": [120, 140, 150, 110, 100], "pet": ["dog", "cat", "dog", "cat", "cat"]}
65+
)
66+
preprocessor = ColumnTransformer(
67+
[
68+
("numerical", StandardScaler(), ["height"]),
69+
("categorical", OneHotEncoder(sparse_output=False), ["pet"]),
70+
],
71+
verbose_feature_names_out=False,
72+
)
73+
preprocessor.set_output(transform="polars")
74+
75+
df_out = preprocessor.fit_transform(df)
76+
print(f"Output type: {type(df_out)}")
77+
78+
# %%
79+
# Missing value support for Random Forest
80+
# ---------------------------------------
81+
# The classes :class:`ensemble.RandomForestClassifier` and
82+
# :class:`ensemble.RandomForestRegressor` now support missing values. When training
83+
# every individual tree, the splitter evaluates each potential threshold with the
84+
# missing values going to the left and right nodes. More details in the
85+
# :ref:`User Guide <tree_missing_value_support>`.
86+
import numpy as np
87+
from sklearn.ensemble import RandomForestClassifier
88+
89+
X = np.array([0, 1, 6, np.nan]).reshape(-1, 1)
90+
y = [0, 0, 1, 1]
91+
92+
forest = RandomForestClassifier(random_state=0).fit(X, y)
93+
forest.predict(X)
94+
95+
# %%
96+
# Add support for monotonic constraints in tree-based models
97+
# ----------------------------------------------------------
98+
# While we added support for monotonic constraints in histogram-based gradient boosting
99+
# in scikit-learn 0.23, we now support this feature for all other tree-based models as
100+
# trees, random forests, extra-trees, and exact gradient boosting. Here, we show this
101+
# feature for random forest on a regression problem.
102+
import matplotlib.pyplot as plt
103+
from sklearn.inspection import PartialDependenceDisplay
104+
from sklearn.ensemble import RandomForestRegressor
105+
106+
n_samples = 500
107+
rng = np.random.RandomState(0)
108+
X = rng.randn(n_samples, 2)
109+
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
110+
y = 5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise
111+
112+
rf_no_cst = RandomForestRegressor().fit(X, y)
113+
rf_cst = RandomForestRegressor(monotonic_cst=[1, 0]).fit(X, y)
114+
115+
disp = PartialDependenceDisplay.from_estimator(
116+
rf_no_cst,
117+
X,
118+
features=[0],
119+
feature_names=["feature 0"],
120+
line_kw={"linewidth": 4, "label": "unconstrained", "color": "tab:blue"},
121+
)
122+
PartialDependenceDisplay.from_estimator(
123+
rf_cst,
124+
X,
125+
features=[0],
126+
line_kw={"linewidth": 4, "label": "constrained", "color": "tab:orange"},
127+
ax=disp.axes_,
128+
)
129+
disp.axes_[0, 0].plot(
130+
X[:, 0], y, "o", alpha=0.5, zorder=-1, label="samples", color="tab:green"
131+
)
132+
disp.axes_[0, 0].set_ylim(-3, 3)
133+
disp.axes_[0, 0].set_xlim(-1, 1)
134+
disp.axes_[0, 0].legend()
135+
plt.show()
136+
137+
# %%
138+
# Enriched estimator displays
139+
# ---------------------------
140+
# Estimators displays have been enriched: if we look at `forest`, defined above:
141+
forest
142+
143+
# %%
144+
# One can access the documentation of the estimator by clicking on the icon "?" on
145+
# the top right corner of the diagram.
146+
#
147+
# In addition, the display changes color, from orange to blue, when the estimator is
148+
# fitted. You can also get this information by hovering on the icon "i".
149+
from sklearn.base import clone
150+
151+
clone(forest) # the clone is not fitted
152+
153+
# %%
154+
# Metadata Routing Support
155+
# ------------------------
156+
# Many meta-estimators and cross-validation routines now support metadata
157+
# routing, which are listed in the :ref:`user guide
158+
# <_metadata_routing_models>`. For instance, this is how you can do a nested
159+
# cross-validation with sample weights and :class:`~model_selection.GroupKFold`:
160+
import sklearn
161+
from sklearn.metrics import get_scorer
162+
from sklearn.datasets import make_regression
163+
from sklearn.linear_model import Lasso
164+
from sklearn.model_selection import GridSearchCV, cross_validate, GroupKFold
165+
166+
# For now by default metadata routing is disabled, and need to be explicitly
167+
# enabled.
168+
sklearn.set_config(enable_metadata_routing=True)
169+
170+
n_samples = 100
171+
X, y = make_regression(n_samples=n_samples, n_features=5, noise=0.5)
172+
rng = np.random.RandomState(7)
173+
groups = rng.randint(0, 10, size=n_samples)
174+
sample_weights = rng.rand(n_samples)
175+
estimator = Lasso().set_fit_request(sample_weight=True)
176+
hyperparameter_grid = {"alpha": [0.1, 0.5, 1.0, 2.0]}
177+
scoring_inner_cv = get_scorer("neg_mean_squared_error").set_score_request(
178+
sample_weight=True
179+
)
180+
inner_cv = GroupKFold(n_splits=5)
181+
182+
grid_search = GridSearchCV(
183+
estimator=estimator,
184+
param_grid=hyperparameter_grid,
185+
cv=inner_cv,
186+
scoring=scoring_inner_cv,
187+
)
188+
189+
outer_cv = GroupKFold(n_splits=5)
190+
scorers = {
191+
"mse": get_scorer("neg_mean_squared_error").set_score_request(sample_weight=True)
192+
}
193+
results = cross_validate(
194+
grid_search,
195+
X,
196+
y,
197+
cv=outer_cv,
198+
scoring=scorers,
199+
return_estimator=True,
200+
params={"sample_weight": sample_weights, "groups": groups},
201+
)
202+
print("cv error on test sets:", results["test_mse"])
203+
204+
# Setting the flag to the default `False` to avoid interference with other
205+
# scripts.
206+
sklearn.set_config(enable_metadata_routing=False)

0 commit comments

Comments
 (0)
0