8000 DOC Release Highlights for 0.22 (#14846) · crankycoder/scikit-learn@cd3d502 · GitHub
[go: up one dir, main page]

Skip to content

Commit cd3d502

Browse files
NicolasHugjnothman
authored andcommitted
DOC Release Highlights for 0.22 (scikit-learn#14846)
1 parent a89462b commit cd3d502

File tree

3 files changed

+129
-0
lines changed

3 files changed

+129
-0
lines changed

doc/whats_new/v0.22.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ Version 0.22.0
99

1010
**In Development**
1111

12+
For a short description of the main highlights of the release, please
13+
refer to
14+
:ref:`sphx_glr_auto_examples_release_highlights_plot_release_highlights_0_22_0.py`.
15+
1216
Changed models
1317
--------------
1418

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.. _release_highlights_examples:
2+
3+
Release Highlights
4+
------------------
5+
6+
These examples illustrate the main features of the releases of scikit-learn.
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
========================================
3+
Release Highlights for scikit-learn 0.22
4+
========================================
5+
6+
We are pleased to announce the release of scikit-learn 0.22, which comes
7+
with many bug fixes and new features! We detail below a few of the major
8+
features of this release. For an exhaustive list of all the changes, please
9+
refer to the :ref:`release notes <changes_0_22>`.
10+
11+
To install the latest version (with pip)::
12+
13+
pip install -U scikit-learn --upgrade
14+
15+
or with conda::
16+
17+
conda install scikit-learn
18+
"""
19+
20+
##############################################################################
21+
# Permutation-based feature importance
22+
# ------------------------------------
23+
#
24+
# The :func:`~sklearn.inspection.permutation_importance` can be used to get an
25+
# estimate of the importance of each feature, for any fitted estimator:
26+
27+
from sklearn.ensemble import RandomForestClassifier
28+
from sklearn.datasets import make_classification
29+
from sklearn.inspection import permutation_importance
30+
import matplotlib.pyplot as plt
31+
32+
X, y = make_classification(random_state=0, n_features=5, n_informative=3)
33+
rf = RandomForestClassifier(random_state=0).fit(X, y)
34+
result = permutation_importance(rf, X, y, n_repeats=10, random_state=0,
35+
n_jobs=-1)
36+
37+
fig, ax = plt.subplots()
38+
sorted_idx = result.importances_mean.argsort()
39+
ax.boxplot(result.importances[sorted_idx].T,
40+
vert=False, labels=range(X.shape[1]))
41+
ax.set_title("Permutation Importance of each feature")
42+
ax.set_ylabel("Features")
43+
fig.tight_layout()
44+
plt.show()
45+
46+
##############################################################################
47+
# Native support for missing values for gradient boosting
48+
# -------------------------------------------------------
49+
#
50+
# The :class:`~sklearn.ensemble.HistGradientBoostingClassifier`
51+
# and :class:`~sklearn.ensemble.HistGradientBoostingRegressor` now have native
52+
# support for missing values (NaNs). This means that there is no need for
53+
# imputing data when training or predicting.
54+
55+
from sklearn.experimental import enable_hist_gradient_boosting # noqa
56+
from sklearn.ensemble import HistGradientBoostingClassifier
57+
import numpy as np
58+
59+
X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
60+
y = [0, 0, 1, 1]
61+
62+
gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y)
63+
print(gbdt.predict(X))
64+
65+
##############################################################################
66+
# New plotting API
67+
# ----------------
68+
#
69+
# A new plotting API is available for creating visualizations. This new API
70+
# allows for quickly adjusting the visuals of a plot without involving any
71+
# recomputation. It is also possible to add different plots to the same
72+
# figure. See more examples in the :ref:`User Guide <visualizations>`.
73+
74+
from sklearn.model_selection import train_test_split
75+
from sklearn.svm import SVC
76+
from sklearn.metrics import plot_roc_curve
77+
78+
X, y = make_classification(random_state=0)
79+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
80+
81+
svc = SVC(random_state=42)
82+
svc.fit(X_train, y_train)
83+
rfc = RandomForestClassifier(random_state=42)
84+
rfc.fit(X_train, y_train)
85+
86+
svc_disp = plot_roc_curve(svc, X_test, y_test)
87+
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_)
88+
rfc_disp.figure_.suptitle("ROC curve comparison")
89+
90+
plt.show()
91+
92+
#############################################################################
93+
# Tree pruning
94+
# ------------
95+
#
96+
# It is now possible to prune most tree-based estimators once the trees are
97+
# built. The pruning is based on minimal cost-complexity. Read more in the
98+
# :ref:`User Guide <minimal_cost_complexity_pruning>` for details.
99+
100+
X, y = make_classification(random_state=0)
101+
102+
rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y)
103+
print("Average number of nodes without pruning {:.1f}".format(
104+
np.mean([e.tree_.node_count for e in rf.estimators_])))
105+
106+
rf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y)
107+
print("Average number of nodes with pruning {:.1f}".format(
108+
np.mean([e.tree_.node_count for e in rf.estimators_])))
109+
110+
############################################################################
111+
# Retrieve dataframes from OpenML
112+
# -------------------------------
113+
# :func:`datasets.fetch_openml` can now return pandas dataframe and thus
114+
# properly handle datasets with heterogeneous data:
115+
116+
from sklearn.datasets import fetch_openml
117+
118+
titanic = fetch_openml('titanic', version=1, as_frame=True)
119+
print(titanic.data.head()[['pclass', 'embarked']])

0 commit comments

Comments
 (0)
0