|
| 1 | +""" |
| 2 | +======================================== |
| 3 | +Release Highlights for scikit-learn 0.22 |
| 4 | +======================================== |
| 5 | +
|
| 6 | +We are pleased to announce the release of scikit-learn 0.22, which comes |
| 7 | +with many bug fixes and new features! We detail below a few of the major |
| 8 | +features of this release. For an exhaustive list of all the changes, please |
| 9 | +refer to the :ref:`release notes <changes_0_22>`. |
| 10 | +
|
| 11 | +To install the latest version (with pip):: |
| 12 | +
|
| 13 | + pip install -U scikit-learn --upgrade |
| 14 | +
|
| 15 | +or with conda:: |
| 16 | +
|
| 17 | + conda install scikit-learn |
| 18 | +""" |
| 19 | + |
| 20 | +############################################################################## |
| 21 | +# Permutation-based feature importance |
| 22 | +# ------------------------------------ |
| 23 | +# |
| 24 | +# The :func:`~sklearn.inspection.permutation_importance` can be used to get an |
| 25 | +# estimate of the importance of each feature, for any fitted estimator: |
| 26 | + |
| 27 | +from sklearn.ensemble import RandomForestClassifier |
| 28 | +from sklearn.datasets import make_classification |
| 29 | +from sklearn.inspection import permutation_importance |
| 30 | +import matplotlib.pyplot as plt |
| 31 | + |
| 32 | +X, y = make_classification(random_state=0, n_features=5, n_informative=3) |
| 33 | +rf = RandomForestClassifier(random_state=0).fit(X, y) |
| 34 | +result = permutation_importance(rf, X, y, n_repeats=10, random_state=0, |
| 35 | + n_jobs=-1) |
| 36 | + |
| 37 | +fig, ax = plt.subplots() |
| 38 | +sorted_idx = result.importances_mean.argsort() |
| 39 | +ax.boxplot(result.importances[sorted_idx].T, |
| 40 | + vert=False, labels=range(X.shape[1])) |
| 41 | +ax.set_title("Permutation Importance of each feature") |
| 42 | +ax.set_ylabel("Features") |
| 43 | +fig.tight_layout() |
| 44 | +plt.show() |
| 45 | + |
| 46 | +############################################################################## |
| 47 | +# Native support for missing values for gradient boosting |
| 48 | +# ------------------------------------------------------- |
| 49 | +# |
| 50 | +# The :class:`~sklearn.ensemble.HistGradientBoostingClassifier` |
| 51 | +# and :class:`~sklearn.ensemble.HistGradientBoostingRegressor` now have native |
| 52 | +# support for missing values (NaNs). This means that there is no need for |
| 53 | +# imputing data when training or predicting. |
| 54 | + |
| 55 | +from sklearn.experimental import enable_hist_gradient_boosting # noqa |
| 56 | +from sklearn.ensemble import HistGradientBoostingClassifier |
| 57 | +import numpy as np |
| 58 | + |
| 59 | +X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) |
| 60 | +y = [0, 0, 1, 1] |
| 61 | + |
| 62 | +gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) |
| 63 | +print(gbdt.predict(X)) |
| 64 | + |
| 65 | +############################################################################## |
| 66 | +# New plotting API |
| 67 | +# ---------------- |
| 68 | +# |
| 69 | +# A new plotting API is available for creating visualizations. This new API |
| 70 | +# allows for quickly adjusting the visuals of a plot without involving any |
| 71 | +# recomputation. It is also possible to add different plots to the same |
| 72 | +# figure. See more examples in the :ref:`User Guide <visualizations>`. |
| 73 | + |
| 74 | +from sklearn.model_selection import train_test_split |
| 75 | +from sklearn.svm import SVC |
| 76 | +from sklearn.metrics import plot_roc_curve |
| 77 | + |
| 78 | +X, y = make_classification(random_state=0) |
| 79 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) |
| 80 | + |
| 81 | +svc = SVC(random_state=42) |
| 82 | +svc.fit(X_train, y_train) |
| 83 | +rfc = RandomForestClassifier(random_state=42) |
| 84 | +rfc.fit(X_train, y_train) |
| 85 | + |
| 86 | +svc_disp = plot_roc_curve(svc, X_test, y_test) |
| 87 | +rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=svc_disp.ax_) |
| 88 | +rfc_disp.figure_.suptitle("ROC curve comparison") |
| 89 | + |
| 90 | +plt.show() |
| 91 | + |
| 92 | +############################################################################# |
| 93 | +# Tree pruning |
| 94 | +# ------------ |
| 95 | +# |
| 96 | +# It is now possible to prune most tree-based estimators once the trees are |
| 97 | +# built. The pruning is based on minimal cost-complexity. Read more in the |
| 98 | +# :ref:`User Guide <minimal_cost_complexity_pruning>` for details. |
| 99 | + |
| 100 | +X, y = make_classification(random_state=0) |
| 101 | + |
| 102 | +rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y) |
| 103 | +print("Average number of nodes without pruning {:.1f}".format( |
| 104 | + np.mean([e.tree_.node_count for e in rf.estimators_]))) |
| 105 | + |
| 106 | +rf = RandomForestClassifier(random_state=0, ccp_alpha=0.05).fit(X, y) |
| 107 | +print("Average number of nodes with pruning {:.1f}".format( |
| 108 | + np.mean([e.tree_.node_count for e in rf.estimators_]))) |
| 109 | + |
| 110 | +############################################################################ |
| 111 | +# Retrieve dataframes from OpenML |
| 112 | +# ------------------------------- |
| 113 | +# :func:`datasets.fetch_openml` can now return pandas dataframe and thus |
| 114 | +# properly handle datasets with heterogeneous data: |
| 115 | + |
| 116 | +from sklearn.datasets import fetch_openml |
| 117 | + |
| 118 | +titanic = fetch_openml('titanic', version=1, as_frame=True) |
| 119 | +print(titanic.data.head()[['pclass', 'embarked']]) |
0 commit comments