8000 DOC use notebook-style for example plot_feature_selection.py (#22437) · scikit-learn/scikit-learn@69dc086 · GitHub
[go: up one dir, main page]

Skip to content

Commit 69dc086

Browse files
DOC use notebook-style for example plot_feature_selection.py (#22437)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent d8d5637 commit 69dc086

File tree

1 file changed

+64
-40
lines changed

1 file changed

+64
-40
lines changed

examples/feature_selection/plot_feature_selection.py

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,26 @@
33
Univariate Feature Selection
44
============================
55
6-
An example showing univariate feature selection.
7-
8-
Noisy (non informative) features are added to the iris data and
9-
univariate feature selection is applied. For each feature, we plot the
10-
p-values for the univariate feature selection and the corresponding
11-
weights of an SVM. We can see that univariate feature selection
12-
selects the informative features and that these have larger SVM weights.
13-
14-
In the total set of features, only the 4 first ones are significant. We
15-
can see that they have the highest score with univariate feature
16-
selection. The SVM assigns a large weight to one of these features, but also
17-
Selects many of the non-informative features.
18-
Applying univariate feature selection before the SVM
19-
increases the SVM weight attributed to the significant features, and will
20-
thus improve classification.
6+
This notebook is an example of using univariate feature selection
7+
to improve classification accuracy on a noisy dataset.
8+
9+
In this example, some noisy (non informative) features are added to
10+
the iris dataset. Support vector machine (SVM) is used to classify the
11+
dataset both before and after applying univariate feature selection.
12+
For each feature, we plot the p-values for the univariate feature selection
13+
and the corresponding weights of SVMs. With this, we will compare model
14+
accuracy and examine the impact of univariate feature selection on model
15+
weights.
2116
2217
"""
2318

19+
# %%
20+
# Generate sample data
21+
# --------------------
22+
#
2423
import numpy as np
25-
import matplotlib.pyplot as plt
26-
2724
from sklearn.datasets import load_iris
2825
from sklearn.model_selection import train_test_split
29-
from sklearn.preprocessing import MinMaxScaler
30-
from sklearn.svm import LinearSVC
31-
from sklearn.pipeline import make_pipeline
32-
from sklearn.feature_selection import SelectKBest, f_classif
33-
34-
# #############################################################################
35-
# Import some data to play with
3626

3727
# The iris dataset
3828
X, y = load_iris(return_X_y=True)
@@ -46,25 +36,46 @@
4636
# Split dataset to select feature and evaluate the classifier
4737
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
4838

49-
plt.figure(1)
50-
plt.clf()
51-
52-
X_indices = np.arange(X.shape[-1])
39+
# %%
40+
# Univariate feature selection
41+
# ----------------------------
42+
#
43+
# Univariate feature selection with F-test for feature scoring.
44+
# We use the default selection function to select
45+
# the four most significant features.
46+
from sklearn.feature_selection import SelectKBest, f_classif
5347

54-
# #############################################################################
55-
# Univariate feature selection with F-test for feature scoring
56-
# We use the default selection function to select the four
57-
# most significant features
5848
selector = SelectKBest(f_classif, k=4)
5949
selector.fit(X_train, y_train)
6050
scores = -np.log10(selector.pvalues_)
6151
scores /= scores.max()
62-
plt.bar(
63-
X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
64-
)
6552

66-
# #############################################################################
67-
# Compare to the weights of an SVM
53+
# %%
54+
import matplotlib.pyplot as plt
55+
56+
X_indices = np.arange(X.shape[-1])
57+
plt.figure(1)
58+
plt.clf()
59+
plt.bar(X_indices - 0.05, scores, width=0.2)
60+
plt.title("Feature univariate score")
61+
plt.xlabel("Feature number")
62+
plt.ylabel(r"Univariate score ($-Log(p_{value})$)")
63+
plt.show()
64+
65+
# %%
66+
# In the total set of features, only the 4 of the original features are significant.
67+
# We can see that they have the highest score with univariate feature
68+
# selection.
69+
70+
# %%
71+
# Compare with SVMs
72+
# -----------------
73+
#
74+
# Without univariate feature selection
75+
from sklearn.pipeline import make_pipeline
76+
from sklearn.preprocessing import MinMaxScaler
77+
from sklearn.svm import LinearSVC
78+
6879
clf = make_pipeline(MinMaxScaler(), LinearSVC())
6980
clf.fit(X_train, y_train)
7081
print(
@@ -76,8 +87,8 @@
7687
svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
7788
svm_weights /= svm_weights.sum()
7889

79-
plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")
80-
90+
# %%
91+
# After univariate feature selection
8192
clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())
8293
clf_selected.fit(X_train, y_train)
8394
print(
@@ -89,17 +100,30 @@
89100
svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
90101
svm_weights_selected /= svm_weights_selected.sum()
91102

103+
# %%
104+
plt.bar(
105+
X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
106+
)
107+
108+
plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")
109+
92110
plt.bar(
93111
X_indices[selector.get_support()] - 0.05,
94112
svm_weights_selected,
95113
width=0.2,
96114
label="SVM weights after selection",
97115
)
98116

99-
100117
plt.title("Comparing feature selection")
101118
plt.xlabel("Feature number")
102119
plt.yticks(())
103120
plt.axis("tight")
104121
plt.legend(loc="upper right")
105122
plt.show()
123+
124+
# %%
125+
# Without univariate feature selection, the SVM assigns a large weight
126+
# to the first 4 original significant features, but also selects many of the
127+
# non-informative features. Applying univariate feature selection before
128+
# the SVM increases the SVM weight attributed to the significant features,
129+
# and will thus improve classification.

0 commit comments

Comments
 (0)
0