|
3 | 3 | Univariate Feature Selection
|
4 | 4 | ============================
|
5 | 5 |
|
6 |
| -An example showing univariate feature selection. |
7 |
| -
|
8 |
| -Noisy (non informative) features are added to the iris data and |
9 |
| -univariate feature selection is applied. For each feature, we plot the |
10 |
| -p-values for the univariate feature selection and the corresponding |
11 |
| -weights of an SVM. We can see that univariate feature selection |
12 |
| -selects the informative features and that these have larger SVM weights. |
13 |
| -
|
14 |
| -In the total set of features, only the 4 first ones are significant. We |
15 |
| -can see that they have the highest score with univariate feature |
16 |
| -selection. The SVM assigns a large weight to one of these features, but also |
17 |
| -Selects many of the non-informative features. |
18 |
| -Applying univariate feature selection before the SVM |
19 |
| -increases the SVM weight attributed to the significant features, and will |
20 |
| -thus improve classification. |
| 6 | +This notebook is an example of using univariate feature selection |
| 7 | +to improve classification accuracy on a noisy dataset. |
| 8 | +
|
| 9 | +In this example, some noisy (non informative) features are added to |
| 10 | +the iris dataset. Support vector machine (SVM) is used to classify the |
| 11 | +dataset both before and after applying univariate feature selection. |
| 12 | +For each feature, we plot the p-values for the univariate feature selection |
| 13 | +and the corresponding weights of SVMs. With this, we will compare model |
| 14 | +accuracy and examine the impact of univariate feature selection on model |
| 15 | +weights. |
21 | 16 |
|
22 | 17 | """
|
23 | 18 |
|
| 19 | +# %% |
| 20 | +# Generate sample data |
| 21 | +# -------------------- |
| 22 | +# |
24 | 23 | import numpy as np
|
25 |
| -import matplotlib.pyplot as plt |
26 |
| - |
27 | 24 | from sklearn.datasets import load_iris
|
28 | 25 | from sklearn.model_selection import train_test_split
|
29 |
| -from sklearn.preprocessing import MinMaxScaler |
30 |
| -from sklearn.svm import LinearSVC |
31 |
| -from sklearn.pipeline import make_pipeline |
32 |
| -from sklearn.feature_selection import SelectKBest, f_classif |
33 |
| - |
34 |
| -# ############################################################################# |
35 |
| -# Import some data to play with |
36 | 26 |
|
37 | 27 | # The iris dataset
|
38 | 28 | X, y = load_iris(return_X_y=True)
|
|
46 | 36 | # Split dataset to select feature and evaluate the classifier
|
47 | 37 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
|
48 | 38 |
|
49 |
| -plt.figure(1) |
50 |
| -plt.clf() |
51 |
| - |
52 |
| -X_indices = np.arange(X.shape[-1]) |
| 39 | +# %% |
| 40 | +# Univariate feature selection |
| 41 | +# ---------------------------- |
| 42 | +# |
| 43 | +# Univariate feature selection with F-test for feature scoring. |
| 44 | +# We use the default selection function to select |
| 45 | +# the four most significant features. |
| 46 | +from sklearn.feature_selection import SelectKBest, f_classif |
53 | 47 |
|
54 |
| -# ############################################################################# |
55 |
| -# Univariate feature selection with F-test for feature scoring |
56 |
| -# We use the default selection function to select the four |
57 |
| -# most significant features |
58 | 48 | selector = SelectKBest(f_classif, k=4)
|
59 | 49 | selector.fit(X_train, y_train)
|
60 | 50 | scores = -np.log10(selector.pvalues_)
|
61 | 51 | scores /= scores.max()
|
62 |
| -plt.bar( |
63 |
| - X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)" |
64 |
| -) |
65 | 52 |
|
66 |
| -# ############################################################################# |
67 |
| -# Compare to the weights of an SVM |
| 53 | +# %% |
| 54 | +import matplotlib.pyplot as plt |
| 55 | + |
| 56 | +X_indices = np.arange(X.shape[-1]) |
| 57 | +plt.figure(1) |
| 58 | +plt.clf() |
| 59 | +plt.bar(X_indices - 0.05, scores, width=0.2) |
| 60 | +plt.title("Feature univariate score") |
| 61 | +plt.xlabel("Feature number") |
| 62 | +plt.ylabel(r"Univariate score ($-Log(p_{value})$)") |
| 63 | +plt.show() |
| 64 | + |
| 65 | +# %% |
| 66 | +# In the total set of features, only the 4 of the original features are significant. |
| 67 | +# We can see that they have the highest score with univariate feature |
| 68 | +# selection. |
| 69 | + |
| 70 | +# %% |
| 71 | +# Compare with SVMs |
| 72 | +# ----------------- |
| 73 | +# |
| 74 | +# Without univariate feature selection |
| 75 | +from sklearn.pipeline import make_pipeline |
| 76 | +from sklearn.preprocessing import MinMaxScaler |
| 77 | +from sklearn.svm import LinearSVC |
| 78 | + |
68 | 79 | clf = make_pipeline(MinMaxScaler(), LinearSVC())
|
69 | 80 | clf.fit(X_train, y_train)
|
70 | 81 | print(
|
|
76 | 87 | svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
|
77 | 88 | svm_weights /= svm_weights.sum()
|
78 | 89 |
|
79 |
| -plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight") |
80 |
| - |
| 90 | +# %% |
| 91 | +# After univariate feature selection |
81 | 92 | clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())
|
82 | 93 | clf_selected.fit(X_train, y_train)
|
83 | 94 | print(
|
|
89 | 100 | svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
|
90 | 101 | svm_weights_selected /= svm_weights_selected.sum()
|
91 | 102 |
|
| 103 | +# %% |
| 104 | +plt.bar( |
| 105 | + X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)" |
| 106 | +) |
| 107 | + |
| 108 | +plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight") |
| 109 | + |
92 | 110 | plt.bar(
|
93 | 111 | X_indices[selector.get_support()] - 0.05,
|
94 | 112 | svm_weights_selected,
|
95 | 113 | width=0.2,
|
96 | 114 | label="SVM weights after selection",
|
97 | 115 | )
|
98 | 116 |
|
99 |
| - |
100 | 117 | plt.title("Comparing feature selection")
|
101 | 118 | plt.xlabel("Feature number")
|
102 | 119 | plt.yticks(())
|
103 | 120 | plt.axis("tight")
|
104 | 121 | plt.legend(loc="upper right")
|
105 | 122 | plt.show()
|
| 123 | + |
| 124 | +# %% |
| 125 | +# Without univariate feature selection, the SVM assigns a large weight |
| 126 | +# to the first 4 original significant features, but also selects many of the |
| 127 | +# non-informative features. Applying univariate feature selection before |
| 128 | +# the SVM increases the SVM weight attributed to the significant features, |
| 129 | +# and will thus improve classification. |
0 commit comments