|
1 | 1 | """
|
2 |
| -=============================== |
| 2 | +============================ |
3 | 3 | Univariate Feature Selection
|
4 |
| -=============================== |
<
10000
tr class="diff-line-row">
| 4 | +============================ | 5 | 5 |
|
6 | 6 | An example showing univariate feature selection.
|
7 | 7 |
|
|
24 | 24 | import numpy as np
|
25 | 25 | import matplotlib.pyplot as plt
|
26 | 26 |
|
27 |
| -from sklearn import datasets, svm |
28 |
| -from sklearn.feature_selection import SelectPercentile, f_classif |
| 27 | +from sklearn.datasets import load_iris |
| 28 | +from sklearn.model_selection import train_test_split |
| 29 | +from sklearn.preprocessing import MinMaxScaler |
| 30 | +from sklearn.svm import LinearSVC |
| 31 | +from sklearn.pipeline import make_pipeline |
| 32 | +from sklearn.feature_selection import SelectKBest, f_classif |
29 | 33 |
|
30 | 34 | # #############################################################################
|
31 | 35 | # Import some data to play with
|
32 | 36 |
|
33 | 37 | # The iris dataset
|
34 |
| -iris = datasets.load_iris() |
| 38 | +X, y = load_iris(return_X_y=True) |
35 | 39 |
|
36 | 40 | # Some noisy data not correlated
|
37 |
| -E = np.random.uniform(0, 0.1, size=(len(iris.data), 20)) |
| 41 | +E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20)) |
38 | 42 |
|
39 | 43 | # Add the noisy data to the informative features
|
40 |
| -X = np.hstack((iris.data, E)) |
41 |
| -y = iris.target |
| 44 | +X = np.hstack((X, E)) |
| 45 | + |
| 46 | +# Split dataset to select feature and evaluate the classifier |
| 47 | +X_train, X_test, y_train, y_test = train_test_split( |
| 48 | + X, y, stratify=y, random_state=0 |
| 49 | +) |
42 | 50 |
|
43 | 51 | plt.figure(1)
|
44 | 52 | plt.clf()
|
|
47 | 55 |
|
48 | 56 | # #############################################################################
|
49 | 57 | # Univariate feature selection with F-test for feature scoring
|
50 |
| -# We use the default selection function: the 10% most significant features |
51 |
| -selector = SelectPercentile(f_classif, percentile=10) |
52 |
| -selector.fit(X, y) |
| 58 | +# We use the default selection function to select the four |
| 59 | +# most significant features |
| 60 | +selector = SelectKBest(f_classif, k=4) |
| 61 | +selector.fit(X_train, y_train) |
53 | 62 | scores = -np.log10(selector.pvalues_)
|
54 | 63 | scores /= scores.max()
|
55 | 64 | plt.bar(X_indices - .45, scores, width=.2,
|
|
58 | 67 |
|
59 | 68 | # #############################################################################
|
60 | 69 | # Compare to the weights of an SVM
|
61 |
| -clf = svm.SVC(kernel='linear') |
62 |
| -clf.fit(X, y) |
| 70 | +clf = make_pipeline(MinMaxScaler(), LinearSVC()) |
| 71 | +clf.fit(X_train, y_train) |
| 72 | +print('Classification accuracy without selecting features: {:.3f}' |
| 73 | + .format(clf.score(X_test, y_test))) |
63 | 74 |
|
64 |
| -svm_weights = (clf.coef_ ** 2).sum(axis=0) |
65 |
| -svm_weights /= svm_weights.max() |
| 75 | +svm_weights = np.abs(clf[-1].coef_).sum(axis=0) |
| 76 | +svm_weights /= svm_weights.sum() |
66 | 77 |
|
67 | 78 | plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
|
68 | 79 | color='navy', edgecolor='black')
|
69 | 80 |
|
70 |
| -clf_selected = svm.SVC(kernel='linear') |
71 |
| -clf_selected.fit(selector.transform(X), y) |
| 81 | +clf_selected = make_pipeline( |
| 82 | + SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC() |
| 83 | +) |
| 84 | +clf_selected.fit(X_train, y_train) |
| 85 | +print('Classification accuracy after univariate feature selection: {:.3f}' |
| 86 | + .format(clf_selected.score(X_test, y_test))) |
72 | 87 |
|
73 |
| -svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) |
74 |
| -svm_weights_selected /= svm_weights_selected.max() |
| 88 | +svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0) |
| 89 | +svm_weights_selected /= svm_weights_selected.sum() |
75 | 90 |
|
76 | 91 | plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
|
77 | 92 | width=.2, label='SVM weights after selection', color='c',
|
|
0 commit comments