|
4 | 4 | =================================================
|
5 | 5 |
|
6 | 6 | This example shows how to perform univariate feature selection before running a
|
7 |
| -SVC (support vector classifier) to improve the classification scores. |
| 7 | +SVC (support vector classifier) to improve the classification scores. We use |
| 8 | +the iris dataset (4 features) and add 36 non-informative features. We can find |
| 9 | +that our model achieves best performance when we select around 10% of features. |
8 | 10 | """
|
9 | 11 | print(__doc__)
|
10 | 12 |
|
11 | 13 | import numpy as np
|
12 | 14 | import matplotlib.pyplot as plt
|
13 |
| -from sklearn.datasets import load_digits |
| 15 | +from sklearn.datasets import load_iris |
14 | 16 | from sklearn.feature_selection import SelectPercentile, chi2
|
15 | 17 | from sklearn.model_selection import cross_val_score
|
16 | 18 | from sklearn.pipeline import Pipeline
|
| 19 | +from sklearn.preprocessing import StandardScaler |
17 | 20 | from sklearn.svm import SVC
|
18 | 21 |
|
19 | 22 |
|
20 | 23 | # #############################################################################
|
21 | 24 | # Import some data to play with
|
22 |
| -X, y = load_digits(return_X_y=True) |
23 |
| -# Throw away data, to be in the curse of dimension settings |
24 |
| -X = X[:200] |
25 |
| -y = y[:200] |
26 |
| -n_samples = len(y) |
27 |
| -X = X.reshape((n_samples, -1)) |
28 |
| -# add 200 non-informative features |
29 |
| -X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) |
| 25 | +X, y = load_iris(return_X_y=True) |
| 26 | +# Add non-informative features |
| 27 | +np.random.seed(0) |
| 28 | +X = np.hstack((X, 2 * np.random.random((X.shape[0], 36)))) |
30 | 29 |
|
31 | 30 | # #############################################################################
|
32 |
| -# Create a feature-selection transform and an instance of SVM that we |
| 31 | +# Create a feature-selection transform, a scaler and an instance of SVM that we |
33 | 32 | # combine together to have an full-blown estimator
|
34 |
| - |
35 |
| -transform = SelectPercentile(chi2) |
36 |
| - |
37 |
| -clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))]) |
| 33 | +clf = Pipeline([('anova', SelectPercentile(chi2)), |
| 34 | + ('scaler', StandardScaler()), |
| 35 | + ('svc', SVC(gamma="auto"))]) |
38 | 36 |
|
39 | 37 | # #############################################################################
|
40 | 38 | # Plot the cross-validation score as a function of percentile of features
|
|
44 | 42 |
|
45 | 43 | for percentile in percentiles:
|
46 | 44 | clf.set_params(anova__percentile=percentile)
|
47 |
| - # Compute cross-validation score using 1 CPU |
48 |
| - this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1) |
| 45 | + this_scores = cross_val_score(clf, X, y, cv=5) |
49 | 46 | score_means.append(this_scores.mean())
|
50 | 47 | score_stds.append(this_scores.std())
|
51 | 48 |
|
52 | 49 | plt.errorbar(percentiles, score_means, np.array(score_stds))
|
53 |
| - |
54 | 50 | plt.title(
|
55 | 51 | 'Performance of the SVM-Anova varying the percentile of features selected')
|
| 52 | +plt.xticks(np.linspace(0, 100, 11, endpoint=True)) |
56 | 53 | plt.xlabel('Percentile')
|
57 |
| -plt.ylabel('Prediction rate') |
58 |
| - |
| 54 | +plt.ylabel('Accuracy Score') |
59 | 55 | plt.axis('tight')
|
60 | 56 | plt.show()
|
0 commit comments