8000 EXA Improve example plot_svm_anova.py (#11731) · scikit-learn/scikit-learn@1deb95a · GitHub
[go: up one dir, main page]

Skip to content

Commit 1deb95a

Browse files
qinhanmin2014jnothman
authored andcommitted
EXA Improve example plot_svm_anova.py (#11731)
1 parent 59a1ef7 commit 1deb95a

File tree

1 file changed

+16
-20
lines changed

1 file changed

+16
-20
lines changed

examples/svm/plot_svm_anova.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,35 @@
44
=================================================
55
66
This example shows how to perform univariate feature selection before running a
7-
SVC (support vector classifier) to improve the classification scores.
7+
SVC (support vector classifier) to improve the classification scores. We use
8+
the iris dataset (4 features) and add 36 non-informative features. We can find
9+
that our model achieves best performance when we select around 10% of features.
810
"""
911
print(__doc__)
1012

1113
import numpy as np
1214
import matplotlib.pyplot as plt
13-
from sklearn.datasets import load_digits
15+
from sklearn.datasets import load_iris
1416
from sklearn.feature_selection import SelectPercentile, chi2
1517
from sklearn.model_selection import cross_val_score
1618
from sklearn.pipeline import Pipeline
19+
from sklearn.preprocessing import StandardScaler
1720
from sklearn.svm import SVC
1821

1922

2023
# #############################################################################
2124
# Import some data to play with
22-
X, y = load_digits(return_X_y=True)
23-
# Throw away data, to be in the curse of dimension settings
24-
X = X[:200]
25-
y = y[:200]
26-
n_samples = len(y)
27-
X = X.reshape((n_samples, -1))
28-
# add 200 non-informative features
29-
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))
25+
X, y = load_iris(return_X_y=True)
26+
# Add non-informative features
27+
np.random.seed(0)
28+
X = np.hstack((X, 2 * np.random.random((X.shape[0], 36))))
3029

3130
# #############################################################################
32-
# Create a feature-selection transform and an instance of SVM that we
31+
# Create a feature-selection transform, a scaler and an instance of SVM that we
3332
# combine together to have an full-blown estimator
34-
35-
transform = SelectPercentile(chi2)
36-
37-
clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])
33+
clf = Pipeline([('anova', SelectPercentile(chi2)),
34+
('scaler', StandardScaler()),
35+
('svc', SVC(gamma="auto"))])
3836

3937
# #############################################################################
4038
# Plot the cross-validation score as a function of percentile of features
@@ -44,17 +42,15 @@
4442

4543
for percentile in percentiles:
4644
clf.set_params(anova__percentile=percentile)
47-
# Compute cross-validation score using 1 CPU
48-
this_scores = cross_val_score(clf, X, y, cv=5, n_jobs=1)
45+
this_scores = cross_val_score(clf, X, y, cv=5)
4946
score_means.append(this_scores.mean())
5047
score_stds.append(this_scores.std())
5148

5249
plt.errorbar(percentiles, score_means, np.array(score_stds))
53-
5450
plt.title(
5551
'Performance of the SVM-Anova varying the percentile of features selected')
52+
plt.xticks(np.linspace(0, 100, 11, endpoint=True))
5653
plt.xlabel('Percentile')
57-
plt.ylabel('Prediction rate')
58-
54+
plt.ylabel('Accuracy Score')
5955
plt.axis('tight')
6056
plt.show()

0 commit comments

Comments
 (0)
0