8000 Add discretization example · scikit-learn/scikit-learn@ffd97a3 · GitHub
[go: up one dir, main page]

Skip to content

Commit ffd97a3

Browse files
committed
Add discretization example
1 parent 0a91bce commit ffd97a3

File tree

3 files changed

+169
-1
lines changed

3 files changed

+169
-1
lines changed
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
======================
5+
Feature discretization
6+
======================
7+
8+
A demonstration of feature discretization on synthetic classification datasets.
9+
Feature discretization decomposes each feature into a set of bins, here
10+
equally distributed in width. The discrete values are then one-hot encoded,
11+
and given to a linear classifier. On the two non-linearly separable datasets,
12+
feature discretization largely increases the performance of linear classifiers.
13+
14+
This should be taken with a grain of salt, as the intuition conveyed by
15+
these examples does not necessarily carry over to real datasets.
16+
17+
Particularly in high-dimensional spaces, data can more easily be separated
18+
linearly.
19+
20+
The plots show training points in solid colors and testing points
21+
semi-transparent. The lower right shows the classification accuracy on the test
22+
set.
23+
"""
24+
print(__doc__)
25+
26+
# Code source: Tom Dupré la Tour
27+
# Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
28+
#
29+
# License: BSD 3 clause
30+
31+
import numpy as np
32+
import matplotlib.pyplot as plt
33+
from matplotlib.colors import ListedColormap
34+
from sklearn.model_selection import train_test_split
35+
from sklearn.preprocessing import StandardScaler
36+
from sklearn.datasets import make_moons, make_circles, make_classification
37+
from sklearn.linear_model import LogisticRegression
38+
from sklearn.model_selection import GridSearchCV
39+
from sklearn.pipeline import make_pipeline
40+
from sklearn.preprocessing import KBinsDiscretizer
41+
from sklearn.svm import SVC, LinearSVC
42+
from sklearn.ensemble import GradientBoostingClassifier
43+
44+
h = .02 # step size in the mesh
45+
46+
47+
def get_name(estimator):
48+
name = estimator.__class__.__name__
49+
if name == 'Pipeline':
50+
name = [get_name(est[1]) for est in estimator.steps]
51+
name = '\n'.join(name)
52+
return name
53+
54+
55+
classifiers = [
56+
(LogisticRegression(solver='lbfgs', random_state=0), {
57+
'C': np.logspace(-2, 7, 10)
58+
}),
59+
(LinearSVC(random_state=0), {
60+
'C': np.logspace(-2, 7, 10)
61+
}),
62+
(GradientBoostingClassifier(n_estimators=50, random_state=0), {
63+
'learning_rate': np.logspace(-4, 0, 10)
64+
}),
65+
(SVC(random_state=0), {
66+
'C': np.logspace(-2, 7, 10)
67+
}),
68+
(make_pipeline(
69+
KBinsDiscretizer(encode='onehot'),
70+
LogisticRegression(solver='lbfgs', random_state=0)), {
71+
'kbinsdiscretizer__n_bins': np.arange(2, 10),
72+
'logisticregression__C': np.logspace(-2, 7, 10),
73+
}),
74+
(make_pipeline(
75+
KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
76+
'kbinsdiscretizer__n_bins': np.arange(2, 10),
77+
'linearsvc__C': np.logspace(-2, 7, 10),
78+
}),
79+
]
80+
81+
names = [get_name(e) for e, g in classifiers]
82+
83+
n_samples = 100
84+
datasets = [
85+
make_moons(n_samples=n_samples, noise=0.2, random_state=0),
86+
make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
87+
make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
88+
n_informative=2, random_state=2,
89+
n_clusters_per_class=1)
90+
]
91+
92+
figure = plt.figure(figsize=(21, 9))
93+
i = 1
94+
# iterate over datasets
95+
for ds_cnt, ds in enumerate(datasets):
96+
# preprocess dataset, split into training and test part
97+
X, y = ds
98+
X = StandardScaler().fit_transform(X)
99+
X_train, X_test, y_train, y_test = \
100+
train_test_split(X, y, test_size=.5, random_state=42)
101+
102+
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
103+
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
104+
xx, yy = np.meshgrid(
105+
np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
106+
107+
# just plot the dataset first
108+
cm = plt.cm.PiYG
109+
cm_bright = ListedColormap(['#b30065', '#178000'])
110+
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
111+
if ds_cnt == 0:
112+
ax.set_title("Input data")
113+
# Plot the training points
114+
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
115+
edgecolors='k')
116+
# and testing points
117+
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
118+
edgecolors='k')
119+
ax.set_xlim(xx.min(), xx.max())
120+
ax.set_ylim(yy.min(), yy.max())
121+
ax.set_xticks(())
122+
ax.set_yticks(())
123+
i += 1
124+
125+
# iterate over classifiers
126+
for name, (estimator, param_grid) in zip(names, classifiers):
127+
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
128+
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)
129+
clf.fit(X_train, y_train)
130+
score = clf.score(X_test, y_test)
131+
print(ds_cnt, name, score)
132+
133+
# Plot the decision boundary. For that, we will assign a color to each
134+
# point in the mesh [x_min, x_max]x[y_min, y_max].
135+
if hasattr(clf, "decision_function"):
136+
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
137+
else:
138+
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
139+
140+
# Put the result into a color plot
141+
Z = Z.reshape(xx.shape)
142+
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
143+
144+
# Plot also the training points
145+
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
146+
edgecolors='k')
147+
# and testing points
148+
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
149+
edgecolors='k', alpha=0.6)
150+
151+
ax.set_xlim(xx.min(), xx.max())
152+
ax.set_ylim(yy.min(), yy.max())
153+
ax.set_xticks(())
< 10000 /td>154+
ax.set_yticks(())
155+
if ds_cnt == 0:
156+
ax.set_title(name)
157+
ax.text(xx.max() - .3,
158+
yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15,
159+
horizontalalignment='right')
160+
i += 1
161+
162+
plt.tight_layout()
163+
plt.show()

sklearn/preprocessing/discretization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _validate_n_bins(self, n_features, ignored):
166166
"""
167167
orig_bins = self.n_bins
168168
if isinstance(orig_bins, numbers.Number):
169-
if not isinstance(orig_bins, np.int):
169+
if not isinstance(orig_bins, (np.int, np.integer)):
170170
raise ValueError("{} received an invalid n_bins type. "
171171
"Received {}, expected int."
172172
.format(KBinsDiscretizer.__name__,

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ def test_fit_transform():
2929
assert_array_equal(expected, est.transform(X))
3030

3131

32+
def test_valid_n_bins():
33+
KBinsDiscretizer(n_bins=2).fit_transform(X)
34+
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
35+
36+
3237
def test_invalid_n_bins():
3338
est = KBinsDiscretizer(n_bins=1)
3439
assert_raise_message(ValueError, "KBinsDiscretizer received an invalid "

0 commit comments

Comments
 (0)
0