8000 DOC add a second example for KBinsDiscretizer (#10195) · scikit-learn/scikit-learn@430af30 · GitHub
[go: up one dir, main page]

Skip to content

Commit 430af30

Browse files
TomDLTjnothman
authored andcommitted
DOC add a second example for KBinsDiscretizer (#10195)
1 parent 2c9134e commit 430af30

File tree

3 files changed

+193
-1
lines changed

3 files changed

+193
-1
lines changed
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
======================
5+
Feature discretization
6+
======================
7+
8+
A demonstration of feature discretization on synthetic classification datasets.
9+
Feature discretization decomposes each feature into a set of bins, here equally
10+
distributed in width. The discrete values are then one-hot encoded, and given
11+
to a linear classifier. This preprocessing enables a non-linear behavior even
12+
though the classifier is linear.
13+
14+
On this example, the first two rows represent linearly non-separable datasets
15+
(moons and concentric circles) while the third is approximately linearly
16+
separable. On the two linearly non-separable datasets, feature discretization
17+
largely increases the performance of linear classifiers. On the linearly
18+
separable dataset, feature discretization decreases the performance of linear
19+
classifiers. Two non-linear classifiers are also shown for comparison.
20+
21+
This example should be taken with a grain of salt, as the intuition conveyed
22+
does not necessarily carry over to real datasets. Particularly in
23+
high-dimensional spaces, data can more easily be separated linearly. Moreover,
24+
using feature discretization and one-hot encoding increases the number of
25+
features, which easily lead to overfitting when the number of samples is small.
26+
27+
The plots show training points in solid colors and testing points
28+
semi-transparent. The lower right shows the classification accuracy on the test
29+
set.
30+
"""
31+
print(__doc__)
32+
33+
# Code source: Tom Dupré la Tour
34+
# Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
35+
#
36+
# License: BSD 3 clause
37+
38+
import numpy as np
39+
import matplotlib.pyplot as plt
40+
from matplotlib.colors import ListedColormap
41+
from sklearn.model_selection import train_test_split
42+
from sklearn.preprocessing import StandardScaler
43+
from sklearn.datasets import make_moons, make_circles, make_classification
44+
from sklearn.linear_model import LogisticRegression
45+
from sklearn.model_selection import GridSearchCV
46+
from sklearn.pipeline import make_pipeline
47+
from sklearn.preprocessing import KBinsDiscretizer
48+
from sklearn.svm import SVC, LinearSVC
49+
from sklearn.ensemble import GradientBoostingClassifier
50+
51+
h = .02 # step size in the mesh
52+
53+
54+
def get_name(estimator):
55+
name = estimator.__class__.__name__
56+
if name == 'Pipeline':
57+
name = [get_name(est[1]) for est in estimator.steps]
58+
name = ' + '.join(name)
59+
return name
60+
61+
62+
# list of (estimator, param_grid), where param_grid is used in GridSearchCV
63+
classifiers = [
64+
(LogisticRegression(solver='lbfgs', random_state=0), {
65+
'C': np.logspace(-2, 7, 10)
66+
}),
67+
(LinearSVC(random_state=0), {
68+
'C': np.logspace(-2, 7, 10)
69+
}),
70+
(make_pipeline(
71+
KBinsDiscretizer(encode='onehot'),
72+
LogisticRegression(solver='lbfgs', random_state=0)), {
73+
'kbinsdiscretizer__n_bins': np.arange(2, 10),
74+
'logisticregression__C': np.logspace(-2, 7, 10),
75+
}),
76+
(make_pipeline(
77+
KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
78+
'kbinsdiscretizer__n_bins': np.arange(2, 10),
79+
'linearsvc__C': np.logspace(-2, 7, 10),
80+
}),
81+
(GradientBoostingClassifier(n_estimators=50, random_state=0), {
82+
'learning_rate': np.logspace(-4, 0, 10)
83+
}),
84+
(SVC(random_state=0), {
85+
'C': np.logspace(-2, 7, 10)
86+
}),
87+
]
88+
89+
names = [get_name(e) for e, g in classifiers]
90+
91+
n_samples = 100
92+
datasets = [
93+
make_moons(n_samples=n_samples, noise=0.2, random_state=0),
94+
make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
95+
make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
96+
n_informative=2, random_state=2,
97+
n_clusters_per_class=1)
98+
]
99+
100+
figure = plt.figure(figsize=(21, 9))
101+
cm = plt.cm.PiYG
102+
cm_bright = ListedColormap(['#b30065', '#178000'])
103+
i = 1
104+
# iterate over datasets
105+
for ds_cnt, (X, y) in enumerate(datasets):
106+
print('\ndataset %d\n---------' % ds_cnt)
107+
108+
# preprocess dataset, split into training and test part
109+
X = StandardScaler().fit_transform(X)
110+
X_train, X_test, y_train, y_test = train_test_split(
111+
X, y, test_size=.5, random_state=42)
112+
113+
# create the grid for background colors
114+
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
115+
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
116+
xx, yy = np.meshgrid(
117+
np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
118+
119+
# plot the dataset first
120+
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
121+
if ds_cnt == 0:
122+
ax.set_title("Input data")
123+
# plot the training points
124+
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
125+
edgecolors='k')
126+
# and testing points
127+
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
128+
edgecolors='k')
129+
ax.set_xlim(xx.min(), xx.max())
130+
ax.set_ylim(yy.min(), yy.max())
131+
ax.set_xticks(())
132+
ax.set_yticks(())
133+
i += 1
134+
135+
# iterate over classifiers
136+
for name, (estimator, param_grid) in zip(names, classifiers):
137+
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
138+
139+
clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)
140+
clf.fit(X_train, y_train)
141+
score = clf.score(X_test, y_test)
142+
print('%s: %.2f' % (name, score))
143+
144+
# plot the decision boundary. For that, we will assign a color to each
145+
# point in the mesh [x_min, x_max]*[y_min, y_max].
146+
if hasattr(clf, "decision_function"):
147+
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
148+
else:
149+
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
150+
151+
# put the result into a color plot
152+
Z = Z.reshape(xx.shape)
153+
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
154+
155+
# plot the training points
156+
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
157+
edgecolors='k')
158+
# and testing points
159+
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
160+
edgecolors='k', alpha=0.6)
161+
ax.set_xlim(xx.min(), xx.max())
162+
ax.set_ylim(yy.min(), yy.max())
163+
ax.set_xticks(())
164+
ax.set_yticks(())
165+
166+
if ds_cnt == 0:
167+
ax.set_title(name.replace(' + ', '\n'))
168+
ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15,
169+
bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),
170+
transform=ax.transAxes, horizontalalignment='right')
171+
172+
i += 1
173+
174+
plt.tight_layout()
175+
176+
# Add suptitles above the figure
177+
plt.subplots_adjust(top=0.90)
178+
suptitles = [
179+
'Linear classifiers',
180+
'Feature discretization and linear classifiers',
181+
'Non-linear classifiers',
182+
]
183+
for i, suptitle in zip([2, 4, 6], suptitles):
184+
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
185+
ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,
186+
horizontalalignment='center', size='x-large')
187+
plt.show()

sklearn/preprocessing/discretization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _validate_n_bins(self, n_features, ignored):
166166
"""
167167
orig_bins = self.n_bins
168168
if isinstance(orig_bins, numbers.Number):
169-
if not isinstance(orig_bins, np.int):
169+
if not isinstance(orig_bins, (numbers.Integral, np.integer)):
170170
raise ValueError("{} received an invalid n_bins type. "
171171
"Received {}, expected int."
172172
.format(KBinsDiscretizer.__name__,

sklearn/preprocessing/tests/test_discretization.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ def test_fit_transform():
2929
assert_array_equal(expected, est.transform(X))
3030

3131

32+
def test_valid_n_bins():
33+
KBinsDiscretizer(n_bins=2).fit_transform(X)
34+
KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
35+
36+
3237
def test_invalid_n_bins():
3338
est = KBinsDiscretizer(n_bins=1)
3439
assert_raise_message(ValueError, "KBinsDiscretizer received an invalid "

0 commit comments

Comments
 (0)
0