8000 ML Docs: Update all kNN sections based on discussions · samaffolter/plotly.py@0b861e9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0b861e9

Browse files
author
xhlu
committed
ML Docs: Update all kNN sections based on discussions
1 parent f96b816 commit 0b861e9

File tree

1 file changed

+103
-38
lines changed

1 file changed

+103
-38
lines changed

doc/python/ml-knn.md

Lines changed: 103 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,56 +36,79 @@ jupyter:
3636

3737
## Basic binary classification with kNN
3838

39+
This section gets us started with displaying basic binary classification using 2D data. We first show how to display training versus testing data using [various marker styles](https://plot.ly/python/marker-style/), then demonstrate how to evaluate a kNN classifier's performance on the **test split** using a continuous color gradient to indicate the model's predicted score.
3940

40-
### Display training and test splits
41-
42-
```python
43-
44-
```
4541

46-
### Visualize predictions on test split
42+
### Display training and test splits
4743

48-
```python
4944

50-
```
45+
Here, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data.
5146

5247
```python
5348
import numpy as np
5449
import plotly.express as px
5550
import plotly.graph_objects as go
5651
from sklearn.datasets import make_moons
52+
from sklearn.model_selection import train_test_split
5753
from sklearn.neighbors import KNeighborsClassifier
5854

5955
X, y = make_moons(noise=0.3, random_state=0)
60-
X_test, _ = make_moons(noise=0.3, random_state=1)
61-
62-
clf = KNeighborsClassifier(15)
63-
clf.fit(X, y.astype(str)) # Fit on training set
64-
y_pred = clf.predict(X_test) # Predict on new data
65-
66-
fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'})
67-
fig.update_traces(marker_size=10)
56+
X_train, X_test, y_train, y_test = train_test_split(
57+
X, y.astype(str), test_size=0.25, random_state=0)
58+
59+
trace_specs = [
60+
[X_train, y_train, '0', 'Train', 'square'],
61+
[X_train, y_train, '1', 'Train', 'circle'],
62+
[X_test, y_test, '0', 'Test', 'square-dot'],
63+
[X_test, y_test, '1', 'Test', 'circle-dot']
64+
]
65+
66+
fig = go.Figure(data=[
67+
go.Scatter(
68+
x=X[y==label, 0], y=X[y==label, 1],
69+
name=f'{split} Split, Label {label}',
70+
mode='markers', marker_symbol=marker
71+
)
72+
for X, y, label, split, marker in trace_specs
73+
])
74+
fig.update_traces(
75+
marker_size=12, marker_line_width=1.5,
76+
marker_color="lightyellow"
77+
)
6878
fig.show()
6979
```
7080

71-
## Visualize Binary Prediction Scores
81+
### Visualize predictions on test split
82+
83+
84+
Now, we evaluate the model only on the test set. Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model.
7285

7386
```python
7487
import numpy as np
7588
import plotly.express as px
7689
import plotly.graph_objects as go
77-
from sklearn.datasets import make_classification
90+
from sklearn.datasets import make_moons
91+
from sklearn.model_selection import train_test_split
7892
from sklearn.neighbors import KNeighborsClassifier
7993

80-
X, y = make_classification(n_features=2, n_redundant=0, random_state=0)
81-
X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1)
94+
# Load and split data
95+
X, y = make_moons(noise=0.3, random_state=0)
96+
X_train, X_test, y_train, y_test = train_test_split(
97+
X, y.astype(str), test_size=0.25, random_state=0)
8298

99+
# Fit the model on training data, predict on test data
83100
clf = KNeighborsClassifier(15)
84-
clf.fit(X, y) # Fit on training set
85-
y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data
86-
87-
fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'})
88-
fig.update_traces(marker_size=10)
101+
clf.fit(X_train, y_train)
102+
y_score = clf.predict_proba(X_test)[:, 1]
103+
104+
fig = px.scatter(
105+
X_test, x=0, y=1,
106+
color=y_score, color_continuous_scale='RdBu',
107+
symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'},
108+
labels={'symbol': 'Label', 'color': 'Score'}
109+
)
110+
fig.update_traces(marker_size=12, marker_line_width=1.5)
111+
fig.update_layout(legend_orientation='h')
89112
fig.show()
90113
```
91114

@@ -96,12 +119,16 @@ import numpy as np
96119
import plotly.express as px
97120
import plotly.graph_objects as go
98121
from sklearn.datasets import make_moons
122+
from sklearn.model_selection import train_test_split
99123
from sklearn.neighbors import KNeighborsClassifier
100124

101125
mesh_size = .02
102-
margin = 1
126+
margin = 0.25
103127

128+
# Load and split data
104129
X, y = make_moons(noise=0.3, random_state=0)
130+
X_train, X_test, y_train, y_test = train_test_split(
131+
X, y.astype(str), test_size=0.25, random_state=0)
105132

106133
# Create a mesh grid on which we will run our model
107134
x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin
@@ -116,24 +143,45 @@ clf.fit(X, y)
116143
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
117144
Z = Z.reshape(xx.shape)
118145

119-
fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''})
120-
fig.update_traces(marker_size=10, marker_line_width=1)
146+
trace_specs = [
147+
[X_train, y_train, '0', 'Train', 'square'],
148+
[X_train, y_train, '1', 'Train', 'circle'],
149+
[X_test, y_test, '0', 'Test', 'square-dot'],
150+
[X_test, y_test, '1', 'Test', 'circle-dot']
151+
]
152+
153+
fig = go.Figure(data=[
154+
go.Scatter(
155+
x=X[y==label, 0], y=X[y==label, 1],
156+
name=f'{split} Split, Label {label}',
157+
mode='markers', marker_symbol=marker
158+
)
159+
for X, y, label, split, marker in trace_specs
160+
])
161+
fig.update_traces(
162+
marker_size=12, marker_line_width=1.5,
163+
marker_color="lightyellow"
164+
)
165+
121166
fig.add_trace(
122167
go.Contour(
123168
x=xrange,
124169
y=yrange,
125170
z=Z,
126171
showscale=False,
127-
colorscale=['Blue', 'Red'],
172+
colorscale='RdBu',
128173
opacity=0.4,
129-
name='Score'
174+
name='Score',
175+
hoverinfo='skip'
130176
)
131177
)
132178
fig.show()
133179
```
134180

135181
## Multi-class prediction confidence with `go.Heatmap`
136182

183+
It is also possible to visualize the prediction confidence of the model using `go.Heatmap`. In this example, you can see how to compute how confident the model is about its prediction at every point in the 2D grid. Here, we define the confidence as the difference between the highest score and the score of the other classes summed, at a certain point.
184+
137185
```python
138186
import numpy as np
139187
import plotly.express as px
@@ -145,8 +193,9 @@ margin = 1
145193

146194
# We will use the iris data, which is included in px
147195
df = px.data.iris()
148-
X = df[['sepal_length', 'sepal_width']]
149-
y = df.species_id
196+
df_train, df_test = train_test_split(df, test_size=0.25, random_state=0)
197+
X_train = df_train[['sepal_length', 'sepal_width']]
198+
y_train = df_train.species_id
150199

151200
# Create a mesh grid on which we will run our model
152201
l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin
@@ -157,23 +206,35 @@ ll, ww = np.meshgrid(lrange, wrange)
157206

158207
# Create classifier, run predictions on grid
159208
clf = KNeighborsClassifier(15, weights='distance')
160-
clf.fit(X, y)
209+
clf.fit(X_train, y_train)
161210
Z = clf.predict(np.c_[ll.ravel(), ww.ravel()])
162211
Z = Z.reshape(ll.shape)
163212
proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()])
164213
proba = proba.reshape(ll.shape + (3,))
165214

166-
fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species')
167-
fig.update_traces(marker_size=10, marker_line_width=1)
215+
# Compute the confidence, which is the difference
216+
diff = proba.max(axis=-1) - (proba.sum(axis=-1) - proba.max(axis=-1))
217+
218+
fig = px.scatter(
219+
df_test, x='sepal_length', y='sepal_width',
220+
symbol='species',
221+
symbol_map={
222+
'setosa': 'square-dot',
223+
'versicolor': 'circle-dot',
224+
'virginica': 'diamond-dot'},
225+
)
226+
fig.update_traces(
227+
marker_size=12, marker_line_width=1.5,
228+
marker_color="lightyellow"
229+
)
168230
fig.add_trace(
169231
go.Heatmap(
170232
x=lrange,
171233
y=wrange,
172-
z=Z,
173-
showscale=False,
174-
colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']],
234+
z=diff,
175235
opacity=0.25,
176236
customdata=proba,
237+
colorscale='RdBu',
177238
hovertemplate=(
178239
'sepal length: %{x} <br>'
179240
'sepal width: %{y} <br>'
@@ -183,6 +244,10 @@ fig.add_trace(
183244
)
184245
)
185246
)
247+
fig.update_layout(
248+
legend_orientation='h',
249+
title='Prediction Confidence on Test Split'
250+
)
186251
fig.show()
187252
```
188253

0 commit comments

Comments
 (0)
0