[go: up one dir, main page]

0% found this document useful (0 votes)
10 views7 pages

Nearest Centroid

The document explains the Nearest Centroid Classifier, a simple algorithm that classifies samples based on the centroid of each class. It provides examples using various datasets such as blobs, moons, circles, and spirals, demonstrating the classifier's performance and limitations, especially with non-linearly separable data. Additionally, it discusses the option to adjust the distance metric and shrink_threshold to improve classification results in datasets with varying variances.

Uploaded by

Gabriel aquino
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views7 pages

Nearest Centroid

The document explains the Nearest Centroid Classifier, a simple algorithm that classifies samples based on the centroid of each class. It provides examples using various datasets such as blobs, moons, circles, and spirals, demonstrating the classifier's performance and limitations, especially with non-linearly separable data. Additionally, it discusses the option to adjust the distance metric and shrink_threshold to improve classification results in datasets with varying variances.

Uploaded by

Gabriel aquino
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

!

pip install make-spirals

Nearest Centroid Classifier


El clasificador NearestCentroid es un algoritmo sencillo que representa cada clase por el
centroide de sus miembros. Su funcionamiento se rige por el siguiente algoritmo:

1. Determinar el centroide de cada clase como el valor promedio de todas las muestras
asignadas a dicha clase para cada característica
2. Asignar las muestras a la clase representada por el centroide más cercano

Como se observar, no hay que elegir ningún parámetro, por lo que es un buen clasificador de
referencia.

Veamos cómo funciona con un ejemplo:

import matplotlib.pyplot as plt


import numpy as np

from sklearn.neighbors import NearestCentroid


from sklearn.datasets import make_blobs, make_moons, make_circles

n_centers = [2, 3, 4, 5]

fig, axs = plt.subplots(nrows=2, ncols=len(n_centers),


figsize=(5*len(n_centers), 10))
fig.tight_layout(pad=4.0)

for i in range(len(n_centers)):

X, y = make_blobs(n_samples=300, n_features=2, centers=n_centers[i],


cluster_std=2.0, random_state=42)

min = np.amin(X, axis=0)


max = np.amax(X, axis=0)

diff = max - min

min = min - 0.1 * diff


max = max + 0.1 * diff

axs[0,i].set_title('Muestras en crudo (' + str(n_centers[i]) + '


clases)')
axs[0,i].set_xlabel('X1')
axs[0,i].set_ylabel('X2')

axs[0,i].set_xlim(min[0], max[0])
axs[0,i].set_ylim(min[1], max[1])

axs[0,i].scatter(X[:,0], X[:,1], c=y, cmap='rainbow')


nc = NearestCentroid().fit(X, y)

xx, yy = np.meshgrid(np.linspace(min[0], max[0]),


np.linspace(min[1], max[1]))
Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[1,i].set_title('Muestras clasificadas (' + str(n_centers[i]) + '


clases)')
axs[1,i].set_xlabel('X1')
axs[1,i].set_ylabel('X2')

axs[1,i].set_xlim(min[0], max[0])
axs[1,i].set_ylim(min[1], max[1])

axs[1,i].contourf(xx, yy, Z, cmap='rainbow', alpha=0.6,


antialiased=True)
axs[1,i].scatter(X[:,0], X[:,1], c=y, edgecolor='black',
cmap='rainbow')

axs[1,i].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

Aunque en el caso del conjunto de datos blobs su desempeño es aceptable, cuando tratamos
de clasificar muestras no separables linealmente, el algoritmo ofrece unos resultados muy
probres.

Con moons:

import matplotlib.pyplot as plt


import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.datasets import make_moons

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))


fig.tight_layout(pad=4.0)

X, y = make_moons(n_samples=300, noise=0.15, random_state=42)

min = np.amin(X, axis=0)


max = np.amax(X, axis=0)

diff = max - min

min = min - 0.1 * diff


max = max + 0.1 * diff

axs[0].set_title('Muestras en crudo')
axs[0].set_xlabel('X1')
axs[0].set_ylabel('X2')

axs[0].set_xlim(min[0], max[0])
axs[0].set_ylim(min[1], max[1])

axs[0].scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.bwr, alpha=0.75)

nc = NearestCentroid().fit(X, y)

xx, yy = np.meshgrid(np.linspace(min[0], max[0]), np.linspace(min[1],


max[1]))
Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[1].set_title('Muestras clasificadas')
axs[1].set_xlabel('X1')
axs[1].set_ylabel('X2')

axs[1].set_xlim(min[0], max[0])
axs[1].set_ylim(min[1], max[1])

axs[1].contourf(xx, yy, Z, cmap='bwr', alpha=0.8, antialiased=True)


axs[1].scatter(X[:,0], X[:,1], c=y, edgecolor='black', cmap='bwr')

axs[1].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

Con circles:

import matplotlib.pyplot as plt


import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.datasets import make_circles

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))


fig.tight_layout(pad=4.0)

X, y = make_circles(n_samples=300, noise=0.20, factor=0.2,


random_state=42)

min = np.amin(X, axis=0)


max = np.amax(X, axis=0)

diff = max - min

min = min - 0.1 * diff


max = max + 0.1 * diff

axs[0].set_title('Muestras en crudo')
axs[0].set_xlabel('X1')
axs[0].set_ylabel('X2')

axs[0].set_xlim(min[0], max[0])
axs[0].set_ylim(min[1], max[1])

axs[0].scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.bwr, alpha=0.75)

nc = NearestCentroid().fit(X, y)

xx, yy = np.meshgrid(np.linspace(min[0], max[0]), np.linspace(min[1],


max[1]))
Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[1].set_title('Muestras clasificadas')
axs[1].set_xlabel('X1')
axs[1].set_ylabel('X2')

axs[1].set_xlim(min[0], max[0])
axs[1].set_ylim(min[1], max[1])

axs[1].contourf(xx, yy, Z, cmap='bwr', alpha=0.8, antialiased=True)


axs[1].scatter(X[:,0], X[:,1], c=y, edgecolor='black', cmap='bwr')

axs[1].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

Con spirals:

import matplotlib.pyplot as plt


import numpy as np
from sklearn.neighbors import NearestCentroid
from make_spirals import make_spirals

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))


fig.tight_layout(pad=4.0)

X, y = make_spirals(n_samples=1000, random_state=42)

min = np.amin(X, axis=0)


max = np.amax(X, axis=0)

diff = max - min

min = min - 0.1 * diff


max = max + 0.1 * diff

axs[0].set_title('Muestras en crudo')
axs[0].set_xlabel('X1')
axs[0].set_ylabel('X2')
axs[0].set_xlim(min[0], max[0])
axs[0].set_ylim(min[1], max[1])

axs[0].scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.bwr, alpha=0.75)

nc = NearestCentroid().fit(X, y)

xx, yy = np.meshgrid(np.linspace(min[0], max[0]), np.linspace(min[1],


max[1]))
Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[1].set_title('Muestras clasificadas')
axs[1].set_xlabel('X1')
axs[1].set_ylabel('X2')

axs[1].set_xlim(min[0], max[0])
axs[1].set_ylim(min[1], max[1])

axs[1].contourf(xx, yy, Z, cmap='bwr', alpha=0.8, antialiased=True)


axs[1].scatter(X[:,0], X[:,1], c=y, edgecolor='black', cmap='bwr')

axs[1].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

El algoritmo permite parametrizar la métrica de distancia utilizada mediante metric y un


umbral shrink_threshold que modifica el cáculo del centroide dividiendo su valor para cada
característica por la varianza de dicha carácteristica en cada clase. Esto permite ser menos
sensible a conjuntos de datos con varianzas dispares.

import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))


fig.tight_layout(pad=4.0)

# dataset

X, y = make_blobs(n_samples=100, n_features=2, centers=3,


cluster_std=1.0, random_state=42)
X = X * [1, 3]

min = np.min(X)
max = np.max(X)

diff = max - min

min = min - 0.1 * diff


max = max + 0.1 * diff
xx, yy = np.meshgrid(np.linspace(min, max), np.linspace(min, max))

# plot dataset

axs[0].set_title('Muestras clasificadas')
axs[0].set_xlabel('X1')
axs[0].set_ylabel('X2')

axs[0].set_xlim(min, max)
axs[0].set_ylim(min, max)

axs[0].scatter(X[:,0], X[:,1], c=y, cmap='rainbow')

# shrink_threshold=None

nc = NearestCentroid(shrink_threshold=None).fit(X, y)

Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[1].set_title('Muestras clasificadas (shrink_threshold=None)')


axs[1].set_xlabel('X1')
axs[1].set_ylabel('X2')

axs[1].set_xlim(min, max)
axs[1].set_ylim(min, max)

axs[1].contourf(xx, yy, Z, cmap='rainbow', alpha=0.7,


antialiased=True)
axs[1].scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='black')

axs[1].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

# shrink_threshold=2

nc = NearestCentroid(shrink_threshold=2).fit(X, y)

xx, yy = np.meshgrid(np.linspace(min, max), np.linspace(min, max))


Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[2].set_title('Muestras clasificadas (shrink_threshold=2)')


axs[2].set_xlabel('X1')
axs[2].set_ylabel('X2')

axs[2].set_xlim(min, max)
axs[2].set_ylim(min, max)
axs[2].contourf(xx, yy, Z, cmap='rainbow', alpha=0.7,
antialiased=True)
axs[2].scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='black')

axs[2].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

# shrink_threshold=8

nc = NearestCentroid(shrink_threshold=8).fit(X, y)

xx, yy = np.meshgrid(np.linspace(min, max), np.linspace(min, max))


Z = nc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

axs[3].set_title('Muestras clasificadas (shrink_threshold=8)')


axs[3].set_xlabel('X1')
axs[3].set_ylabel('X2')

axs[3].set_xlim(min, max)
axs[3].set_ylim(min, max)

axs[3].contourf(xx, yy, Z, cmap='rainbow', alpha=0.7,


antialiased=True)
axs[3].scatter(X[:,0], X[:,1], c=y, cmap='rainbow', edgecolor='black')

axs[3].scatter(nc.centroids_[:,0], nc.centroids_[:,1], c='white',


edgecolor='black', s=200)

Creado por Fernando Ortega (fernando.ortega@upm.es)

You might also like