8000 DOC Improve narrative of DBSCAN example by ArturoAmorQ · Pull Request #24874 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

DOC Improve narrative of DBSCAN example #24874

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 10, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 62 additions & 25 deletions examples/cluster/plot_dbscan.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,53 @@
# -*- coding: utf-8 -*-
"""
===================================
Demo of DBSCAN clustering algorithm
===================================

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
finds core samples of high density and expands clusters from them.
This algorithm is good for data which contains clusters of similar density.
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core
samples in regions of high density and expands clusters from them. This
algorithm is good for data which contains clusters of similar density.

See the :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` example
for a demo of different clustering algorithms on 2D datasets.

"""

import numpy as np
# %%
# Data generation
# ---------------
#
# We use :class:`~sklearn.datasets.make_blobs` to create 3 synthetic clusters.

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


# %%
# Generate sample data
# --------------------
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(
n_samples=750, centers=centers, cluster_std=0.4, random_state=0
)

X = StandardScaler().fit_transform(X)

# %%
# We can visualize the resulting data:

import matplotlib.pyplot as plt

plt.scatter(X[:, 0], X[:, 1])
plt.show()

# %%
# Compute DBSCAN
# --------------
#
# One can access the labels assigned by :class:`~sklearn.cluster.DBSCAN` using
# the `labels_` attribute. Noisy samples are given the label math:`-1`.

import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics

db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
Expand All @@ -42,23 +56,46 @@

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))

# %%
# Clustering algorithms are fundamentally unsupervised learning methods.
# However, since :class:`~sklearn.datasets.make_blobs` gives access to the true
# labels of the synthetic clusters, it is possible to use evaluation metrics
# that leverage this "supervised" ground truth information to quantify the
# quality of the resulting clusters. Examples of such metrics are the
# homogeneity, completeness, V-measure, Rand-Index, Adjusted Rand-Index and
# Adjusted Mutual Information (AMI).
#
# If the ground truth labels are not known, evaluation can only be performed
# using the model results itself. In that case, the Silhouette Coefficient comes
# in handy.
#
# For more information, see the
# :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`
# example or the :ref:`clustering_evaluation` module.

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, labels):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, labels):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels):.3f}")
print(
"Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels)
"Adjusted Mutual Information:"
f" {metrics.adjusted_mutual_info_score(labels_true, labels):.3f}"
)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
print(f"Silhouette Coefficient: {metrics.silhouette_score(X, labels):.3f}")

# %%
# Plot result
# -----------
import matplotlib.pyplot as plt
# Plot results
# ------------
#
# Core samples (large dots) and non-core samples (small dots) are color-coded
# according to the asigned cluster. Samples tagged as noise are represented in
# black.

# Black removed and is used for noise instead.
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
Expand Down Expand Up @@ -87,5 +124,5 @@
markersize=6,
)

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.title(f"Estimated number of clusters: {n_clusters_}")
plt.show()
0