diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py index 85df50c69cb87..74d71b2042b59 100644 --- a/examples/cluster/plot_dbscan.py +++ b/examples/cluster/plot_dbscan.py @@ -1,26 +1,26 @@ -# -*- coding: utf-8 -*- """ =================================== Demo of DBSCAN clustering algorithm =================================== -DBSCAN (Density-Based Spatial Clustering of Applications with Noise) -finds core samples of high density and expands clusters from them. -This algorithm is good for data which contains clusters of similar density. +DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core +samples in regions of high density and expands clusters from them. This +algorithm is good for data which contains clusters of similar density. + +See the :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` example +for a demo of different clustering algorithms on 2D datasets. """ -import numpy as np +# %% +# Data generation +# --------------- +# +# We use :class:`~sklearn.datasets.make_blobs` to create 3 synthetic clusters. -from sklearn.cluster import DBSCAN -from sklearn import metrics from sklearn.datasets import make_blobs from sklearn.preprocessing import StandardScaler - -# %% -# Generate sample data -# -------------------- centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 @@ -28,12 +28,26 @@ X = StandardScaler().fit_transform(X) +# %% +# We can visualize the resulting data: + +import matplotlib.pyplot as plt + +plt.scatter(X[:, 0], X[:, 1]) +plt.show() + # %% # Compute DBSCAN # -------------- +# +# One can access the labels assigned by :class:`~sklearn.cluster.DBSCAN` using +# the `labels_` attribute. Noisy samples are given the label math:`-1`. + +import numpy as np +from sklearn.cluster import DBSCAN +from sklearn import metrics + db = DBSCAN(eps=0.3, min_samples=10).fit(X) -core_samples_mask = np.zeros_like(db.labels_, dtype=bool) -core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. @@ -42,23 +56,46 @@ print("Estimated number of clusters: %d" % n_clusters_) print("Estimated number of noise points: %d" % n_noise_) -print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) -print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) -print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) -print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) + +# %% +# Clustering algorithms are fundamentally unsupervised learning methods. +# However, since :class:`~sklearn.datasets.make_blobs` gives access to the true +# labels of the synthetic clusters, it is possible to use evaluation metrics +# that leverage this "supervised" ground truth information to quantify the +# quality of the resulting clusters. Examples of such metrics are the +# homogeneity, completeness, V-measure, Rand-Index, Adjusted Rand-Index and +# Adjusted Mutual Information (AMI). +# +# If the ground truth labels are not known, evaluation can only be performed +# using the model results itself. In that case, the Silhouette Coefficient comes +# in handy. +# +# For more information, see the +# :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py` +# example or the :ref:`clustering_evaluation` module. + +print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels):.3f}") +print(f"Completeness: {metrics.completeness_score(labels_true, labels):.3f}") +print(f"V-measure: {metrics.v_measure_score(labels_true, labels):.3f}") +print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels):.3f}") print( - "Adjusted Mutual Information: %0.3f" - % metrics.adjusted_mutual_info_score(labels_true, labels) + "Adjusted Mutual Information:" + f" {metrics.adjusted_mutual_info_score(labels_true, labels):.3f}" ) -print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) +print(f"Silhouette Coefficient: {metrics.silhouette_score(X, labels):.3f}") # %% -# Plot result -# ----------- -import matplotlib.pyplot as plt +# Plot results +# ------------ +# +# Core samples (large dots) and non-core samples (small dots) are color-coded +# according to the asigned cluster. Samples tagged as noise are represented in +# black. -# Black removed and is used for noise instead. unique_labels = set(labels) +core_samples_mask = np.zeros_like(labels, dtype=bool) +core_samples_mask[db.core_sample_indices_] = True + colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: @@ -87,5 +124,5 @@ markersize=6, ) -plt.title("Estimated number of clusters: %d" % n_clusters_) +plt.title(f"Estimated number of clusters: {n_clusters_}") plt.show()