diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py deleted file mode 100644 index af56701db846f..0000000000000 --- a/examples/cluster/plot_dbscan.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -=================================== -Demo of DBSCAN clustering algorithm -=================================== - -DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core -samples in regions of high density and expands clusters from them. This -algorithm is good for data which contains clusters of similar density. - -See the :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` example -for a demo of different clustering algorithms on 2D datasets. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Data generation -# --------------- -# -# We use :class:`~sklearn.datasets.make_blobs` to create 3 synthetic clusters. - -from sklearn.datasets import make_blobs -from sklearn.preprocessing import StandardScaler - -centers = [[1, 1], [-1, -1], [1, -1]] -X, labels_true = make_blobs( - n_samples=750, centers=centers, cluster_std=0.4, random_state=0 -) - -X = StandardScaler().fit_transform(X) - -# %% -# We can visualize the resulting data: - -import matplotlib.pyplot as plt - -plt.scatter(X[:, 0], X[:, 1]) -plt.show() - -# %% -# Compute DBSCAN -# -------------- -# -# One can access the labels assigned by :class:`~sklearn.cluster.DBSCAN` using -# the `labels_` attribute. Noisy samples are given the label math:`-1`. - -import numpy as np - -from sklearn import metrics -from sklearn.cluster import DBSCAN - -db = DBSCAN(eps=0.3, min_samples=10).fit(X) -labels = db.labels_ - -# Number of clusters in labels, ignoring noise if present. -n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) -n_noise_ = list(labels).count(-1) - -print("Estimated number of clusters: %d" % n_clusters_) -print("Estimated number of noise points: %d" % n_noise_) - -# %% -# Clustering algorithms are fundamentally unsupervised learning methods. -# However, since :class:`~sklearn.datasets.make_blobs` gives access to the true -# labels of the synthetic clusters, it is possible to use evaluation metrics -# that leverage this "supervised" ground truth information to quantify the -# quality of the resulting clusters. Examples of such metrics are the -# homogeneity, completeness, V-measure, Rand-Index, Adjusted Rand-Index and -# Adjusted Mutual Information (AMI). -# -# If the ground truth labels are not known, evaluation can only be performed -# using the model results itself. In that case, the Silhouette Coefficient comes -# in handy. -# -# For more information, see the -# :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py` -# example or the :ref:`clustering_evaluation` module. - -print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels):.3f}") -print(f"Completeness: {metrics.completeness_score(labels_true, labels):.3f}") -print(f"V-measure: {metrics.v_measure_score(labels_true, labels):.3f}") -print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels):.3f}") -print( - "Adjusted Mutual Information:" - f" {metrics.adjusted_mutual_info_score(labels_true, labels):.3f}" -) -print(f"Silhouette Coefficient: {metrics.silhouette_score(X, labels):.3f}") - -# %% -# Plot results -# ------------ -# -# Core samples (large dots) and non-core samples (small dots) are color-coded -# according to the assigned cluster. Samples tagged as noise are represented in -# black. - -unique_labels = set(labels) -core_samples_mask = np.zeros_like(labels, dtype=bool) -core_samples_mask[db.core_sample_indices_] = True - -colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] -for k, col in zip(unique_labels, colors): - if k == -1: - # Black used for noise. - col = [0, 0, 0, 1] - - class_member_mask = labels == k - - xy = X[class_member_mask & core_samples_mask] - plt.plot( - xy[:, 0], - xy[:, 1], - "o", - markerfacecolor=tuple(col), - markeredgecolor="k", - markersize=14, - ) - - xy = X[class_member_mask & ~core_samples_mask] - plt.plot( - xy[:, 0], - xy[:, 1], - "o", - markerfacecolor=tuple(col), - markeredgecolor="k", - markersize=6, - ) - -plt.title(f"Estimated number of clusters: {n_clusters_}") -plt.show() diff --git a/examples/cluster/plot_dbscan_hdbscan_optics.py b/examples/cluster/plot_dbscan_hdbscan_optics.py new file mode 100644 index 0000000000000..c5f301e300634 --- /dev/null +++ b/examples/cluster/plot_dbscan_hdbscan_optics.py @@ -0,0 +1,447 @@ +# -*- coding: utf-8 -*- +""" +======================================================= +Demo of DBSCAN, HDBSCAN, OPTICS clustering algorithms +======================================================= +.. currentmodule:: sklearn + +In this demo we will take a look at DBSCAN, HDBSCAN, and OPTICs clustering +algorithms. We will run each algorithm on multiclass datasets of varying +densities, and note the performance changes as we adjust important +hyperparameters. We will also show how OPTICS and HDBSCAN +can be viewed as generalizations of DBSCAN, and efficiently extract DBSCAN +clusterings from the results of running these algorithms. + +We start by defining helper functions to visualize a dataset and the resulting +cluster labels after running a clustering algorithm, if applicable. We will +use this function throughout this document. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# %% +import matplotlib.pyplot as plt +import numpy as np + + +def cluster_colours(labels): + return [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(set(labels)))] + + +def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None): + if ax is None: + _, ax = plt.subplots(figsize=(10, 4)) + labels = labels if labels is not None else np.ones(X.shape[0]) + probabilities = probabilities if probabilities is not None else np.ones(X.shape[0]) + # Black removed and is used for noise instead. + unique_labels = set(labels) + + colors = cluster_colours(labels) + # The probability of a point belonging to its labeled cluster determines + # the size of its marker + proba_map = {idx: probabilities[idx] for idx in range(len(labels))} + for k, col in zip(unique_labels, colors): + if k == -1: + # Black used for noise. + col = [0, 0, 0, 1] + + class_index = np.where(labels == k)[0] + for ci in class_index: + ax.plot( + X[ci, 0], + X[ci, 1], + "x" if k == -1 else "o", + markerfacecolor=tuple(col), + markeredgecolor="black", + markersize=4 if k == -1 else 1 + 5 * proba_map[ci], + ) + n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) + preamble = "True" if ground_truth else "Estimated" + title = f"{preamble} number of clusters: {n_clusters_}" + if parameters is not None: + parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items()) + title += f" | {parameters_str}" + ax.set_title(title) + plt.tight_layout() + + +# %% +# DBSCAN +# ------- +# DBSCAN stands for Density-Based Spatial Clustering of Applications with Noise. +# It is a clustering algorithm that takes in a dataset of points and a measure +# of distance between them. +# +# To gain some intuition about how DBSCAN works, we +# need to discuss the concept of core points. We will also discuss neighborhoods. +# Generally speaking, when we talk about the neighborhood of a point, we are +# referring to a collection of points in the dataset that are within a certain +# distance from it. The size of the neighborhood will depend on the context. A +# small neighborhood will contain only points that are very close to the point +# in question, while a larger neighborhood will contain points that are further +# away. +# +# Core points are points in the dataset with 'enough' points near them. In +# other words, a core point is a point whose neighborhood has a minimum number +# of points in it. Clusters in DBSCAN are assigned so that any point in a +# neighborhood of a core point is considered to be in the same cluster as that +# core point. +# +# To formalize these notions, DBSCAN has two parameters that fully +# define what makes a sample a core point. These are `eps` and `min_samples`. +# +# `eps` is a positive distance. For a sample to be a core point, there must be +# `min_samples` samples in the dataset that are within `eps` distance from that +# sample. In other words, the neighborhood of a core point contains all points +# that are within `eps` distance from it, and this collection must contain at +# least `min_samples` points. +# +# Let us now try running some examples. First we will need to generate a +# simple ground-truth dataset. We use `make_blobs` to generate a dataset +# from three two-dimensional isotropic Gaussian distributions. For ease of +# presentation, our distance metric throughout this document will be Euclidian +# distance of points in the two-dimensional plane (all clustering algorithms +# discussed here can be given an optional metric to be used in place of this one). + +from sklearn.datasets import make_blobs + +centers = [[-1, -1], [-1, 1], [1, 0]] +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 0.5], random_state=0 +) + +plot(X, labels=labels_true, ground_truth=True) +# %% +# Now let us see how DBSCAN performs on this dataset. We will use +# `eps=0.2` and `min_samples=10`. Very shortly we will discuss how one +# chooses parameter values according to the dataset they work with. +from sklearn.cluster import DBSCAN + +db = DBSCAN(eps=0.2, min_samples=10).fit(X) +plot(X, db.labels_, parameters={"eps": 0.2, "min_samples": 10}) + +# %% +# We see that with these settings, DBSCAN correctly computed the number of +# clusters, and labelled most points accurately. Note that many points in the +# two sparser clusters were labelled as noise, because they were not within +# reachable from a core point (the euclidian distance from a core point was at +# least `eps`). +# +# Tuning DBSCAN +# ++++++++++++++ +# How do we choose what parameter values to use? While DBSCAN provides +# default values for `eps` and `min_samples` they usually will not provide good +# results, as these sensitive to the shape of the dataset you are working with. +# Larger values of `min_samples` yield more robustness to noise, but +# with a risk of grouping small clusters together. Smaller values will lead +# to more clusters and less noise. The same is true for the `eps` parameter. +# Typically `eps` is the more sensitive parameter, so one may tune `eps` after +# finding a good value for `min_samples`. +# +# We illustrate this by running DBSCAN for `eps` values 0.1, 0.3, 0.4, while +# leaving `min_samples` fixed to 10. For consistency, we will continue to use +# these values for `eps` and `min_samples` when applicable. +eps_values = [0.1, 0.3, 0.4] + +fig, axes = plt.subplots(len(eps_values), 1, figsize=(10, 12)) + +for idx, eps in enumerate(eps_values): + db = DBSCAN(eps=eps, min_samples=10).fit(X) + plot(X, db.labels_, parameters={"eps": eps, "min_samples": 10}, ax=axes[idx]) + +# %% +# DBSCAN produces the best results when we use the parameters as in the second +# plot. However, finding good values of `eps` and `min_samples` is not always +# as easy as this. It may require specialized knowledge of the dataset +# one is working with. While standardizing the dataset with +# `StandardScaler` may help with this problem, great care must be taken for +# choosing the correct value of `eps`. +# +# To illustrate these difficulties, we generate another dataset in the same +# way as before, but with more variability in the clusters. + +centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] +X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0 +) +plot(X, labels=labels_true, ground_truth=True) + +# %% +# Now we run DBSCAN on this dataset with the same settings as before. +_, axes = plt.subplots(len(eps_values), 1, figsize=(10, 12)) + +for idx, eps in enumerate(eps_values): + db = DBSCAN(eps=eps, min_samples=10).fit(X) + plot(X, db.labels_, parameters={"eps": eps, "min_samples": 10}, ax=axes[idx]) +# %% +# We can see that DBSCAN has a lot more trouble with the multiscale clusters +# of this dataset. For smaller values of `eps`, we label too many points as noise, +# especially in the sparse clusters. But taking `eps` as 0.4 results in +# classifying the two smaller dense clusters as one. +# +# The difficulty of DBSCAN with multiscale datasets can be attributed to the +# fixed value of `eps` in the definition of core points in the DBSCAN algorithm, +# resulting in fixed neighborhood sizes when classifying the core points in the +# dataset. In contrast to this approach, HDBSCAN and OPTICS generalize the `eps` +# parameter in DBSCAN to a range of values. Generally speaking they have better +# performance on multiscale clusters. +# +# Before we move on to discuss these algorithms we first discuss evaluating +# clustering performance. The dataset you have might not be the easiest to +# visualize in the way we are doing. In this case, another option to evaluate +# performance of a clustering algorithm is to use metrics. + +# %% +# Measuring Clustering Performance +# ----------------------------------- +# If the ground-truth labels are known, as in our current situation, we can use +# metrics that quantify the quality of the resulting clusters. Examples +# include homogeneity, completeness, V-measure, Rand-Index, +# Adjusted Rand-Index and Adjusted Mutual Information (AMI). +# +# If the ground-truth labels are not known, we only have the results of running +# the clustering algorithm. In this case we may use the Silhouette Coefficient. +# For brevity, we will be satisfied with a simple demonstration of evaluating +# these metrics with the result of our last DBSCAN run. +# +# For more information, see the +# :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py` +# example or the :ref:`clustering_evaluation` module. +from sklearn import metrics + +labels = db.labels_ + +print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels):.3f}") +print(f"Completeness: {metrics.completeness_score(labels_true, labels):.3f}") +print(f"V-measure: {metrics.v_measure_score(labels_true, labels):.3f}") +print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels):.3f}") +print( + "Adjusted Mutual Information:" + f" {metrics.adjusted_mutual_info_score(labels_true, labels):.3f}" +) + +print(f"Silhouette Coefficient: {metrics.silhouette_score(X, labels):.3f}") + +# %% +# HDBSCAN +# --------- +# We now move onto HDBSCAN, or hierarchical DBSCAN. This algorithm can be +# viewed as an improvement to the ideas in DBSCAN, but takes into account +# different scales of the dataset, using the notion of core-distances and +# reachability-distances instead of core points with fixed `eps` sized +# neighborhoods. For brevity we will leave the full details of this algorithm +# for the :ref:`User Guide `. +# +# One of the greatest advantages of HDBSCAN over DBSCAN is its out-of-the-box +# robustness. It’s especially remarkable on heterogeneous mixtures of data. For +# a quick example, let us run it on the previous dataset with its default +# settings. +from sklearn.cluster import HDBSCAN + +hdb = HDBSCAN().fit(X) +plot(X, hdb.labels_, hdb.probabilities_) + +# %% +# HDBSCAN is able to adapt to the multi-scale structure of the dataset without +# requiring any parameter tuning. Also notice that on fit, HDBSCAN produces a +# probabilities attribute that indicates, for each sample, the strength that it +# is a member of its assigned cluster. Our plotting function takes in this +# attribute to enlargen marker size of a sample based on its strength value. +# +# Tuning HDBSCAN +# ++++++++++++++++ +# HDBSCAN still has some hyperparemeters, and a more interesting dataset will +# probably require tuning them. We focus on two of them, `min_cluster_size` and +# `min_samples` +# +# `min_cluster_size` is the minimum number of samples in a group for that group +# to be considered a cluster. +# +# This is an intuitive parameter to select for. Clusters smaller than the ones +# of this size will be left as noise. The default value is 5. This parameter is +# generally tuned to larger values as needed. Smaller values will likely to +# lead to results with fewer points labeled as noise. However values which too +# small will lead to false sub-clusters being picked up and preferred. Larger +# values tend to be more robust with respect to noisy datasets, e.g. +# high-variance clusters with significant overlap. +# +# `min_samples` corresponds to, in some sense, the number of points required in +# the neighborhood of a point, for that point to be considered a core point. It +# is similar to its counterpart in DBSCAN. +# +# A full definition of this parameter requires more context, which the reader +# can find in the :ref:`User Guide `. For our purposes, one can +# interpret this parameter as a measure of how conservative we want our +# clustering to be. Larger values for `min_samples` increase the model’s +# robustness to noise, but risks ignoring or discarding potentially valid but +# small clusters. +# +# We recommend tuning `min_samples` after finding a good value for +# `min_cluster_size`. Alternatively one can set `min_samples=min_cluster_size` +# and simplify the hyperparemter space. For brevity, this is the option we are +# going to take in the example plots below as we show the results of running +# HDBSCAN with different hyperparameter values. + +_, axes = plt.subplots(len(eps_values), 1, figsize=(10, 12)) +hdbscan_param_vals = [5, 15, 25] +for idx, val in enumerate(hdbscan_param_vals): + # default value for min_samples is min_cluster_size but + # we set it explicitly for clarity + hdb = HDBSCAN(min_cluster_size=val, min_samples=val).fit(X) + plot( + X, + hdb.labels_, + hdb.probabilities_, + parameters={"min_cluster_size": val, "min_samples": val}, + ax=axes[idx], + ) + + +# %% +# Extracting DBSCAN clusters +# ++++++++++++++++++++++++++ +# During fit, HDBSCAN builds a (singkle-linkage) tree which encodes a DBSCAN +# clustering of all points across all values of DBSCAN’s `eps` parameter. We +# can efficiently obtain these DBSCAN like clusterings efficiently without +# fully recomputing intermediate values required for an HDBSCAN fit, such as +# core-distances, mutual-reachability, and the minimum spanning tree This is +# done by specifying a `cut_distance` (equivalent to `eps`) that we want to +# cluster with. Again, we refer to the :ref:`User Guide ` for more +# details. +# +# We run HDBSCAN with `min_cluster_size` and `min_samples` set to 10 and then +# use `HDBSCAN.dbscan_clustering(eps)` for various `eps` (the ones we used +# previously when discussing DBSCAN). Compare the plot generated below with the +# one we previously generated by running DBSCAN on the same dataset, and note +# the similarity. + +hdb = HDBSCAN(min_cluster_size=10, min_samples=10) +hdb.fit(X) +fig, axes = plt.subplots(len(eps_values), 1, figsize=(10, 12)) +for idx, eps in enumerate(eps_values): + params = {"threshold": eps} + hdb_dbscan_clustering_labels = hdb.dbscan_clustering(eps) + plot( + X, + hdb_dbscan_clustering_labels, + hdb.probabilities_, + parameters=params, + ax=axes[idx], + ) + +# %% +# OPTICS +# ------- +# We now discuss OPTICS (Ordering Points To Identify the Clustering Structure). +# Like HDBSCAN, OPTICS can be viewed as an improvement on DBSCAN. It +# generalizes the `eps` parameter from DBSCAN to a range of values. To specify +# this range, the user can set the parameter called `max_eps` which OPTICS uses +# when looking at the sizes of neighborhoods of points. The default value is +# `np.inf`. Smaller values of `max_eps` will result in shorter runtimes. +# +# It also has parametmers `min_samples` and `min_cluster_size`, whose +# descriptions are similar to their counterparts in HDBSCAN, so we will not +# repeat them here. It should be noted that values for these parameters that +# work well for HDBSCAN might not work well for OPTICS, and vice versa. +# +# `min_cluster_size` can either be a positive integer or a float between 0 and +# 1. In the latter case, it is specified as a fraction of the total number of +# samples in the dataset. +from sklearn.cluster import OPTICS, cluster_optics_dbscan + +# min_cluster_size is 10% of len(X) +optics = OPTICS(min_samples=10, min_cluster_size=0.1).fit(X) +plot(X, optics.labels_) + +# %% +# A key difference between OPTICS and the former algorithms is that OPTICS +# assigns each sample a reachability distance and an ordering. The assigned +# reachability distance for a sample can be thought of, informally, as the +# distance to the closest core point. The ordering in which OPTICS labels the +# points is such that points that are close together in distance (in the sense +# of the metric used by OPTICS, which defaults to Euclidian distance) are also +# close in ordering. More details can be found in the :ref:`User Guide` +# +# These ideas are better visualized by looking at a reachability plot. This is +# obtained by plotting the ordering of the samples on the horizontal axis +# against the corresponding reachability distance of the sample on the vertical +# axis. We will also plot horizontal lines of the form `y=eps` for the values +# of `eps` used previously. The reason for this will be made clear very soon. + +_, ax = plt.subplots(figsize=(10, 4)) + +space = np.arange(len(X)) +reachability = optics.reachability_[optics.ordering_] + +labels = optics.labels_[optics.ordering_] + +colors = cluster_colours(labels) + +for k, col in zip(set(labels), colors): + if k == -1: + continue + + ax.plot(space[labels == k], reachability[labels == k], color=tuple(col), alpha=0.7) + +for eps in eps_values: + ax.plot(space, np.full_like(space, eps, dtype=float), "k-.", alpha=0.5) + +ax.set_title("Reachability Plot") +plt.tight_layout() + +# %% +# Observe that points in the two dense clusters have low reachability +# distances, because points in a dense cluster are close to core +# points in that cluster. On the other hand, samples in the sparse +# clusters have higher reachability distances. +# +# Clusters can be extracted from this plot in two ways, either Xi or DBSCAN +# clustering. The user can choose which method by setting the `cluster_method` +# parameter to either `xi` (default) or `dbscan`. +# +# The Xi clustering method uses the steep slopes within the reachability +# plot to determine which points are in the same cluster. One can specify +# this by setting the `xi` parameter. More details on this parameter +# may be found the :ref:`User Guide `. +# +# Extracting DBSCAN clusters +# ++++++++++++++++++++++++++ +# We can also extract clustering from the reachability plot to produce results +# similar to `DBSCAN` for a given value of `eps`. This can be done manually by +# plotting the horizontal line `y=eps` on the reachability plot and reading the +# plot left to right. Breaks in the graph signify new clusters, and points +# above the line `y=eps` are treated as noise. +# +# We may also do this computationally using `cluster_optics_dbscan()` or +# setting `cluster_method='dbscan'` and an appropriate value for `eps` and +# other hyperparameters before model fitting. We will discuss only +# `cluster_optics_dbscan()`. This function takes the reachability, ordering, +# and core distances produced from running OPTICS, and an `eps` parameter +# analogous to its counterpart in DBSCAN. It produces a clustering with results +# similar if one were to run DBSCAN on the same dataset with similar settings +# for `eps` and `min_samples`. The runtime is efficient, it is linear in the +# number of samples whereas running DBSCAN from scratch is quadratic in its +# worst case. +# +# We run `cluster_optics_dbscan` over several `eps` values. For the following +# plots, each corresponding to a specific value of `eps`, observe the +# similarity to the corresponding clustering we previously we made using +# DBSCAN. Also consider how the clustering can be obtained from the +# reachability plot with the threshold `y=eps`. + +fig, axes = plt.subplots(len(eps_values), 1, figsize=(10, 12)) + + +def plot_optics_dbscan(optic_clustering, eps, ax): + label_eps = cluster_optics_dbscan( + reachability=optics.reachability_, + core_distances=optics.core_distances_, + ordering=optics.ordering_, + eps=eps, + ) + plot(X, label_eps, parameters={"eps": eps}, ax=ax) + + +for idx, eps in enumerate(eps_values): + plot_optics_dbscan(optics, eps, axes[idx]) diff --git a/examples/cluster/plot_hdbscan.py b/examples/cluster/plot_hdbscan.py deleted file mode 100644 index 64d4936694bf3..0000000000000 --- a/examples/cluster/plot_hdbscan.py +++ /dev/null @@ -1,249 +0,0 @@ -# -*- coding: utf-8 -*- -""" -==================================== -Demo of HDBSCAN clustering algorithm -==================================== -.. currentmodule:: sklearn - -In this demo we will take a look at :class:`cluster.HDBSCAN` from the -perspective of generalizing the :class:`cluster.DBSCAN` algorithm. -We'll compare both algorithms on specific datasets. Finally we'll evaluate -HDBSCAN's sensitivity to certain hyperparameters. - -We first define a couple utility functions for convenience. -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -import matplotlib.pyplot as plt -import numpy as np - -from sklearn.cluster import DBSCAN, HDBSCAN -from sklearn.datasets import make_blobs - - -def plot(X, labels, probabilities=None, parameters=None, ground_truth=False, ax=None): - if ax is None: - _, ax = plt.subplots(figsize=(10, 4)) - labels = labels if labels is not None else np.ones(X.shape[0]) - probabilities = probabilities if probabilities is not None else np.ones(X.shape[0]) - # Black removed and is used for noise instead. - unique_labels = set(labels) - colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] - # The probability of a point belonging to its labeled cluster determines - # the size of its marker - proba_map = {idx: probabilities[idx] for idx in range(len(labels))} - for k, col in zip(unique_labels, colors): - if k == -1: - # Black used for noise. - col = [0, 0, 0, 1] - - class_index = np.where(labels == k)[0] - for ci in class_index: - ax.plot( - X[ci, 0], - X[ci, 1], - "x" if k == -1 else "o", - markerfacecolor=tuple(col), - markeredgecolor="k", - markersize=4 if k == -1 else 1 + 5 * proba_map[ci], - ) - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - preamble = "True" if ground_truth else "Estimated" - title = f"{preamble} number of clusters: {n_clusters_}" - if parameters is not None: - parameters_str = ", ".join(f"{k}={v}" for k, v in parameters.items()) - title += f" | {parameters_str}" - ax.set_title(title) - plt.tight_layout() - - -# %% -# Generate sample data -# -------------------- -# One of the greatest advantages of HDBSCAN over DBSCAN is its out-of-the-box -# robustness. It's especially remarkable on heterogeneous mixtures of data. -# Like DBSCAN, it can model arbitrary shapes and distributions, however unlike -# DBSCAN it does not require specification of an arbitrary and sensitive -# `eps` hyperparameter. -# -# For example, below we generate a dataset from a mixture of three bi-dimensional -# and isotropic Gaussian distributions. -centers = [[1, 1], [-1, -1], [1.5, -1.5]] -X, labels_true = make_blobs( - n_samples=750, centers=centers, cluster_std=[0.4, 0.1, 0.75], random_state=0 -) -plot(X, labels=labels_true, ground_truth=True) -# %% -# Scale Invariance -# ----------------- -# It's worth remembering that, while DBSCAN provides a default value for `eps` -# parameter, it hardly has a proper default value and must be tuned for the -# specific dataset at use. -# -# As a simple demonstration, consider the clustering for a `eps` value tuned -# for one dataset, and clustering obtained with the same value but applied to -# rescaled versions of the dataset. -fig, axes = plt.subplots(3, 1, figsize=(10, 12)) -dbs = DBSCAN(eps=0.3) -for idx, scale in enumerate([1, 0.5, 3]): - dbs.fit(X * scale) - plot(X * scale, dbs.labels_, parameters={"scale": scale, "eps": 0.3}, ax=axes[idx]) - -# %% -# Indeed, in order to maintain the same results we would have to scale `eps` by -# the same factor. -fig, axis = plt.subplots(1, 1, figsize=(12, 5)) -dbs = DBSCAN(eps=0.9).fit(3 * X) -plot(3 * X, dbs.labels_, parameters={"scale": 3, "eps": 0.9}, ax=axis) -# %% -# While standardizing data (e.g. using -# :class:`sklearn.preprocessing.StandardScaler`) helps mitigate this problem, -# great care must be taken to select the appropriate value for `eps`. -# -# HDBSCAN is much more robust in this sense: HDBSCAN can be seen as -# clustering over all possible values of `eps` and extracting the best -# clusters from all possible clusters (see :ref:`User Guide `). -# One immediate advantage is that HDBSCAN is scale-invariant. -fig, axes = plt.subplots(3, 1, figsize=(10, 12)) -hdb = HDBSCAN() -for idx, scale in enumerate([1, 0.5, 3]): - hdb.fit(X * scale) - plot( - X * scale, - hdb.labels_, - hdb.probabilities_, - ax=axes[idx], - parameters={"scale": scale}, - ) -# %% -# Multi-Scale Clustering -# ---------------------- -# HDBSCAN is much more than scale invariant though -- it is capable of -# multi-scale clustering, which accounts for clusters with varying density. -# Traditional DBSCAN assumes that any potential clusters are homogeneous in -# density. HDBSCAN is free from such constraints. To demonstrate this we -# consider the following dataset -centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] -X, labels_true = make_blobs( - n_samples=750, centers=centers, cluster_std=[0.2, 0.35, 1.35, 1.35], random_state=0 -) -plot(X, labels=labels_true, ground_truth=True) - -# %% -# This dataset is more difficult for DBSCAN due to the varying densities and -# spatial separation: -# -# - If `eps` is too large then we risk falsely clustering the two dense -# clusters as one since their mutual reachability will extend -# clusters. -# - If `eps` is too small, then we risk fragmenting the sparser clusters -# into many false clusters. -# -# Not to mention this requires manually tuning choices of `eps` until we -# find a tradeoff that we are comfortable with. -fig, axes = plt.subplots(2, 1, figsize=(10, 8)) -params = {"eps": 0.7} -dbs = DBSCAN(**params).fit(X) -plot(X, dbs.labels_, parameters=params, ax=axes[0]) -params = {"eps": 0.3} -dbs = DBSCAN(**params).fit(X) -plot(X, dbs.labels_, parameters=params, ax=axes[1]) - -# %% -# To properly cluster the two dense clusters, we would need a smaller value of -# epsilon, however at `eps=0.3` we are already fragmenting the sparse clusters, -# which would only become more severe as we decrease epsilon. Indeed it seems -# that DBSCAN is incapable of simultaneously separating the two dense clusters -# while preventing the sparse clusters from fragmenting. Let's compare with -# HDBSCAN. -hdb = HDBSCAN().fit(X) -plot(X, hdb.labels_, hdb.probabilities_) - -# %% -# HDBSCAN is able to adapt to the multi-scale structure of the dataset without -# requiring parameter tuning. While any sufficiently interesting dataset will -# require tuning, this case demonstrates that HDBSCAN can yield qualitatively -# better classes of clusterings without users' intervention which are -# inaccessible via DBSCAN. - -# %% -# Hyperparameter Robustness -# ------------------------- -# Ultimately tuning will be an important step in any real world application, so -# let's take a look at some of the most important hyperparameters for HDBSCAN. -# While HDBSCAN is free from the `eps` parameter of DBSCAN, it does still have -# some hyperparameters like `min_cluster_size` and `min_samples` which tune its -# results regarding density. We will however see that HDBSCAN is relatively robust -# to various real world examples thanks to those parameters whose clear meaning -# helps tuning them. -# -# `min_cluster_size` -# ^^^^^^^^^^^^^^^^^^ -# `min_cluster_size` is the minimum number of samples in a group for that -# group to be considered a cluster. -# -# Clusters smaller than the ones of this size will be left as noise. -# The default value is 5. This parameter is generally tuned to -# larger values as needed. Smaller values will likely to lead to results with -# fewer points labeled as noise. However values which too small will lead to -# false sub-clusters being picked up and preferred. Larger values tend to be -# more robust with respect to noisy datasets, e.g. high-variance clusters with -# significant overlap. - -PARAM = ({"min_cluster_size": 5}, {"min_cluster_size": 3}, {"min_cluster_size": 25}) -fig, axes = plt.subplots(3, 1, figsize=(10, 12)) -for i, param in enumerate(PARAM): - hdb = HDBSCAN(**param).fit(X) - labels = hdb.labels_ - - plot(X, labels, hdb.probabilities_, param, ax=axes[i]) - -# %% -# `min_samples` -# ^^^^^^^^^^^^^ -# `min_samples` is the number of samples in a neighborhood for a point to -# be considered as a core point, including the point itself. -# `min_samples` defaults to `min_cluster_size`. -# Similarly to `min_cluster_size`, larger values for `min_samples` increase -# the model's robustness to noise, but risks ignoring or discarding -# potentially valid but small clusters. -# `min_samples` better be tuned after finding a good value for `min_cluster_size`. - -PARAM = ( - {"min_cluster_size": 20, "min_samples": 5}, - {"min_cluster_size": 20, "min_samples": 3}, - {"min_cluster_size": 20, "min_samples": 25}, -) -fig, axes = plt.subplots(3, 1, figsize=(10, 12)) -for i, param in enumerate(PARAM): - hdb = HDBSCAN(**param).fit(X) - labels = hdb.labels_ - - plot(X, labels, hdb.probabilities_, param, ax=axes[i]) - -# %% -# `dbscan_clustering` -# ^^^^^^^^^^^^^^^^^^^ -# During `fit`, `HDBSCAN` builds a single-linkage tree which encodes the -# clustering of all points across all values of :class:`~cluster.DBSCAN`'s -# `eps` parameter. -# We can thus plot and evaluate these clusterings efficiently without fully -# recomputing intermediate values such as core-distances, mutual-reachability, -# and the minimum spanning tree. All we need to do is specify the `cut_distance` -# (equivalent to `eps`) we want to cluster with. - -PARAM = ( - {"cut_distance": 0.1}, - {"cut_distance": 0.5}, - {"cut_distance": 1.0}, -) -hdb = HDBSCAN() -hdb.fit(X) -fig, axes = plt.subplots(len(PARAM), 1, figsize=(10, 12)) -for i, param in enumerate(PARAM): - labels = hdb.dbscan_clustering(**param) - - plot(X, labels, hdb.probabilities_, param, ax=axes[i]) diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py deleted file mode 100644 index 26218302542d9..0000000000000 --- a/examples/cluster/plot_optics.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -=================================== -Demo of OPTICS clustering algorithm -=================================== - -.. currentmodule:: sklearn - -Finds core samples of high density and expands clusters from them. -This example uses data that is generated so that the clusters have -different densities. - -The :class:`~cluster.OPTICS` is first used with its Xi cluster detection -method, and then setting specific thresholds on the reachability, which -corresponds to :class:`~cluster.DBSCAN`. We can see that the different -clusters of OPTICS's Xi method can be recovered with different choices of -thresholds in DBSCAN. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.gridspec as gridspec -import matplotlib.pyplot as plt -import numpy as np - -from sklearn.cluster import OPTICS, cluster_optics_dbscan - -# Generate sample data - -np.random.seed(0) -n_points_per_cluster = 250 - -C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2) -C2 = [4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2) -C3 = [1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2) -C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2) -C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) -C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) -X = np.vstack((C1, C2, C3, C4, C5, C6)) - -clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05) - -# Run the fit -clust.fit(X) - -labels_050 = cluster_optics_dbscan( - reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, - eps=0.5, -) -labels_200 = cluster_optics_dbscan( - reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, - eps=2, -) - -space = np.arange(len(X)) -reachability = clust.reachability_[clust.ordering_] -labels = clust.labels_[clust.ordering_] - -plt.figure(figsize=(10, 7)) -G = gridspec.GridSpec(2, 3) -ax1 = plt.subplot(G[0, :]) -ax2 = plt.subplot(G[1, 0]) -ax3 = plt.subplot(G[1, 1]) -ax4 = plt.subplot(G[1, 2]) - -# Reachability plot -colors = ["g.", "r.", "b.", "y.", "c."] -for klass, color in enumerate(colors): - Xk = space[labels == klass] - Rk = reachability[labels == klass] - ax1.plot(Xk, Rk, color, alpha=0.3) -ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3) -ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5) -ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5) -ax1.set_ylabel("Reachability (epsilon distance)") -ax1.set_title("Reachability Plot") - -# OPTICS -colors = ["g.", "r.", "b.", "y.", "c."] -for klass, color in enumerate(colors): - Xk = X[clust.labels_ == klass] - ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], "k+", alpha=0.1) -ax2.set_title("Automatic Clustering\nOPTICS") - -# DBSCAN at 0.5 -colors = ["g.", "r.", "b.", "c."] -for klass, color in enumerate(colors): - Xk = X[labels_050 == klass] - ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], "k+", alpha=0.1) -ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN") - -# DBSCAN at 2. -colors = ["g.", "m.", "y.", "c."] -for klass, color in enumerate(colors): - Xk = X[labels_200 == klass] - ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], "k+", alpha=0.1) -ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN") - -plt.tight_layout() -plt.show()