From 042cc392e4754f38e0fded1e9c2a303c4731315d Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Sun, 12 May 2019 16:22:26 +0800 Subject: [PATCH 1/2] DOC Improve OPTICS doc --- sklearn/cluster/optics_.py | 62 +++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index c211b86a30eab..256fbb8753955 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -7,6 +7,7 @@ Authors: Shane Grigsby Adrin Jalali Erich Schubert + Hanmin Qin License: BSD 3 clause """ @@ -23,13 +24,15 @@ class OPTICS(BaseEstimator, ClusterMixin): """Estimate clustering structure from vector array - OPTICS: Ordering Points To Identify the Clustering Structure Closely + OPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Better suited for usage on large datasets than the current sklearn implementation of DBSCAN. - Clusters are then extracted using a DBSCAN like method [1]_. + Clusters are then extracted using a DBSCAN like method + (cluster_method = 'dbscan') or an automatic + technique proposed in [1]_ (cluster_method = 'xi'). This implementation deviates from the original OPTICS by first performing k-nearest-neighborhood searches on all points to identify core sizes, then @@ -49,22 +52,21 @@ class OPTICS(BaseEstimator, ClusterMixin): 2). max_eps : float, optional (default=np.inf) - The maximum distance between two samples for them to be considered - as in the same neighborhood. Default value of ``np.inf`` will identify - clusters across all scales; reducing ``max_eps`` will result in - shorter run times. + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. Default value of ``np.inf`` will + identify clusters across all scales; reducing ``max_eps`` will result + in shorter run times. metric : string or callable, optional (default='minkowski') - metric to use for distance computation. Any metric from scikit-learn + Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - - Distance matrices are not supported. + efficient than passing the metric name as a string. If metric is + "precomputed", X is assumed to be a distance matrix and must be square. Valid values for metric are: @@ -94,9 +96,9 @@ class OPTICS(BaseEstimator, ClusterMixin): reachability and ordering. Possible values are "xi" and "dbscan". eps : float, optional (default=None) - The maximum distance between two samples for them to be considered - as in the same neighborhood. By default it assumes the same value as - ``max_eps``. + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. By default it assumes the same value + as ``max_eps``. Used only when ``cluster_method='dbscan'``. xi : float, between 0 and 1, optional (default=0.05) @@ -219,8 +221,10 @@ def fit(self, X, y=None): Parameters ---------- - X : array, shape (n_samples, n_features) - The data. + X : array, shape (n_samples, n_features), or (n_samples, n_samples) \ +if metric=’precomputed’. + A feature array, or array of distances between samples if + metric='precomputed'. y : ignored @@ -332,8 +336,10 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, Parameters ---------- - X : array, shape (n_samples, n_features) - The data. + X : array, shape (n_samples, n_features), or (n_samples, n_samples) \ +if metric=’precomputed’. + A feature array, or array of distances between samples if + metric='precomputed' min_samples : int (default=5) The number of samples in a neighborhood for a point to be considered @@ -341,22 +347,21 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, number of samples (rounded to be at least 2). max_eps : float, optional (default=np.inf) - The maximum distance between two samples for them to be considered - as in the same neighborhood. Default value of "np.inf" will identify - clusters across all scales; reducing `max_eps` will result in - shorter run times. + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. Default value of ``np.inf`` will + identify clusters across all scales; reducing ``max_eps`` will result + in shorter run times. metric : string or callable, optional (default='minkowski') - metric to use for distance computation. Any metric from scikit-learn + Metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - - Distance matrices are not supported. + efficient than passing the metric name as a string. If metric is + "precomputed", X is assumed to be a distance matrix and must be square. Valid values for metric are: @@ -771,8 +776,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, clusters. """ - # all indices are inclusive (specially at the end) - # add an inf to the end of reachability plot + # Our implementation adds an inf to the end of reachability plot # this helps to find potential clusters at the end of the # reachability plot even if there's no upward region at the end of it. reachability_plot = np.hstack((reachability_plot, np.inf)) @@ -783,6 +787,10 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, index = 0 mib = 0. # maximum in between, section 4.3.2 + # Our implementation corrects a mistake in the original + # paper, i.e., in Definition 9 steep downward point, + # r(p) * (1 - x1) <= r(p + 1) should be + # r(p) * (1 - x1) >= r(p + 1) with np.errstate(invalid='ignore'): ratio = reachability_plot[:-1] / reachability_plot[1:] steep_upward = ratio <= xi_complement From c684998bb5827ba85a03c846545f9d78b7a87b85 Mon Sep 17 00:00:00 2001 From: Hanmin Qin Date: Wed, 15 May 2019 10:21:23 +0800 Subject: [PATCH 2/2] Update sklearn/cluster/optics_.py Co-Authored-By: Joel Nothman --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 256fbb8753955..4f7eb11ab2f72 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -30,7 +30,7 @@ class OPTICS(BaseEstimator, ClusterMixin): neighborhood radius. Better suited for usage on large datasets than the current sklearn implementation of DBSCAN. - Clusters are then extracted using a DBSCAN like method + Clusters are then extracted using a DBSCAN-like method (cluster_method = 'dbscan') or an automatic technique proposed in [1]_ (cluster_method = 'xi').