8000 DOC Improve OPTICS doc (#13866) · scikit-learn/scikit-learn@fb8721a · GitHub
[go: up one dir, main page]

Skip to content

Commit fb8721a

Browse files
qinhanmin2014jnothman
authored andcommitted
DOC Improve OPTICS doc (#13866)
1 parent 17eb3c9 commit fb8721a

File tree

1 file changed

+35
-27
lines changed

1 file changed

+35
-27
lines changed

sklearn/cluster/optics_.py

+35-27
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Authors: Shane Grigsby <refuge@rocktalus.com>
88
Adrin Jalali <adrinjalali@gmail.com>
99
Erich Schubert <erich@debian.org>
10+
Hanmin Qin <qinhanmin2005@sina.com>
1011
License: BSD 3 clause
1112
"""
1213

@@ -23,13 +24,15 @@
2324
class OPTICS(BaseEstimator, ClusterMixin):
2425
"""Estimate clustering structure from vector array
2526
26-
OPTICS: Ordering Points To Identify the Clustering Structure Closely
27+
OPTICS (Ordering Points To Identify the Clustering Structure), closely
2728
related to DBSCAN, finds core sample of high density and expands clusters
2829
from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
2930
neighborhood radius. Better suited for usage on large datasets than the
3031
current sklearn implementation of DBSCAN.
3132
32-
Clusters are then extracted using a DBSCAN like method [1]_.
33+
Clusters are then extracted using a DBSCAN-like method
34+
(cluster_method = 'dbscan') or an automatic
35+
technique proposed in [1]_ (cluster_method = 'xi').
3336
3437
This implementation deviates from the original OPTICS by first performing
3538
k-nearest-neighborhood searches on all points to identify core sizes, then
@@ -49,22 +52,21 @@ class OPTICS(BaseEstimator, ClusterMixin):
4952
2).
5053
5154
max_eps : float, optional (default=np.inf)
52-
The maximum distance between two samples for them to be considered
53-
as in the same neighborhood. Default value of ``np.inf`` will identify
54-
clusters across all scales; reducing ``max_eps`` will result in
55-
shorter run times.
55+
The maximum distance between two samples for one to be considered as
56+
in the neighborhood of th 8000 e other. Default value of ``np.inf`` will
57+
identify clusters across all scales; reducing ``max_eps`` will result
58+
in shorter run times.
5659
5760
metric : string or callable, optional (default='minkowski')
58-
metric to use for distance computation. Any metric from scikit-learn
61+
Metric to use for distance computation. Any metric from scikit-learn
5962
or scipy.spatial.distance can be used.
6063
6164
If metric is a callable function, it is called on each
6265
pair of instances (rows) and the resulting value recorded. The callable
6366
should take two arrays as input and return one value indicating the
6467
distance between them. This works for Scipy's metrics, but is less
65-
efficient than passing the metric name as a string.
66-
67-
Distance matrices are not supported.
68+
efficient than passing the metric name as a string. If metric is
69+
"precomputed", X is assumed to be a distance matrix and must be square.
6870
6971
Valid values for metric are:
7072
@@ -94,9 +96,9 @@ class OPTICS(BaseEstimator, ClusterMixin):
9496
reachability and ordering. Possible values are "xi" and "dbscan".
9597
9698
eps : float, optional (default=None)
97-
The maximum distance between two samples for them to be considered
98-
as in the same neighborhood. By default it assumes the same value as
99-
``max_eps``.
99+
The maximum distance between two samples for one to be considered as
100+
in the neighborhood of the other. By default it assumes the same value
101+
as ``max_eps``.
100102
Used only when ``cluster_method='dbscan'``.
101103
102104
xi : float, between 0 and 1, optional (default=0.05)
@@ -219,8 +221,10 @@ def fit(self, X, y=None):
219221
220222
Parameters
221223
----------
222-
X : array, shape (n_samples, n_features)
223-
The data.
224+
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
225+
if metric=’precomputed’.
226+
A feature array, or array of distances between samples if
227+
metric='precomputed'.
224228
225229
y : ignored
226230
@@ -332,31 +336,32 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
332336
333337
Parameters
334338
----------
335-
X : array, shape (n_samples, n_features)
336-
The data.
339+
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
340+
if metric=’precomputed’.
341+
A feature array, or array of distances between samples if
342+
metric='precomputed'
337343
338344
min_samples : int (default=5)
339345
The number of samples in a neighborhood for a point to be considered
340346
as a core point. Expressed as an absolute number or a fraction of the
341347
number of samples (rounded to be at least 2).
342348
343349
max_eps : float, optional (default=np.inf)
344-
The maximum distance between two samples for them to be considered
345-
as in the same neighborhood. Default value of "np.inf" will identify
346-
clusters across all scales; reducing `max_eps` will result in
347-
shorter run times.
350+
The maximum distance between two samples for one to be considered as
351+
in the neighborhood of the other. Default value of ``np.inf`` will
352+
identify clusters across all scales; reducing ``max_eps`` will result
353+
in shorter run times.
348354
349355
metric : string or callable, optional (default='minkowski')
350-
metric to use for distance computation. Any metric from scikit-learn
356+
Metric to use for distance computation. Any metric from scikit-learn
351357
or scipy.spatial.distance can be used.
352358
353359
If metric is a callable function, it is called on each
354360
pair of instances (rows) and the resulting value recorded. The callable
355361
should take two arrays as input and return one value indicating the
356362
distance between them. This works for Scipy's metrics, but is less
357-
efficient than passing the metric name as a string.
358-
359-
Distance matrices are not supported.
363+
efficient than passing the metric name as a string. If metric is
364+
"precomputed", X is assumed to be a distance matrix and must be square.
360365
361366
Valid values for metric are:
362367
@@ -771,8 +776,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
771776
clusters.
772777
"""
773778

774-
# all indices are inclusive (specially at the end)
775-
# add an inf to the end of reachability plot
779+
# Our implementation adds an inf to the end of reachability plot
776780
# this helps to find potential clusters at the end of the
777781
# reachability plot even if there's no upward region at the end of it.
778782
reachability_plot = np.hstack((reachability_plot, np.inf))
@@ -783,6 +787,10 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
783787
index = 0
784788
mib = 0. # maximum in between, section 4.3.2
785789

790+
# Our implementation corrects a mistake in the original
791+
# paper, i.e., in Definition 9 steep downward point,
792+
# r(p) * (1 - x1) <= r(p + 1) should be
793+
# r(p) * (1 - x1) >= r(p + 1)
786794
with np.errstate(invalid='ignore'):
787795
ratio = reachability_plot[:-1] / reachability_plot[1:]
788796
steep_upward = ratio <= xi_complement

0 commit comments

Comments
 (0)
0