1
1
# -*- coding: utf-8 -*-
2
- """ Algorithms for clustering : DBSCAN
3
-
4
- DBSCAN: (Density-Based Spatial Clustering of Applications with Noise)
5
2
"""
6
- # Author: Robert Layton robertlayton@gmail.com
3
+ DBSCAN: Density-Based Spatial Clustering of Applications with Noise
4
+ """
5
+
6
+ # Author: Robert Layton <robertlayton@gmail.com>
7
7
#
8
8
# License: BSD
9
9
15
15
16
16
def dbscan
8000
(S , eps = 0.5 , min_points = 5 , metric = 'euclidean' ,
17
17
index_order = None , verbose = False , is_similarity = None ,):
18
- """Perform DBSCAN Clustering of data
18
+ """Perform DBSCAN clustering of data.
19
19
20
20
Parameters
21
21
----------
22
-
23
22
S: array [n_points, n_points] or [n_points, n_features]
24
- Matrix of similarities between points, or a feature matrix .
25
- If the matrix is square, it is treated as a similarity matrix ,
26
- otherwise it is treated as a feature matrix . Use is_similarity to
23
+ Array of similarities between points, or a feature array .
24
+ If the array is square, it is treated as a similarity array ,
25
+ otherwise it is treated as a feature array . Use is_similarity to
27
26
override this pattern.
28
27
eps: float, optional
29
28
The minimum similarity for two points to be considered
30
- in the same neighbourhood .
29
+ in the same neighborhood .
31
30
min_points: int, optional
32
- The number of points in a neighbourhood for a point to be considered
31
+ The number of points in a neighborhood for a point to be considered
33
32
as a core point.
34
33
metric: string, or callable
35
34
The metric to use when calculating distance between instances in a
36
- feature matrix . If metric is a string, it must be one of the options
35
+ feature array . If metric is a string, it must be one of the options
37
36
allowed by scipy.spatial.distance.pdist for its metric parameter.
38
37
Alternatively, if metric is a callable function, it is called on each
39
38
pair of instances (rows) and the resulting value recorded.
@@ -44,23 +43,21 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
44
43
verbose: boolean, optional
45
44
The verbosity level
46
45
is_similarity: boolean, optional (default=None)
47
- Overrides the behaviour of the matrix handling of S.
48
- If is_similarity is None, any square matrix is handled as a similarity
49
- matrix and any non-square matrix is a feature matrix .
50
- If is_similarity is True, any matrix is handled as a similarity matrix ,
51
- and the procedure will raise a ValueError if the matrix is not square.
52
- If is_similarity is False, any matrix will be handled as a feature
53
- matrix , including square matrices.
46
+ Overrides the behaviour of the array handling of S.
47
+ If is_similarity is None, any square array is handled as a similarity
48
+ array and any non-square array is a feature array .
49
+ If is_similarity is True, any array is handled as a similarity array ,
50
+ and the procedure will raise a ValueError if the array is not square.
51
+ If is_similarity is False, any array will be handled as a feature
52
+ array , including square matrices.
54
53
55
54
Returns
56
55
-------
57
-
58
56
core_points: array [n_core_points]
59
- index of core points
57
+ Indices of core points.
60
58
61
59
labels : array [n_points]
62
- cluster labels for each point
63
- Noisey points are given the label -1
60
+ Cluster labels for each point. Noisy points are given the label -1.
64
61
65
62
Notes
66
63
-----
@@ -71,9 +68,8 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
71
68
Algorithm for Discovering Clusters in Large Spatial Databases with Noise”.
72
69
In: Proceedings of the 2nd International Conference on Knowledge Discovery
73
70
and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996
74
-
75
-
76
71
"""
72
+
77
73
n = S .shape [0 ]
78
74
# If index order not given, create random order.
79
75
if index_order is None :
@@ -82,19 +78,19 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
82
78
assert len (index_order ) == n , ("Index order must be of length n"
83
79
" (%d expected, %d given)"
84
80
% (n , len (index_order )))
85
- S = calculateSimilarity (S , metric = metric , is_similarity = is_similarity )
86
- # Calculate neighbourhood for all points. This leaves the original point
81
+ S = calculate_similarity (S , metric = metric , is_similarity = is_similarity )
82
+ # Calculate neighborhood for all points. This leaves the original point
87
83
# in, which needs to be considered later (i.e. point i is the
88
- # neighbourhood of point i. While True, its useless information)
89
- neighbourhoods = [np .where (x >= eps )[0 ] for x in S ]
84
+ # neighborhood of point i. While True, its useless information)
85
+ neighborhoods = [np .where (x >= eps )[0 ] for x in S ]
90
86
# Initially, all points are noise.
91
- labels = np .zeros (( n ,), dtype = 'int' ) - 1
87
+ labels = np .array ([ - 1 ] * n )
92
88
# A list of all core points found.
93
89
core_points = []
94
90
# Look at all points and determine if they are core.
95
91
# If they are then build a new cluster from them.
96
92
for index in index_order :
97
- if labels [index ] != - 1 or len (neighbourhoods [index ]) < min_points :
93
+ if labels [index ] != - 1 or len (neighborhoods [index ]) < min_points :
98
94
# This point is already classified, or not enough for a core point.
99
95
continue
100
96
core_points .append (index )
@@ -108,41 +104,36 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
108
104
# A candidate is a core point in the current cluster that has
109
105
# not yet been used to expand the current cluster.
110
106
for c in candidates :
111
- for neighbour in neighbourhoods [c ]:
112
- if labels [neighbour ] == - 1 :
113
- # neighbour is part of the current cluster iff
114
- # it is not part of another cluster already.
115
- labels [neighbour ] = label_num
116
- # check if its a core point as well
117
- if len (neighbourhoods [neighbour ]) >= min_points :
118
- # is new core point
119
- new_candidates .append (neighbour )
120
- core_points .append (neighbour )
107
+ noise = np .where (labels [neighborhoods [c ]] == - 1 )[0 ]
108
+ noise = neighborhoods [c ][noise ]
109
+ labels [noise ] = label_num
110
+ for neighbor in noise :
111
+ # check if its a core point as well
112
+ if len (neighborhoods [neighbor ]) >= min_points :
113
+ # is new core point
114
+ new_candidates .append (neighbor )
115
+ core_points .append (neighbor )
121
116
# Update candidates for next round of cluster expansion.
122
117
candidates = new_candidates
123
118
return core_points , labels
124
119
125
120
126
- def calculateSimilarity (S , metric = None , is_similarity = None ):
121
+ def calculate_similarity (S , metric = None , is_similarity = None ):
127
122
n , d = S .shape
128
- # If the matrix looks square, it may be a similarity matrix .
123
+ # If the array looks square, it may be a similarity array .
129
124
if n == d :
130
- if is_similarity is None or is_similarity :
125
+ if is_similarity in ( None , True ) :
131
126
return S
132
- else :
133
- # Matrix is not square, so it cannot be a similarity matrix.
134
- if is_similarity :
135
- raise ValueError ("Matrix not square, "
136
- "cannot be a similarity matrix."
137
- " Size: %d x %d." % (n , d ))
138
- # In all other cases, the matrix is to be considered as a feature matrix.
127
+ elif is_similarity :
128
+ # Array is not square, so it cannot be a similarity array.
129
+ raise ValueError ("Array not square, cannot be a similarity array."
130
+ " Shape = %s" % repr ((n , d ))
131
+ # In all other cases, the array is to be considered as a feature array.
139
132
D = distance .squareform (distance .pdist (S , metric = metric ))
140
133
S = 1. - (D / np .max (D ))
141
134
return S
142
135
143
136
144
- ###############################################################################
145
-
146
137
class DBSCAN (BaseEstimator ):
147
138
"""Perform DBSCAN Clustering of data
148
139
@@ -152,15 +143,14 @@ class DBSCAN(BaseEstimator):
152
143
153
144
Parameters
154
145
----------
155
-
156
146
eps: float, optional
157
- The distance for two points to be considered in the same neighbourhood
147
+ The distance for two points to be considered in the same neighborhood
158
148
min_points: int, optional
159
- The number of points in a neighbourhood for a point to be considered
149
+ The number of points in a neighborhood for a point to be considered
160
150
as a core point.
161
151
metric: string, or callable
162
152
The metric to use when calculating distance between instances in a
163
- feature matrix . If metric is a string, it must be one of the options
153
+ feature array . If metric is a string, it must be one of the options
164
154
allowed by scipy.spatial.distance.pdist for its metric parameter.
165
155
Alternatively, if metric is a callable function, it is called on each
166
156
pair of instances (rows) and the resulting value recorded.
@@ -171,30 +161,26 @@ class DBSCAN(BaseEstimator):
171
161
verbose: boolean, optional
172
162
The verbosity level
173
163
is_similarity: boolean, optional (default=None)
174
- Overrides the behaviour of the matrix handling of S.
175
- If is_similarity is None, any square matrix is handled as a similarity
176
- matrix and any non-square matrix is a feature matrix .
177
- If is_similarity is True, any matrix is handled as a similarity matrix ,
178
- and the procedure will raise a ValueError if the matrix is not square.
179
- If is_similarity is False, any matrix will be handled as a feature
180
- matrix , including square matrices.
164
+ Overrides the behaviour of the array handling of S.
165
+ If is_similarity is None, any square array is handled as a similarity
166
+ array and any non-square array is a feature array .
167
+ If is_similarity is True, any array is handled as a similarity array ,
168
+ and the procedure will raise a ValueError if the array is not square.
169
+ If is_similarity is False, any array will be handled as a feature
170
+ array , including square matrices.
181
171
182
172
Methods
183
173
-------
184
-
185
174
fit:
186
175
Compute the clustering
187
176
188
177
Attributes
189
178
----------
179
+ core_points: array, shape = [n_core_points]
180
+ Indices of core points.
190
181
191
- core_points: array [n_core_points]
192
- index of core points
193
-
194
- labels : array [n_points]
195
- cluster labels for each point
196
- Noisey points are given the label -1
197
-
182
+ labels : array, shape = [n_points]
183
+ Cluster labels for each point. Noisy points are given the label -1.
198
184
199
185
Notes
200
186
-----
@@ -219,25 +205,19 @@ def __init__(self, eps=0.5, min_points=5, metric='euclidean',
219
205
self .is_similarity = is_similarity
220
206
221
207
def fit (self , S , ** params ):
222
- """Compute DBSCAN labels for points, using similarity matrix S.
208
+ """Compute DBSCAN labels for points, using similarity array S.
223
209
224
210
Parameters
225
211
----------
226
-
227
212
S: array [n_points, n_points] or [n_points, n_features]
228
- Matrix of similarities between points, or a feature matrix .
229
- If the matrix is square, it is treated as a similarity matrix ,
230
- otherwise it is treated as a feature matrix . Use is_similarity to
213
+ Array of similarities between points, or a feature array .
214
+ If the array is square, it is treated as a similarity array ,
215
+ otherwise it is treated as a feature array . Use is_similarity to
231
216
override this pattern.
232
- params: Overwrite keywords from __init__
233
-
217
+ params: dict
218
+ Overwrite keywords from __init__.
234
219
"""
220
+
235
221
self ._set_params (** params )
236
- self .core_points_ , self .labels_ = dbscan (S , eps = self .eps ,
237
- min_points = self .min_points ,
238
- verbose = self .verbose ,
239
- metric = self .metric ,
240
- index_order = self .index_order ,
241
- is_similarity = \
242
- self .is_similarity )
222
+ self .core_points_ , self .labels_ = dbscan (S , ** self ._get_params ())
243
223
return self
0 commit comments