8000 Merge pull request #1 from larsmans/dbscan · NelleV/scikit-learn@0d9a3fc · GitHub
[go: up one dir, main page]

8000
Skip to content

Commit 0d9a3fc

Browse files
committed
Merge pull request #1 from larsmans/dbscan
Minor stuff for DBSCAN
2 parents 470c94b + 760c154 commit 0d9a3fc

File tree

2 files changed

+72
-93
lines changed

2 files changed

+72
-93
lines changed

scikits/learn/cluster/dbscan_.py

Lines changed: 66 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# -*- coding: utf-8 -*-
2-
""" Algorithms for clustering : DBSCAN
3-
4-
DBSCAN: (Density-Based Spatial Clustering of Applications with Noise)
52
"""
6-
# Author: Robert Layton robertlayton@gmail.com
3+
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
4+
"""
5+
6+
# Author: Robert Layton <robertlayton@gmail.com>
77
#
88
# License: BSD
99

@@ -15,25 +15,24 @@
1515

1616
def dbscan 8000 (S, eps=0.5, min_points=5, metric='euclidean',
1717
index_order=None, verbose=False, is_similarity=None,):
18-
"""Perform DBSCAN Clustering of data
18+
"""Perform DBSCAN clustering of data.
1919
2020
Parameters
2121
----------
22-
2322
S: array [n_points, n_points] or [n_points, n_features]
24-
Matrix of similarities between points, or a feature matrix.
25-
If the matrix is square, it is treated as a similarity matrix,
26-
otherwise it is treated as a feature matrix. Use is_similarity to
23+
Array of similarities between points, or a feature array.
24+
If the array is square, it is treated as a similarity array,
25+
otherwise it is treated as a feature array. Use is_similarity to
2726
override this pattern.
2827
eps: float, optional
2928
The minimum similarity for two points to be considered
30-
in the same neighbourhood.
29+
in the same neighborhood.
3130
min_points: int, optional
32-
The number of points in a neighbourhood for a point to be considered
31+
The number of points in a neighborhood for a point to be considered
3332
as a core point.
3433
metric: string, or callable
3534
The metric to use when calculating distance between instances in a
36-
feature matrix. If metric is a string, it must be one of the options
35+
feature array. If metric is a string, it must be one of the options
3736
allowed by scipy.spatial.distance.pdist for its metric parameter.
3837
Alternatively, if metric is a callable function, it is called on each
3938
pair of instances (rows) and the resulting value recorded.
@@ -44,23 +43,21 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
4443
verbose: boolean, optional
4544
The verbosity level
4645
is_similarity: boolean, optional (default=None)
47-
Overrides the behaviour of the matrix handling of S.
48-
If is_similarity is None, any square matrix is handled as a similarity
49-
matrix and any non-square matrix is a feature matrix.
50-
If is_similarity is True, any matrix is handled as a similarity matrix,
51-
and the procedure will raise a ValueError if the matrix is not square.
52-
If is_similarity is False, any matrix will be handled as a feature
53-
matrix, including square matrices.
46+
Overrides the behaviour of the array handling of S.
47+
If is_similarity is None, any square array is handled as a similarity
48+
array and any non-square array is a feature array.
49+
If is_similarity is True, any array is handled as a similarity array,
50+
and the procedure will raise a ValueError if the array is not square.
51+
If is_similarity is False, any array will be handled as a feature
52+
array, including square matrices.
5453
5554
Returns
5655
-------
57-
5856
core_points: array [n_core_points]
59-
index of core points
57+
Indices of core points.
6058
6159
labels : array [n_points]
62-
cluster labels for each point
63-
Noisey points are given the label -1
60+
Cluster labels for each point. Noisy points are given the label -1.
6461
6562
Notes
6663
-----
@@ -71,9 +68,8 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
7168
Algorithm for Discovering Clusters in Large Spatial Databases with Noise”.
7269
In: Proceedings of the 2nd International Conference on Knowledge Discovery
7370
and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996
74-
75-
7671
"""
72+
7773
n = S.shape[0]
7874
# If index order not given, create random order.
7975
if index_order is None:
@@ -82,19 +78,19 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
8278
assert len(index_order) == n, ("Index order must be of length n"
8379
" (%d expected, %d given)"
8480
% (n, len(index_order)))
85-
S = calculateSimilarity(S, metric=metric, is_similarity=is_similarity)
86-
# Calculate neighbourhood for all points. This leaves the original point
81+
S = calculate_similarity(S, metric=metric, is_similarity=is_similarity)
82+
# Calculate neighborhood for all points. This leaves the original point
8783
# in, which needs to be considered later (i.e. point i is the
88-
# neighbourhood of point i. While True, its useless information)
89-
neighbourhoods = [np.where(x >= eps)[0] for x in S]
84+
# neighborhood of point i. While True, its useless information)
85+
neighborhoods = [np.where(x >= eps)[0] for x in S]
9086
# Initially, all points are noise.
91-
labels = np.zeros((n,), dtype='int') - 1
87+
labels = np.array([-1] * n)
9288
# A list of all core points found.
9389
core_points = []
9490
# Look at all points and determine if they are core.
9591
# If they are then build a new cluster from them.
9692
for index in index_order:
97-
if labels[index] != -1 or len(neighbourhoods[index]) < min_points:
93+
if labels[index] != -1 or len(neighborhoods[index]) < min_points:
9894
# This point is already classified, or not enough for a core point.
9995
continue
10096
core_points.append(index)
@@ -108,41 +104,36 @@ def dbscan(S, eps=0.5, min_points=5, metric='euclidean',
108104
# A candidate is a core point in the current cluster that has
109105
# not yet been used to expand the current cluster.
110106
for c in candidates:
111-
for neighbour in neighbourhoods[c]:
112-
if labels[neighbour] == -1:
113-
# neighbour is part of the current cluster iff
114-
# it is not part of another cluster already.
115-
labels[neighbour] = label_num
116-
# check if its a core point as well
117-
if len(neighbourhoods[neighbour]) >= min_points:
118-
# is new core point
119-
new_candidates.append(neighbour)
120-
core_points.append(neighbour)
107+
noise = np.where(labels[neighborhoods[c]] == -1)[0]
108+
noise = neighborhoods[c][noise]
109+
labels[noise] = label_num
110+
for neighbor in noise:
111+
# check if its a core point as well
112+
if len(neighborhoods[neighbor]) >= min_points:
113+
# is new core point
114+
new_candidates.append(neighbor)
115+
core_points.append(neighbor)
121116
# Update candidates for next round of cluster expansion.
122117
candidates = new_candidates
123118
return core_points, labels
124119

125120

126-
def calculateSimilarity(S, metric=None, is_similarity=None):
121+
def calculate_similarity(S, metric=None, is_similarity=None):
127122
n, d = S.shape
128-
# If the matrix looks square, it may be a similarity matrix.
123+
# If the array looks square, it may be a similarity array.
129124
if n == d:
130-
if is_similarity is None or is_similarity:
125+
if is_similarity in (None, True):
131126
return S
132-
else:
133-
# Matrix is not square, so it cannot be a similarity matrix.
134-
if is_similarity:
135-
raise ValueError("Matrix not square, "
136-
"cannot be a similarity matrix."
137-
" Size: %d x %d." % (n, d))
138-
# In all other cases, the matrix is to be considered as a feature matrix.
127+
elif is_similarity:
128+
# Array is not square, so it cannot be a similarity array.
129+
raise ValueError("Array not square, cannot be a similarity array."
130+
" Shape = %s" % repr((n, d))
131+
# In all other cases, the array is to be considered as a feature array.
139132
D = distance.squareform(distance.pdist(S, metric=metric))
140133
S = 1. - (D / np.max(D))
141134
return S
142135

143136

144-
###############################################################################
145-
146137
class DBSCAN(BaseEstimator):
147138
"""Perform DBSCAN Clustering of data
148139
@@ -152,15 +143,14 @@ class DBSCAN(BaseEstimator):
152143
153144
Parameters
154145
----------
155-
156146
eps: float, optional
157-
The distance for two points to be considered in the same neighbourhood
147+
The distance for two points to be considered in the same neighborhood
158148
min_points: int, optional
159-
The number of points in a neighbourhood for a point to be considered
149+
The number of points in a neighborhood for a point to be considered
160150
as a core point.
161151
metric: string, or callable
162152
The metric to use when calculating distance between instances in a
163-
feature matrix. If metric is a string, it must be one of the options
153+
feature array. If metric is a string, it must be one of the options
164154
allowed by scipy.spatial.distance.pdist for its metric parameter.
165155
Alternatively, if metric is a callable function, it is called on each
166156
pair of instances (rows) and the resulting value recorded.
@@ -171,30 +161,26 @@ class DBSCAN(BaseEstimator):
171161
verbose: boolean, optional
172162
The verbosity level
173163
is_similarity: boolean, optional (default=None)
174-
Overrides the behaviour of the matrix handling of S.
175-
If is_similarity is None, any square matrix is handled as a similarity
176-
matrix and any non-square matrix is a feature matrix.
177-
If is_similarity is True, any matrix is handled as a similarity matrix,
178-
and the procedure will raise a ValueError if the matrix is not square.
179-
If is_similarity is False, any matrix will be handled as a feature
180-
matrix, including square matrices.
164+
Overrides the behaviour of the array handling of S.
165+
If is_similarity is None, any square array is handled as a similarity
166+
array and any non-square array is a feature array.
167+
If is_similarity is True, any array is handled as a similarity array,
168+
and the procedure will raise a ValueError if the array is not square.
169+
If is_similarity is False, any array will be handled as a feature
170+
array, including square matrices.
181171
182172
Methods
183173
-------
184-
185174
fit:
186175
Compute the clustering
187176
188177
Attributes
189178
----------
179+
core_points: array, shape = [n_core_points]
180+
Indices of core points.
190181
191-
core_points: array [n_core_points]
192-
index of core points
193-
194-
labels : array [n_points]
195-
cluster labels for each point
196-
Noisey points are given the label -1
197-
182+
labels : array, shape = [n_points]
183+
Cluster labels for each point. Noisy points are given the label -1.
198184
199185
Notes
200186
-----
@@ -219,25 +205,19 @@ def __init__(self, eps=0.5, min_points=5, metric='euclidean',
219205
self.is_similarity = is_similarity
220206

221207
def fit(self, S, **params):
222-
"""Compute DBSCAN labels for points, using similarity matrix S.
208+
"""Compute DBSCAN labels for points, using similarity array S.
223209
224210
Parameters
225211
----------
226-
227212
S: array [n_points, n_points] or [n_points, n_features]
228-
Matrix of similarities between points, or a feature matrix.
229-
If the matrix is square, it is treated as a similarity matrix,
230-
otherwise it is treated as a feature matrix. Use is_similarity to
213+
Array of similarities between points, or a feature array.
214+
If the array is square, it is treated as a similarity array,
215+
otherwise it is treated as a feature array. Use is_similarity to
231216
override this pattern.
232-
params: Overwrite keywords from __init__
233-
217+
params: dict
218+
Overwrite keywords from __init__.
234219
"""
220+
235221
self._set_params(**params)
236-
self.core_points_, self.labels_ = dbscan(S, eps=self.eps,
237-
min_points=self.min_points,
238-
verbose=self.verbose,
239-
metric=self.metric,
240-
index_order=self.index_order,
241-
is_similarity=\
242-
self.is_similarity)
222+
self.core_points_, self.labels_ = dbscan(S, **self._get_params())
243223
return self

scikits/learn/cluster/tests/test_dbscan.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""
2-
Testing for Clustering methods
3-
2+
Tests for DBSCAN clustering algorithm
43
"""
54

65
import numpy as np
@@ -17,7 +16,7 @@
1716

1817
def test_dbscan_similarity():
1918
"""
20-
Tests the DBSCAN algorithm with a similarity matrix
19+
Tests the DBSCAN algorithm with a similarity array
2120
2221
"""
2322
# Compute similarities
@@ -36,13 +35,13 @@ def test_dbscan_similarity():
3635
db = DBSCAN()
3736
labels = db.fit(S, eps=0.85, min_points=10).labels_
3837

39-
n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
38+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
4039
assert_equal(n_clusters, n_clusters_2)
4140

4241

4342
def test_dbscan_feature():
4443
"""
45-
Tests the DBSCAN algorithm with a features matrix
44+
Tests the DBSCAN algorithm with a feature vector array
4645
4746
"""
4847
metric = 'euclidean'
@@ -52,12 +51,12 @@ def test_dbscan_feature():
5251
eps=0.85, min_points=10)
5352

5453
# number of clusters, ignoring noise if present
55-
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
54+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
5655
assert_equal(n_clusters, n_clusters_1)
5756

5857
db = DBSCAN()
5958
labels = db.fit(X, metric=metric,
6059
eps=0.85, min_points=10).labels_
6160

62-
n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
61+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
6362
assert_equal(n_clusters, n_clusters_2)

0 commit comments

Comments
 (0)
0