8000 FIX & TST at least min_samples to be considered a core sample · raghavrv/scikit-learn@2e99294 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2e99294

Browse files
committed
FIX & TST at least min_samples to be considered a core sample
1 parent 27d4376 commit 2e99294

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

sklearn/cluster/dbscan_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski',
135135
labels = -np.ones(X.shape[0], dtype=np.intp)
136136

137137
# A list of all core samples found.
138-
core_samples = np.asarray(n_neighbors > min_samples, dtype=np.uint8)
138+
core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
139139
dbscan_inner(core_samples, neighborhoods, labels)
140140
return np.where(core_samples)[0], labels
141141

sklearn/cluster/tests/test_dbscan.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,37 @@ def test_weighted_dbscan():
259259
assert_array_equal(core1, core5)
260260
assert_array_equal(label1, label5)
261261
assert_array_equal(label1, est.labels_)
262+
263+
264+
def test_dbscan_core_samples_toy():
265+
X = [[0], [2], [3], [4], [6], [8], [10]]
266+
n_samples = len(X)
267+
268+
for algorithm in ['brute', 'kd_tree', 'ball_tree']:
269+
# Degenerate case: every sample is a core sample, either with its own
270+
# cluster or including other close core samples.
271+
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
272+
min_samples=1)
273+
assert_array_equal(core_samples, np.arange(n_samples))
274+
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
275+
276+
# With eps=1 and min_samples=2 only the 3 samples from the denser area
277+
# are core samples. All other points are isolated and considered noise.
278+
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
279+
min_samples=2)
280+
assert_array_equal(core_samples, [1, 2, 3])
281+
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
282+
283+
# Only the sample in the middle of the dense area is core. Its two
284+
# neighbors are edge samples. Remaining samples are noise.
285+
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
286+
min_samples=3)
287+
assert_array_equal(core_samples, [2])
288+
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
289+
290+
# It's no longer possible to extract core samples with eps=1:
291+
# everything is noise.
292+
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
293+
min_samples=4)
294+
assert_array_equal(core_samples, [])
295+
assert_array_equal(labels, -np.ones(n_samples))

0 commit comments

Comments
 (0)
0