From 11f030770f50e40262f315026c25cfbad9f2ccea Mon Sep 17 00:00:00 2001 From: Akshay Date: Mon, 13 Feb 2017 20:39:32 +0530 Subject: [PATCH 1/6] Fixes incorrect output in DBSCAN --- sklearn/cluster/dbscan_.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index a02db3feafb00..4585c50cfb426 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -125,6 +125,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, X_mask = X.data <= eps masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] masked_indptr = np.cumsum(X_mask)[X.indptr[1:] - 1] + + if X.indptr[0] == X.indptr[1] == 0: # check if first row is all zero + masked_indptr[0] = 0 + # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data masked_indices = np.insert(masked_indices, masked_indptr, From fe924f96153039fa8d1f1f2357375f397598d6c4 Mon Sep 17 00:00:00 2001 From: Akshay Date: Wed, 15 Feb 2017 17:21:30 +0530 Subject: [PATCH 2/6] Added support for multiple initial row all zero --- sklearn/cluster/dbscan_.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 4585c50cfb426..9c3e3317c2502 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -124,10 +124,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] - masked_indptr = np.cumsum(X_mask)[X.indptr[1:] - 1] - - if X.indptr[0] == X.indptr[1] == 0: # check if first row is all zero - masked_indptr[0] = 0 + masked_indptr = np.concatenate(([0], np.cumsum(X_mask)), + axis=0)[X.indptr[1:]] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data From 4875516794267b4526c7b01f9bf3881d83d81fcf Mon Sep 17 00:00:00 2001 From: Akshay Date: Wed, 15 Feb 2017 18:18:16 +0530 Subject: [PATCH 3/6] Added a test for issue #8306 --- sklearn/cluster/tests/test_dbscan.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py index b4b34dcefb822..1dee674e49af7 100644 --- a/sklearn/cluster/tests/test_dbscan.py +++ b/sklearn/cluster/tests/test_dbscan.py @@ -350,3 +350,20 @@ def test_dbscan_precomputed_metric_with_degenerate_input_arrays(): X = np.zeros((10, 10)) labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_ assert_equal(len(set(labels)), 1) + + +def test_dbscan_precomputed_metric_with_initial_rows_zero(): + # sample matrix with initial two row all zero + ar = np.array([ + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], + [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], + [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0] + ]) + matrix = sparse.csr_matrix(ar) + labels = DBSCAN(eps=0.2, metric='precomputed', + min_samples=2).fit(matrix).labels_ + assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1]) From 749733b48c4441a78f05cbb60968c25a6c564fec Mon Sep 17 00:00:00 2001 From: Akshay Date: Wed, 15 Feb 2017 18:25:00 +0530 Subject: [PATCH 4/6] Added entry in whats_new.rst for issue #8306 --- doc/whats_new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index d110b509d6c79..8e739ecc8fa7a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -149,6 +149,10 @@ Enhancements Bug fixes ......... + - Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect + result when input is precomputed sparse matrix with initial rows + all zero. + :issue:`8306` by :user:`Akshay Gupta ` - Fixed a bug where :func:`sklearn.datasets.make_moons` gives an incorrect result when ``n_samples`` is odd. From dcb54339a6e608908cefd4d1a2172ee97ab357ec Mon Sep 17 00:00:00 2001 From: Akshay Date: Wed, 15 Feb 2017 18:53:36 +0530 Subject: [PATCH 5/6] Removed axis=0 --- sklearn/cluster/dbscan_.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 9c3e3317c2502..2f3374027d7c2 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -124,8 +124,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] - masked_indptr = np.concatenate(([0], np.cumsum(X_mask)), - axis=0)[X.indptr[1:]] + masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data From 62ff80f3e0d949125036d5e06f46424a153de5d5 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 23 Feb 2017 21:28:54 +1100 Subject: [PATCH 6/6] Wording in changelog --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 8e739ecc8fa7a..71af69785f943 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -150,8 +150,8 @@ Enhancements Bug fixes ......... - Fixed a bug where :class:`sklearn.cluster.DBSCAN` gives incorrect - result when input is precomputed sparse matrix with initial rows - all zero. + result when input is a precomputed sparse matrix with initial + rows all zero. :issue:`8306` by :user:`Akshay Gupta ` - Fixed a bug where :func:`sklearn.datasets.make_moons` gives an