From 5f7e6de69e425cfdfeeb82cf692b81fe22589dd5 Mon Sep 17 00:00:00 2001 From: huntzhan Date: Fri, 23 Aug 2019 11:53:31 +0800 Subject: [PATCH 01/28] Try csr support. --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index ecf5fa6a2bcc0..732c7cfb352fc 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -233,7 +233,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = check_array(X, accept_sparse='csr') if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" From 18425aaffc785024d8baaa2bb31e4fd52427eed8 Mon Sep 17 00:00:00 2001 From: huntzhan Date: Fri, 23 Aug 2019 16:15:39 +0800 Subject: [PATCH 02/28] Change the default metric of OPTICS to euclidean. --- sklearn/cluster/optics_.py | 4 ++-- sklearn/cluster/tests/test_optics.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 732c7cfb352fc..26734da87d778 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -194,7 +194,7 @@ class OPTICS(BaseEstimator, ClusterMixin): the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. """ - def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, + def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, n_jobs=None): @@ -517,7 +517,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_, # the same logic as neighbors, p is ignored if explicitly set # in the dict params _params['p'] = p - dists = pairwise_distances(P, np.take(X, unproc, axis=0), + dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 5ae8b3f898fcf..b7e46fe18e921 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -352,7 +352,7 @@ def test_compare_to_ELKI(): # Tests against known extraction array # Does NOT work with metric='euclidean', because sklearn euclidean has # worse numeric precision. 'minkowski' is slower but more accurate. - clust1 = OPTICS(min_samples=5).fit(X) + clust1 = OPTICS(metric='minkowski', min_samples=5).fit(X) assert_array_equal(clust1.ordering_, np.array(o1)) assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) @@ -386,7 +386,7 @@ def test_compare_to_ELKI(): 11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30, 30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] - clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) + clust2 = OPTICS(metric='minkowski', min_samples=5, max_eps=0.5).fit(X) assert_array_equal(clust2.ordering_, np.array(o2)) assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) From f712b46baa1ad3ca25d52f85f01162e4e613f1a7 Mon Sep 17 00:00:00 2001 From: huntzhan Date: Mon, 26 Aug 2019 11:14:49 +0800 Subject: [PATCH 03/28] Retain the default metric minkowski. --- sklearn/cluster/optics_.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 26734da87d778..d3faed82d8650 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -19,6 +19,7 @@ from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances +from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS class OPTICS(BaseEstimator, ClusterMixin): @@ -194,7 +195,7 @@ class OPTICS(BaseEstimator, ClusterMixin): the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. """ - def __init__(self, min_samples=5, max_eps=np.inf, metric='euclidean', p=2, + def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, n_jobs=None): @@ -222,7 +223,8 @@ def fit(self, X, y=None): Parameters ---------- X : array, shape (n_samples, n_features), or (n_samples, n_samples) \ -if metric=’precomputed’. +if metric=’precomputed’, or sparse matrix \ + if metric in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', 'manhattan']. A feature array, or array of distances between samples if metric='precomputed'. @@ -233,7 +235,10 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, accept_sparse='csr') + if self.metric in PAIRWISE_DISTANCE_FUNCTIONS: + X = check_array(X, accept_sparse='csr') + else: + X = check_array(X) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" From 876bb2935599e37d5c827a83fbcee12829101e6d Mon Sep 17 00:00:00 2001 From: huntzhan Date: Mon, 26 Aug 2019 11:17:14 +0800 Subject: [PATCH 04/28] Undo tests. --- sklearn/cluster/tests/test_optics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index b7e46fe18e921..5ae8b3f898fcf 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -352,7 +352,7 @@ def test_compare_to_ELKI(): # Tests against known extraction array # Does NOT work with metric='euclidean', because sklearn euclidean has # worse numeric precision. 'minkowski' is slower but more accurate. - clust1 = OPTICS(metric='minkowski', min_samples=5).fit(X) + clust1 = OPTICS(min_samples=5).fit(X) assert_array_equal(clust1.ordering_, np.array(o1)) assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) @@ -386,7 +386,7 @@ def test_compare_to_ELKI(): 11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30, 30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] - clust2 = OPTICS(metric='minkowski', min_samples=5, max_eps=0.5).fit(X) + clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) assert_array_equal(clust2.ordering_, np.array(o2)) assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) From 71b5c1e88da2424b2289779c29c20c78040ee861 Mon Sep 17 00:00:00 2001 From: huntzhan Date: Mon, 26 Aug 2019 11:24:18 +0800 Subject: [PATCH 05/28] Fix flake8. --- sklearn/cluster/optics_.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index d3faed82d8650..3864750901509 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -223,8 +223,9 @@ def fit(self, X, y=None): Parameters ---------- X : array, shape (n_samples, n_features), or (n_samples, n_samples) \ -if metric=’precomputed’, or sparse matrix \ - if metric in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', 'manhattan']. +if metric=’precomputed’, or sparse matrix (n_samples, n_features) if metric + in ['cityblock', 'cosine', 'euclidean', 'haversine', 'l2', 'l1', + 'manhattan']. A feature array, or array of distances between samples if metric='precomputed'. From 6f498a9206f8e0b0a7422bb7eeb571918d4f7dd3 Mon Sep 17 00:00:00 2001 From: huntzhan Date: Mon, 26 Aug 2019 12:40:06 +0800 Subject: [PATCH 06/28] Add sparse tests. --- sklearn/cluster/optics_.py | 4 +- sklearn/cluster/tests/test_optics.py | 197 ++++++++++++++++++++------- 2 files changed, 151 insertions(+), 50 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 3864750901509..be417b98cb7ab 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -236,7 +236,9 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - if self.metric in PAIRWISE_DISTANCE_FUNCTIONS: + # TODO: Support the sparse input for metric = 'precopmuted'. + if self.metric != 'precomputed' \ + and self.metric in PAIRWISE_DISTANCE_FUNCTIONS: X = check_array(X, accept_sparse='csr') else: X = check_array(X) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 5ae8b3f898fcf..f251978a2eab4 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from scipy import sparse + from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster.optics_ import (OPTICS, _extend_region, @@ -101,12 +103,22 @@ def test_extract_xi(): xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=3, min_cluster_size=2, + max_eps=20, cluster_method='xi', + xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + # check float min_samples and min_cluster_size clust = OPTICS(min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method='xi', xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=0.1, min_cluster_size=0.08, + max_eps=20, cluster_method='xi', + xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5] @@ -118,6 +130,11 @@ def test_extract_xi(): # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=3, min_cluster_size=3, + max_eps=20, cluster_method='xi', + xi=0.3, metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] @@ -130,6 +147,11 @@ def test_extract_xi(): xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=2, min_cluster_size=2, + max_eps=np.inf, cluster_method='xi', + xi=0.04, metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + def test_cluster_hierarchy_(): rng = np.random.RandomState(0) @@ -144,33 +166,42 @@ def test_cluster_hierarchy_(): diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) assert diff / len(X) < 0.05 + clust = OPTICS(min_samples=20, xi=.1, + metric='euclidean').fit(sparse.lil_matrix(X)) + clusters = clust.cluster_hierarchy_ + assert clusters.shape == (2, 2) + diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) + assert diff / len(X) < 0.05 + def test_correct_number_of_clusters(): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) - # Parameters chosen specifically for this task. - # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1) - clust.fit(X) - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) - assert n_clusters_1 == n_clusters - # check attribute types and sizes - assert clust.labels_.shape == (len(X),) - assert clust.labels_.dtype.kind == 'i' + for metric in ['minkowski', 'euclidean']: + # Parameters chosen specifically for this task. + # Compute OPTICS + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1, metric=metric) + clust.fit(X if metric == 'minkowski' else sparse.lil_matrix(X)) + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) + assert n_clusters_1 == n_clusters - assert clust.reachability_.shape == (len(X),) - assert clust.reachability_.dtype.kind == 'f' + # check attribute types and sizes + assert clust.labels_.shape == (len(X),) + assert clust.labels_.dtype.kind == 'i' - assert clust.core_distances_.shape == (len(X),) - assert clust.core_distances_.dtype.kind == 'f' + assert clust.reachability_.shape == (len(X),) + assert clust.reachability_.dtype.kind == 'f' - assert clust.ordering_.shape == (len(X),) - assert clust.ordering_.dtype.kind == 'i' - assert set(clust.ordering_) == set(range(len(X))) + assert clust.core_distances_.shape == (len(X),) + assert clust.core_distances_.dtype.kind == 'f' + + assert clust.ordering_.shape == (len(X),) + assert clust.ordering_.dtype.kind == 'i' + assert set(clust.ordering_) == set(range(len(X))) def test_minimum_number_of_sample_check(): @@ -184,6 +215,14 @@ def test_minimum_number_of_sample_check(): # Run the fit assert_raise_message(ValueError, msg, clust.fit, X) + # Compute OPTICS + X = sparse.lil_matrix([[1, 1]]) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, + metric='euclidean') + + # Run the fit + assert_raise_message(ValueError, msg, clust.fit, X) + def test_bad_extract(): # Test an extraction of eps too close to original eps @@ -198,6 +237,13 @@ def test_bad_extract(): eps=0.3, min_samples=10) assert_raise_message(ValueError, msg, clust.fit, X) + # Compute OPTICS + clust = OPTICS(max_eps=5.0 * 0.03, + cluster_method='dbscan', + eps=0.3, min_samples=10, + metric='euclidean') + assert_raise_message(ValueError, msg, clust.fit, sparse.lil_matrix(X)) + def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." @@ -209,6 +255,11 @@ def test_bad_reachability(): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X) + with pytest.warns(UserWarning, match=msg): + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, + metric='euclidean') + clust.fit(sparse.lil_matrix(X)) + def test_close_extract(): # Test extract where extraction eps is close to scaled max_eps @@ -223,32 +274,43 @@ def test_close_extract(): # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2 + # Compute OPTICS + clust = OPTICS(max_eps=1.0, cluster_method='dbscan', + eps=0.3, min_samples=10, + metric='euclidean').fit(sparse.lil_matrix(X)) + # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters + assert max(clust.labels_) == 2 + @pytest.mark.parametrize('eps', [0.1, .3, .5]) @pytest.mark.parametrize('min_samples', [3, 10, 20]) def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN - centers = [[1, 1], [-1, -1], [1, -1]] - X, labels_true = make_blobs(n_samples=750, centers=centers, - cluster_std=0.4, random_state=0) + for metric in ['minkowski', 'euclidean']: - # calculate optics with dbscan extract at 0.3 epsilon - op = OPTICS(min_samples=min_samples, cluster_method='dbscan', - eps=eps).fit(X) + centers = [[1, 1], [-1, -1], [1, -1]] + _X, labels_true = make_blobs(n_samples=750, centers=centers, + cluster_std=0.4, random_state=0) + X = _X if metric == 'minkowski' else sparse.lil_matrix(_X) - # calculate dbscan labels - db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) + # calculate optics with dbscan extract at 0.3 epsilon + op = OPTICS(min_samples=min_samples, cluster_method='dbscan', + eps=eps, + metric=metric).fit(X) - contingency = contingency_matrix(db.labels_, op.labels_) - agree = min(np.sum(np.max(contingency, axis=0)), - np.sum(np.max(contingency, axis=1))) - disagree = X.shape[0] - agree + # calculate dbscan labels + db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) - percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) + contingency = contingency_matrix(db.labels_, op.labels_) + agree = min(np.sum(np.max(contingency, axis=0)), + np.sum(np.max(contingency, axis=1))) + disagree = X.shape[0] - agree - # verify label mismatch is <= 5% labels - assert percent_mismatch <= 0.05 + percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) + + # verify label mismatch is <= 5% labels + assert percent_mismatch <= 0.05 def test_min_samples_edge_case(): @@ -263,12 +325,24 @@ def test_min_samples_edge_case(): xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=3, + max_eps=7, cluster_method='xi', + xi=0.04, + metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] clust = OPTICS(min_samples=3, max_eps=3, cluster_method='xi', xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels) + clust = OPTICS(min_samples=3, + max_eps=3, cluster_method='xi', + xi=0.04, + metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + expected_labels = np.r_[[-1] * 9] with pytest.warns(UserWarning, match="All reachability values"): clust = OPTICS(min_samples=4, @@ -276,20 +350,33 @@ def test_min_samples_edge_case(): xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels) + with pytest.warns(UserWarning, match="All reachability values"): + clust = OPTICS(min_samples=4, + max_eps=3, cluster_method='xi', + xi=0.04, + metric='euclidean').fit(sparse.lil_matrix(X)) + assert_array_equal(clust.labels_, expected_labels) + # try arbitrary minimum sizes @pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23)) def test_min_cluster_size(min_cluster_size): - redX = X[::2] # reduce for speed - clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) - cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) - if cluster_sizes.size: - assert min(cluster_sizes) >= min_cluster_size - # check behaviour is the same when min_cluster_size is a fraction - clust_frac = OPTICS(min_samples=9, - min_cluster_size=min_cluster_size / redX.shape[0]) - clust_frac.fit(redX) - assert_array_equal(clust.labels_, clust_frac.labels_) + _redX = X[::2] # reduce for speed + + for metric in ['minkowski', 'euclidean']: + redX = _redX if metric == 'minkowski' else sparse.lil_matrix(_redX) + + clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, + metric=metric).fit(redX) + cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) + if cluster_sizes.size: + assert min(cluster_sizes) >= min_cluster_size + # check behaviour is the same when min_cluster_size is a fraction + clust_frac = OPTICS(min_samples=9, + min_cluster_size=min_cluster_size / redX.shape[0], + metric=metric) + clust_frac.fit(redX) + assert_array_equal(clust.labels_, clust_frac.labels_) @pytest.mark.parametrize('min_cluster_size', [0, -1, 1.1, 2.2]) @@ -298,21 +385,33 @@ def test_min_cluster_size_invalid(min_cluster_size): with pytest.raises(ValueError, match="must be a positive integer or a "): clust.fit(X) + clust = OPTICS(min_cluster_size=min_cluster_size, metric='euclidean') + with pytest.raises(ValueError, match="must be a positive integer or a "): + clust.fit(sparse.lil_matrix(X)) + def test_min_cluster_size_invalid2(): clust = OPTICS(min_cluster_size=len(X) + 1) with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(X) + clust = OPTICS(min_cluster_size=len(X) + 1, metric='euclidean') + with pytest.raises(ValueError, match="must be no greater than the "): + clust.fit(sparse.lil_matrix(X)) + def test_processing_order(): - # Ensure that we consider all unprocessed points, - # not only direct neighbors. when picking the next point. - Y = [[0], [10], [-10], [25]] - clust = OPTICS(min_samples=3, max_eps=15).fit(Y) - assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) - assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) - assert_array_equal(clust.ordering_, [0, 1, 2, 3]) + for metric in ['minkowski', 'euclidean']: + + # Ensure that we consider all unprocessed points, + # not only direct neighbors. when picking the next point. + _Y = [[0], [10], [-10], [25]] + Y = _Y if metric == 'minkowski' else sparse.lil_matrix(_Y) + + clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y) + assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) + assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) + assert_array_equal(clust.ordering_, [0, 1, 2, 3]) def test_compare_to_ELKI(): From 0f832d4ffdf3a161f118ea865d6bb6dc8bba3eaa Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Tue, 17 Aug 2021 17:45:02 -0500 Subject: [PATCH 07/28] Change assert_raise_message to pytest.raises --- sklearn/cluster/tests/test_optics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 775872e3f0c9a..0775455e9180b 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -225,7 +225,8 @@ def test_minimum_number_of_sample_check(): metric='euclidean') # Run the fit - assert_raise_message(ValueError, msg, clust.fit, X) + with pytest.raises(ValueError, match=msg): + clust.fit(X) def test_bad_extract(): @@ -246,7 +247,8 @@ def test_bad_extract(): cluster_method='dbscan', eps=0.3, min_samples=10, metric='euclidean') - assert_raise_message(ValueError, msg, clust.fit, sparse.lil_matrix(X)) + with pytest.raises(ValueError, match=msg): + clust.fit(X) def test_bad_reachability(): From b237b05e091630a2a6b1249cbba52cd4c1f6b78d Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Wed, 18 Aug 2021 17:21:16 -0500 Subject: [PATCH 08/28] Parametrized tests --- sklearn/cluster/tests/test_optics.py | 308 ++++++++++++--------------- 1 file changed, 140 insertions(+), 168 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 0775455e9180b..532038b4dc7b0 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -82,8 +82,8 @@ def test_the_extract_xi_labels(ordering, clusters, expected): assert_array_equal(labels, expected) - -def test_extract_xi(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_extract_xi(metric): # small and easy test (no clusters around other clusters) # but with a clear noise data. rng = np.random.RandomState(0) @@ -99,26 +99,17 @@ def test_extract_xi(): X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X clust = OPTICS( - min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4 - ).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - clust = OPTICS(min_samples=3, min_cluster_size=2, - max_eps=20, cluster_method='xi', - xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X)) + min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) # check float min_samples and min_cluster_size clust = OPTICS( - min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4 - ).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - clust = OPTICS(min_samples=0.1, min_cluster_size=0.08, - max_eps=20, cluster_method='xi', - xi=0.4, metric='euclidean').fit(sparse.lil_matrix(X)) + min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) @@ -126,147 +117,126 @@ def test_extract_xi(): [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5 ] X, expected_labels = shuffle(X, expected_labels, random_state=rng) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X clust = OPTICS( - min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3 - ).fit(X) + min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3, + metric=metric).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) - clust = OPTICS(min_samples=3, min_cluster_size=3, - max_eps=20, cluster_method='xi', - xi=0.3, metric='euclidean').fit(sparse.lil_matrix(X)) - assert_array_equal(clust.labels_, expected_labels) - C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] X, expected_labels = shuffle(X, expected_labels, random_state=rng) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X clust = OPTICS( - min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04 - ).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - clust = OPTICS(min_samples=2, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi', - xi=0.04, metric='euclidean').fit(sparse.lil_matrix(X)) + min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) -def test_cluster_hierarchy_(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_cluster_hierarchy_(metric): rng = np.random.RandomState(0) n_points_per_cluster = 100 C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2)) X = shuffle(X, random_state=0) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X - clusters = OPTICS(min_samples=20, xi=0.1).fit(X).cluster_hierarchy_ - assert clusters.shape == (2, 2) - diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) - assert diff / len(X) < 0.05 - - clust = OPTICS(min_samples=20, xi=.1, - metric='euclidean').fit(sparse.lil_matrix(X)) - clusters = clust.cluster_hierarchy_ + clusters = OPTICS(min_samples=20, xi=0.1, metric=metric).fit(X).cluster_hierarchy_ assert clusters.shape == (2, 2) diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) - assert diff / len(X) < 0.05 + X_len = X.getnnz(axis=0)[0] if metric == 'euclidean' else len(X) + assert diff / X_len < 0.05 -def test_correct_number_of_clusters(): +@pytest.mark.parametrize( + "metric, is_sparse", + [ + ['minkowski', False], + ['euclidean', False], + ['euclidean', True] + ] +) +def test_correct_number_of_clusters(metric, is_sparse): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) - for metric in ['minkowski', 'euclidean']: - # Parameters chosen specifically for this task. - # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) - clust.fit(X if metric == 'minkowski' else sparse.lil_matrix(X)) - # number of clusters, ignoring noise if present - n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) - assert n_clusters_1 == n_clusters + # Parameters chosen specifically for this task. + # Compute OPTICS + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) + clust.fit(sparse.lil_matrix(X) if is_sparse else X) + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) + assert n_clusters_1 == n_clusters - # check attribute types and sizes - assert clust.labels_.shape == (len(X),) - assert clust.labels_.dtype.kind == 'i' + # check attribute types and sizes + assert clust.labels_.shape == (len(X),) + assert clust.labels_.dtype.kind == 'i' - assert clust.reachability_.shape == (len(X),) - assert clust.reachability_.dtype.kind == 'f' + assert clust.reachability_.shape == (len(X),) + assert clust.reachability_.dtype.kind == 'f' - assert clust.core_distances_.shape == (len(X),) - assert clust.core_distances_.dtype.kind == 'f' + assert clust.core_distances_.shape == (len(X),) + assert clust.core_distances_.dtype.kind == 'f' - assert clust.ordering_.shape == (len(X),) - assert clust.ordering_.dtype.kind == 'i' - assert set(clust.ordering_) == set(range(len(X))) + assert clust.ordering_.shape == (len(X),) + assert clust.ordering_.dtype.kind == 'i' + assert set(clust.ordering_) == set(range(len(X))) -def test_minimum_number_of_sample_check(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_minimum_number_of_sample_check(metric): # test that we check a minimum number of samples msg = "min_samples must be no greater than" # Compute OPTICS X = [[1, 1]] - clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1) - - # Run the fit - with pytest.raises(ValueError, match=msg): - clust.fit(X) - - # Compute OPTICS - X = sparse.lil_matrix([[1, 1]]) - clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, - metric='euclidean') + X = sparse.lil_matrix(X) if metric == 'euclidean' else X + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric) # Run the fit with pytest.raises(ValueError, match=msg): clust.fit(X) -def test_bad_extract(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_bad_extract(metric): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10) - with pytest.raises(ValueError, match=msg): - clust.fit(X) - - # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 0.03, - cluster_method='dbscan', - eps=0.3, min_samples=10, - metric='euclidean') + clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric) with pytest.raises(ValueError, match=msg): clust.fit(X) -def test_bad_reachability(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_bad_reachability(metric): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X with pytest.warns(UserWarning, match=msg): - clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric) clust.fit(X) - with pytest.warns(UserWarning, match=msg): - clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, - metric='euclidean') - clust.fit(sparse.lil_matrix(X)) - def test_nowarn_if_metric_bool_data_bool(): # make sure no warning is raised if metric and data are both boolean @@ -310,117 +280,114 @@ def test_nowarn_if_metric_no_bool(): OPTICS(metric=pairwise_metric).fit(X_num) assert len(warn_record) == 0 - -def test_close_extract(): +@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +def test_close_extract(metric): # Test extract where extraction eps is close to scaled max_eps centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) + X = sparse.lil_matrix(X) if metric == 'euclidean' else X # Compute OPTICS - clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X) - # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters - assert max(clust.labels_) == 2 - - # Compute OPTICS - clust = OPTICS(max_eps=1.0, cluster_method='dbscan', - eps=0.3, min_samples=10, - metric='euclidean').fit(sparse.lil_matrix(X)) + clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2 @pytest.mark.parametrize("eps", [0.1, 0.3, 0.5]) @pytest.mark.parametrize("min_samples", [3, 10, 20]) -def test_dbscan_optics_parity(eps, min_samples): +@pytest.mark.parametrize( + "metric, is_sparse", + [ + ['minkowski', False], + ['euclidean', False], + ['euclidean', True] + ] +) +def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN @TODO modified - for metric in ['minkowski', 'euclidean']: - - centers = [[1, 1], [-1, -1], [1, -1]] - _X, labels_true = make_blobs(n_samples=750, centers=centers, - cluster_std=0.4, random_state=0) - X = _X if metric == 'minkowski' else sparse.lil_matrix(_X) + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs(n_samples=750, centers=centers, + cluster_std=0.4, random_state=0) + X = sparse.lil_matrix(X) if is_sparse else X - # calculate optics with dbscan extract at 0.3 epsilon - op = OPTICS(min_samples=min_samples, cluster_method='dbscan', - eps=eps, - metric=metric).fit(X) + # calculate optics with dbscan extract at 0.3 epsilon + op = OPTICS(min_samples=min_samples, cluster_method='dbscan', + eps=eps, + metric=metric).fit(X) - # calculate dbscan labels - db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) + # calculate dbscan labels + db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) - contingency = contingency_matrix(db.labels_, op.labels_) - agree = min(np.sum(np.max(contingency, axis=0)), - np.sum(np.max(contingency, axis=1))) - disagree = X.shape[0] - agree + contingency = contingency_matrix(db.labels_, op.labels_) + agree = min(np.sum(np.max(contingency, axis=0)), + np.sum(np.max(contingency, axis=1))) + disagree = X.shape[0] - agree - percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) + percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) - # verify label mismatch is <= 5% labels - assert percent_mismatch <= 0.05 + # verify label mismatch is <= 5% labels + assert percent_mismatch <= 0.05 -def test_min_samples_edge_case(): #@TODO modified for sparse +@pytest.mark.parametrize( + "metric, is_sparse", + [ + ['minkowski', False], + ['euclidean', False], + ['euclidean', True] + ] +) +def test_min_samples_edge_case(metric, is_sparse): C1 = [[0, 0], [0, 0.1], [0, -0.1]] C2 = [[10, 10], [10, 9], [10, 11]] C3 = [[100, 100], [100, 96], [100, 106]] X = np.vstack((C1, C2, C3)) + X = sparse.lil_matrix(X) if is_sparse else X expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] - clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - clust = OPTICS(min_samples=3, - max_eps=7, cluster_method='xi', - xi=0.04, - metric='euclidean').fit(sparse.lil_matrix(X)) + clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] - clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - clust = OPTICS(min_samples=3, - max_eps=3, cluster_method='xi', - xi=0.04, - metric='euclidean').fit(sparse.lil_matrix(X)) + clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[-1] * 9] with pytest.warns(UserWarning, match="All reachability values"): - clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - with pytest.warns(UserWarning, match="All reachability values"): - clust = OPTICS(min_samples=4, - max_eps=3, cluster_method='xi', - xi=0.04, - metric='euclidean').fit(sparse.lil_matrix(X)) + clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) # try arbitrary minimum sizes @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23)) -def test_min_cluster_size(min_cluster_size): - _redX = X[::2] # reduce for speed @TODO modified for sparse +@pytest.mark.parametrize( + "metric, is_sparse", + [ + ['minkowski', False], + ['euclidean', False], + ['euclidean', True] + ] +) +def test_min_cluster_size(min_cluster_size, metric, is_sparse): + redX = X[::2] # reduce for speed - for metric in ['minkowski', 'euclidean']: - redX = _redX if metric == 'minkowski' else sparse.lil_matrix(_redX) + redX = sparse.lil_matrix(redX) if is_sparse else redX - clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, - metric=metric).fit(redX) - cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) - if cluster_sizes.size: - assert min(cluster_sizes) >= min_cluster_size - # check behaviour is the same when min_cluster_size is a fraction - clust_frac = OPTICS(min_samples=9, - min_cluster_size=min_cluster_size / redX.shape[0], - metric=metric) - clust_frac.fit(redX) - assert_array_equal(clust.labels_, clust_frac.labels_) + clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, + metric=metric).fit(redX) + cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) + if cluster_sizes.size: + assert min(cluster_sizes) >= min_cluster_size + # check behaviour is the same when min_cluster_size is a fraction + clust_frac = OPTICS(min_samples=9, + min_cluster_size=min_cluster_size / redX.shape[0], + metric=metric) + clust_frac.fit(redX) + assert_array_equal(clust.labels_, clust_frac.labels_) @pytest.mark.parametrize("min_cluster_size", [0, -1, 1.1, 2.2]) @@ -443,19 +410,24 @@ def test_min_cluster_size_invalid2(): with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(sparse.lil_matrix(X)) - -def test_processing_order(): - for metric in ['minkowski', 'euclidean']: - - # Ensure that we consider all unprocessed points, - # not only direct neighbors. when picking the next point. - _Y = [[0], [10], [-10], [25]] - Y = _Y if metric == 'minkowski' else sparse.lil_matrix(_Y) - - clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y) - assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) - assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) - assert_array_equal(clust.ordering_, [0, 1, 2, 3]) +@pytest.mark.parametrize( + "metric, is_sparse", + [ + ['minkowski', False], + ['euclidean', False], + ['euclidean', True] + ] +) +def test_processing_order(metric, is_sparse): + # Ensure that we consider all unprocessed points, + # not only direct neighbors. when picking the next point. + Y = [[0], [10], [-10], [25]] + Y = sparse.lil_matrix(Y) if is_sparse else Y + + clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y) + assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) + assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) + assert_array_equal(clust.ordering_, [0, 1, 2, 3]) def test_compare_to_ELKI(): From a00301027716583f5293d7954281d62d4c645d2f Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Wed, 18 Aug 2021 17:26:38 -0500 Subject: [PATCH 09/28] Fix flake8 test_optics.py --- sklearn/cluster/tests/test_optics.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 532038b4dc7b0..72f59f29f70b9 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -82,6 +82,7 @@ def test_the_extract_xi_labels(ordering, clusters, expected): assert_array_equal(labels, expected) + @pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) def test_extract_xi(metric): # small and easy test (no clusters around other clusters) @@ -103,13 +104,13 @@ def test_extract_xi(metric): clust = OPTICS( min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4, - metric=metric).fit(X) + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) # check float min_samples and min_cluster_size clust = OPTICS( min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4, - metric=metric).fit(X) + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) @@ -121,7 +122,7 @@ def test_extract_xi(metric): clust = OPTICS( min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3, - metric=metric).fit(X) + metric=metric).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) @@ -135,7 +136,7 @@ def test_extract_xi(metric): clust = OPTICS( min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04, - metric=metric).fit(X) + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) @@ -219,7 +220,8 @@ def test_bad_extract(metric): X = sparse.lil_matrix(X) if metric == 'euclidean' else X # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric) + clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, + metric=metric) with pytest.raises(ValueError, match=msg): clust.fit(X) @@ -280,6 +282,7 @@ def test_nowarn_if_metric_no_bool(): OPTICS(metric=pairwise_metric).fit(X_num) assert len(warn_record) == 0 + @pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) def test_close_extract(metric): # Test extract where extraction eps is close to scaled max_eps @@ -291,7 +294,8 @@ def test_close_extract(metric): X = sparse.lil_matrix(X) if metric == 'euclidean' else X # Compute OPTICS - clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric).fit(X) + clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, + metric=metric).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2 @@ -311,7 +315,7 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, - cluster_std=0.4, random_state=0) + cluster_std=0.4, random_state=0) X = sparse.lil_matrix(X) if is_sparse else X # calculate optics with dbscan extract at 0.3 epsilon @@ -349,16 +353,19 @@ def test_min_samples_edge_case(metric, is_sparse): X = sparse.lil_matrix(X) if is_sparse else X expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] - clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric).fit(X) + clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] - clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X) + clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[-1] * 9] with pytest.warns(UserWarning, match="All reachability values"): - clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric).fit(X) + clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, + metric=metric).fit(X) assert_array_equal(clust.labels_, expected_labels) @@ -410,6 +417,7 @@ def test_min_cluster_size_invalid2(): with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(sparse.lil_matrix(X)) + @pytest.mark.parametrize( "metric, is_sparse", [ From 2188e5b097be462414be095d7033bd8e71304d46 Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Wed, 18 Aug 2021 17:31:25 -0500 Subject: [PATCH 10/28] Fix flake8 _optics.py --- sklearn/cluster/_optics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 3dd733081f0f4..bef57b3aa4dfe 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -21,7 +21,6 @@ from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances -from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS class OPTICS(ClusterMixin, BaseEstimator): @@ -281,7 +280,7 @@ def fit(self, X, y=None): ) warnings.warn(msg, DataConversionWarning) - X = self._validate_data(X, dtype=dtype, accept_sparse='csr') #@TODO original condition was metric != 'precomputed' and metric in PAIRWISE_DISTANCE_FUNCTIONS + X = self._validate_data(X, dtype=dtype, accept_sparse='csr') memory = check_memory(self.memory) if self.cluster_method not in ["dbscan", "xi"]: @@ -607,7 +606,7 @@ def _set_reach_dist( _params['p'] = p dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, - **_params).ravel() #@TODO Check if axis matters. Original X[unproc] was np.take(X,unproc,axis=0) + **_params).ravel() rdists = np.maximum(dists, core_distances_[point_index]) np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) From f8a43e4f98fb5ae65fff94aa89b5c7bf1695a3b7 Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Sat, 21 Aug 2021 12:29:41 -0500 Subject: [PATCH 11/28] Add sparse precomputed test case --- sklearn/cluster/tests/test_optics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 72f59f29f70b9..78dd0578467d6 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -861,9 +861,11 @@ def test_extract_dbscan(): assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3]) -def test_precomputed_dists(): +@pytest.mark.parametrize("is_sparse", [False, True]) +def test_precomputed_dists(is_sparse): redX = X[::2] dists = pairwise_distances(redX, metric="euclidean") + dists = sparse.lil_matrix(dists).tocsr() if is_sparse else dists clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists) clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX) From 52fdf934d5175ad6b6ce7221466006751283cad4 Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Sat, 21 Aug 2021 12:54:16 -0500 Subject: [PATCH 12/28] Black test_optics.py --- sklearn/cluster/tests/test_optics.py | 164 ++++++++++++++------------- 1 file changed, 88 insertions(+), 76 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 78dd0578467d6..feaaa9c3b08ee 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -83,7 +83,7 @@ def test_the_extract_xi_labels(ordering, clusters, expected): assert_array_equal(labels, expected) -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_extract_xi(metric): # small and easy test (no clusters around other clusters) # but with a clear noise data. @@ -100,17 +100,27 @@ def test_extract_xi(metric): X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( - min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4, - metric=metric).fit(X) + min_samples=3, + min_cluster_size=2, + max_eps=20, + cluster_method="xi", + xi=0.4, + metric=metric, + ).fit(X) assert_array_equal(clust.labels_, expected_labels) # check float min_samples and min_cluster_size clust = OPTICS( - min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4, - metric=metric).fit(X) + min_samples=0.1, + min_cluster_size=0.08, + max_eps=20, + cluster_method="xi", + xi=0.4, + metric=metric, + ).fit(X) assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) @@ -118,11 +128,16 @@ def test_extract_xi(metric): [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5 ] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( - min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3, - metric=metric).fit(X) + min_samples=3, + min_cluster_size=3, + max_eps=20, + cluster_method="xi", + xi=0.3, + metric=metric, + ).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) @@ -132,15 +147,20 @@ def test_extract_xi(metric): X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( - min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04, - metric=metric).fit(X) + min_samples=2, + min_cluster_size=2, + max_eps=np.inf, + cluster_method="xi", + xi=0.04, + metric=metric, + ).fit(X) assert_array_equal(clust.labels_, expected_labels) -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_cluster_hierarchy_(metric): rng = np.random.RandomState(0) n_points_per_cluster = 100 @@ -148,22 +168,18 @@ def test_cluster_hierarchy_(metric): C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2)) X = shuffle(X, random_state=0) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X clusters = OPTICS(min_samples=20, xi=0.1, metric=metric).fit(X).cluster_hierarchy_ assert clusters.shape == (2, 2) diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) - X_len = X.getnnz(axis=0)[0] if metric == 'euclidean' else len(X) + X_len = X.getnnz(axis=0)[0] if metric == "euclidean" else len(X) assert diff / X_len < 0.05 @pytest.mark.parametrize( "metric, is_sparse", - [ - ['minkowski', False], - ['euclidean', False], - ['euclidean', True] - ] + [["minkowski", False], ["euclidean", False], ["euclidean", True]], ) def test_correct_number_of_clusters(metric, is_sparse): # in 'auto' mode @@ -181,27 +197,27 @@ def test_correct_number_of_clusters(metric, is_sparse): # check attribute types and sizes assert clust.labels_.shape == (len(X),) - assert clust.labels_.dtype.kind == 'i' + assert clust.labels_.dtype.kind == "i" assert clust.reachability_.shape == (len(X),) - assert clust.reachability_.dtype.kind == 'f' + assert clust.reachability_.dtype.kind == "f" assert clust.core_distances_.shape == (len(X),) - assert clust.core_distances_.dtype.kind == 'f' + assert clust.core_distances_.dtype.kind == "f" assert clust.ordering_.shape == (len(X),) - assert clust.ordering_.dtype.kind == 'i' + assert clust.ordering_.dtype.kind == "i" assert set(clust.ordering_) == set(range(len(X))) -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_minimum_number_of_sample_check(metric): # test that we check a minimum number of samples msg = "min_samples must be no greater than" # Compute OPTICS X = [[1, 1]] - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric) # Run the fit @@ -209,7 +225,7 @@ def test_minimum_number_of_sample_check(metric): clust.fit(X) -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_bad_extract(metric): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3." @@ -217,23 +233,28 @@ def test_bad_extract(metric): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10, - metric=metric) + clust = OPTICS( + max_eps=5.0 * 0.03, + cluster_method="dbscan", + eps=0.3, + min_samples=10, + metric=metric, + ) with pytest.raises(ValueError, match=msg): clust.fit(X) -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_bad_reachability(metric): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric) @@ -283,7 +304,7 @@ def test_nowarn_if_metric_no_bool(): assert len(warn_record) == 0 -@pytest.mark.parametrize("metric", ['minkowski', 'euclidean']) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_close_extract(metric): # Test extract where extraction eps is close to scaled max_eps @@ -291,11 +312,12 @@ def test_close_extract(metric): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == 'euclidean' else X + X = sparse.lil_matrix(X) if metric == "euclidean" else X # Compute OPTICS - clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, - metric=metric).fit(X) + clust = OPTICS( + max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric + ).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2 @@ -304,31 +326,29 @@ def test_close_extract(metric): @pytest.mark.parametrize("min_samples", [3, 10, 20]) @pytest.mark.parametrize( "metric, is_sparse", - [ - ['minkowski', False], - ['euclidean', False], - ['euclidean', True] - ] + [["minkowski", False], ["euclidean", False], ["euclidean", True]], ) def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN @TODO modified centers = [[1, 1], [-1, -1], [1, -1]] - X, labels_true = make_blobs(n_samples=750, centers=centers, - cluster_std=0.4, random_state=0) + X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 + ) X = sparse.lil_matrix(X) if is_sparse else X # calculate optics with dbscan extract at 0.3 epsilon - op = OPTICS(min_samples=min_samples, cluster_method='dbscan', - eps=eps, - metric=metric).fit(X) + op = OPTICS( + min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric + ).fit(X) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, op.labels_) - agree = min(np.sum(np.max(contingency, axis=0)), - np.sum(np.max(contingency, axis=1))) + agree = min( + np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1)) + ) disagree = X.shape[0] - agree percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) @@ -339,11 +359,7 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): @pytest.mark.parametrize( "metric, is_sparse", - [ - ['minkowski', False], - ['euclidean', False], - ['euclidean', True] - ] + [["minkowski", False], ["euclidean", False], ["euclidean", True]], ) def test_min_samples_edge_case(metric, is_sparse): C1 = [[0, 0], [0, 0.1], [0, -0.1]] @@ -353,19 +369,22 @@ def test_min_samples_edge_case(metric, is_sparse): X = sparse.lil_matrix(X) if is_sparse else X expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] - clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, - metric=metric).fit(X) + clust = OPTICS( + min_samples=3, max_eps=7, cluster_method="xi", xi=0.04, metric=metric + ).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] - clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, - metric=metric).fit(X) + clust = OPTICS( + min_samples=3, max_eps=3, cluster_method="xi", xi=0.04, metric=metric + ).fit(X) assert_array_equal(clust.labels_, expected_labels) expected_labels = np.r_[[-1] * 9] with pytest.warns(UserWarning, match="All reachability values"): - clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, - metric=metric).fit(X) + clust = OPTICS( + min_samples=4, max_eps=3, cluster_method="xi", xi=0.04, metric=metric + ).fit(X) assert_array_equal(clust.labels_, expected_labels) @@ -373,26 +392,23 @@ def test_min_samples_edge_case(metric, is_sparse): @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23)) @pytest.mark.parametrize( "metric, is_sparse", - [ - ['minkowski', False], - ['euclidean', False], - ['euclidean', True] - ] + [["minkowski", False], ["euclidean", False], ["euclidean", True]], ) def test_min_cluster_size(min_cluster_size, metric, is_sparse): redX = X[::2] # reduce for speed redX = sparse.lil_matrix(redX) if is_sparse else redX - clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, - metric=metric).fit(redX) + clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, metric=metric).fit( + redX + ) cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) if cluster_sizes.size: assert min(cluster_sizes) >= min_cluster_size # check behaviour is the same when min_cluster_size is a fraction - clust_frac = OPTICS(min_samples=9, - min_cluster_size=min_cluster_size / redX.shape[0], - metric=metric) + clust_frac = OPTICS( + min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0], metric=metric + ) clust_frac.fit(redX) assert_array_equal(clust.labels_, clust_frac.labels_) @@ -403,7 +419,7 @@ def test_min_cluster_size_invalid(min_cluster_size): with pytest.raises(ValueError, match="must be a positive integer or a "): clust.fit(X) - clust = OPTICS(min_cluster_size=min_cluster_size, metric='euclidean') + clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean") with pytest.raises(ValueError, match="must be a positive integer or a "): clust.fit(sparse.lil_matrix(X)) @@ -413,18 +429,14 @@ def test_min_cluster_size_invalid2(): with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(X) - clust = OPTICS(min_cluster_size=len(X) + 1, metric='euclidean') + clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean") with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(sparse.lil_matrix(X)) @pytest.mark.parametrize( "metric, is_sparse", - [ - ['minkowski', False], - ['euclidean', False], - ['euclidean', True] - ] + [["minkowski", False], ["euclidean", False], ["euclidean", True]], ) def test_processing_order(metric, is_sparse): # Ensure that we consider all unprocessed points, From 8992c870808cfbe3709f02f6bc3c66db268fa570 Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Sat, 21 Aug 2021 13:33:32 -0500 Subject: [PATCH 13/28] Black _optics.py --- sklearn/cluster/_optics.py | 53 ++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index bef57b3aa4dfe..73b028b64ac14 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -14,13 +14,14 @@ import warnings import numpy as np -from ..exceptions import DataConversionWarning +from ..exceptions import DataConversionWarning, EfficiencyWarning from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import check_memory from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances +from scipy.sparse import issparse, SparseEfficiencyWarning class OPTICS(ClusterMixin, BaseEstimator): @@ -280,7 +281,12 @@ def fit(self, X, y=None): ) warnings.warn(msg, DataConversionWarning) - X = self._validate_data(X, dtype=dtype, accept_sparse='csr') + X = self._validate_data(X, dtype=dtype, accept_sparse="csr") + if self.metric == "precomputed" and issparse(X): + # Set each diagonal to an explicit value so each point is its own neighbor + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=SparseEfficiencyWarning) + X.setdiag(X.diagonal()) memory = check_memory(self.memory) if self.cluster_method not in ["dbscan", "xi"]: @@ -518,13 +524,16 @@ def compute_optics_graph( n_jobs=n_jobs, ) - nbrs.fit(X) - # Here we first do a kNN query for each point, this differs from - # the original OPTICS that only used epsilon range queries. - # TODO: handle working_memory somehow? - core_distances_ = _compute_core_distances_( - X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=EfficiencyWarning) + # Efficiency warning appears when using sparse precomputed matrices + nbrs.fit(X) + # Here we first do a kNN query for each point, this differs from + # the original OPTICS that only used epsilon range queries. + # TODO: handle working_memory somehow? + core_distances_ = _compute_core_distances_( + X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None + ) # OPTICS puts an upper limit on these, use inf for undefined. core_distances_[core_distances_ > max_eps] = np.inf np.around( @@ -587,7 +596,10 @@ def _set_reach_dist( # Assume that radius_neighbors is faster without distances # and we don't need all distances, nevertheless, this means # we may be doing some work twice. - indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=EfficiencyWarning) + # Efficiency warning appears when using sparse precomputed matrices + indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress(~np.take(processed, indices), indices) @@ -603,13 +615,20 @@ def _set_reach_dist( if metric == "minkowski" and "p" not in _params: # the same logic as neighbors, p is ignored if explicitly set # in the dict params - _params['p'] = p - dists = pairwise_distances(P, X[unproc], - metric, n_jobs=None, - **_params).ravel() - - rdists = np.maximum(dists, core_distances_[point_index]) - np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) + _params["p"] = p + dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() + + if issparse(dists): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=SparseEfficiencyWarning) + rdists = dists.maximum(core_distances_[point_index]) + np.around( + rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data + ) + rdists = np.array(rdists.todense())[0] + else: + rdists = np.maximum(dists, core_distances_[point_index]) + np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) improved = np.where(rdists < np.take(reachability_, unproc)) reachability_[unproc[improved]] = rdists[improved] predecessor_[unproc[improved]] = point_index From 91fd5edbe7f4fe52485b0811b2322d76c1cb38d7 Mon Sep 17 00:00:00 2001 From: Clicked Date: Sat, 21 Aug 2021 16:37:52 -0500 Subject: [PATCH 14/28] Add sparse matrix support to _optics.py Original commit was pushed, but wasn't reflected on github for some reason --- sklearn/cluster/_optics.py | 53 ++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index bef57b3aa4dfe..73b028b64ac14 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -14,13 +14,14 @@ import warnings import numpy as np -from ..exceptions import DataConversionWarning +from ..exceptions import DataConversionWarning, EfficiencyWarning from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import check_memory from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances +from scipy.sparse import issparse, SparseEfficiencyWarning class OPTICS(ClusterMixin, BaseEstimator): @@ -280,7 +281,12 @@ def fit(self, X, y=None): ) warnings.warn(msg, DataConversionWarning) - X = self._validate_data(X, dtype=dtype, accept_sparse='csr') + X = self._validate_data(X, dtype=dtype, accept_sparse="csr") + if self.metric == "precomputed" and issparse(X): + # Set each diagonal to an explicit value so each point is its own neighbor + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=SparseEfficiencyWarning) + X.setdiag(X.diagonal()) memory = check_memory(self.memory) if self.cluster_method not in ["dbscan", "xi"]: @@ -518,13 +524,16 @@ def compute_optics_graph( n_jobs=n_jobs, ) - nbrs.fit(X) - # Here we first do a kNN query for each point, this differs from - # the original OPTICS that only used epsilon range queries. - # TODO: handle working_memory somehow? - core_distances_ = _compute_core_distances_( - X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=EfficiencyWarning) + # Efficiency warning appears when using sparse precomputed matrices + nbrs.fit(X) + # Here we first do a kNN query for each point, this differs from + # the original OPTICS that only used epsilon range queries. + # TODO: handle working_memory somehow? + core_distances_ = _compute_core_distances_( + X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None + ) # OPTICS puts an upper limit on these, use inf for undefined. core_distances_[core_distances_ > max_eps] = np.inf np.around( @@ -587,7 +596,10 @@ def _set_reach_dist( # Assume that radius_neighbors is faster without distances # and we don't need all distances, nevertheless, this means # we may be doing some work twice. - indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=EfficiencyWarning) + # Efficiency warning appears when using sparse precomputed matrices + indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress(~np.take(processed, indices), indices) @@ -603,13 +615,20 @@ def _set_reach_dist( if metric == "minkowski" and "p" not in _params: # the same logic as neighbors, p is ignored if explicitly set # in the dict params - _params['p'] = p - dists = pairwise_distances(P, X[unproc], - metric, n_jobs=None, - **_params).ravel() - - rdists = np.maximum(dists, core_distances_[point_index]) - np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) + _params["p"] = p + dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() + + if issparse(dists): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=SparseEfficiencyWarning) + rdists = dists.maximum(core_distances_[point_index]) + np.around( + rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data + ) + rdists = np.array(rdists.todense())[0] + else: + rdists = np.maximum(dists, core_distances_[point_index]) + np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) improved = np.where(rdists < np.take(reachability_, unproc)) reachability_[unproc[improved]] = rdists[improved] predecessor_[unproc[improved]] = point_index From 53474d2438e81bbe764af24abcefaa88d71d934a Mon Sep 17 00:00:00 2001 From: Clickedbigfoot Date: Sat, 21 Aug 2021 17:49:19 -0500 Subject: [PATCH 15/28] Added changelog entry --- doc/whats_new/v1.0.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7d8175a3b5046..5dea3b5eb06e9 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -210,6 +210,11 @@ Changelog of connected components is greater than 1. :pr:`20597` by `Thomas Fan`_. +- |Enhancement| The `predict` and `fit_predict` methods of + :class:`cluster.OPTICS` now accept sparse data type for input + data. + :pr:`20802` by :user:`Brandon Pokorny ` + :mod:`sklearn.compose` ...................... From f15c0193083b532e4400d3980e041657ac203afb Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Mar 2022 20:46:50 -0400 Subject: [PATCH 16/28] Removed extra changelog entry --- doc/whats_new/v1.0.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 51ecabe21e0e2..fbb9299745191 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -499,11 +499,6 @@ Changelog - |API| :func:`cluster.spectral_clustering` raises an improved error when passed a `np.matrix`. :pr:`20560` by `Thomas Fan`_. -- |Fix| :class:`cluster.AgglomerativeClustering` correctly connects components - when connectivity and affinity are both precomputed and the number - of connected components is greater than 1. :pr:`20597` by - `Thomas Fan`_. - - |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now accept sparse data type for input data. From 011ab6d4cbb594d6ab3e4b02c18809c7863e3ef8 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Mar 2022 20:57:38 -0400 Subject: [PATCH 17/28] Updated changelog entry --- doc/whats_new/v1.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index fbb9299745191..7e859248952b6 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -501,8 +501,9 @@ Changelog - |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now accept sparse data type for input - data. - :pr:`20802` by :user:`Brandon Pokorny ` + data. :pr:`14736` by :user:`Hunt Zhan `, + :pr:`20802` by :user:`Brandon Pokorny `, + and :pr:`22965` by :user:`Meekail Zain ` :mod:`sklearn.compose` ...................... From 367b14361e05f606849207752dd9c7de96eb1f6f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 13:57:45 -0400 Subject: [PATCH 18/28] Update changelog --- doc/whats_new/v1.0.rst | 6 ------ doc/whats_new/v1.1.rst | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7e859248952b6..1418bcbdf37f0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -499,12 +499,6 @@ Changelog - |API| :func:`cluster.spectral_clustering` raises an improved error when passed a `np.matrix`. :pr:`20560` by `Thomas Fan`_. -- |Enhancement| The `predict` and `fit_predict` methods of - :class:`cluster.OPTICS` now accept sparse data type for input - data. :pr:`14736` by :user:`Hunt Zhan `, - :pr:`20802` by :user:`Brandon Pokorny `, - and :pr:`22965` by :user:`Meekail Zain ` - :mod:`sklearn.compose` ...................... diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index fab4ec850f63d..65b3522826022 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -186,6 +186,12 @@ Changelog `-1` and the original warning message is shown. :pr:`22217` by :user:`Meekail Zain `. +- |Enhancement| The `predict` and `fit_predict` methods of + :class:`cluster.OPTICS` now accept sparse data type for input + data. :pr:`14736` by :user:`Hunt Zhan `, + :pr:`20802` by :user:`Brandon Pokorny `, + and :pr:`22965` by :user:`Meekail Zain `. + :mod:`sklearn.compose` ...................... From d3ffd5246dd2ea0773a3e8e5d784ed0c507719ba Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 14:13:31 -0400 Subject: [PATCH 19/28] Streamlined tests --- sklearn/cluster/_optics.py | 34 ++++++++--------------- sklearn/cluster/tests/test_optics.py | 41 +++++++++++----------------- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 72fb2865d9dbf..86b87c01690b5 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -13,14 +13,14 @@ import warnings import numpy as np -from ..exceptions import DataConversionWarning, EfficiencyWarning +from ..exceptions import DataConversionWarning from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import check_memory from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances -from scipy.sparse import issparse, SparseEfficiencyWarning +from scipy.sparse import issparse class OPTICS(ClusterMixin, BaseEstimator): @@ -290,9 +290,7 @@ def fit(self, X, y=None): X = self._validate_data(X, dtype=dtype, accept_sparse="csr") if self.metric == "precomputed" and issparse(X): # Set each diagonal to an explicit value so each point is its own neighbor - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=SparseEfficiencyWarning) - X.setdiag(X.diagonal()) + X.setdiag(X.diagonal()) memory = check_memory(self.memory) if self.cluster_method not in ["dbscan", "xi"]: @@ -530,16 +528,13 @@ def compute_optics_graph( n_jobs=n_jobs, ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=EfficiencyWarning) - # Efficiency warning appears when using sparse precomputed matrices - nbrs.fit(X) - # Here we first do a kNN query for each point, this differs from - # the original OPTICS that only used epsilon range queries. - # TODO: handle working_memory somehow? - core_distances_ = _compute_core_distances_( - X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None - ) + nbrs.fit(X) + # Here we first do a kNN query for each point, this differs from + # the original OPTICS that only used epsilon range queries. + # TODO: handle working_memory somehow? + core_distances_ = _compute_core_distances_( + X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None + ) # OPTICS puts an upper limit on these, use inf for undefined. core_distances_[core_distances_ > max_eps] = np.inf np.around( @@ -602,10 +597,7 @@ def _set_reach_dist( # Assume that radius_neighbors is faster without distances # and we don't need all distances, nevertheless, this means # we may be doing some work twice. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=EfficiencyWarning) - # Efficiency warning appears when using sparse precomputed matrices - indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] + indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress(~np.take(processed, indices), indices) @@ -625,9 +617,7 @@ def _set_reach_dist( dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() if issparse(dists): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=SparseEfficiencyWarning) - rdists = dists.maximum(core_distances_[point_index]) + rdists = dists.maximum(core_distances_[point_index]) np.around( rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data ) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 69c354c4658a4..e0cdf9993eef9 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -101,7 +101,6 @@ def test_extract_xi(metric): X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( min_samples=3, @@ -129,7 +128,6 @@ def test_extract_xi(metric): [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5 ] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( min_samples=3, @@ -148,7 +146,6 @@ def test_extract_xi(metric): X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] X, expected_labels = shuffle(X, expected_labels, random_state=rng) - X = sparse.lil_matrix(X) if metric == "euclidean" else X clust = OPTICS( min_samples=2, @@ -169,7 +166,7 @@ def test_cluster_hierarchy_(metric): C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2)) X = shuffle(X, random_state=0) - X = sparse.lil_matrix(X) if metric == "euclidean" else X + X = sparse.csr_matrix(X) if metric == "euclidean" else X clusters = OPTICS(min_samples=20, xi=0.1, metric=metric).fit(X).cluster_hierarchy_ assert clusters.shape == (2, 2) @@ -191,7 +188,7 @@ def test_correct_number_of_clusters(metric, is_sparse): # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) - clust.fit(sparse.lil_matrix(X) if is_sparse else X) + clust.fit(sparse.csr_matrix(X) if is_sparse else X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert n_clusters_1 == n_clusters @@ -218,7 +215,7 @@ def test_minimum_number_of_sample_check(metric): # Compute OPTICS X = [[1, 1]] - X = sparse.lil_matrix(X) if metric == "euclidean" else X + X = sparse.csr_matrix(X) if metric == "euclidean" else X clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric) # Run the fit @@ -234,7 +231,7 @@ def test_bad_extract(metric): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == "euclidean" else X + X = sparse.csr_matrix(X) if metric == "euclidean" else X # Compute OPTICS clust = OPTICS( @@ -255,7 +252,7 @@ def test_bad_reachability(metric): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == "euclidean" else X + X = sparse.csr_matrix(X) if metric == "euclidean" else X with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric) @@ -315,7 +312,7 @@ def test_close_extract(metric): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if metric == "euclidean" else X + X = sparse.csr_matrix(X) if metric == "euclidean" else X # Compute OPTICS clust = OPTICS( @@ -338,7 +335,7 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): X, labels_true = make_blobs( n_samples=750, centers=centers, cluster_std=0.4, random_state=0 ) - X = sparse.lil_matrix(X) if is_sparse else X + X = sparse.csr_matrix(X) if is_sparse else X # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS( @@ -369,7 +366,7 @@ def test_min_samples_edge_case(metric, is_sparse): C2 = [[10, 10], [10, 9], [10, 11]] C3 = [[100, 100], [100, 96], [100, 106]] X = np.vstack((C1, C2, C3)) - X = sparse.lil_matrix(X) if is_sparse else X + X = sparse.csr_matrix(X) if is_sparse else X expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] clust = OPTICS( @@ -393,15 +390,10 @@ def test_min_samples_edge_case(metric, is_sparse): # try arbitrary minimum sizes @pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23)) -@pytest.mark.parametrize( - "metric, is_sparse", - [["minkowski", False], ["euclidean", False], ["euclidean", True]], -) -def test_min_cluster_size(min_cluster_size, metric, is_sparse): +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) +def test_min_cluster_size(min_cluster_size, metric): redX = X[::2] # reduce for speed - redX = sparse.lil_matrix(redX) if is_sparse else redX - clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size, metric=metric).fit( redX ) @@ -424,7 +416,7 @@ def test_min_cluster_size_invalid(min_cluster_size): clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean") with pytest.raises(ValueError, match="must be a positive integer or a "): - clust.fit(sparse.lil_matrix(X)) + clust.fit(sparse.csr_matrix(X)) def test_min_cluster_size_invalid2(): @@ -434,18 +426,17 @@ def test_min_cluster_size_invalid2(): clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean") with pytest.raises(ValueError, match="must be no greater than the "): - clust.fit(sparse.lil_matrix(X)) + clust.fit(sparse.csr_matrix(X)) @pytest.mark.parametrize( - "metric, is_sparse", - [["minkowski", False], ["euclidean", False], ["euclidean", True]], + "metric", + ["minkowski", "euclidean"], ) -def test_processing_order(metric, is_sparse): +def test_processing_order(metric): # Ensure that we consider all unprocessed points, # not only direct neighbors. when picking the next point. Y = [[0], [10], [-10], [25]] - Y = sparse.lil_matrix(Y) if is_sparse else Y clust = OPTICS(min_samples=3, max_eps=15, metric=metric).fit(Y) assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) @@ -880,7 +871,7 @@ def test_extract_dbscan(): def test_precomputed_dists(is_sparse): redX = X[::2] dists = pairwise_distances(redX, metric="euclidean") - dists = sparse.lil_matrix(dists).tocsr() if is_sparse else dists + dists = sparse.csr_matrix(dists) if is_sparse else dists clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists) clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX) From 01236930e1366bc58bb4d805576545c886cfc6c1 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 14:20:13 -0400 Subject: [PATCH 20/28] Added disclaimer on sparse matrix support for metrics --- sklearn/cluster/_optics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 86b87c01690b5..5ebd5f7c60794 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -82,6 +82,7 @@ class OPTICS(ClusterMixin, BaseEstimator): 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] + Sparse matrices are only supported by scikit-learn metrics. See the documentation for scipy.spatial.distance for details on these metrics. From ca0529e99a697e4b14f094c63e974e91ea7ad560 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 14:24:10 -0400 Subject: [PATCH 21/28] Removed sparse parametrization for algorithm edge-case test --- sklearn/cluster/tests/test_optics.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index e0cdf9993eef9..156214e0fae11 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -357,16 +357,12 @@ def test_dbscan_optics_parity(eps, min_samples, metric, is_sparse): assert percent_mismatch <= 0.05 -@pytest.mark.parametrize( - "metric, is_sparse", - [["minkowski", False], ["euclidean", False], ["euclidean", True]], -) +@pytest.mark.parametrize("metric", ["minkowski", "euclidean"]) def test_min_samples_edge_case(metric, is_sparse): C1 = [[0, 0], [0, 0.1], [0, -0.1]] C2 = [[10, 10], [10, 9], [10, 11]] C3 = [[100, 100], [100, 96], [100, 106]] X = np.vstack((C1, C2, C3)) - X = sparse.csr_matrix(X) if is_sparse else X expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] clust = OPTICS( From 7980f41ff3844f93f8977950ea43499d1b20cf93 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 14:43:58 -0400 Subject: [PATCH 22/28] Remove unused metric argument --- sklearn/cluster/tests/test_optics.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index cac87f4d80fa0..8513f89e97956 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -192,20 +192,20 @@ def test_correct_number_of_clusters(metric, is_sparse): assert set(clust.ordering_) == set(range(len(X))) -def test_minimum_number_of_sample_check(metric): +def test_minimum_number_of_sample_check(): # test that we check a minimum number of samples msg = "min_samples must be no greater than" # Compute OPTICS X = [[1, 1]] - clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1, metric=metric) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1) # Run the fit with pytest.raises(ValueError, match=msg): clust.fit(X) -def test_bad_extract(metric): +def test_bad_extract(): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3." centers = [[1, 1], [-1, -1], [1, -1]] @@ -219,13 +219,12 @@ def test_bad_extract(metric): cluster_method="dbscan", eps=0.3, min_samples=10, - metric=metric, ) with pytest.raises(ValueError, match=msg): clust.fit(X) -def test_bad_reachability(metric): +def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs( @@ -233,7 +232,7 @@ def test_bad_reachability(metric): ) with pytest.warns(UserWarning, match=msg): - clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015, metric=metric) + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X) @@ -282,7 +281,7 @@ def test_nowarn_if_metric_no_bool(): OPTICS(metric=pairwise_metric).fit(X_num) -def test_close_extract(metric): +def test_close_extract(): # Test extract where extraction eps is close to scaled max_eps centers = [[1, 1], [-1, -1], [1, -1]] @@ -291,9 +290,7 @@ def test_close_extract(metric): ) # Compute OPTICS - clust = OPTICS( - max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10, metric=metric - ).fit(X) + clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2 From c9e699ee4aff5a9a927211d2a2e10bab866e4190 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 1 Apr 2022 14:46:02 -0400 Subject: [PATCH 23/28] Cleanup to minimize diff --- sklearn/cluster/tests/test_optics.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 8513f89e97956..d7ea791b2d5e6 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -168,7 +168,6 @@ def test_correct_number_of_clusters(metric, is_sparse): n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) - # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) @@ -214,12 +213,7 @@ def test_bad_extract(): ) # Compute OPTICS - clust = OPTICS( - max_eps=5.0 * 0.03, - cluster_method="dbscan", - eps=0.3, - min_samples=10, - ) + clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10) with pytest.raises(ValueError, match=msg): clust.fit(X) From cf206bfcdfbb2e482e1b30499f91388c163796e9 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sat, 16 Apr 2022 16:57:53 -0400 Subject: [PATCH 24/28] Update sklearn/cluster/_optics.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- sklearn/cluster/_optics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 5ebd5f7c60794..6a86dc299e783 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -269,7 +269,7 @@ def fit(self, X, y=None): (n_samples, n_samples) if metric=’precomputed’ A feature array, or array of distances between samples if metric='precomputed'. If a sparse matrix is provided, it will be - converted into a sparse ``csr_matrix``. + converted into CSR format. y : Ignored Not used, present for API consistency by convention. From 4a5ca67ecbbf61fbf22730020d0e2723b8e813d2 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 16 Apr 2022 18:31:46 -0400 Subject: [PATCH 25/28] Added explicit dense cast before evaluating maximal distances. --- sklearn/cluster/_optics.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 6a86dc299e783..5f33f3f5fb444 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -609,6 +609,8 @@ def _set_reach_dist( # Only compute distances to unprocessed neighbors: if metric == "precomputed": dists = X[point_index, unproc] + if issparse(dists): + dists = dists.toarray().ravel() else: _params = dict() if metric_params is None else metric_params.copy() if metric == "minkowski" and "p" not in _params: @@ -617,15 +619,8 @@ def _set_reach_dist( _params["p"] = p dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() - if issparse(dists): - rdists = dists.maximum(core_distances_[point_index]) - np.around( - rdists.data, decimals=np.finfo(rdists.dtype).precision, out=rdists.data - ) - rdists = np.array(rdists.todense())[0] - else: - rdists = np.maximum(dists, core_distances_[point_index]) - np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) + rdists = np.maximum(dists, core_distances_[point_index]) + np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) improved = np.where(rdists < np.take(reachability_, unproc)) reachability_[unproc[improved]] = rdists[improved] predecessor_[unproc[improved]] = point_index From 4d03c54f6a1dc31adc3bbfc2c2d1e7522dde88fd Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 29 Apr 2022 18:38:45 -0400 Subject: [PATCH 26/28] Surpressed unnecessary warnings --- sklearn/cluster/_optics.py | 9 ++++++--- sklearn/cluster/tests/test_optics.py | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 5f33f3f5fb444..d6ae1d8c63394 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -20,7 +20,7 @@ from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances -from scipy.sparse import issparse +from scipy.sparse import issparse, SparseEfficiencyWarning class OPTICS(ClusterMixin, BaseEstimator): @@ -290,8 +290,11 @@ def fit(self, X, y=None): X = self._validate_data(X, dtype=dtype, accept_sparse="csr") if self.metric == "precomputed" and issparse(X): - # Set each diagonal to an explicit value so each point is its own neighbor - X.setdiag(X.diagonal()) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SparseEfficiencyWarning) + # Set each diagonal to an explicit value so each point is its + # own neighbor + X.setdiag(X.diagonal()) memory = check_memory(self.memory) if self.cluster_method not in ["dbscan", "xi"]: diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index d7ea791b2d5e6..6de9e9c656e22 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -16,7 +16,7 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose - +from sklearn.exceptions import EfficiencyWarning from sklearn.cluster.tests.common import generate_clustered_data @@ -823,7 +823,11 @@ def test_precomputed_dists(is_sparse, global_dtype): redX = X[::2].astype(global_dtype, copy=False) dists = pairwise_distances(redX, metric="euclidean") dists = sparse.csr_matrix(dists) if is_sparse else dists - clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit(dists) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", EfficiencyWarning) + clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit( + dists + ) clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX) assert_allclose(clust1.reachability_, clust2.reachability_) From 27f3edd7a6fc97a79fe061089705b9ebe9ec33e3 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sat, 30 Apr 2022 20:11:09 -0400 Subject: [PATCH 27/28] Update sklearn/cluster/_optics.py Co-authored-by: Thomas J. Fan --- sklearn/cluster/_optics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index dad48b05b32d3..a6b159ef5c5a0 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -613,7 +613,8 @@ def _set_reach_dist( if metric == "precomputed": dists = X[point_index, unproc] if issparse(dists): - dists = dists.toarray().ravel() + dists.sort_indices() + dists = dists.data else: _params = dict() if metric_params is None else metric_params.copy() if metric == "minkowski" and "p" not in _params: From c22d638b9e78c8e4332c8f7b2d806fa581f39fce Mon Sep 17 00:00:00 2001 From: jeremiedbb Date: Mon, 2 May 2022 10:51:28 +0200 Subject: [PATCH 28/28] fix position in changelog --- doc/whats_new/v1.2.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index fe7b2b27c6e79..e3c5d46bc8e2a 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -33,6 +33,14 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.cluster` +...................... + +- |Enhancement| The `predict` and `fit_predict` methods of :class:`cluster.OPTICS` now + accept sparse data type for input data. :pr:`14736` by :user:`Hunt Zhan `, + :pr:`20802` by :user:`Brandon Pokorny `, + and :pr:`22965` by :user:`Meekail Zain `. + Code and Documentation Contributors ----------------------------------- @@ -40,11 +48,3 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.1, including: TODO: update at the time of the release. - -:mod:`sklearn.cluster` -...................... -- |Enhancement| The `predict` and `fit_predict` methods of - :class:`cluster.OPTICS` now accept sparse data type for input - data. :pr:`14736` by :user:`Hunt Zhan `, - :pr:`20802` by :user:`Brandon Pokorny `, - and :pr:`22965` by :user:`Meekail Zain `.