8000 Second code review changes · scikit-learn/scikit-learn@cd52612 · GitHub
[go: up one dir, main page]

Skip to content

Commit cd52612

Browse files
committed
Second code review changes
- Fix incorrectly parsed documentation block - Fix references indentation - Refactor test assertions
1 parent a588458 commit cd52612

File tree

3 files changed

+56
-56
lines changed

3 files changed

+56
-56
lines changed

doc/modules/clustering.rst

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,14 +1621,15 @@ Then the DB index is defined as:
16211621
.. math::
16221622
DB = \frac{1}{k} \sum{i=1}^k \max_{i \neq j} R_{ij}
16231623
1624+
A lower index value is indicative of a better clustering partition.
1625+
1626+
In normal usage, the Davies-Bouldin index is applied to the results of a
1627+
cluster analysis as follows:
16241628

16251629
>>> from sklearn import datasets
16261630
>>> iris = datasets.load_iris()
16271631
>>> X = iris.data
16281632

1629-
In normal usage, the Davies-Bouldin index is applied to the results of a
1630-
cluster analysis.
1631-
16321633
>>> from sklearn.cluster import KMeans
16331634
>>> from sklearn.metrics import davies_bouldin_index
16341635
>>> kmeans = KMeans(n_clusters=3, random_state=1).fit(X)
@@ -1642,9 +1643,6 @@ Advantages
16421643

16431644
- The computation of Davies-Bouldin is simpler than that of Silhouette scores.
16441645

1645-
- Contrary to Calinsky-Harabaz, Davies-Bouldin is bounded to values between 0
1646-
and 1.
1647-
16481646
- The index is computed only quantities and features inherent to the dataset.
16491647

16501648
Drawbacks
@@ -1660,20 +1658,20 @@ Drawbacks
16601658

16611659
.. topic:: References
16621660

1663-
* Davies, David L.; Bouldin, Donald W. (1979).
1664-
"A Cluster Separation Measure"
1665-
IEEE Transactions on Pattern Analysis and Machine Intelligence.
1666-
PAMI-1 (2): 224-227.
1667-
`doi:10.1109/TPAMI.1979.4766909 <http://dx.doi.org/10.1109/TPAMI.1979.4766909>`_.
1661+
* Davies, David L.; Bouldin, Donald W. (1979).
1662+
"A Cluster Separation Measure"
1663+
IEEE Transactions on Pattern Analysis and Machine Intelligence.
1664+
PAMI-1 (2): 224-227.
1665+
`doi:10.1109/TPAMI.1979.4766909 <http://dx.doi.org/10.1109/TPAMI.1979.4766909>`_.
1666+
1667+
* Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001).
1668+
"On Clustering Validation Techniques"
1669+
Journal of Intelligent Information Systems, 17(2-3), 107-145.
1670+
`doi:10.1023/A:1012801612483 <http://dx.doi.org/10.1023/A:1012801612483>`_.
16681671

1669-
* Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001).
1670-
"On Clustering Validation Techniques"
1671-
Journal of Intelligent Information Systems, 17(2-3), 107-145.
1672-
`doi:10.1023/A:1012801612483 <http://dx.doi.org/10.1023/A:1012801612483>`_.
1672+
* `Wikipedia entry for Davies-Bouldin index
1673+
<https://en.wikipedia.org/wiki/Davies–Bouldin_index>`_.
16731674

1674-
* `Wikipedia entry for Davies-Bouldin index
1675-
<https://en.wikipedia.org/wiki/Davies–Bouldin_index>`_.
1676-
16771675

16781676
.. _contingency_matrix:
16791677

doc/whats_new/v0.20.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ Preprocessing
9999

100100
Model evaluation
101101

102+
- Added the :func:`metrics.cluster.davies_bouldin_index` metric for unsupervised
103+
evaluation of clustering models. :issue:`10827` by :user:`Luis Osa <logc>`.
104+
102105
- Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding
103106
``'balanced_accuracy'`` scorer for binary classification.
104107
:issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.

sklearn/metrics/cluster/tests/test_unsupervised.py

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import numpy as np
22
import scipy.sparse as sp
3+
import pytest
34
from scipy.sparse import csr_matrix
45

56
from sklearn import datasets
67
from sklearn.utils.testing import assert_false
7-
from sklearn.utils.testing import assert_almost_equal
88
from sklearn.utils.testing import assert_array_equal
99
from sklearn.utils.testing import assert_equal
1010
from sklearn.utils.testing import assert_raises_regexp
@@ -34,13 +34,13 @@ def test_silhouette():
3434
assert_greater(score_precomputed, 0)
3535
# Test without calculating D
3636
score_euclidean = silhouette_score(X, y, metric='euclidean')
37-
assert_almost_equal(score_precomputed, score_euclidean)
37+
pytest.approx(score_precomputed, score_euclidean)
3838

3939
if X is X_dense:
4040
score_dense_without_sampling = score_precomputed
4141
else:
42-
assert_almost_equal(score_euclidean,
43-
score_dense_without_sampling)
42+
pytest.approx(score_euclidean,
43+
score_dense_without_sampling)
4444

4545
# Test with sampling
4646
score_precomputed = silhouette_score(D, y, metric='precomputed',
@@ -51,12 +51,12 @@ def test_silhouette():
5151
random_state=0)
5252
assert_greater(score_precomputed, 0)
5353
assert_greater(score_euclidean, 0)
54-
assert_almost_equal(score_euclidean, score_precomputed)
54+
pytest.approx(score_euclidean, score_precomputed)
5555

5656
if X is X_dense:
5757
score_dense_with_sampling = score_precomputed
5858
else:
59-
assert_almost_equal(score_euclidean, score_dense_with_sampling)
59+
pytest.approx(score_euclidean, score_dense_with_sampling)
6060

6161

6262
def test_cluster_size_1():
@@ -121,12 +121,14 @@ def test_silhouette_paper_example():
121121
(labels2, expected2, score2)]:
122122
expected = [expected[name] for name in names]
123123
# we check to 2dp because that's what's in the paper
124-
assert_almost_equal(expected, silhouette_samples(D, np.array(labels),
125-
metric='precomputed'),
126-
decimal=2)
127-
assert_almost_equal(score, silhouette_score(D, np.array(labels),
128-
metric='precomputed'),
129-
decimal=2)
124+
pytest.approx(expected,
125+
silhouette_samples(D, np.array(labels),
126+
metric='precomputed'),
127+
abs=1e-2)
128+
pytest.approx(score,
129+
silhouette_score(D, np.array(labels),
130+
metric='precomputed'),
131+
abs=1e-2)
130132

131133

132134
def test_correct_labelsize():
@@ -167,19 +169,27 @@ def test_non_numpy_labels():
167169
silhouette_score(list(X), list(y)), silhouette_score(X, y))
168170

169171

170-
def test_calinski_harabaz_score():
172+
def assert_raises_on_only_one_label(func):
173+
"""Assert message when there is only one label"""
171174
rng = np.random.RandomState(seed=0)
172-
173-
# Assert message when there is only one label
174175
assert_raise_message(ValueError, "Number of labels is",
175-
calinski_harabaz_score,
176+
func,
176177
rng.rand(10, 2), np.zeros(10))
177178

178-
# Assert message when all point are in different clusters
179+
180+
def assert_raises_on_all_points_same_cluster(func):
181+
"""Assert message when all point are in different clusters"""
182+
rng = np.random.RandomState(seed=0)
179183
assert_raise_message(ValueError, "Number of labels is",
180-
calinski_harabaz_score,
184+
func,
181185
rng.rand(10, 2), np.arange(10))
182186

187+
188+
def test_calinski_harabaz_score():
189+
assert_raises_on_only_one_label(calinski_harabaz_score)
190+
191+
assert_raises_on_all_points_same_cluster(calinski_harabaz_score)
192+
183193
# Assert the value is 1. when all samples are equals
184194
assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
185195
[0] * 5 + [1] * 5))
@@ -192,40 +202,29 @@ def test_calinski_harabaz_score():
192202
X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
193203
[[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
194204
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
195-
assert_almost_equal(calinski_harabaz_score(X, labels),
205+
pytest.approx(calinski_harabaz_score(X, labels),
196206
45 * (40 - 4) / (5 * (4 - 1)))
197207

198208

199209
def test_davies_bouldin_index():
200-
rng = np.random.RandomState(seed=0)
201-
202-
# Assert message when there is only one label
203-
assert_raise_message(ValueError, "Number of labels is",
204-
davies_bouldin_index,
205-
rng.rand(10, 2), np.zeros(10))
206-
207-
# Assert message when all point are in different clusters
208-
assert_raise_message(ValueError, "Number of labels is",
209-
davies_bouldin_index,
210-
rng.rand(10, 2), np.arange(10))
210+
assert_raises_on_only_one_label(davies_bouldin_index)
211+
assert_raises_on_all_points_same_cluster(davies_bouldin_index)
211212

212213
# Assert the value is 0. when all samples are equals
213-
assert 0. == davies_bouldin_index(np.ones((10, 2)),
214-
[0] * 5 + [1] * 5)
214+
assert davies_bouldin_index(np.ones((10, 2)),
215+
[0] * 5 + [1] * 5) == pytest.approx(0.0)
215216

216217
# Assert the value is 0. when all the mean cluster are equal
217-
assert 0. == davies_bouldin_index([[-1, -1], [1, 1]] * 10,
218-
[0] * 10 + [1] * 10)
218+
assert davies_bouldin_index([[-1, -1], [1, 1]] * 10,
219+
[0] * 10 + [1] * 10) == pytest.approx(0.0)
219220

220221
# General case (with non numpy arrays)
221222
X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
222223
[[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
223224
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
224-
assert_almost_equal(davies_bouldin_index(X, labels),
225-
2 * np.sqrt(0.5) / 3)
225+
pytest.approx(davies_bouldin_index(X, labels), 2 * np.sqrt(0.5) / 3)
226226

227227
# General case - cluster have one sample
228228
X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
229229
labels = [0, 0, 1, 2]
230-
assert_almost_equal(davies_bouldin_index(X, labels),
231-
(5. / 4) / 3)
230+
pytest.approx(davies_bouldin_index(X, labels), (5. / 4) / 3)

0 commit comments

Comments
 (0)
0