8000 reverted the comment · scikit-learn/scikit-learn@c098aa5 · GitHub
[go: up one dir, main page]

Skip to content

Commit c098aa5

Browse files
committed
reverted the comment
1 parent 1de3e15 commit c098aa5

File tree

2 files changed

+46
-3
lines changed

2 files changed

+46
-3
lines changed

sklearn/metrics/cluster/tests/test_unsupervised.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from scipy.sparse import csr_matrix
33

44
from sklearn import datasets
5-
from sklearn.metrics.cluster.unsupervised import silhouette_score, silhouette_samples
5+
from sklearn.metrics.cluster.unsupervised import silhouette_score
66
from sklearn.metrics import pairwise_distances
77
from sklearn.utils.testing import assert_false
88
from sklearn.utils.testing import assert_almost_equal
@@ -53,6 +53,7 @@ def test_no_nan():
5353
silhouette_sample = silhouette_samples([[3],[3]], np.array([2,4]))
5454
assert_false(np.isnan(silhouette_sample).any())
5555

56+
5657
def test_correct_labelsize():
5758
# Assert 1 < n_labels < n_samples
5859
dataset = datasets.load_iris()
@@ -86,4 +87,4 @@ def test_non_numpy_labels():
8687
X = dataset.data
8788
y = dataset.target
8889
assert_equal(
89-
silhouette_score(list(X), list(y)), silhouette_score(X, y))
90+
silhouette_score(list(X), list(y)), sil 10000 houette_score(X, y))

sklearn/metrics/cluster/unsupervised.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,50 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
161161
<http://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
162162
163163
"""
164+
<<<<<<< HEAD
164165
le = LabelEncoder()
165166
labels = le.fit_transform(labels)
167+
=======
168+
distances = pairwise_distances(X, metric=metric, **kwds)
169+
n = labels.shape[0]
170+
A = np.array([_intra_cluster_distance(distances[i], labels, i)
171+
for i in range(n)])
172+
B = np.array([_nearest_cluster_distance(distances[i], labels, i)
173+
for i in range(n)])
174+
sil_samples = (B - A) / np.maximum(A, B)
175+
# nan values are for clusters of size 1, and should be 0
176+
return np.nan_to_num(sil_samples)
177+
178+
179+
def _intra_cluster_distance(distances_row, labels, i):
180+
"""Calculate the mean intra-cluster distance for sample i.
181+
182+
Parameters
183+
----------
184+
distances_row : array, shape = [n_samples]
185+
Pairwise distance matrix between sample i and each sample.
186+
187+
labels : array, shape = [n_samples]
188+
label values for each sample
189+
190+
i : int
191+
Sample index being calculated. It is excluded from calculation and
192+
used to determine the current label
193+
194+
Returns
195+
-------
196+
a : float
197+
Mean intra-cluster distance for sample i
198+
"""
199+
mask = labels == labels[i]
200+
mask[i] = False
201+
if not np.any(mask):
202+
# cluster of size 1
203+
return 0
204+
a = np.mean(distances_row[mask])
205+
return a
206+
207+
>>>>>>> reverted the comment
166208

167209
distances = pairwise_distances(X, metric=metric, **kwds)
168210
unique_labels = le.classes_
@@ -200,4 +242,4 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
200242

201243
sil_samples = inter_clust_dists - intra_clust_dists
202244
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
203-
return np.nan_to_num(sil_samples)
245+
return np.nan_to_num(sil_samples)

0 commit comments

Comments
 (0)
0