8000 Resolved merge conflicts · scikit-learn/scikit-learn@209e6d9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 209e6d9

Browse files
committed
Resolved merge conflicts
1 parent 0d31aaf commit 209e6d9

File tree

1 file changed

+46
-62
lines changed

1 file changed

+46
-62
lines changed

sklearn/metrics/cluster/unsupervised.py

Lines changed: 46 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
import numpy as np
88

99
from ...utils import check_random_state
10+
from ...utils import check_X_y
1011
from ..pairwise import pairwise_distances
12+
from ...preprocessing import LabelEncoder
1113

1214

1315
def silhouette_score(X, labels, metric='euclidean', sample_size=None,
@@ -79,8 +81,12 @@ def silhouette_score(X, labels, metric='euclidean', sample_size=None,
7981
<http://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
8082
8183
"""
84+
X, labels = check_X_y(X, labels)
85+
le = LabelEncoder()
86+
labels = le.fit_transform(labels)
8287
n_labels = len(np.unique(labels))
8388
n_samples = X.shape[0]
89+
8490
if not 1 < n_labels < n_samples:
8591
raise ValueError("Number of labels is %d. Valid values are 2 "
8692
"to n_samples - 1 (inclusive)" % n_labels)
@@ -155,67 +161,45 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds):
155161
<http://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
156162
157163
"""
158-
distances = pairwise_distances(X, metric=metric, **kwds)
159-
n = labels.shape[0]
160-
A = np.array([_intra_cluster_distance(distances[i], labels, i)
161-
for i in range(n)])
162-
B = np.array([_nearest_cluster_distance(distances[i], labels, i)
163-
for i in range(n)])
164-
sil_samples = (B - A) / np.maximum(A, B)
165-
# nan values are for clusters of size 1, and should be 0
166-
return np.nan_to_num(sil_samples)
167-
168-
169-
def _intra_cluster_distance(distances_row, labels, i):
170-
"""Calculate the mean intra-cluster distance for sample i.
171-
172-
Parameters
173-
----------
174-
distances_row : array, shape = [n_samples]
175-
Pairwise distance matrix between sample i and each sample.
176-
177-
labels : array, shape = [n_samples]
178-
label values for each sample
179-
180-
i : int
181-
Sample index being calculated. It is excluded from calculation and
182-
used to determine the current label
164+
le = LabelEncoder()
165+
labels = le.fit_transform(labels)
183166

184-
Returns
185-
-------
186-
a : float
187-
Mean intra-cluster distance for sample i
188-
"""
189-
mask = labels == labels[i]
190-
mask[i] = False
191-
if not np.any(mask):
192-
# cluster of size 1
193-
return 0
194-
a = np.mean(distances_row[mask])
195-
return a
196-
197-
198-
def _nearest_cluster_distance(distances_row, labels, i):
199-
"""Calculate the mean nearest-cluster distance for sample i.
200-
201-
Parameters
202-
----------
203-
distances_row : array, shape = [n_samples]
204-
Pairwise distance matrix between sample i and each sample.
205-
206-
labels : array, shape = [n_samples]
207-
label values for each sample
208-
209-
i : int
210-
Sample index being calculated. It is used to determine the current
211-
label.
167+
distances = pairwise_distances(X, metric=metric, **kwds)
168+
unique_labels = le.classes_
169+
170+
# For sample i, store the mean distance of the cluster to which
171+
# it belongs in intra_clust_dists[i]
172+
intra_clust_dists = np.ones(distances.shape[0], dtype=distances.dtype)
173+
174+
# For sample i, store the mean distance of the second closest
175+
# cluster in inter_clust_dists[i]
176+
inter_clust_dists = np.inf * intra_clust_dists
177+
178+
for curr_label in unique_labels:
179+
180+
# Find inter_clust_dist for all samples belonging to the same
181+
# label.
182+
mask = labels == curr_label
183+
current_distances = distances[mask]
184+
185+
# Leave out current sample.
186+
n_samples_curr_lab = np.sum(mask) - 1
187+
if n_samples_curr_lab != 0:
188+
intra_clust_dists[mask] = np.sum(
189+
current_distances[:, mask], axis=1) / n_samples_curr_lab
190+
191+
# Now iterate over all other labels, finding the mean
192+
# cluster distance that is closest to every sample.
193+
for other_label in unique_labels:
194+
if other_label != curr_label:
195+
other_mask = labels == other_label
196+
other_distances = np.mean(
197+
current_distances[:, other_mask], axis=1)
198+
inter_clust_dists[mask] = np.minimum(
199+
inter_clust_dists[mask], other_distances)
200+
201+
sil_samples = inter_clust_dists - intra_clust_dists
202+
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
212203

213-
Returns
214-
-------
215-
b : float
216-
Mean nearest-cluster distance for sample i
217-
"""
218-
label = labels[i]
219-
b = np.min([np.mean(distances_row[labels == cur_label])
220-
for cur_label in set(labels) if not cur_label == label])
221-
return b
204+
# nan values are for clusters of size 1, and should be 0
205+
return np.nan_to_num(sil_samples)

0 commit comments

Comments
 (0)
0