8000 add stability · scikit-learn/scikit-learn@fb0f911 · GitHub
[go: up one dir, main page]

Skip to content

Commit fb0f911

Browse files
author
Arnaud Fouchet
committed
add stability
1 parent 0ab6894 commit fb0f911

File tree

2 files changed

+165
-0
lines changed

2 files changed

+165
-0
lines changed

sklearn/metrics/cluster/stability.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from __future__ import division
2+
from collections import defaultdict
3+
4+
from math import pow, sqrt, log
5+
6+
import numpy as np
7+
from scipy.spatial.distance import cosine
8+
9+
from ...utils import check_random_state
10+
11+
12+
def stability(X, cluster_estimator, k_max=None, nb_draw=100, prop_subset=.8,
13+
random_state=None):
14+
"""Stability algorithm.
15+
For k from 2 to k_max, compute stability of cluster estimator to produce k
16+
clusters. Stability measures if the estimator produces the same clusters
17+
given small variations in the input data. It draws two overlapping subsets
18+
A and B of input data. For points in the two subsets, we compute the
19+
connectivity matrix M_A and M_B for the clustering done on subsets A and B.
20+
The stability of cluster_estimator with k cluster is the expectation of
21+
<M_A, M_B> / ||M_A|| * ||M_B||
22+
23+
Ref: Ben-Hur, Elisseeff, Guyon: a stability based method for discovering
24+
structure in clusterd data, 2002
25+
Overview of stability: Luxburg: clustering stability: an overview
26+
27+
Parameters
28+
----------
29+
X : array-like or sparse matrix, shape (n_samples, n_features)
30+
The observations to cluster.
31+
cluster_estimator: ClusterMixing estimator object.
32+
need parameter n_clusters
33+
need method fit_predict: X -> labels
34+
k_max: int: maximum number of clusters (default = n_samples / 2)
35+
nb_draw: number of draws to estimate expectation of
36+
<M_A, M_B> / ||M_A|| * ||M_B||
37+
38+
prop_subset: 0 < float < 1: proportion of input data taken in each subset
39+
40+
Return
41+
------
42+
k: int
43+
"""
44+
rng = check_random_state(random_state)
45+
46+
n_samples, n_features = X.shape
47+
if not k_max:
48+
k_max = n_samples / 2
49+
50+
best_stab, best_k = 0, 0
51+
for k in range(2, k_max):
52+
cluster_estimator.set_params(n_clusters=k)
53+
this_score = sum(
54+
_one_stability_measure(cluster_estimator, X, prop_subset)
55+
for _ in range(nb_draw)) / nb_draw
56+
57+
if this_score >= best_stab:
58+
best_stab = this_score
59+
best_k = k
60+
61+
return best_k
62+
63+
64+
def adjacency_matrix(cluster_assignement):
65+
"""
66+
Parameter
67+
---------
68+
cluster_assignement: vector (n_samples) of int i, 0 <= i < k
69+
70+
Return
71+
------
72+
adj_matrix: matrix (n_samples, n_samples)
73+
adji_matrix[i, j] = cluster_assignement[i] == cluster_assignement[j]
74+
"""
75+
n_samples = len(cluster_assignement)
76+
adj_matrix = np.zeros((n_samples, n_samples))
77+
for i, val in enumerate(cluster_assignement):
78+
for j in range(i, n_samples):
79+
linked = val == cluster_assignement[j]
80+
adj_matrix[i, j] = linked
81+
adj_matrix[j, i] = linked
82+
return adj_matrix
83+
84+
85+
def _one_stability_measure(cluster_estimator, X, prop_sample,
86+
random_state=None):
87+
"""
88+
Draws two subsets A and B from X, apply clustering and return
89+
<M_A, M_B> / ||M_A|| * ||M_B||
90+
91+
Parameter
92+
---------
93+
X: array of size n_samples, n_features
94+
cluster_estimator: ClusterMixing estimator object.
95+
need parameter n_clusters
96+
need method fit_predict: X -> labels
97+
prop_sample: 0 < float < 1, proportion of X taken in each subset
98+
"""
99+
rng = check_random_state(random_state)
100+
101+
n_sample = X.shape[0]
102+
set_1 = rng.uniform(size=n_sample) < prop_sample
103+
set_2 = rng.uniform(size=n_sample) < prop_sample
104+
nb_points_1, nb_points_2 = 0, 0
105+
points_1, points_2 = [], []
106+
common_points_1, common_points_2 = [], []
107+
for i, (is_1, is_2) in enumerate(zip(set_1, set_2)):
108+
if is_1 and is_2:
109+
common_points_1.append(nb_points_1)
110+
common_points_2.append(nb_points_2)
111+
if is_1:
112+
points_1.append(i)
113+
nb_points_1 += 1
114+
if is_2:
115+
points_2.append(i)
116+
nb_points_2 += 1
117+
118+
assi_1 = cluster_estimator.fit_predict(X[np.ix_(points_1)])
119+
assi_2 = cluster_estimator.fit_predict(X[np.ix_(points_2)])
120+
121+
adj_mat_1 = adjacency_matrix(assi_1)[np.ix_(common_points_1,
122+
common_points_1)]
123+
adj_mat_2 = adjacency_matrix(assi_2)[np.ix_(common_points_2,
124+
common_points_2)]
125+
return 1 - cosine(adj_mat_1.flatten(), adj_mat_2.flatten())
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import numpy as np
2+
3+
from sklearn.utils.testing import (
4+
assert_almost_equal, assert_equal, assert_array_equal)
5+
6+
from sklearn.cluster.k_means_ import KMeans
7+
from sklearn.metrics.cluster.stability import (
8+
adjacency_matrix, _one_stability_measure, stability)
9+
from sklearn.datasets import make_blobs
10+
11+
12+
def test_adjacency_matrix():
13+
assi = [0, 0, 1, 1]
14+
adj_matrix = np.array([[1, 1, 0, 0], [1, 1, 0, 0],
15+
[0, 0, 1, 1], [0, 0, 1, 1]])
16+
found_adj = adjacency_matrix(assi)
17+
assert_array_equal(found_adj, adj_matrix)
18+
19+
20+
def test_one_stability_measure():
21+
X = np.arange(10) < 5
22+
X.reshape((10, 1))
23+
24+
# test perfect clustering has 1 stability
25+
class SameCluster(object):
26+
def set_params(self, *args, **kwargs):
27+
pass
28+
29+
def fit_predict(self, X):
30+
return X
31+
same_clusters = SameCluster()
32+
assert_almost_equal(_one_stability_measure(same_clusters, X, .8), 1)
33+
34+
35+
def test_stability():
36+
X, _ = make_blobs(90, centers=np.array([[-2, -2], [2, 0], [-2, 2]]),
37+
random_state=0)
38+
cluster_estimator = KMeans()
39+
assert_equal(stability(X, cluster_estimator, k_max=6,
40+
nb_draw=10, random_state=0), 3)

0 commit comments

Comments
 (0)
0