|
| 1 | +from __future__ import division |
| 2 | +from collections import defaultdict |
| 3 | + |
| 4 | +from math import pow, sqrt, log |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from scipy.spatial.distance import cosine |
| 8 | + |
| 9 | +from ...utils import check_random_state |
| 10 | + |
| 11 | + |
| 12 | +def stability(X, cluster_estimator, k_max=None, nb_draw=100, prop_subset=.8, |
| 13 | + random_state=None): |
| 14 | + """Stability algorithm. |
| 15 | + For k from 2 to k_max, compute stability of cluster estimator to produce k |
| 16 | + clusters. Stability measures if the estimator produces the same clusters |
| 17 | + given small variations in the input data. It draws two overlapping subsets |
| 18 | + A and B of input data. For points in the two subsets, we compute the |
| 19 | + connectivity matrix M_A and M_B for the clustering done on subsets A and B. |
| 20 | + The stability of cluster_estimator with k cluster is the expectation of |
| 21 | + <M_A, M_B> / ||M_A|| * ||M_B|| |
| 22 | +
|
| 23 | + Ref: Ben-Hur, Elisseeff, Guyon: a stability based method for discovering |
| 24 | + structure in clusterd data, 2002 |
| 25 | + Overview of stability: Luxburg: clustering stability: an overview |
| 26 | +
|
| 27 | + Parameters |
| 28 | + ---------- |
| 29 | + X : array-like or sparse matrix, shape (n_samples, n_features) |
| 30 | + The observations to cluster. |
| 31 | + cluster_estimator: ClusterMixing estimator object. |
| 32 | + need parameter n_clusters |
| 33 | + need method fit_predict: X -> labels |
| 34 | + k_max: int: maximum number of clusters (default = n_samples / 2) |
| 35 | + nb_draw: number of draws to estimate expectation of |
| 36 | + <M_A, M_B> / ||M_A|| * ||M_B|| |
| 37 | +
|
| 38 | + prop_subset: 0 < float < 1: proportion of input data taken in each subset |
| 39 | +
|
| 40 | + Return |
| 41 | + ------ |
| 42 | + k: int |
| 43 | + """ |
| 44 | + rng = check_random_state(random_state) |
| 45 | + |
| 46 | + n_samples, n_features = X.shape |
| 47 | + if not k_max: |
| 48 | + k_max = n_samples / 2 |
| 49 | + |
| 50 | + best_stab, best_k = 0, 0 |
| 51 | + for k in range(2, k_max): |
| 52 | + cluster_estimator.set_params(n_clusters=k) |
| 53 | + this_score = sum( |
| 54 | + _one_stability_measure(cluster_estimator, X, prop_subset) |
| 55 | + for _ in range(nb_draw)) / nb_draw |
| 56 | + |
| 57 | + if this_score >= best_stab: |
| 58 | + best_stab = this_score |
| 59 | + best_k = k |
| 60 | + |
| 61 | + return best_k |
| 62 | + |
| 63 | + |
| 64 | +def adjacency_matrix(cluster_assignement): |
| 65 | + """ |
| 66 | + Parameter |
| 67 | + --------- |
| 68 | + cluster_assignement: vector (n_samples) of int i, 0 <= i < k |
| 69 | +
|
| 70 | + Return |
| 71 | + ------ |
| 72 | + adj_matrix: matrix (n_samples, n_samples) |
| 73 | + adji_matrix[i, j] = cluster_assignement[i] == cluster_assignement[j] |
| 74 | + """ |
| 75 | + n_samples = len(cluster_assignement) |
| 76 | + adj_matrix = np.zeros((n_samples, n_samples)) |
| 77 | + for i, val in enumerate(cluster_assignement): |
| 78 | + for j in range(i, n_samples): |
| 79 | + linked = val == cluster_assignement[j] |
| 80 | + adj_matrix[i, j] = linked |
| 81 | + adj_matrix[j, i] = linked |
| 82 | + return adj_matrix |
| 83 | + |
| 84 | + |
| 85 | +def _one_stability_measure(cluster_estimator, X, prop_sample, |
| 86 | + random_state=None): |
| 87 | + """ |
| 88 | + Draws two subsets A and B from X, apply clustering and return |
| 89 | + <M_A, M_B> / ||M_A|| * ||M_B|| |
| 90 | +
|
| 91 | + Parameter |
| 92 | + --------- |
| 93 | + X: array of size n_samples, n_features |
| 94 | + cluster_estimator: ClusterMixing estimator object. |
| 95 | + need parameter n_clusters |
| 96 | + need method fit_predict: X -> labels |
| 97 | + prop_sample: 0 < float < 1, proportion of X taken in each subset |
| 98 | + """ |
| 99 | + rng = check_random_state(random_state) |
| 100 | + |
| 101 | + n_sample = X.shape[0] |
| 102 | + set_1 = rng.uniform(size=n_sample) < prop_sample |
| 103 | + set_2 = rng.uniform(size=n_sample) < prop_sample |
| 104 | + nb_points_1, nb_points_2 = 0, 0 |
| 105 | + points_1, points_2 = [], [] |
| 106 | + common_points_1, common_points_2 = [], [] |
| 107 | + for i, (is_1, is_2) in enumerate(zip(set_1, set_2)): |
| 108 | + if is_1 and is_2: |
| 109 | + common_points_1.append(nb_points_1) |
| 110 | + common_points_2.append(nb_points_2) |
| 111 | + if is_1: |
| 112 | + points_1.append(i) |
| 113 | + nb_points_1 += 1 |
| 114 | + if is_2: |
| 115 | + points_2.append(i) |
| 116 | + nb_points_2 += 1 |
| 117 | + |
| 118 | + assi_1 = cluster_estimator.fit_predict(X[np.ix_(points_1)]) |
| 119 | + assi_2 = cluster_estimator.fit_predict(X[np.ix_(points_2)]) |
| 120 | + |
| 121 | + adj_mat_1 = adjacency_matrix(assi_1)[np.ix_(common_points_1, |
| 122 | + common_points_1)] |
| 123 | + adj_mat_2 = adjacency_matrix(assi_2)[np.ix_(common_points_2, |
| 124 | + common_points_2)] |
| 125 | + return 1 - cosine(adj_mat_1.flatten(), adj_mat_2.flatten()) |
0 commit comments