From de60297500c84a7db32b8138d632899c4c42f2ee Mon Sep 17 00:00:00 2001 From: MechCoder Date: Mon, 18 Apr 2016 19:35:19 -0400 Subject: [PATCH] Remove add_row_csr --- sklearn/cluster/_k_means.pyx | 28 ++++++++++++++-------------- sklearn/utils/sparsefuncs_fast.pxd | 8 -------- sklearn/utils/sparsefuncs_fast.pyx | 17 ----------------- 3 files changed, 14 insertions(+), 39 deletions(-) delete mode 100644 sklearn/utils/sparsefuncs_fast.pxd diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx index ec26543b5609b..0e8b729cc679e 100644 --- a/sklearn/cluster/_k_means.pyx +++ b/sklearn/cluster/_k_means.pyx @@ -15,7 +15,7 @@ cimport numpy as np cimport cython from ..utils.extmath import norm -from sklearn.utils.sparsefuncs_fast cimport add_row_csr +from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils.fixes import bincount ctypedef np.float64_t DOUBLE @@ -323,9 +323,8 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters, centers: array, shape (n_clusters, n_features) The resulting centers """ - n_features = X.shape[1] - - cdef np.npy_intp cluster_id + cdef int n_features = X.shape[1] + cdef int curr_label cdef np.ndarray[DOUBLE, ndim=1] data = X.data cdef np.ndarray[int, ndim=1] indices = X.indices @@ -338,24 +337,25 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters, bincount(labels, minlength=n_clusters) cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \ np.where(n_samples_in_cluster == 0)[0] + cdef int n_empty_clusters = empty_clusters.shape[0] # maybe also relocate small clusters? - if empty_clusters.shape[0] > 0: + if n_empty_clusters > 0: # find points to reassign empty clusters to - far_from_centers = distances.argsort()[::-1] + far_from_centers = distances.argsort()[::-1][:n_empty_clusters] - for i in range(empty_clusters.shape[0]): - cluster_id = empty_clusters[i] + # XXX two relocated clusters could be close to each other + assign_rows_csr(X, far_from_centers, empty_clusters, centers) - # XXX two relocated clusters could be close to each other - centers[cluster_id] = 0. - add_row_csr(data, indices, indptr, far_from_centers[i], - centers[cluster_id]) - n_samples_in_cluster[cluster_id] = 1 + for i in range(n_empty_clusters): + n_samples_in_cluster[empty_clusters[i]] = 1 for i in range(labels.shape[0]): - add_row_csr(data, indices, indptr, i, centers[labels[i]]) + curr_label = labels[i] + for ind in range(indptr[i], indptr[i + 1]): + j = indices[ind] + centers[curr_label, j] += data[ind] centers /= n_samples_in_cluster[:, np.newaxis] diff --git a/sklearn/utils/sparsefuncs_fast.pxd b/sklearn/utils/sparsefuncs_fast.pxd deleted file mode 100644 index f1a5455f64255..0000000000000 --- a/sklearn/utils/sparsefuncs_fast.pxd +++ /dev/null @@ -1,8 +0,0 @@ -cimport numpy as np - - -cdef void add_row_csr(np.ndarray[np.float64_t, ndim=1], - np.ndarray[int, ndim=1], - np.ndarray[int, ndim=1], - int i, np.ndarray[np.float64_t, ndim=1, mode="c"]) - diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 10c143ed5078a..554f2836d24af 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -382,21 +382,6 @@ def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data, X_data[j] /= sum_ -cdef void add_row_csr(np.ndarray[np.float64_t, ndim=1] data, - np.ndarray[int, ndim=1] indices, - np.ndarray[int, ndim=1] indptr, - int i, np.ndarray[np.float64_t, ndim=1, mode="c"] out): - """Add row i of CSR matrix (data, indices, indptr) to array out. - - Equivalent to out += X[i].toarray(). Returns None. - """ - cdef int ind, j - - for ind in range(indptr[i], indptr[i + 1]): - j = indices[ind] - out[j] += data[ind] - - def assign_rows_csr(X, np.ndarray[np.npy_intp, ndim=1] X_rows, np.ndarray[np.npy_intp, ndim=1] out_rows, @@ -427,8 +412,6 @@ def assign_rows_csr(X, out[out_rows] = 0. for i in range(X_rows.shape[0]): - # XXX we could reuse add_row_csr here, but the array slice - # is not optimized away. rX = X_rows[i] for ind in range(indptr[rX], indptr[rX + 1]): j = indices[ind]