8000 [MRG+1] Allows KMeans/MiniBatchKMeans to use float32 internally by using cython fused types by ssaeger · Pull Request #6430 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+1] Allows KMeans/MiniBatchKMeans to use float32 internally by using cython fused types #6430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ Enhancements
- Added ``inverse_transform`` function to :class:`decomposition.nmf` to compute
data matrix of original shape. By `Anish Shah`_.

- :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` now works
with ``np.float32`` and ``np.float64`` input data without converting it.
This allows to reduce the memory consumption by using ``np.float32``.
(`#6430 <https://github.com/scikit-learn/scikit-learn/pull/6430>`_)
By `Sebastian Säger`_.

Bug fixes
.........

Expand Down Expand Up @@ -1647,7 +1653,7 @@ List of contributors for release 0.15 by number of commits.
* 4 Alexis Metaireau
* 4 Ignacio Rossi
* 4 Virgile Fritsch
* 4 Sebastian Saeger
* 4 Sebastian Säger
* 4 Ilambharathi Kanniah
* 4 sdenton4
* 4 Robert Layton
Expand Down Expand Up @@ -4127,3 +4133,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
.. _Anish Shah: https://github.com/AnishShah

.. _Ryad Zenine: https://github.com/ryadzenine

.. _Sebastian Säger: https://github.com/ssaeger
106 changes: 73 additions & 33 deletions sklearn/cluster/_k_means.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import numpy as np
import scipy.sparse as sp
cimport numpy as np
cimport cython
from cython cimport floating

from ..utils.extmath import norm
from sklearn.utils.sparsefuncs_fast cimport add_row_csr
Expand All @@ -23,18 +24,19 @@ ctypedef np.int32_t INT

cdef extern from "cblas.h":
double ddot "cblas_ddot"(int N, double *X, int incX, double *Y, int incY)
float sdot "cblas_sdot"(int N, float *X, int incX, float *Y, int incY)

np.import_array()


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
np.ndarray[DOUBLE, ndim=1] x_squared_norms,
np.ndarray[DOUBLE, ndim=2] centers,
cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
np.ndarray[floating, ndim=1] x_squared_norms,
np.ndarray[floating, ndim=2] centers,
np.ndarray[INT, ndim=1] labels,
np.ndarray[DOUBLE, ndim=1] distances):
np.ndarray[floating, ndim=1] distances):
"""Compute label assignment and inertia for a dense array

Return the inertia (sum of squared distances to the centers).
Expand All @@ -43,33 +45,52 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
unsigned int n_clusters = centers.shape[0]
unsigned int n_features = centers.shape[1]
unsigned int n_samples = X.shape[0]
unsigned int x_stride = X.strides[1] / sizeof(DOUBLE)
unsigned int center_stride = centers.strides[1] / sizeof(DOUBLE)
unsigned int x_stride
unsigned int center_stride
unsigned int sample_idx, center_idx, feature_idx
unsigned int store_distances = 0
unsigned int k
np.ndarray[floating, ndim=1] center_squared_norms
# the following variables are always double cause make them floating
# does not save any memory, but makes the code much bigger
DOUBLE inertia = 0.0
DOUBLE min_dist
DOUBLE dist
np.ndarray[DOUBLE, ndim=1] center_squared_norms = np.zeros(
n_clusters, dtype=np.float64)

if floating is float:
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
x_stride = X.strides[1] / sizeof(float)
center_stride = centers.strides[1] / sizeof(float)
else:
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)
x_stride = X.strides[1] / sizeof(DOUBLE)
center_stride = centers.strides[1] / sizeof(DOUBLE)

if n_samples == distances.shape[0]:
store_distances = 1

for center_idx in range(n_clusters):
center_squared_norms[center_idx] = ddot(
n_features, &centers[center_idx, 0], center_stride,
&centers[center_idx, 0], center_stride)
if floating is float:
center_squared_norms[center_idx] = sdot(
n_features, &centers[center_idx, 0], center_stride,
&centers[center_idx, 0], center_stride)
else:
center_squared_norms[center_idx] = ddot(
n_features, &centers[center_idx, 0], center_stride,
&centers[center_idx, 0], center_stride)

for sample_idx in range(n_samples):
min_dist = -1
for center_idx in range(n_clusters):
dist = 0.0
# hardcoded: minimize euclidean distance to cluster center:
# ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
dist += ddot(n_features, &X[sample_idx, 0], x_stride,
&centers[center_idx, 0], center_stride)
10000 if floating is float:
dist += sdot(n_features, &X[sample_idx, 0], x_stride,
&centers[center_idx, 0], center_stride)
else:
dist += ddot(n_features, &X[sample_idx, 0], x_stride,
&centers[center_idx, 0], center_stride)
dist *= -2
dist += center_squared_norms[center_idx]
dist += x_squared_norms[sample_idx]
Expand All @@ -87,16 +108,16 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[DOUBLE, ndim=2] X,
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
np.ndarray[DOUBLE, ndim=2] centers,
cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
np.ndarray[floating, ndim=2] centers,
np.ndarray[INT, ndim=1] labels,
np.ndarray[DOUBLE, ndim=1] distances):
np.ndarray[floating, ndim=1] distances):
"""Compute label assignment and inertia for a CSR input

Return the inertia (sum of squared distances to the centers).
"""
cdef:
np.ndarray[DOUBLE, ndim=1] X_data = X.data
np.ndarray[floating, ndim=1] X_data = X.data
np.ndarray[INT, ndim=1] X_indices = X.indices
np.ndarray[INT, ndim=1] X_indptr = X.indptr
unsigned int n_clusters = centers.shape[0]
Expand All @@ -105,18 +126,28 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
unsigned int store_distances = 0
unsigned int sample_idx, center_idx, feature_idx
unsigned int k
np.ndarray[floating, ndim=1] center_squared_norms
# the following variables are always double cause make them floating
# does not save any memory, but makes the code much bigger
DOUBLE inertia = 0.0
DOUBLE min_dist
DOUBLE dist
np.ndarray[DOUBLE, ndim=1] center_squared_norms = np.zeros(
n_clusters, dtype=np.float64)

if floating is float:
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
else:
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)

if n_samples == distances.shape[0]:
store_distances = 1

for center_idx in range(n_clusters):
center_squared_norms[center_idx] = ddot(
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
if floating is float:
center_squared_norms[center_idx] = sdot(
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
else:
center_squared_norms[center_idx] = ddot(
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)

for sample_idx in range(n_samples):
min_dist = -1
Expand All @@ -142,18 +173,18 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
np.ndarray[DOUBLE, ndim=2] centers,
def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
np.ndarray[floating, ndim=2] centers,
np.ndarray[INT, ndim=1] counts,
np.ndarray[INT, ndim=1] nearest_center,
np.ndarray[DOUBLE, ndim=1] old_center,
np.ndarray[floating, ndim=1] old_center,
int compute_squared_diff):
"""Incremental update of the centers for sparse MiniBatchKMeans.

Parameters
----------

X: CSR matrix, dtype float64
X: CSR matrix, dtype float
The complete (pre allocated) training set as a CSR matrix.

centers: array, shape (n_clusters, n_features)
Expand All @@ -179,7 +210,7 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
of the algorithm.
"""
cdef:
np.ndarray[DOUBLE, ndim=1] X_data = X.data
np.ndarray[floating, ndim=1] X_data = X.data
np.ndarray[int, ndim=1] X_indices = X.indices
np.ndarray[int, ndim=1] X_indptr = X.indptr
unsigned int n_samples = X.shape[0]
Expand Down Expand Up @@ -245,9 +276,9 @@ def _mini_batch_update_csr(X, np.ndarray[DOUBLE, ndim=1] x_squared_norms,
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
def _centers_dense(np.ndarray[floating, ndim=2] X,
np.ndarray[INT, ndim=1] labels, int n_clusters,
np.ndarray[DOUBLE, ndim=1] distances):
np.ndarray[floating, ndim=1] distances):
"""M step of the K-means EM algorithm

Computation of cluster centers / means.
Expand Down Expand Up @@ -275,7 +306,12 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,
n_samples = X.shape[0]
n_features = X.shape[1]
cdef int i, j, c
cdef np.ndarray[DOUBLE, ndim=2] centers = np.zeros((n_clusters, n_features))
cdef np.ndarray[floating, ndim=2] centers
if floating is float:
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
else:
centers = np.zeros((n_clusters, n_features), dtype=np.float64)

n_samples_in_cluster = bincount(labels, minlength=n_clusters)
empty_clusters = np.where(n_samples_in_cluster == 0)[0]
# maybe also relocate small clusters?
Expand All @@ -300,7 +336,7 @@ def _centers_dense(np.ndarray[DOUBLE, ndim=2] X,


def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
np.ndarray[DOUBLE, ndim=1] distances):
np.ndarray[floating, ndim=1] distances):
"""M step of the K-means EM algorithm

Computation of cluster centers / means.
Expand All @@ -327,18 +363,22 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,

cdef np.npy_intp cluster_id

cdef np.ndarray[DOUBLE, ndim=1] data = X.data
cdef np.ndarray[floating, ndim=1] data = X.data
cdef np.ndarray[int, ndim=1] indices = X.indices
cdef np.ndarray[int, ndim=1] indptr = X.indptr

cdef np.ndarray[DOUBLE, ndim=2, mode="c"] centers = \
np.zeros((n_clusters, n_features))
cdef np.ndarray[floating, ndim=2, mode="c"] centers
cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers
cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] n_samples_in_cluster = \
bincount(labels, minlength=n_clusters)
cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \
np.where(n_samples_in_cluster == 0)[0]

if floating is float:
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
else:
centers = np.zeros((n_clusters, n_features), dtype=np.float64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be further simplified as:

centers = np.zeros((n_clusters, n_features), dtype=X.dtype)

no?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that and it works on my machine with Ubuntu, but leads to failing appveyor builds.
I don't know exactly why this happens.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is really weird. It would be worth investigating with a debugger under windows. This might be a bug in Cython / MSVC or even numpy. But we can leave it as it is for now in this PR.


# maybe also relocate small clusters?

if empty_clusters.shape[0] > 0:
Expand Down
Loading
0