8000 Adresses several comments · scikit-learn/scikit-learn@cebb687 · GitHub
[go: up one dir, main page]

Skip to content

Commit cebb687

Browse files
author
Sebastian Saeger
committed
Adresses several comments
1 parent 5e0e925 commit cebb687

File tree

2 files changed

+18
-38
lines changed
< 8000 div id="diff-content-parent" tabindex="-1">

2 files changed

+18
-38
lines changed

doc/whats_new.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ Enhancements
105105
- Added ``inverse_transform`` function to :class:`decomposition.nmf` to compute
106106
data matrix of original shape. By `Anish Shah`_.
107107

108+
- :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` now works
109+
with ``np.float32`` and ``np.float64`` input data without converting it.
110+
This allows to reduce the memory consumption by using ``np.float32``.
111+
(`#6430 <https://github.com/scikit-learn/scikit-learn/pull/6430>`_)
112+
By `Sebastian Säger`_.
113+
108114
Bug fixes
109115
.........
110116

@@ -1615,7 +1621,7 @@ List of contributors for release 0.15 by number of commits.
16151621
* 4 Alexis Metaireau
16161622
* 4 Ignacio Rossi
16171623
* 4 Virgile Fritsch
1618-
* 4 Sebastian Saeger
1624+
* 4 Sebastian Säger
16191625
* 4 Ilambharathi Kanniah
16201626
* 4 sdenton4
16211627
* 4 Robert Layton
@@ -4093,3 +4099,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
40934099
.. _Jonathan Arfa: https://github.com/jarfa
40944100

40954101
.. _Anish Shah: https://github.com/AnishShah
4102+
4103+
.. _Sebastian Säger:: https://github.com/ssaeger

sklearn/cluster/_k_means.pyx

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,9 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
5757
DOUBLE min_dist
5858
DOUBLE dist
5959

60-
if floating is float:
61-
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
62-
x_stride = X.strides[1] / sizeof(float)
63-
center_stride = centers.strides[1] / sizeof(float)
64-
elif floating is double:
65-
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)
66-
x_stride = X.strides[1] / sizeof(DOUBLE)
67-
center_stride = centers.strides[1] / sizeof(DOUBLE)
68-
else:
69-
raise ValueError("Unknown floating type.")
60+
center_squared_norms = np.zeros(n_clusters, dtype=X.dtype)
61+
x_stride = X.strides[1] / sizeof(X.dtype)
62+
center_stride = centers.strides[1] / sizeof(X.dtype)
7063

7164
if n_samples == distances.shape[0]:
7265
store_distances = 1
@@ -76,12 +69,10 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
7669
center_squared_norms[center_idx] = sdot(
7770
n_features, &centers[center_idx, 0], center_stride,
7871
&centers[center_idx, 0], center_stride)
79-
elif floating is double:
72+
else:
8073
center_squared_norms[center_idx] = ddot(
8174
n_features, &centers[center_idx, 0], center_stride,
8275
&centers[center_idx, 0], center_stride)
83-
else:
84-
raise ValueError("Unknown floating type.")
8576

8677
for sample_idx in range(n_samples):
8778
min_dist = -1
@@ -92,11 +83,9 @@ cpdef DOUBLE _assign_labels_array(np.ndarray[floating, ndim=2] X,
9283
if floating is float:
9384
dist += sdot(n_features, &X[sample_idx, 0], x_stride,
9485
&centers[center_idx, 0], center_stride)
95-
elif floating is double:
86+
else:
9687
dist += ddot(n_features, &X[sample_idx, 0], x_stride,
9788
&centers[center_idx, 0], center_stride)
98-
else:
99-
raise ValueError("Unknown floating type.")
10089
dist *= -2
10190
dist += center_squared_norms[center_idx]
10291
dist += x_squared_norms[sample_idx]
@@ -139,12 +128,7 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
139128
DOUBLE min_dist
140129
DOUBLE dist
141130

142-
if floating is float:
143-
center_squared_norms = np.zeros(n_clusters, dtype=np.float32)
144-
elif floating is double:
145-
center_squared_norms = np.zeros(n_clusters, dtype=np.float64)
146-
else:
147-
raise ValueError("Unknown floating type.")
131+
center_squared_norms = np.zeros(n_clusters, dtype=X.dtype)
148132

149133
if n_samples == distances.shape[0]:
150134
store_distances = 1
@@ -153,11 +137,9 @@ cpdef DOUBLE _assign_labels_csr(X, np.ndarray[floating, ndim=1] x_squared_norms,
153137
if floating is float:
154138
center_squared_norms[center_idx] = sdot(
155139
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
156-
elif floating is double:
140+
else:
157141
center_squared_norms[center_idx] = ddot(
158142
n_features, &centers[center_idx, 0], 1, &centers[center_idx, 0], 1)
159-
else:
160-
raise ValueError("Unknown floating type.")
161143

162144
for sample_idx in range(n_samples):
163145
min_dist = -1
@@ -317,12 +299,7 @@ def _centers_dense(np.ndarray[floating, ndim=2] X,
317299
n_features = X.shape[1]
318300
cdef int i, j, c
319301
cdef np.ndarray[floating, ndim=2] centers
320-
if floating is float:
321-
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
322-
elif floating is double:
323-
centers = np.zeros((n_clusters, n_features), dtype=np.float64)
324-
else:
325-
raise ValueError("Unknown floating type.")
302+
centers = np.zeros((n_clusters, n_features), dtype=X.dtype)
326303

327304
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
328305
empty_clusters = np.where(n_samples_in_cluster == 0)[0]
@@ -386,12 +363,7 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
386363
cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \
387364
np.where(n_samples_in_cluster == 0)[0]
388365

389-
if floating is float:
390-
centers = np.zeros((n_clusters, n_features), dtype=np.float32)
391-
elif floating is double:
392-
centers = np.zeros((n_clusters, n_features), dtype=np.float64)
393-
else:
394-
raise ValueError("Unknown floating type.")
366+
centers = np.zeros((n_clusters, n_features), dtype=X.dtype)
395367

396368
# maybe also relocate small clusters?
397369

0 commit comments

Comments
 (0)
0