8000 FEA add ReadonlyArrayWrapper (#20903) · scikit-learn/scikit-learn@30f3de2 · GitHub
[go: up one dir, main page]

Skip to content

Commit 30f3de2

Browse files
lorentzenchrjeremiedbb
authored andcommitted
FEA add ReadonlyArrayWrapper (#20903)
Co-authored-by: jeremie du boisberranger <jeremiedbb@yahoo.fr>
1 parent 4ae5f83 commit 30f3de2

9 files changed

+175
-64
lines changed

sklearn/cluster/_k_means_common.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
1111
floating, bint) nogil
1212

1313
cpdef void _relocate_empty_clusters_dense(
14-
np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
14+
floating[:, ::1], floating[::1], floating[:, ::1],
1515
floating[:, ::1], floating[::1], int[::1])
1616

1717
cpdef void _relocate_empty_clusters_sparse(

sklearn/cluster/_k_means_common.pyx

+10-10
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@ def _euclidean_sparse_dense_wrapper(
9797

9898

9999
cpdef floating _inertia_dense(
100-
np.ndarray[floating, ndim=2, mode='c'] X, # IN
101-
floating[::1] sample_weight, # IN
102-
floating[:, ::1] centers, # IN
103-
int[::1] labels, # IN
100+
floating[:, ::1] X, # IN READ-ONLY
101+
floating[::1] sample_weight, # IN READ-ONLY
102+
floating[:, ::1] centers, # IN
103+
int[::1] labels, # IN
104104
int n_threads):
105105
"""Compute inertia for dense input data
106106
@@ -161,12 +161,12 @@ cpdef floating _inertia_sparse(
161161

162162

163163
cpdef void _relocate_empty_clusters_dense(
164-
np.ndarray[floating, ndim=2, mode='c'] X, # IN
165-
floating[::1] sample_weight, # IN
166-
floating[:, ::1] centers_old, # IN
167-
floating[:, ::1] centers_new, # INOUT
168-
floating[::1] weight_in_clusters, # INOUT
169-
int[::1] labels): # IN
164+
floating[:, ::1] X, # IN READ-ONLY
165+
floating[::1] sample_weight, # IN READ-ONLY
166+
floating[:, ::1] centers_old, # IN
167+
floating[:, ::1] centers_new, # INOUT
168+
floating[::1] weight_in_clusters, # INOUT
169+
int[::1] labels): # IN
170170
"""Relocate centers which have no sample assigned to them."""
171171
cdef:
172172
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)

sklearn/cluster/_k_means_elkan.pyx

+23-26
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@ np.import_array()
3131

3232

3333
def init_bounds_dense(
34-
np.ndarray[floating, ndim=2, mode='c'] X, # IN
35-
floating[:, ::1] centers, # IN
36-
floating[:, ::1] center_half_distances, # IN
37-
int[::1] labels, # OUT
38-
floating[::1] upper_bounds, # OUT
39-
floating[:, ::1] lower_bounds): # OUT
34+
floating[:, ::1] X, # IN READ-ONLY
35+
floating[:, ::1] centers, # IN
36+
floating[:, ::1] center_half_distances, # IN
37+
int[::1] labels, # OUT
38+
floating[::1] upper_bounds, # OUT
39+
floating[:, ::1] lower_bounds): # OUT
4040
"""Initialize upper and lower bounds for each sample for dense input data.
4141
4242
Given X, centers and the pairwise distances divided by 2.0 between the
@@ -182,17 +182,17 @@ def init_bounds_sparse(
182182

183183

184184
def elkan_iter_chunked_dense(
185-
np.ndarray[floating, ndim=2, mode='c'] X, # IN
186-
floating[::1] sample_weight, # IN
187-
floating[:, ::1] centers_old, # IN
188-
floating[:, ::1] centers_new, # OUT
189-
floating[::1] weight_in_clusters, # OUT
190-
floating[:, ::1] center_half_distances, # IN
191-
floating[::1] distance_next_center, # IN
192-
floating[::1] upper_bounds, # INOUT
193-
floating[:, ::1] lower_bounds, # INOUT
194-
int[::1] labels, # INOUT
195-
floating[::1] center_shift, # OUT
185+
floating[:, ::1] X, # IN READ-ONLY
186+
floating[::1] sample_weight, # IN READ-ONLY
187+
floating[:, ::1] centers_old, # IN
188+
floating[:, ::1] centers_new, # OUT
189+
floating[::1] weight_in_clusters, # OUT
190+
floating[:, ::1] center_half_distances, # IN
191+
floating[::1] distance_next_center, # IN
192+
floating[::1] upper_bounds, # INOUT
193+
floating[:, ::1] lower_bounds, # INOUT
194+
int[::1] labels, # INOUT
195+
floating[::1] center_shift, # OUT
196196
int n_threads,
197197
bint update_centers=True):
198198
"""Single iteration of K-means Elkan algorithm with dense input.
@@ -292,7 +292,7 @@ def elkan_iter_chunked_dense(
292292
end = start + n_samples_chunk
293293

294294
_update_chunk_dense(
295-
&X[start, 0],
295+
X[start: end],
296296
sample_weight[start: end],
297297
centers_old,
298298
center_half_distances,
@@ -334,11 +334,8 @@ def elkan_iter_chunked_dense(
334334

335335

336336
cdef void _update_chunk_dense(
337-
floating *X, # IN
338-
# expecting C alinged 2D array. XXX: Can be
339-
# replaced by const memoryview when cython min
340-
# version is >= 0.3
341-
floating[::1] sample_weight, # IN
337+
floating[:, ::1] X, # IN READ-ONLY
338+
floating[::1] sample_weight, # IN READ-ONLY
342339
floating[:, ::1] centers_old, # IN
343340
floating[:, ::1] center_half_distances, # IN
344341
floating[::1] distance_next_center, # IN
@@ -383,7 +380,7 @@ cdef void _update_chunk_dense(
383380
# between the sample and its current assigned center.
384381
if not bounds_tight:
385382
upper_bound = _euclidean_dense_dense(
386-
X + i * n_features, &centers_old[label, 0], n_features, False)
383+
&X[i, 0], &centers_old[label, 0], n_features, False)
387384
lower_bounds[i, label] = upper_bound
388385
bounds_tight = 1
389386

@@ -394,7 +391,7 @@ cdef void _update_chunk_dense(
394391
or (upper_bound > center_half_distances[label, j])):
395392

396393
distance = _euclidean_dense_dense(
397-
X + i * n_features, &centers_old[j, 0], n_features, False)
394+
&X[i, 0], &centers_old[j, 0], n_features, False)
398395
lower_bounds[i, j] = distance
399396
if distance < upper_bound:
400397
label = j
@@ -406,7 +403,7 @@ cdef void _update_chunk_dense(
406403
if update_centers:
407404
weight_in_clusters[label] += sample_weight[i]
408405
for k in range(n_features):
409-
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
406+
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
410407

411408

412409
def elkan_iter_chunked_sparse(

sklearn/cluster/_k_means_lloyd.pyx

+13-16
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,14 @@ np.import_array()
2727

2828

2929
def lloyd_iter_chunked_dense(
30-
np.ndarray[floating, ndim=2, mode='c'] X, # IN
31-
floating[::1] sample_weight, # IN
32-
floating[::1] x_squared_norms, # IN
33-
floating[:, ::1] centers_old, # IN
34-
floating[:, ::1] centers_new, # OUT
35-
floating[::1] weight_in_clusters, # OUT
36-
int[::1] labels, # OUT
37-
floating[::1] center_shift, # OUT
30+
floating[:, ::1] X, # IN READ-ONLY
31+
floating[::1] sample_weight, # IN READ-ONLY
32+
floating[::1] x_squared_norms, # IN
33+
floating[:, ::1] centers_old, # IN
34+
floating[:, ::1] centers_new, # OUT
35+
floating[::1] weight_in_clusters, # OUT
36+
int[::1] labels, # OUT
37+
floating[::1] center_shift, # OUT
3838
int n_threads,
3939
bint update_centers=True):
4040
"""Single iteration of K-means lloyd algorithm with dense input.
@@ -129,7 +129,7 @@ def lloyd_iter_chunked_dense(
129129
end = start + n_samples_chunk
130130

131131
_update_chunk_dense(
132-
&X[start, 0],
132+
X[start: end],
133133
sample_weight[start: end],
134134
x_squared_norms[start: end],
135135
centers_old,
@@ -162,11 +162,8 @@ def lloyd_iter_chunked_dense(
162162

163163

164164
cdef void _update_chunk_dense(
165-
floating *X, # IN
166-
# expecting C alinged 2D array. XXX: Can be
167-
# replaced by const memoryview when cython min
168-
# version is >= 0.3
169-
floating[::1] sample_weight, # IN
165+
floating[:, ::1] X, # IN READ-ONLY
166+
floating[::1] sample_weight, # IN READ-ONLY
170167
floating[::1] x_squared_norms, # IN
171168
floating[:, ::1] centers_old, # IN
172169
floating[::1] centers_squared_norms, # IN
@@ -199,7 +196,7 @@ cdef void _update_chunk_dense(
199196

200197
# pairwise_distances += -2 * X.dot(C.T)
201198
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
202-
-2.0, X, n_features, &centers_old[0, 0], n_features,
199+
-2.0, &X[0, 0], n_features, &centers_old[0, 0], n_features,
203200
1.0, pairwise_distances, n_clusters)
204201

205202
for i in range(n_samples):
@@ -215,7 +212,7 @@ cdef void _update_chunk_dense(
215212
if update_centers:
216213
weight_in_clusters[label] += sample_weight[i]
217214
for k in range(n_features):
218-
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
215+
centers_new[label * n_features + k] += X[i, k] * sample_weight[i]
219216

220217

221218
def lloyd_iter_chunked_sparse(

sklearn/cluster/_k_means_minibatch.pyx

+10-10
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ np.import_array()
1414

1515

1616
def _minibatch_update_dense(
17-
np.ndarray[floating, ndim=2, mode="c"] X, # IN
18-
floating[::1] sample_weight, # IN
19-
floating[:, ::1] centers_old, # IN
20-
floating[:, ::1] centers_new, # OUT
21-
floating[::1] weight_sums, # INOUT
22-
int[::1] labels, # IN
17+
floating[:, ::1] X, # IN READ-ONLY
18+
floating[::1] sample_weight, # IN READ-ONLY
19+
floating[:, ::1] centers_old, # IN
20+
floating[:, ::1] centers_new, # OUT
21+
floating[::1] weight_sums, # INOUT
22+
int[::1] labels, # IN
2323
int n_threads):
2424
"""Update of the centers for dense MiniBatchKMeans.
2525
@@ -59,7 +59,7 @@ def _minibatch_update_dense(
5959
indices = <int*> malloc(n_samples * sizeof(int))
6060

6161
for cluster_idx in prange(n_clusters, schedule="static"):
62-
update_center_dense(cluster_idx, &X[0, 0], sample_weight,
62+
update_center_dense(cluster_idx, X, sample_weight,
6363
centers_old, centers_new, weight_sums, labels,
6464
indices)
6565

@@ -68,8 +68,8 @@ def _minibatch_update_dense(
6868

6969
cdef void update_center_dense(
7070
int cluster_idx,
71-
floating *X, # IN
72-
floating[::1] sample_weight, # IN
71+
floating[:, ::1] X, # IN READ-ONLY
72+
floating[::1] sample_weight, # IN READ-ONLY
7373
floating[:, ::1] centers_old, # IN
7474
floating[:, ::1] centers_new, # OUT
7575
floating[::1] weight_sums, # INOUT
@@ -103,7 +103,7 @@ cdef void update_center_dense(
103103
for k in range(n_indices):
104104
sample_idx = indices[k]
105105
for feature_idx in range(n_features):
106-
centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
106+
centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]
107107

108108
# Update the count statistics for this center
109109
weight_sums[cluster_idx] += wsum

sklearn/cluster/_kmeans.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from ..utils import deprecated
3030
from ..utils.validation import check_is_fitted, _check_sample_weight
3131
from ..utils._openmp_helpers import _openmp_effective_n_threads
32+
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
3233
from ..exceptions import ConvergenceWarning
3334
from ._k_means_common import CHUNK_SIZE
3435
from ._k_means_common import _inertia_dense
@@ -729,6 +730,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
729730
else:
730731
_labels = lloyd_iter_chunked_dense
731732
_inertia = _inertia_dense
733+
X = ReadonlyArrayWrapper(X)
732734

733735
_labels(
734736
X,
@@ -1449,7 +1451,13 @@ def _mini_batch_step(
14491451
)
14501452
else:
14511453
_minibatch_update_dense(
1452-
X, sample_weight, centers, centers_new, weight_sums, labels, n_threads
1454+
ReadonlyArrayWrapper(X),
1455+
sample_weight,
1456+
centers,
1457+
centers_new,
1458+
weight_sums,
1459+
labels,
1460+
n_threads,
14531461
)
14541462

14551463
# Reassign clusters that have very low weight
+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
ReadonlyArrayWrapper implements the buffer protocol to make the wraped buffer behave as if
3+
writeable, even for readonly buffers. This way, even readonly arrays can be passed as
4+
argument of type (non const) memoryview.
5+
This is a workaround for the missing support for const fused-typed memoryviews in
6+
Cython < 3.0.
7+
8+
Note: All it does is LIE about the readonly attribute: tell it's false!
9+
This way, we can use it on arrays that we don't touch.
10+
!!! USE CAREFULLY !!!
11+
"""
12+
# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
13+
14+
from cpython cimport Py_buffer
15+
from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
16+
17+
import numpy as np
18+
cimport numpy as np
19+
20+
21+
np.import_array()
22+
23+
24+
ctypedef fused NUM_TYPES:
25+
np.npy_float64
26+
np.npy_float32
27+
np.npy_int64
28+
np.npy_int32
29+
30+
31+
cdef class ReadonlyArrayWrapper:
32+
cdef object wraps
33+
34+
def __init__(self, wraps):
35+
self.wraps = wraps
36+
37+
def __getbuffer__(self, Py_buffer *buffer, int flags):
38+
request_for_writeable = False
39+
if flags & PyBUF_WRITABLE:
40+
flags ^= PyBUF_WRITABLE
41+
request_for_writeable = True
42+
PyObject_GetBuffer(self.wraps, buffer, flags)
43+
if request_for_writeable:
44+
# The following is a lie when self.wraps is readonly!
45+
buffer.readonly = False
46+
47+
def __releasebuffer__(self, Py_buffer *buffer):
48+
PyBuffer_Release(buffer)
49+
50+
51+
def _test_sum(NUM_TYPES[:] x):
52+
"""This function is for testing only.
53+
54+
As this function does not modify x, we would like to define it as
55+
56+
_test_sum(const NUM_TYPES[:] x)
57+
58+
which is not possible as fused typed const memoryviews aren't
59+
supported in Cython<3.0.
60+
"""
61+
cdef:
62+
int i
63+
int n = x.shape[0]
64+
NUM_TYPES sum = 0
65+
66+
for i in range(n):
67+
sum += x[i]
68+
return sum

sklearn/utils/setup.py

+6
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ def configuration(parent_package="", top_path=None):
8282
libraries=libraries,
8383
)
8484

85+
config.add_extension(
86+
"_readonly_array_wrapper",
87+
sources=["_readonly_array_wrapper.pyx"],
88+
libraries=libraries,
89+
)
90+
8591
config.add_subpackage("tests")
8692

8793
return config

0 commit comments

Comments
 (0)
0