diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 9ab9eab1cf5ed..d7bc0a5164d7a 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -98,7 +98,8 @@ Changes impacting all modules - :func:`sklearn.manifold.trustworthiness` :pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion `, - :user:`Olivier Grisel `, and `Thomas Fan`_. + :user:`Olivier Grisel `, and `Thomas Fan`_, + :pr:`24556` by :user:`Vincent Maladière `. - |Fix| Systematically check the sha256 digest of dataset tarballs used in code examples in the documentation. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp index f67df8333fcec..eec2e2aabdd06 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -63,9 +63,10 @@ cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): """ if ( metric in ("euclidean", "sqeuclidean") - and not (issparse(X) or issparse(Y)) + and not (issparse(X) ^ issparse(Y)) # "^" is the XOR operator ): - # Specialized implementation of ArgKmin for the Euclidean distance. + # Specialized implementation of ArgKmin for the Euclidean distance + # for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using # a decomposition of the Squared Euclidean distance. # This specialisation has an improved arithmetic intensity for both @@ -492,7 +493,6 @@ cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): DTYPE_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] ITYPE_t * heaps_indices = self.heaps_indices_chunks[thread_num] - # Pushing the distance and their associated indices on heaps # which keep tracks of the argkmin. for i in range(n_X): diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 35c8184d25a6c..be44f3a98a263 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -2,7 +2,7 @@ cimport numpy as cnp from cython cimport final -from ...utils._typedefs cimport ITYPE_t, DTYPE_t +from ...utils._typedefs cimport ITYPE_t, DTYPE_t, SPARSE_INDEX_TYPE_t cnp.import_array() @@ -12,7 +12,7 @@ from ._datasets_pair cimport DatasetsPair{{name_suffix}} cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:, ::1] X, + X, ITYPE_t num_threads, ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index c46d21aa25ed3..1b2a8a31fb679 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -26,11 +26,12 @@ from ...utils._typedefs cimport ITYPE_t, DTYPE_t import numpy as np +from scipy.sparse import issparse from numbers import Integral from sklearn import get_config from sklearn.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils._typedefs import DTYPE +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE cnp.import_array() @@ -102,16 +103,40 @@ cdef DTYPE_t[::1] _sqeuclidean_row_norms32_dense( return squared_row_norms +cdef DTYPE_t[::1] _sqeuclidean_row_norms64_sparse( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t num_threads, +): + cdef: + ITYPE_t n = X_indptr.shape[0] - 1 + SPARSE_INDEX_TYPE_t X_i_ptr, idx = 0 + DTYPE_t[::1] squared_row_norms = np.zeros(n, dtype=DTYPE) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]): + squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr] + + return squared_row_norms + + {{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} cpdef DTYPE_t[::1] _sqeuclidean_row_norms{{name_suffix}}( - const {{INPUT_DTYPE_t}}[:, ::1] X, + X, ITYPE_t num_threads, ): - return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) + if issparse(X): + # TODO: remove this instruction which is a cast in the float32 case + # by moving squared row norms computations in MiddleTermComputer. + X_data = np.asarray(X.data, dtype=DTYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads) + else: + return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) cdef class BaseDistancesReduction{{name_suffix}}: @@ -131,7 +156,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: strategy=None, ): cdef: - ITYPE_t n_samples_chunk, X_n_full_chunks, Y_n_full_chunks + ITYPE_t X_n_full_chunks, Y_n_full_chunks if chunk_size is None: chunk_size = get_config().get("pairwise_dist_chunk_size", 256) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index b5e77ba61eddd..62403d1c334f0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -8,10 +8,7 @@ from .._dist_metrics import BOOL_METRICS, METRIC_MAPPING -from ._base import ( - _sqeuclidean_row_norms64, - _sqeuclidean_row_norms32, -) +from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64 from ._argkmin import ( ArgKmin64, ArgKmin32, @@ -133,8 +130,10 @@ def is_valid_sparse_matrix(X): # See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa # TODO: implement specialisation for (sq)euclidean on fused sparse-dense # using sparse-dense routines for matrix-vector multiplications. + # Currently, only dense-dense and sparse-sparse are optimized for + # the Euclidean case. fused_sparse_dense_euclidean_case_guard = not ( - (is_valid_sparse_matrix(X) or is_valid_sparse_matrix(Y)) + (is_valid_sparse_matrix(X) ^ is_valid_sparse_matrix(Y)) # "^" is XOR and isinstance(metric, str) and "euclidean" in metric ) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp index c49787188a05c..e6ef5de2727b5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp @@ -17,7 +17,22 @@ cimport numpy as cnp from libcpp.vector cimport vector -from ...utils._typedefs cimport DTYPE_t, ITYPE_t +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + + +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} @@ -133,4 +148,42 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ ) nogil +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const DTYPE_t[:] X_data + const SPARSE_INDEX_TYPE_t[:] X_indices + const SPARSE_INDEX_TYPE_t[:] X_indptr + + const DTYPE_t[:] Y_data + const SPARSE_INDEX_TYPE_t[:] Y_indices + const SPARSE_INDEX_TYPE_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num + ) nogil + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil + + {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 692652b3e8d5a..3363eb9524263 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -25,16 +25,65 @@ from ...utils._cython_blas cimport ( Trans, _gemm, ) -from ...utils._typedefs cimport DTYPE_t, ITYPE_t - -from scipy.sparse import issparse +from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t + +# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used +# Introduction in Cython: +# +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa +cdef extern from "" namespace "std" nogil: + void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa + +import numpy as np +from scipy.sparse import issparse, csr_matrix +from ...utils._typedefs import DTYPE, SPARSE_INDEX_TYPE + +# TODO: If possible optimize this routine to efficiently treat cases where +# `n_samples_X << n_samples_Y` met in practise when X_test consists of a +# few samples, and thus when there's a single chunk of X whose number of +# samples is less that the default chunk size. + +# TODO: compare this routine with the similar ones in SciPy, especially +# `csr_matmat` which might implement a better algorithm. +# See: https://github.com/scipy/scipy/blob/e58292e066ba2cb2f3d1e0563ca9314ff1f4f311/scipy/sparse/sparsetools/csr.h#L603-L669 # noqa +cdef void _middle_term_sparse_sparse_64( + const DTYPE_t[:] X_data, + const SPARSE_INDEX_TYPE_t[:] X_indices, + const SPARSE_INDEX_TYPE_t[:] X_indptr, + ITYPE_t X_start, + ITYPE_t X_end, + const DTYPE_t[:] Y_data, + const SPARSE_INDEX_TYPE_t[:] Y_indices, + const SPARSE_INDEX_TYPE_t[:] Y_indptr, + ITYPE_t Y_start, + ITYPE_t Y_end, + DTYPE_t * D, +) nogil: + # This routine assumes that D points to the first element of a + # zeroed buffer of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered array. + cdef: + ITYPE_t i, j, k + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + for j in range(n_Y): + k = i * n_Y + j + for Y_j_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + Y_j_col_idx = Y_indices[Y_j_ptr] + if X_i_col_idx == Y_j_col_idx: + D[k] += -2 * X_data[X_i_ptr] * Y_data[Y_j_ptr] {{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} cdef class MiddleTermComputer{{name_suffix}}: - """Helper class to compute a Euclidean distance matrix in chunks. + """Helper class to compute a Euclidean distance matrix in chunks. This is an abstract base class that is further specialized depending on the type of data (dense or sparse). @@ -92,8 +141,29 @@ cdef class MiddleTermComputer{{name_suffix}}: n_features, chunk_size, ) + if X_is_sparse and Y_is_sparse: + return SparseSparseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) - raise NotImplementedError("X and Y must be both dense") + raise NotImplementedError( + "X and Y must be both CSR sparse matrices or both numpy arrays." + ) + + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with SPARSE_INDEX_TYPE.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=SPARSE_INDEX_TYPE) + X_indptr = np.asarray(X.indptr, dtype=SPARSE_INDEX_TYPE) + return X_data, X_indices, X_indptr def __init__( self, @@ -334,4 +404,97 @@ cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_ return dist_middle_terms + +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Middle term of the Euclidean distance between two chunked CSR matrices. + + The result is return as a contiguous array. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. + This routine iterates over the data, indices and indptr arrays of the sparse matrices without + densifying them. + """ + + def __init__( + self, + X, + Y, + ITYPE_t effective_n_threads, + ITYPE_t chunks_n_threads, + ITYPE_t dist_middle_terms_chunks_size, + ITYPE_t n_features, + ITYPE_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef DTYPE_t * _compute_dist_middle_terms( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + DTYPE_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_64( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms + + {{endfor}} diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index aec943448be3f..0fdc3bb50203f 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -84,11 +84,10 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) """ if ( metric in ("euclidean", "sqeuclidean") - and not issparse(X) - and not issparse(Y) + and not (issparse(X) ^ issparse(Y)) # "^" is XOR ): # Specialized implementation of RadiusNeighbors for the Euclidean - # distance. + # distance for the dense-dense and sparse-sparse cases. # This implementation computes the distances by chunk using # a decomposition of the Squared Euclidean distance. # This specialisation has an improved arithmetic intensity for both diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index f929a55105509..c334087c65448 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -561,9 +561,12 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y, metric="euclidean" ) - assert not BaseDistancesReductionDispatcher.is_usable_for( + assert BaseDistancesReductionDispatcher.is_usable_for( X_csr, Y_csr, metric="sqeuclidean" ) + assert BaseDistancesReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="euclidean" + ) # CSR matrices without non-zeros elements aren't currently supported # TODO: support CSR matrices without non-zeros elements @@ -974,6 +977,9 @@ def test_pairwise_distances_argkmin( X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + X_csr = csr_matrix(X) + Y_csr = csr_matrix(Y) + # Haversine distance only accepts 2D data if metric == "haversine": X = np.ascontiguousarray(X[:, :2]) @@ -996,24 +1002,25 @@ def test_pairwise_distances_argkmin( row_idx, argkmin_indices_ref[row_idx] ] - argkmin_distances, argkmin_indices = ArgKmin.compute( - X, - Y, - k, - metric=metric, - metric_kwargs=metric_kwargs, - return_distance=True, - # So as to have more than a chunk, forcing parallelism. - chunk_size=n_samples // 4, - strategy=strategy, - ) + for _X, _Y in [(X, Y), (X_csr, Y_csr)]: + argkmin_distances, argkmin_indices = ArgKmin.compute( + _X, + _Y, + k, + metric=metric, + metric_kwargs=metric_kwargs, + return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, + ) - ASSERT_RESULT[(ArgKmin, dtype)]( - argkmin_distances, - argkmin_distances_ref, - argkmin_indices, - argkmin_indices_ref, - ) + ASSERT_RESULT[(ArgKmin, dtype)]( + argkmin_distances, + argkmin_distances_ref, + argkmin_indices, + argkmin_indices_ref, + ) # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @@ -1148,10 +1155,15 @@ def test_sqeuclidean_row_norms( spread = 100 X = rng.rand(n_samples, n_features).astype(dtype) * spread + X_csr = csr_matrix(X) + sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 - sq_row_norm = np.asarray(sqeuclidean_row_norms(X, num_threads=num_threads)) + sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads) + + sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads) assert_allclose(sq_row_norm_reference, sq_row_norm) + assert_allclose(sq_row_norm_reference, sq_row_norm_csr) with pytest.raises(ValueError): X = np.asfortranarray(X)