8000 [WIP] ENH Implement sparse support by mbatoul · Pull Request #4 · jjerphan/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] ENH Implement sparse support #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions sklearn/metrics/_dist_metrics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,35 @@ cdef class DistanceMetric:
cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
ITYPE_t size) nogil except -1

cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
const ITYPE_t[:] x1_indices,
const DTYPE_t[:] x2_data,
const ITYPE_t[:] x2_indices,
cdef DTYPE_t csr_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
const DTYPE_t[:] x2_csr_data,
const ITYPE_t[:] x2_csr_indices,
const ITYPE_t[:] x2_csr_indptr,
DTYPE_t[:, ::1] D,
) nogil except -1

cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
const ITYPE_t[:] x1_indices,
const DTYPE_t[:] x2_data,
const ITYPE_t[:] x2_indices,
cdef DTYPE_t csr_pdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
DTYPE_t[:, ::1] D
) nogil except -1

cdef DTYPE_t csr_rdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
const DTYPE_t[:] x2_csr_data,
const ITYPE_t[:] x2_csr_indices,
const ITYPE_t[:] x2_csr_indptr,
DTYPE_t[:, ::1] D
) nogil except -1

cdef DTYPE_t csr_dense_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
DTYPE_t[:, ::1] x2_dense,
DTYPE_t[:, ::1] D
) nogil except -1

cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
Expand Down
196 changes: 176 additions & 20 deletions sklearn/metrics/_dist_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
cimport numpy as np
from cython cimport final
from scipy.sparse import issparse

np.import_array() # required in order to use C-API

Expand Down Expand Up @@ -316,22 +317,39 @@ cdef class DistanceMetric:
"""
return self.dist(x1, x2, size)

cdef DTYPE_t sparse_dist(self, const DTYPE_t[:] x1_data,
const ITYPE_t[:] x1_indices,
const DTYPE_t[:] x2_data,
const ITYPE_t[:] x2_indices,
cdef DTYPE_t csr_pdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
DTYPE_t[:, ::1] D,
) nogil except -1:
"""Compute the reduced distance between vectors x1 and x2
given non null coordinates and their corresponding indices.
"""
return self.csr_cdist(x1_csr_data, x1_csr_indices, x1_csr_indptr, x1_csr_data, x1_csr_indices, x1_csr_indptr, D)

cdef DTYPE_t csr_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
const DTYPE_t[:] x2_csr_data,
const ITYPE_t[:] x2_csr_indices,
const ITYPE_t[:] x2_csr_indptr,
DTYPE_t[:, ::1] D,
) nogil except -1:
"""Compute the reduced distance between vectors x1 and x2
given non null coordinates and their corresponding indices.

This should be overridden in a base class.
"""

return -999

cdef DTYPE_t sparse_rdist(self, const DTYPE_t[:] x1_data,
const ITYPE_t[:] x1_indices,
const DTYPE_t[:] x2_data,
const ITYPE_t[:] x2_indices,
cdef DTYPE_t csr_rdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
const DTYPE_t[:] x2_csr_data,
const ITYPE_t[:] x2_csr_indices,
const ITYPE_t[:] x2_csr_indptr,
DTYPE_t[:, ::1] D,
) nogil except -1:
"""Compute the reduced distance between vectors x1 and x2
given non null coordinates and their corresponding indices.
Expand All @@ -343,7 +361,21 @@ cdef class DistanceMetric:
Euclidean metric, the reduced distance is the squared-euclidean
distance.
"""
return self.sparse_dist(x1_data, x1_indices, x2_data, x2_indices)
return self.csr_cdist(x1_csr_data, x1_csr_indices, x1_csr_indptr, x2_csr_data, x2_csr_indices, x2_csr_indptr, D)

cdef DTYPE_t csr_dense_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
DTYPE_t[:, ::1] x2_dense,
DTYPE_t[:, ::1] D,
) nogil except -1:
"""Compute the reduced distance between a CSR sparse vector x1 and a dense
vector x2 given non null coordinates and their corresponding indices.

This should be overridden in a base class.
"""

return -999

cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
"""compute the pairwise distances between points in X"""
Expand Down Expand Up @@ -436,23 +468,80 @@ cdef class DistanceMetric:
"""
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr
cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr

cdef np.ndarray[DTYPE_t, ndim=1, mode='c'] X_data
cdef np.ndarray[ITYPE_t, ndim=1, mode='c'] X_indices
cdef np.ndarray[ITYPE_t, ndim=1, mode='c'] X_indptr

cdef np.ndarray[DTYPE_t, ndim=1, mode='c'] Y_data
cdef np.ndarray[ITYPE_t, ndim=1, mode='c'] Y_indices
cdef np.ndarray[ITYPE_t, ndim=1, mode='c'] Y_indptr

cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr

Xarr = np.asarray(X, dtype=DTYPE, order='C')
self._validate_data(Xarr)
if Y is None:
Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
dtype=DTYPE, order='C')
self.pdist(Xarr, Darr)
Darr = np.zeros((X.shape[0], X.shape[0]), dtype=DTYPE, order="C")

if issparse(X):
self.csr_pdist(X.data, X.indices, X.indptr, Darr)
else:
Xarr_dense = np.asarray(X, dtype=DTYPE, order='C')
self._validate_data(Xarr_dense)

self.pdist(Xarr_dense, Darr)
else:
Yarr = np.asarray(Y, dtype=DTYPE, order='C')
self._validate_data(Yarr)
Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
dtype=DTYPE, order='C')
self.cdist(Xarr, Yarr, Darr)
return Darr
Darr = np.zeros((X.shape[0], Y.shape[0]), dtype=DTYPE, order="C")

if issparse(X) and issparse(Y):

X_data = np.asarray(X.data, dtype=DTYPE, order="C")
X_indices = np.asarray(X.indices, dtype=ITYPE, order="C")
X_indptr = np.asarray(X.indptr, dtype=ITYPE, order="C")

Y_data = np.asarray(Y.data, dtype=DTYPE, order="C")
Y_indices = np.asarray(Y.indices, dtype=ITYPE, order="C")
Y_indptr = np.asarray(Y.indptr, dtype=ITYPE, order="C")

self.csr_cdist(
X_data, X_indices, X_indptr,
Y_data, Y_indices, Y_indptr,
Darr)
elif issparse(X):
X_data = np.asarray(X.data, dtype=DTYPE, order="C")
X_indices = np.asarray(X.indices, dtype=ITYPE, order="C")
X_indptr = np.asarray(X.indptr, dtype=ITYPE, order="C")

Yarr_dense = np.asarray(Y, dtype=DTYPE, order="C")
self._validate_data(Yarr_dense)

self.csr_dense_cdist(
X_data, X_indices, X_indptr,
Yarr_dense,
Darr
)
elif issparse(Y):
Y_data = np.asarray(Y.data, dtype=DTYPE, order="C")
Y_indices = np.asarray(Y.indices, dtype=ITYPE, order="C")
Y_indptr = np.asarray(Y.indptr, dtype=ITYPE, order="C")

Xarr_dense = np.asarray(X, dtype=DTYPE, order='C')
self._validate_data(Xarr_dense)

self.csr_dense_cdist(
Y_data, Y_indices, Y_indptr,
Xarr_dense,
Darr
)
else:
Xarr_dense = np.asarray(X, dtype=DTYPE, order='C')
self._validate_data(Xarr_dense)

Yarr_dense = np.asarray(Y, dtype=DTYPE, order='C')
self._validate_data(Yarr_dense)

self.cdist(Xarr_dense, Yarr_dense, Darr)

return Darr
#------------------------------------------------------------
# Euclidean Distance
# d = sqrt(sum(x_i^2 - y_i^2))
Expand All @@ -473,6 +562,73 @@ cdef class EuclideanDistance(DistanceMetric):
ITYPE_t size) nogil except -1:
return euclidean_rdist(x1, x2, size)

cdef DTYPE_t csr_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
const DTYPE_t[:] x2_csr_data,
const ITYPE_t[:] x2_csr_indices,
const ITYPE_t[:] x2_csr_indptr,
DTYPE_t[:, ::1] D,
) nogil except -1:
"""Compute the reduced distance between vectors x1 and x2
given non null coordinates and their corresponding indices.
"""
cdef np.npy_intp px, py, i, j, ix, iy
cdef double d = 0.0

cdef int m = D.shape[0]
cdef int n = D.shape[1]

cdef int x1_csr_indptr_end = 0
cdef int x2_csr_indptr_end = 0

for px in range(m):
x1_csr_indptr_end = x1_csr_indptr[px + 1]
for py in range(n):
x2_csr_indptr_end = x2_csr_indptr[py + 1]
i = x1_csr_indptr[px]
j = x2_csr_indptr[py]
d = 0.0
while i < x1_csr_indptr_end and j < x2_csr_indptr_end:
ix = x1_csr_indices[i]
iy = x2_csr_indices[j]

if ix == iy:
d = d + (x1_csr_data[i] - x2_csr_data[j]) ** 2
i = i + 1
j = j + 1
elif ix < iy:
d = d + (x1_csr_data[i]) ** 2
i = i + 1
else:
d = d + (x2_csr_data[j]) ** 2
j = j + 1

if i == x1_csr_indptr_end:
while j < x2_csr_indptr_end:
d = d + (x2_csr_data[j]) ** 2
j = j + 1
else:
while i < x1_csr_indptr_end:
d = d + (x1_csr_data[i]) ** 2
i = i + 1

D[px, py] = sqrt(d)

return 0

cdef DTYPE_t csr_dense_cdist(self, const DTYPE_t[:] x1_csr_data,
const ITYPE_t[:] x1_csr_indices,
const ITYPE_t[:] x1_csr_indptr,
DTYPE_t[:, ::1] x2_dense,
DTYPE_t[:, ::1] D
) nogil except -1:
"""Compute the reduced distance between a CSR sparse vector x1 and a dense
vector x2 given non null coordinates and their corresponding indices.
"""

return 0

cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1:
return sqrt(rdist)

Expand Down
36 changes: 35 additions & 1 deletion sklearn/metrics/tests/test_dist_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pickle

import numpy as np
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_almost_equal, assert_array_equal

import pytest

Expand All @@ -12,6 +12,7 @@
from sklearn.utils import check_random_state
from sklearn.utils._testing import create_memmap_backed_data
from sklearn.utils.fixes import sp_version, parse_version
from scipy.sparse import csr_matrix


def dist_func(x1, x2, p):
Expand All @@ -25,6 +26,14 @@ def dist_func(x1, x2, p):
X1 = rng.random_sample((n1, d)).astype("float64", copy=False)
X2 = rng.random_sample((n2, d)).astype("float64", copy=False)

X1_sparse = X1.copy()
X1_sparse[X1_sparse < 0.5] = 0
X1_csr = csr_matrix(X1_sparse)

X2_sparse = X2.copy()
X2_sparse[X2_sparse < 0.5] = 0
X2_csr = csr_matrix(X2_sparse)

[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2])

# make boolean arrays: ones and zeros
Expand Down Expand Up @@ -243,3 +252,28 @@ def custom_metric(x, y):
pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
eucl = DistanceMetric.get_metric("euclidean")
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)


def test_csr_csr_pdist_consistency():
metric = "euclidean"

dm = DistanceMetric.get_metric(metric)

expected_dist = dm.pairwise(X1, X2)
actual_dist = dm.pairwise(X1_csr, X2_csr)

assert_array_equal(expected_dist, actual_dist)


def test_csr_dense_pdist_consistency():
metric = "euclidean"

dm = DistanceMetric.get_metric(metric)

expected_dist = dm.pairwise(X1, X2)

actual_dist = dm.pairwise(X1, X2_csr)
assert_array_equal(expected_dist, actual_dist)

actual_dist_2 = dm.pairwise(X1_csr, X2)
assert_array_equal(expected_dist, actual_dist_2)
0