8000 Remove DatasetsPair used for sparse datasets · scikit-learn/scikit-learn@5678666 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5678666

Browse files
committed
Remove DatasetsPair used for sparse datasets
As to make #20254 smaller. The removed hunks will be re-introduced in a subsequent PR.
1 parent fa424a4 commit 5678666

File tree

2 files changed

+18
-210
lines changed

2 files changed

+18
-210
lines changed

sklearn/metrics/_dist_metrics.pyx

Lines changed: 4 additions & 195 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,13 +1275,10 @@ cdef class DatasetsPair:
12751275
distance_metric._validate_data(X)
12761276
distance_metric._validate_data(Y)
12771277

1278-
if not issparse(X) and not issparse(Y):
1279-
return DenseDenseDatasetsPair(X, Y, distance_metric)
1280-
if issparse(X) and not issparse(Y):
1281-
return SparseDenseDatasetsPair(X, Y, distance_metric)
1282-
if not issparse(X) and issparse(Y):
1283-
return DenseSparseDatasetsPair(X, Y, distance_metric)
1284-
return SparseSparseDatasetsPair(X, Y, distance_metric)
1278+
if issparse(X) or issparse(Y):
1279+
raise ValueError("Only dense datasets are supported for X and Y.")
1280+
1281+
return DenseDenseDatasetsPair(X, Y, distance_metric)
12851282

12861283
@classmethod
12871284
def unpack_csr_matrix(cls, X: csr_matrix):
@@ -1351,191 +1348,3 @@ cdef class DenseDenseDatasetsPair(DatasetsPair):
13511348
return self.distance_metric.dist(&self.X[i, 0],
13521349
&self.Y[j, 0],
13531350
self.d)
1354-
1355-
@final
1356-
cdef class SparseSparseDatasetsPair(DatasetsPair):
1357-
"""Compute distances between vectors of two CSR matrices.
1358-
1359-
Parameters
1360-
----------
1361-
X: sparse matrix of shape (n_samples_X, n_features)
1362-
Rows represent vectors. Must be in CSR format.
1363-
1364-
Y: sparse matrix of shape (n_samples_Y, n_features)
1365-
Rows represent vectors. Must be in CSR format.
1366-
1367-
distance_metric: DistanceMetric
1368-
The distance metric responsible for computing distances
1369-
between two vectors of (X, Y).
1370-
"""
1371-
cdef:
1372-
const DTYPE_t[:] X_data
1373-
const ITYPE_t[:] X_indices,
1374-
const ITYPE_t[:] X_indptr,
1375-
1376-
const DTYPE_t[:] Y_data
1377-
const ITYPE_t[:] Y_indices
1378-
const ITYPE_t[:] Y_indptr
1379-
1380-
1381-
def __init__(self, X, Y, DistanceMetric distance_metric):
1382-
DatasetsPair.__init__(self, distance_metric)
1383-
1384-
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
1385-
self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y)
1386-
1387-
@final
1388-
cdef ITYPE_t n_samples_X(self) nogil:
1389-
return self.X_indptr.shape[0] - 1
1390-
1391-
@final
1392-
cdef ITYPE_t n_samples_Y(self) nogil:
1393-
return self.Y_indptr.shape[0] -1
1394-
1395-
@final
1396-
cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
1397-
cdef:
1398-
ITYPE_t xi_start = self.X_indptr[i]
1399-
ITYPE_t xi_end = self.X_indptr[i + 1]
1400-
ITYPE_t yj_start = self.Y_indptr[j]
1401-
ITYPE_t yj_end = self.Y_indptr[j + 1]
1402-
1403-
return self.distance_metric.csr_rdist(
1404-
self.X_data[xi_start:xi_end],
1405-
self.X_indices[xi_start:xi_end],
1406-
self.Y_data[yj_start:yj_end],
1407-
self.Y_indices[yj_start:yj_end],
1408-
)
1409-
1410-
@final
1411-
cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
1412-
cdef:
1413-
ITYPE_t xi_start = self.X_indptr[i]
1414-
ITYPE_t xi_end = self.X_indptr[i + 1]
1415-
ITYPE_t yj_start = self.Y_indptr[j]
1416-
ITYPE_t yj_end = self.Y_indptr[j + 1]
1417-
1418-
return self.distance_metric.csr_dist(
1419-
self.X_data[xi_start:xi_end],
1420-
self.X_indices[xi_start:xi_end],
1421-
self.Y_data[yj_start:yj_end],
1422-
self.Y_indices[yj_start:yj_end]
1423-
)
1424-
1425-
@final
1426-
cdef class SparseDenseDatasetsPair(DatasetsPair):
1427-
"""Compute distances between vectors of a CSR matrix and a dense array.
1428-
1429-
Parameters
1430-
----------
1431-
X: sparse matrix of shape (n_samples_X, n_features)
1432-
Rows represent vectors. Must be in CSR format.
1433-
1434-
Y: ndarray of shape (n_samples_Y, n_features)
1435-
Rows represent vectors. Must be C-contiguous.
1436-
1437-
distance_metric: DistanceMetric
1438-
The distance metric responsible for computing distances
1439-
between two vectors of (X, Y).
1440-
"""
1441-
cdef:
1442-
const DTYPE_t[:] X_data
1443-
const ITYPE_t[:] X_indices,
1444-
const ITYPE_t[:] X_indptr,
1445-
1446-
const DTYPE_t[:, ::1] Y
1447-
const ITYPE_t[:] Y_indices
1448-
1449-
def __init__(self, X, Y, DistanceMetric distance_metric):
1450-
super().__init__(distance_metric)
1451-
1452-
self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X)
1453-
1454-
# This array already has been checked here
1455-
self.Y = Y
1456-
self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE)
1457-
1458-
@final
1459-
cdef ITYPE_t n_samples_X(self) nogil:
1460-
return self.X_indptr.shape[0] - 1
1461-
1462-
@final
1463-
cdef ITYPE_t n_samples_Y(self) nogil:
1464-
return self.Y.shape[0]
1465-
1466-
@final
1467-
cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
1468-
cdef:
1469-
ITYPE_t xi_start = self.X_indptr[i]
1470-
ITYPE_t xi_end = self.X_indptr[i + 1]
1471-
1472-
# TODO: the 2D to 1D memory-view conversion might make computation slower, see:
1473-
# https://github.com/scikit-learn/scikit-learn/issues/17299
1474-
# Alternatively, we could pass pointers and indices and access elements
1475-
# then in distance_metric.dist. This works but would complexify this API.
1476-
return self.distance_metric.csr_rdist(
1477-
self.X_data[xi_start:xi_end],
1478-
self.X_indices[xi_start:xi_end],
1479-
self.Y[j, :],
1480-
self.Y_indices
1481-
)
1482-
1483-
@final
1484-
cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
1485-
cdef:
1486-
ITYPE_t xi_start = self.X_indptr[i]
1487-
ITYPE_t xi_end = self.X_indptr[i + 1]
1488-
1489-
# TODO: same as previous comment
1490-
return self.distance_metric.csr_dist(
1491-
self.X_data[xi_start:xi_end],
1492-
self.X_indices[xi_start:xi_end],
1493-
self.Y[j, :],
1494-
self.Y_indices
1495-
)
1496-
1497-
@final
1498-
cdef class DenseSparseDatasetsPair(DatasetsPair):
1499-
"""Compute distances between vectors of a dense array and a CSR matrix.
1500-
1501-
Parameters
1502-
----------
1503-
X: ndarray of shape (n_samples_X, n_features)
1504-
Rows represent vectors. Must be C-contiguous.
1505-
1506-
Y: sparse matrix of shape (n_samples_Y, n_features)
1507-
Rows represent vectors. Must be in CSR format.
1508-
1509-
distance_metric: DistanceMetric
1510-
The distance metric responsible for computing distances
1511-
between two vectors of (X, Y).
1512-
"""
1513-
cdef:
1514-
# As distance metrics are symmetric functions, we can
1515-
# simply rely on the SparseDenseDatasetsPair and swap arguments.
1516-
DatasetsPair datasets_pair
1517-
1518-
def __init__(self, X, Y, DistanceMetric distance_metric):
1519-
super().__init__(distance_metric)
1520-
# Swapping arguments on the constructor
1521-
self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric)
1522-
1523-
@final
1524-
cdef ITYPE_t n_samples_X(self) nogil:
1525-
# Swapping interface
1526-
return self.datasets_pair.n_samples_Y()
1527-
1528-
@final
1529-
cdef ITYPE_t n_samples_Y(self) nogil:
1530-
# Swapping interface
1531-
return self.datasets_pair.n_samples_X()
1532-
1533-
@final
1534-
cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
1535-
# Swapping arguments on the same interface
1536-
return self.datasets_pair.surrogate_dist(j, i)
1537-
1538-
@final
1539-
cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
1540-
# Swapping arguments on the same interface
1541-
return self.datasets_pair.dist(j, i)

sklearn/metrics/tests/test_pairwise_distances_reduction.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55

66
from sklearn.metrics._dist_metrics import (
77
DenseDenseDatasetsPair,
8-
DenseSparseDatasetsPair,
9-
SparseDenseDatasetsPair,
10-
SparseSparseDatasetsPair,
118
)
129

1310
from sklearn.metrics._pairwise_distances_reduction import (
@@ -165,20 +162,22 @@ def test_pairwise_distances_reduction_factory_method(
165162
dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric)
166163
assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair)
167164

168-
sparse_sparse_instance = PairwiseDistancesReduction.get_for(
169-
csr_matrix(X), csr_matrix(Y), dummy_arg, metric
170-
)
171-
assert isinstance(sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair)
165+
with pytest.raises(
166+
ValueError, match="Only dense datasets are supported for X and Y."
167+
):
168+
PairwiseDistancesReduction.get_for(
169+
csr_matrix(X), csr_matrix(Y), dummy_arg, metric
170+
)
172171

173-
dense_sparse_instance = PairwiseDistancesReduction.get_for(
174-
X, csr_matrix(Y), dummy_arg, metric=metric
175-
)
176-
assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair)
172+
with pytest.raises(
173+
ValueError, match="Only dense datasets are supported for X and Y."
174+
):
175+
PairwiseDistancesReduction.get_for(X, csr_matrix(Y), dummy_arg, metric=metric)
177176

178-
sparse_dense_instance = PairwiseDistancesReduction.get_for(
179-
csr_matrix(X), Y, dummy_arg, metric=metric
180-
)
181-
assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair)
177+
with pytest.raises(
178+
ValueError, match="Only dense datasets are supported for X and Y."
179+
):
180+
PairwiseDistancesReduction.get_for(csr_matrix(X), Y, dummy_arg, metric=metric)
182181

183182
# Test specialisations creation
184183
fast_euclidean_instance = PairwiseDistancesReduction.get_for(

0 commit comments

Comments
 (0)
0