8000 [MRG] Fast PolynomialFeatures on CSR matrices (#12197) · xhluca/scikit-learn@49aab3d · GitHub
[go: up one dir, main page]

Skip to content

Commit 49aab3d

Browse files
AWNystromXing
authored andcommitted
[MRG] Fast PolynomialFeatures on CSR matrices (scikit-learn#12197)
1 parent 32bffd8 commit 49aab3d

File tree

6 files changed

+420
-30
lines changed

6 files changed

+420
-30
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
import scipy.sparse as sparse
4+
from sklearn.preprocessing import PolynomialFeatures
5+
from time import time
6+
7+
degree = 2
8+
trials = 3
9+
num_rows = 1000
10+
dimensionalities = np.array([1, 2, 8, 16, 32, 64])
11+
densities = np.array([0.01, 0.1, 1.0])
12+
csr_times = {d: np.zeros(len(dimensionalities)) for d in densities}
13+
dense_times = {d: np.zeros(len(dimensionalities)) for d in densities}
14+
transform = PolynomialFeatures(degree=degree, include_bias=False,
15+
interaction_only=False)
16+
17+
for trial in range(trials):
18+
for density in densities:
19+
for dim_index, dim in enumerate(dimensionalities):
20+
print(trial, density, dim)
21+
X_csr = sparse.random(num_rows, dim, density).tocsr()
22+
X_dense = X_csr.toarray()
23+
# CSR
24+
t0 = time()
25+
transform.fit_transform(X_csr)
26+
csr_times[density][dim_index] += time() - t0
27+
# Dense
28+
t0 = time()
29+
transform.fit_transform(X_dense)
30+
dense_times[density][dim_index] += time() - t0
31+
32+
csr_linestyle = (0, (3, 1, 1, 1, 1, 1)) # densely dashdotdotted
33+
dense_linestyle = (0, ()) # solid
34+
35+
fig, axes = plt.subplots(nrows=len(densities), ncols=1, figsize=(8, 10))
36+
for density, ax in zip(densities, axes):
37+
38+
ax.plot(dimensionalities, csr_times[density] / trials,
39+
label='csr', linestyle=csr_linestyle)
40+
ax.plot(dimensionalities, dense_times[density] / trials,
41+
label='dense', linestyle=dense_linestyle)
42+
ax.set_title("density %0.2f, degree=%d, n_samples=%d" %
43+
(density, degree, num_rows))
44+
ax.legend()
45+
ax.set_xlabel('Dimensionality')
46+
ax.set_ylabel('Time (seconds)')
47+
48+
plt.tight_layout()
49+
plt.show()

doc/whats_new/v0.21.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ Support for Python 3.4 and below has been officially dropped.
5555< 28BE /td>
of calculating it every time on the fly.
5656
:issue:`12116` by :user:`Ekaterina Krivich <kiote>` and `Joel Nothman`_.
5757

58+
- |Efficiency| :class:`preprocessing.PolynomialFeatures` now supports compressed
59+
sparse row (CSR) matrices as input for degrees 2 and 3. This is typically much
60+
faster than the dense case as it scales with matrix density and expansion degree
61+
(on the order of density^degree), and is much, much faster than the compressed
62+
sparse column (CSC) case. :issue:`12197` by :user:`Andrew Nystrom <awnystrom>`.
63+
5864
- |Efficiency| |API| Speed improvement in :class:`preprocessing.PolynomialFeatures`,
5965
in the dense case. Also added a new parameter ``order`` which controls output
6066
order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_.
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# cython: cdivision=True
2+
# cython: boundscheck=False
3+
# cython: wraparound=False
4+
5+
# Author: Andrew nystrom <awnystrom@gmail.com>
6+
7+
from scipy.sparse import csr_matrix
8+
from numpy cimport ndarray
9+
cimport numpy as np
10+
11+
ctypedef np.int32_t INDEX_T
12+
13+
ctypedef fused DATA_T:
14+
np.float32_t
15+
np.float64_t
16+
np.int32_t
17+
np.int64_t
18+
19+
20+
cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
21+
INDEX_T interaction_only) nogil:
22+
"""Compute the index of the column for a degree 2 expansion
23+
24+
d is the dimensionality of the input data, i and j are the indices
25+
for the columns involved in the expansion.
26+
"""
27+
if interaction_only:
28+
return d * i - (i**2 + 3 * i) / 2 - 1 + j
29+
else:
30+
return d * i - (i**2 + i) / 2 + j
31+
32+
33+
cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
34+
INDEX_T interaction_only) nogil:
35+
"""Compute the index of the column for a degree 3 expansion
36+
37+
d is the dimensionality of the input data, i, j and k are the indices
38+
for the columns involved in the expansion.
39+
"""
40+
if interaction_only:
41+
return ((3 * d**2 * i - 3 * d * i**2 + i**3
42+
+ 11 * i - 3 * j**2 - 9 * j) / 6
43+
+ i**2 - 2 * d * i + d * j - d + k)
44+
else:
45+
return ((3 * d**2 * i - 3 * d * i**2 + i ** 3 - i
46+
- 3 * j**2 - 3 * j) / 6
47+
+ d * j + k)
48+
49+
50+
def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
51+
ndarray[INDEX_T, ndim=1] indices,
52+
ndarray[INDEX_T, ndim=1] indptr,
53+
INDEX_T d, INDEX_T interaction_only,
54+
INDEX_T degree):
55+
"""
56+
Perform a second-degree polynomial or interaction expansion on a scipy
57+
compressed sparse row (CSR) matrix. The method used only takes products of
58+
non-zero features. For a matrix with density d, this results in a speedup
59+
on the order of d^k where k is the degree of the expansion, assuming all
60+
rows are of similar density.
61+
62+
Parameters
63+
----------
64+
data : nd-array
65+
The "data" attribute of the input CSR matrix.
66+
67+
indices : nd-array
68+
The "indices" attribute of the input CSR matrix.
69+
70+
indptr : nd-array
71+
The "indptr" attribute of the input CSR matrix.
72+
73+
d : int
74+
The dimensionality of the input CSR matrix.
75+
76+
interaction_only : int
77+
0 for a polynomial expansion, 1 for an interaction expansion.
78+
79+
degree : int
80+
The degree of the expansion. This must be either 2 or 3.
81+
82+
References
83+
----------
84+
"Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
85+
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
86+
"""
87+
88+
assert degree in (2, 3)
89+
90+
if degree == 2:
91+
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
92+
else:
93+
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
94+
- interaction_only*d**2)
95+
if expanded_dimensionality == 0:
96+
return None
97+
assert expanded_dimensionality > 0
98+
99+
cdef INDEX_T total_nnz = 0, row_i, nnz
100+
101+
# Count how many nonzero elements the expanded matrix will contain.
102+
for row_i in range(indptr.shape[0]-1):
103+
# nnz is the number of nonzero elements in this row.
104+
nnz = indptr[row_i + 1] - indptr[row_i]
105+
if degree == 2:
106+
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
107+
else:
108+
total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
109+
- interaction_only * nnz ** 2)
110+
111+
# Make the arrays that will form the CSR matrix of the expansion.
112+
cdef ndarray[DATA_T, ndim=1] expanded_data = ndarray(
113+
shape=total_nnz, dtype=data.dtype)
114+
cdef ndarray[INDEX_T, ndim=1] expanded_indices = ndarray(
115+
shape=total_nnz, dtype=indices.dtype)
116+
cdef INDEX_T num_rows = indptr.shape[0] - 1
117+
cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
118+
shape=num_rows + 1, dtype=indptr.dtype)
119+
120+
cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
121+
i_ptr, j_ptr, k_ptr, num_cols_in_row, \
122+
expanded_column
123+
124+
with nogil:
125+
expanded_indptr[0] = indptr[0]
126+
for row_i in range(indptr.shape[0]-1):
127+
row_starts = indptr[row_i]
128+
row_ends = indptr[row_i + 1]
129+
num_cols_in_row = 0
130+
for i_ptr in range(row_starts, row_ends):
131+
i = indices[i_ptr]
132+
for j_ptr in range(i_ptr + interaction_only, row_ends):
133+
j = indices[j_ptr]
134+
if degree == 2:
135+
col = _deg2_column(d, i, j, interaction_only)
136+
expanded_indices[expanded_index] = col
137+
expanded_data[expanded_index] = (
138+
data[i_ptr] * data[j_ptr])
139+
expanded_index += 1
140+
num_cols_in_row += 1
141+
else:
142+
# degree == 3
143+
for k_ptr in range(j_ptr + interaction_only,
144+
row_ends):
145+
k = indices[k_ptr]
146+
col = _deg3_column(d, i, j, k, interaction_only)
147+
expanded_indices[expanded_index] = col
148+
expanded_data[expanded_index] = (
149+
data[i_ptr] * data[j_ptr] * data[k_ptr])
150+
expanded_index += 1
151+
num_cols_in_row += 1
152+
153+
expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row
154+
155+
return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
156+
shape=(num_rows, expanded_dimensionality))

sklearn/preprocessing/data.py

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@
3333
from ..utils.validation import (check_is_fitted, check_random_state,
3434
FLOAT_DTYPES)
3535

36-
from ._encoders import OneHotEncoder
36+
from ._csr_polynomial_expansion import _csr_polynomial_expansion
3737

38+
from ._encoders import OneHotEncoder
3839

3940
BOUNDS_THRESHOLD = 1e-7
4041

@@ -1443,41 +1444,71 @@ def transform(self, X):
14431444
----------
14441445
X : array-like or sparse matrix, shape [n_samples, n_features]
14451446
The data to transform, row by row.
1446-
Sparse input should preferably be in CSC format.
1447+
Sparse input should preferably be in CSR format (for speed),
1448+
but must be in CSC format if the degree is 4 or higher.
1449+
1450+
If the input matrix is in CSR format and the expansion is of
1451+
degree 2 or 3, the method described in the work "Leveraging
1452+
Sparsity to Speed Up Polynomial Feature Expansions of CSR
1453+
Matrices Using K-Simplex Numbers" by Andrew Nystrom and
1454+
John Hughes is used, which is much faster than the method
1455+
used on CSC input.
14471456
14481457
Returns
14491458
-------
1450-
XP : np.ndarray or CSC sparse matrix, shape [n_samples, NP]
1459+
XP : np.ndarray or CSR/CSC sparse matrix, shape [n_samples, NP]
14511460
The matrix of features, where NP is the number of polynomial
14521461
features generated from the combination of inputs.
14531462
"""
14541463
check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
14551464

1456-
X = check_array(X, order='F', dtype=FLOAT_DTYPES, accept_sparse='csc')
1465+
X = check_array(X, order='F', dtype=FLOAT_DTYPES,
1466+
accept_sparse=('csr', 'csc'))
1467+
14571468
n_samples, n_features = X.shape
14581469

14591470
if n_features != self.n_input_features_:
14601471
raise ValueError("X shape does not match training shape")
14611472

1462-
combinations = self._combinations(n_features, self.degree,
1463-
self.interaction_only,
1464-
self.include_bias)
1465-
if sparse.isspmatrix(X):
1466-
columns = []
1467-
for comb in combinations:
1468-
if comb:
1469-
out_col = 1
1470-
for col_idx in comb:
1471-
out_col = X[:, col_idx].multiply(out_col)
1472-
columns.append(out_col)
1473-
else:
1474-
columns.append(sparse.csc_matrix(np.ones((X.shape[0], 1))))
1475-
XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
1473+
if sparse.isspmatrix_csr(X):
1474+
if self.degree > 3:
1475+
return self.transform(X.tocsc()).tocsr()
1476+
to_stack = []
1477+
if self.include_bias:
1478+
to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
1479+
to_stack.append(X)
1480+
for deg in range(2, self.degree+1):
1481+
Xp_next = _csr_polynomial_expansion(X.data, X.indices,
1482+
X.indptr, X.shape[1],
1483+
self.interaction_only,
1484+
deg)
1485+
if Xp_next is None:
1486+
break
1487+
to_stack.append(Xp_next)
1488+
XP = sparse.hstack(to_stack, format='csr')
1489+
elif sparse.isspmatrix_csc(X) and self.degree < 4:
1490+
return self.transform(X.tocsr()).tocsc()
14761491
else:
1477-
XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype,
1478-
order=self.order)
1479-
for i, comb in enumerate(combinations):
1480-
XP[:, i] = X[:, comb].prod(1)
1492+
combinations = self._combinations(n_features, self.degree,
1493+
self.interaction_only,
1494+
self.include_bias)
1495+
if sparse.isspmatrix(X):
1496+
columns = []
1497+
for comb in combinations:
1498+
if comb:
1499+
out_col = 1
1500+
for col_idx in comb:
1501+
out_col = X[:, col_idx].multiply(out_col)
1502+
columns.append(out_col)
1503+
else:
1504+
bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
1505+
columns.append(bias)
1506+
XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
1507+
else:
1508+
XP = np.empty((n_samples, self.n_output_features_),
1509+
dtype=X.dtype, order=self.order)
1510+
for i, comb in enumerate(combinations):
1511+
XP[:, i] = X[:, comb].prod(1)
14811512

14821513
return XP
14831514

sklearn/preprocessing/setup.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import os
2+
3+
4+
def configuration(parent_package='', top_path=None):
5+
import numpy
6+
from numpy.distutils.misc_util import Configuration
7+
8+
config = Configuration('preprocessing', parent_package, top_path)
9+
libraries = []
10+
if os.name == 'posix':
11+
libraries.append('m')
12+
13+
config.add_extension('_csr_polynomial_expansion',
14+
sources=['_csr_polynomial_expansion.pyx'],
15+
include_dirs=[numpy.get_include()],
16+
libraries=libraries)
17+
18+
config.add_subpackage('tests')
19+
20+
return config

0 commit comments

Comments
 (0)
0