xhluca
diff --git a/‎benchmarks/bench_feature_expansions.py
Lines changed: 49 additions & 0 deletions b/‎benchmarks/bench_feature_expansions.py
Lines changed: 49 additions & 0 deletions
diff --git a/‎doc/whats_new/v0.21.rst
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new/v0.21.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/_csr_polynomial_expansion.pyx
Lines changed: 156 additions & 0 deletions b/‎sklearn/preprocessing/_csr_polynomial_expansion.pyx
Lines changed: 156 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/data.py
Lines changed: 53 additions & 22 deletions b/‎sklearn/preprocessing/data.py
Lines changed: 53 additions & 22 deletions
diff --git a/‎sklearn/preprocessing/setup.py
Lines changed: 20 additions & 0 deletions b/‎sklearn/preprocessing/setup.py
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,49 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.sparse as sparse
+from sklearn.preprocessing import PolynomialFeatures
+from time import time
+
+degree = 2
+trials = 3
+num_rows = 1000
+dimensionalities = np.array([1, 2, 8, 16, 32, 64])
+densities = np.array([0.01, 0.1, 1.0])
+csr_times = {d: np.zeros(len(dimensionalities)) for d in densities}
+dense_times = {d: np.zeros(len(dimensionalities)) for d in densities}
+transform = PolynomialFeatures(degree=degree, include_bias=False,
+                               interaction_only=False)
+
+for trial in range(trials):
+    for density in densities:
+        for dim_index, dim in enumerate(dimensionalities):
+            print(trial, density, dim)
+            X_csr = sparse.random(num_rows, dim, density).tocsr()
+            X_dense = X_csr.toarray()
+            # CSR
+            t0 = time()
+            transform.fit_transform(X_csr)
+            csr_times[density][dim_index] += time() - t0
+            # Dense
+            t0 = time()
+            transform.fit_transform(X_dense)
+            dense_times[density][dim_index] += time() - t0
+
+csr_linestyle = (0, (3, 1, 1, 1, 1, 1))  # densely dashdotdotted
+dense_linestyle = (0, ())  # solid
+
+fig, axes = plt.subplots(nrows=len(densities), ncols=1, figsize=(8, 10))
+for density, ax in zip(densities, axes):
+
+    ax.plot(dimensionalities, csr_times[density] / trials,
+            label='csr', linestyle=csr_linestyle)
+    ax.plot(dimensionalities, dense_times[density] / trials,
+            label='dense', linestyle=dense_linestyle)
+    ax.set_title("density %0.2f, degree=%d, n_samples=%d" %
+                 (density, degree, num_rows))
+    ax.legend()
+    ax.set_xlabel('Dimensionality')
+    ax.set_ylabel('Time (seconds)')
+
+plt.tight_layout()
+plt.show()
@@ -55,6 +55,12 @@ Support for Python 3.4 and below has been officially dropped.
   of calculating it every time on the fly.
   :issue:`12116` by :user:`Ekaterina Krivich <kiote>` and `Joel Nothman`_.
 
+- |Efficiency| :class:`preprocessing.PolynomialFeatures` now supports compressed
+  sparse row (CSR) matrices as input for degrees 2 and 3. This is typically much
+  faster than the dense case as it scales with matrix density and expansion degree
+  (on the order of density^degree), and is much, much faster than the compressed
+  sparse column (CSC) case. :issue:`12197` by :user:`Andrew Nystrom <awnystrom>`.
+
 - |Efficiency| |API| Speed improvement in :class:`preprocessing.PolynomialFeatures`,
   in the dense case. Also added a new parameter ``order`` which controls output
   order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_.
 
@@ -0,0 +1,156 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+# Author: Andrew nystrom <awnystrom@gmail.com>
+
+from scipy.sparse import csr_matrix
+from numpy cimport ndarray
+cimport numpy as np
+
+ctypedef np.int32_t INDEX_T
+
+ctypedef fused DATA_T:
+    np.float32_t
+    np.float64_t
+    np.int32_t
+    np.int64_t
+
+
+cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j,
+                                 INDEX_T interaction_only) nogil:
+    """Compute the index of the column for a degree 2 expansion
+
+    d is the dimensionality of the input data, i and j are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return d * i - (i**2 + 3 * i) / 2 - 1 + j
+    else:
+        return d * i - (i**2 + i) / 2 + j
+
+
+cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
+                                 INDEX_T interaction_only) nogil:
+    """Compute the index of the column for a degree 3 expansion
+
+    d is the dimensionality of the input data, i, j and k are the indices
+    for the columns involved in the expansion.
+    """
+    if interaction_only:
+        return ((3 * d**2 * i - 3 * d * i**2 + i**3
+                 + 11 * i - 3 * j**2 - 9 * j) / 6
+                + i**2 - 2 * d * i + d * j - d + k)
+    else:
+        return ((3 * d**2 * i - 3 * d * i**2 + i ** 3 - i
+                 - 3 * j**2 - 3 * j) / 6
+                + d * j + k)
+
+
+def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
+                              ndarray[INDEX_T, ndim=1] indices,
+                              ndarray[INDEX_T, ndim=1] indptr,
+                              INDEX_T d, INDEX_T interaction_only,
+                              INDEX_T degree):
+    """
+    Perform a second-degree polynomial or interaction expansion on a scipy
+    compressed sparse row (CSR) matrix. The method used only takes products of
+    non-zero features. For a matrix with density d, this results in a speedup
+    on the order of d^k where k is the degree of the expansion, assuming all
+    rows are of similar density.
+
+    Parameters
+    ----------
+    data : nd-array
+        The "data" attribute of the input CSR matrix.
+
+    indices : nd-array
+        The "indices" attribute of the input CSR matrix.
+
+    indptr : nd-array
+        The "indptr" attribute of the input CSR matrix.
+
+    d : int
+        The dimensionality of the input CSR matrix.
+
+    interaction_only : int
+        0 for a polynomial expansion, 1 for an interaction expansion.
 
+    degree : int
+        The degree of the expansion. This must be either 2 or 3.
+
+    References
+    ----------
+    "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR
+    Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
+    """
+
+    assert degree in (2, 3)
+
+    if degree == 2:
+        expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
+    else:
+        expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
+                                      - interaction_only*d**2)
+    if expanded_dimensionality == 0:
+        return None
+    assert expanded_dimensionality > 0
+
+    cdef INDEX_T total_nnz = 0, row_i, nnz
+
+    # Count how many nonzero elements the expanded matrix will contain.
+    for row_i in range(indptr.shape[0]-1):
+        # nnz is the number of nonzero elements in this row.
+        nnz = indptr[row_i + 1] - indptr[row_i]
+        if degree == 2:
+            total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
+        else:
+            total_nnz += ((nnz ** 3 + 3 * nnz ** 2 + 2 * nnz) / 6
+                          - interaction_only * nnz ** 2)
+
+    # Make the arrays that will form the CSR matrix of the expansion.
+    cdef ndarray[DATA_T, ndim=1] expanded_data = ndarray(
+        shape=total_nnz, dtype=data.dtype)
+    cdef ndarray[INDEX_T, ndim=1] expanded_indices = ndarray(
+        shape=total_nnz, dtype=indices.dtype)
+    cdef INDEX_T num_rows = indptr.shape[0] - 1
+    cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
+        shape=num_rows + 1, dtype=indptr.dtype)
+
+    cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
+                 i_ptr, j_ptr, k_ptr, num_cols_in_row,  \
+                 expanded_column
+
+    with nogil:
+        expanded_indptr[0] = indptr[0]
+        for row_i in range(indptr.shape[0]-1):
+            row_starts = indptr[row_i]
+            row_ends = indptr[row_i + 1]
+            num_cols_in_row = 0
+            for i_ptr in range(row_starts, row_ends):
+                i = indices[i_ptr]
+                for j_ptr in range(i_ptr + interaction_only, row_ends):
+                    j = indices[j_ptr]
+                    if degree == 2:
+                        col = _deg2_column(d, i, j, interaction_only)
+                        expanded_indices[expanded_index] = col
+                        expanded_data[expanded_index] = (
+                            data[i_ptr] * data[j_ptr])
+                        expanded_index += 1
+                        num_cols_in_row += 1
+                    else:
+                        # degree == 3
+                        for k_ptr in range(j_ptr + interaction_only,
+                                            row_ends):
+                            k = indices[k_ptr]
+                            col = _deg3_column(d, i, j, k, interaction_only)
+                            expanded_indices[expanded_index] = col
+                            expanded_data[expanded_index] = (
+                                data[i_ptr] * data[j_ptr] * data[k_ptr])
+                            expanded_index += 1
+                            num_cols_in_row += 1
+
+            expanded_indptr[row_i+1] = expanded_indptr[row_i] + num_cols_in_row
+
+    return csr_matrix((expanded_data, expanded_indices, expanded_indptr),
+                      shape=(num_rows, expanded_dimensionality))
@@ -33,8 +33,9 @@
 from ..utils.validation import (check_is_fitted, check_random_state,
                                 FLOAT_DTYPES)
 
-from ._encoders import OneHotEncoder
+from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
+from ._encoders import OneHotEncoder
 
 BOUNDS_THRESHOLD = 1e-7
 
@@ -1443,41 +1444,71 @@ def transform(self, X):
         ----------
         X : array-like or sparse matrix, shape [n_samples, n_features]
             The data to transform, row by row.
-            Sparse input should preferably be in CSC format.
+            Sparse input should preferably be in CSR format (for speed),
+            but must be in CSC format if the degree is 4 or higher.
+
+            If the input matrix is in CSR format and the expansion is of
+            degree 2 or 3, the method described in the work "Leveraging
+            Sparsity to Speed Up Polynomial Feature Expansions of CSR
+            Matrices Using K-Simplex Numbers" by Andrew Nystrom and
+            John Hughes is used, which is much faster than the method
+            used on CSC input.
 
         Returns
         -------
-        XP : np.ndarray or CSC sparse matrix, shape [n_samples, NP]
+        XP : np.ndarray or CSR/CSC sparse matrix, shape [n_samples, NP]
             The matrix of features, where NP is the number of polynomial
             features generated from the combination of inputs.
         """
         check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
 
-        X = check_array(X, order='F', dtype=FLOAT_DTYPES, accept_sparse='csc')
+        X = check_array(X, order='F', dtype=FLOAT_DTYPES,
+                        accept_sparse=('csr', 'csc'))
+
         n_samples, n_features = X.shape
 
         if n_features != self.n_input_features_:
             raise ValueError("X shape does not match training shape")
 
-        combinations = self._combinations(n_features, self.degree,
-                                          self.interaction_only,
-                                          self.include_bias)
-        if sparse.isspmatrix(X):
-            columns = []
-            for comb in combinations:
-                if comb:
-                    out_col = 1
-                    for col_idx in comb:
-                        out_col = X[:, col_idx].multiply(out_col)
-                    columns.append(out_col)
-                else:
-                    columns.append(sparse.csc_matrix(np.ones((X.shape[0], 1))))
-            XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
+        if sparse.isspmatrix_csr(X):
+            if self.degree > 3:
+                return self.transform(X.tocsc()).tocsr()
+            to_stack = []
+            if self.include_bias:
+                to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
+            to_stack.append(X)
+            for deg in range(2, self.degree+1):
+                Xp_next = _csr_polynomial_expansion(X.data, X.indices,
+                                                    X.indptr, X.shape[1],
+                                                    self.interaction_only,
+                                                    deg)
+                if Xp_next is None:
+                    break
+                to_stack.append(Xp_next)
+            XP = sparse.hstack(to_stack, format='csr')
+        elif sparse.isspmatrix_csc(X) and self.degree < 4:
+            return self.transform(X.tocsr()).tocsc()
         else:
-            XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype,
-                          order=self.order)
-            for i, comb in enumerate(combinations):
-                XP[:, i] = X[:, comb].prod(1)
+            combinations = self._combinations(n_features, self.degree,
+                                              self.interaction_only,
+                                              self.include_bias)
+            if sparse.isspmatrix(X):
+                columns = []
+                for comb in combinations:
+                    if comb:
+                        out_col = 1
+                        for col_idx in comb:
+                            out_col = X[:, col_idx].multiply(out_col)
+                        columns.append(out_col)
+                    else:
+                        bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
+                        columns.append(bias)
+                XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
+            else:
+                XP = np.empty((n_samples, self.n_output_features_),
+                              dtype=X.dtype, order=self.order)
+                for i, comb in enumerate(combinations):
+                    XP[:, i] = X[:, comb].prod(1)
 
         return XP
 
 
@@ -0,0 +1,20 @@
+import os
+
+
+def configuration(parent_package='', top_path=None):
+    import numpy
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration('preprocessing', parent_package, top_path)
+    libraries = []
+    if os.name == 'posix':
+        libraries.append('m')
+
+    config.add_extension('_csr_polynomial_expansion',
+                         sources=['_csr_polynomial_expansion.pyx'],
+                         include_dirs=[numpy.get_include()],
+                         libraries=libraries)
+
+    config.add_subpackage('tests')
+
+    return config