10000 Directly compute polynomial features. · raghavrv/scikit-learn@ed47b5b · GitHub
[go: up one dir, main page]

Skip to content

Commit ed47b5b

Browse files
eamartinjnothman
authored andcommitted
Directly compute polynomial features.
Polynomial features are computed by iterating over all combinations of features. For each combination of features, the product of the columns indexed by the combination is computed. The fit method is now a no-op, and the transform method works with any number of features (regardless of what fit was called with).
1 parent d69f2d6 commit ed47b5b

File tree

3 files changed

+45
-16
lines changed

3 files changed

+45
-16
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ Enhancements
226226
- The outcome of :func:`manifold.spectral_embedding` was made deterministic
227227
by flipping the sign of eigen vectors. By `Hasil Sharma`_.
228228

229+
- Significant performance and memory usage improvements in
230+
:class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_.
229231

230232
Documentation improvements
231233
..........................
@@ -3372,3 +3374,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
33723374
.. _Eric Schubert: https://github.com/kno10
33733375

33743376
.. _Dan Blanchard: https://github.com/dan-blanchard
3377+
3378+
.. _Eric Martin: http://ericmart.in

sklearn/preprocessing/data.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Mathieu Blondel <mathieu@mblondel.org>
33
# Olivier Grisel <olivier.grisel@ensta.org>
44
# Andreas Mueller <amueller@ais.uni-bonn.de>
5+
# Eric Martin <eric@ericmart.in>
56
# License: BSD 3 clause
67

78
from itertools import chain, combinations
@@ -442,9 +443,16 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin):
442443
443444
Attributes
444445
----------
446+
powers_ : array, shape (n_input_features, n_output_features)
447+
powers_[i, j] is the exponent of the jth input in the ith output.
445448
446-
powers_ :
447-
powers_[i, j] is the exponent of the jth input in the ith output.
449+
n_input_features_ : int
450+
The total number of input features.
451+
452+
n_output_features_ : int
453+
The total number of polynomial output features. The number of output
454+
features is computed by iterating over all suitably sized combinations
455+
of input features.
448456
449457
Notes
450458
-----
@@ -461,23 +469,32 @@ def __init__(self, degree=2, interaction_only=False, include_bias=True):
461469
self.include_bias = include_bias
462470

463471
@staticmethod
464-
def _power_matrix(n_features, degree, interaction_only, include_bias):
465-
"""Compute the matrix of polynomial powers"""
472+
def _combinations(n_features, degree, interaction_only, include_bias):
466473
comb = (combinations if interaction_only else combinations_w_r)
467474
start = int(not include_bias)
468-
combn = chain.from_iterable(comb(range(n_features), i)
469-
for i in range(start, degree + 1))
470-
powers = np.vstack(bincount(c, minlength=n_features) for c in combn)
471-
return powers
475+
return chain.from_iterable(comb(range(n_features), i)
476+
for i in range(start, degree + 1))
477+
478+
@property
479+
def powers_(self):
480+
check_is_fitted(self, 'n_input_features_')
481+
482+
combinations = self._combinations(self.n_input_features_, self.degree,
483+
self.interaction_only,
484+
self.include_bias)
485+
return np.vstack(np.bincount(c, minlength=self.n_input_features_)
486+
for c in combinations)
472487

473488
def fit(self, X, y=None):
474489
"""
475-
Compute the polynomial feature combinations
490+
Compute number of output features.
476491
"""
477492
n_samples, n_features = check_array(X).shape
478-
self.powers_ = self._power_matrix(n_features, self.degree,
493+
combinations = self._combinations(n_features, self.degree,
479494
self.interaction_only,
480495
self.include_bias)
496+
self.n_input_features_ = n_features
497+
self.n_output_features_ = sum(1 for _ in combinations)
481498
return self
482499

483500
def transform(self, X, y=None):
@@ -494,15 +511,24 @@ def transform(self, X, y=None):
494511
The matrix of features, where NP is the number of polynomial
495512
features generated from the combination of inputs.
496513
"""
497-
check_is_fitted(self, 'powers_')
514+
check_is_fitted(self, ['n_input_features_', 'n_output_features_'])
498515

499516
X = check_array(X)
500517
n_samples, n_features = X.shape
501518

502-
if n_features != self.powers_.shape[1]:
519+
if n_features != self.n_input_features_:
503520
raise ValueError("X shape does not match training shape")
504521

505-
return (X[:, None, :] ** self.powers_).prod(-1)
522+
# allocate output data
523+
XP = np.empty((n_samples, self.n_output_features_), dtype=X.dtype)
524+
525+
combinations = self._combinations(n_features, self.degree,
526+
self.interaction_only,
527+
self.include_bias)
528+
for i, c in enumerate(combinations):
529+
XP[:, i] = X[:, c].prod(1)
530+
531+
return XP
506532

507533

508534
def normalize(X, norm='l2', axis=1, copy=True):
@@ -1083,7 +1109,8 @@ def _transform(self, X):
10831109
# We use only those catgorical features of X that are known using fit.
10841110
# i.e lesser than n_values_ using mask.
10851111
# This means, if self.handle_unknown is "ignore", the row_indices and
1086-
# col_indices corresponding to the unknown categorical feature are ignored.
1112+
# col_indices corresponding to the unknown categorical feature are
1113+
# ignored.
10871114
mask = (X < self.n_values_).ravel()
10881115
if np.any(~mask):
10891116
if self.handle_unknown not in ['error', 'ignore']:

sklearn/preprocessing/tests/test_data.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,6 @@ def test_polynomial_features():
6868
X_poly = interact.fit_transform(X)
6969
assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
7070

71-
assert_raises(ValueError, interact.transform, X[:, 1:])
72-
7371

7472
def test_scaler_1d():
7573
"""Test scaling of dataset along single axis"""

0 commit comments

Comments
 (0)
0