|
15 | 15 | from .utils import check_arrays
|
16 | 16 | from .utils import array2d
|
17 | 17 | from .utils import atleast2d_or_csr
|
| 18 | +from .utils import atleast2d_or_csc |
18 | 19 | from .utils import safe_asarray
|
19 | 20 | from .utils import warn_if_not_float
|
20 | 21 | from .utils.fixes import unique
|
@@ -629,6 +630,53 @@ def transform(self, X, y=None, copy=None):
|
629 | 630 | return binarize(X, threshold=self.threshold, copy=copy)
|
630 | 631 |
|
631 | 632 |
|
| 633 | +def _transform_selected(X, transform, selected): |
| 634 | + """Apply a transform function to portion of selected features |
| 635 | +
|
| 636 | + Parameters |
| 637 | + ---------- |
| 638 | + X : array-like or sparse matrix, shape=(n_samples, n_features) |
| 639 | + Dense array or sparse matrix. |
| 640 | +
|
| 641 | + transform : callable |
| 642 | + A callable transform(X) -> X_transformed |
| 643 | +
|
| 644 | + selected: "all" or array of indices or mask |
| 645 | + Specify what features to apply the transform to. |
| 646 | +
|
| 647 | + Returns |
| 648 | + ------- |
| 649 | + X : array or sparse matrix, shape=(n_samples, n_features_new) |
| 650 | + """ |
| 651 | + if len(selected) == 0: |
| 652 | + return X |
| 653 | + elif selected == "all": |
| 654 | + return transform(X) |
| 655 | + else: |
| 656 | + X = atleast2d_or_csc(X) |
| 657 | + n_features = X.shape[1] |
| 658 | + ind = np.arange(n_features) |
| 659 | + sel = np.zeros(n_features, dtype=bool) |
| 660 | + sel[np.array(selected)] = True |
| 661 | + not_sel = np.logical_not(sel) |
| 662 | + n_selected = np.sum(sel) |
| 663 | + |
| 664 | + if n_selected == 0: |
| 665 | + # No features selected. |
| 666 | + return X |
| 667 | + elif n_selected == n_features: |
| 668 | + # All features selected. |
| 669 | + return transform(X) |
| 670 | + else: |
| 671 | + X_sel = transform(X[:, ind[sel]]) |
| 672 | + X_not_sel = X[:, ind[not_sel]] |
| 673 | + |
| 674 | + if sp.issparse(X_sel) or sp.issparse(X_not_sel): |
| 675 | + return sp.hstack((X_sel, X_not_sel)) |
| 676 | + else: |
| 677 | + return np.hstack((X_sel, X_not_sel)) |
| 678 | + |
| 679 | + |
632 | 680 | class OneHotEncoder(BaseEstimator, TransformerMixin):
|
633 | 681 | """Encode categorical integer features using a one-hot aka one-of-K scheme.
|
634 | 682 |
|
@@ -722,29 +770,6 @@ def fit(self, X, y=None):
|
722 | 770 | self.fit_transform(X)
|
723 | 771 | return self
|
724 | 772 |
|
725 |
| - def _apply_transform(self, X, transform): |
726 |
| - if self.categorical_features == "all": |
727 |
| - return transform(X) |
728 |
| - else: |
729 |
| - X = check_arrays(X, sparse_format='dense')[0] |
730 |
| - n_features = X.shape[1] |
731 |
| - ind = np.arange(n_features) |
732 |
| - categorical = np.zeros(n_features, dtype=bool) |
733 |
| - categorical[np.array(self.categorical_features)] = True |
734 |
| - not_categorical = np.logical_not(categorical) |
735 |
| - n_categorical = np.sum(categorical) |
736 |
| - |
737 |
| - if n_categorical == 0: |
738 |
| - # No categorical variables. |
739 |
| - return X |
740 |
| - elif n_categorical == n_features: |
741 |
| - # All categorical variables. |
742 |
| - return transform(X) |
743 |
| - else: |
744 |
| - X_cat = transform(X[:, categorical]) |
745 |
| - X_not_cat = X[:, not_categorical] |
746 |
| - return sp.hstack((X_cat, X_not_cat)) |
747 |
| - |
748 | 773 | def _fit_transform(self, X):
|
749 | 774 | """Asssumes X contains only categorical features."""
|
750 | 775 | X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
|
@@ -793,7 +818,8 @@ def fit_transform(self, X, y=None):
|
793 | 818 | Equivalent to self.fit(X).transform(X), but more convenient and more
|
794 | 819 | efficient. See fit for the parameters, transform for the return value.
|
795 | 820 | """
|
796 |
| - return self._apply_transform(X, self._fit_transform) |
| 821 | + return _transform_selected(X, self._fit_transform, |
| 822 | + self.categorical_features) |
797 | 823 |
|
798 | 824 | def _transform(self, X):
|
799 | 825 | """Asssumes X contains only categorical features."""
|
@@ -836,7 +862,8 @@ def transform(self, X):
|
836 | 862 | X_out : sparse matrix, dtype=int
|
837 | 863 | Transformed input.
|
838 | 864 | """
|
839 |
| - return self._apply_transform(X, self._transform) |
| 865 | + return _transform_selected(X, self._transform, |
| 866 | + self.categorical_features) |
840 | 867 |
|
841 | 868 |
|
842 | 869 | class LabelEncoder(BaseEstimator, TransformerMixin):
|
|
0 commit comments