12
12
13
13
from .base import BaseEstimator , TransformerMixin
14
14
from .externals .six import string_types
15
- from .utils import check_arrays , array2d , atleast2d_or_csr , safe_asarray
15
+ from .utils import check_arrays
16
+ from .utils import array2d
17
+ from .utils import atleast2d_or_csr
18
+ from .utils import atleast2d_or_csc
19
+ from .utils import safe_asarray
16
20
from .utils import warn_if_not_float
17
21
from .utils .fixes import unique
18
22
35
39
'LabelEncoder' ,
36
40
'MinMaxScaler' ,
37
41
'Normalizer' ,
42
+ 'OneHotEncoder' ,
38
43
'StandardScaler' ,
39
44
'binarize' ,
40
45
'normalize' ,
@@ -632,6 +637,53 @@ def transform(self, X, y=None, copy=None):
632
637
return binarize (X , threshold = self .threshold , copy = copy )
633
638
634
639
640
+ def _transform_selected (X , transform , selected = "all" ):
641
+ """Apply a transform function to portion of selected features
642
+
643
+ Parameters
644
+ ----------
645
+ X : array-like or sparse matrix, shape=(n_samples, n_features)
646
+ Dense array or sparse matrix.
647
+
648
+ transform : callable
649
+ A callable transform(X) -> X_transformed
650
+
651
+ selected: "all" or array of indices or mask
652
+ Specify what features to apply the transform to.
653
+
654
+ Returns
655
+ -------
656
+ X : array or sparse matrix, shape=(n_samples, n_features_new)
657
+ """
658
+ if selected == "all" :
659
+ return transform (X )
660
+ elif len (selected ) == 0 :
661
+ return X
662
+ else :
663
+ X = atleast2d_or_csc (X )
664
+ n_features = X .shape [1 ]
665
+ ind = np .arange (n_features )
666
+ sel = np .zeros (n_features , dtype = bool )
667
+ sel [np .array (selected )] = True
668
+ not_sel = np .logical_not (sel )
669
+ n_selected = np .sum (sel )
670
+
671
+ if n_selected == 0 :
672
+ # No features selected.
673
+ return X
674
+ elif n_selected == n_features :
675
+ # All features selected.
676
+ return transform (X )
677
+ else :
678
+ X_sel = transform (X [:, ind [sel ]])
679
+ X_not_sel = X [:, ind [not_sel ]]
680
+
681
+ if sp .issparse (X_sel ) or sp .issparse (X_not_sel ):
682
+ return sp .hstack ((X_sel , X_not_sel ))
683
+ else :
684
+ return np .hstack ((X_sel , X_not_sel ))
685
+
686
+
635
687
class OneHotEncoder (BaseEstimator , TransformerMixin ):
636
688
"""Encode categorical integer features using a one-hot aka one-of-K scheme.
637
689
@@ -646,11 +698,21 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
646
698
647
699
Parameters
648
700
----------
649
- n_values : 'auto', int or array of int
701
+ n_values : 'auto', int or array of ints
650
702
Number of values per feature.
651
- 'auto' : determine value range from training data.
652
- int : maximum value for all features.
653
- array : maximum value per feature.
703
+
704
+ - 'auto' : determine value range from training data.
705
+ - int : maximum value for all features.
706
+ - array : maximum value per feature.
707
+
708
+ categorical_features: "all" or array of indices or mask
709
+ Specify what features are treated as categorical.
710
+
711
+ - 'all' (default): All features are treated as categorical.
712
+ - array of indices: Array of categorical feature indices.
713
+ - mask: Array of length n_features and with dtype=bool.
714
+
715
+ Non-categorical features are always stacked to the right of the matrix.
654
716
655
717
dtype : number type, default=np.float
656
718
Desired dtype of output.
@@ -680,7 +742,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
680
742
>>> enc = OneHotEncoder()
681
743
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1],
10000
\
682
744
[1, 0, 2]]) # doctest: +ELLIPSIS
683
- OneHotEncoder(dtype=<... 'float'>, n_values='auto')
745
+ OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
746
+ n_values='auto')
684
747
>>> enc.n_values_
685
748
array([2, 3, 4])
686
749
>>> enc.feature_indices_
@@ -690,12 +753,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
690
753
691
754
See also
692
755
--------
693
- LabelEncoder : performs a one-hot encoding on arbitrary class labels.
694
756
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
695
757
dictionary items (also handles string-valued features).
696
758
"""
697
- def __init__ (self , n_values = "auto" , dtype = np .float ):
759
+ def __init__ (self , n_values = "auto" , categorical_features = "all" ,
760
+ dtype = np .float ):
698
761
self .n_values = n_values
762
+ self .categorical_features = categorical_features
699
763
self .dtype = dtype
700
764
701
765
def fit (self , X , y = None ):
@@ -713,12 +777,8 @@ def fit(self, X, y=None):
713
777
self .fit_transform (X )
714
778
return self
715
779
716
- def fit_transform (self , X , y = None ):
717
- """Fit OneHotEncoder to X, then transform X.
718
-
719
- Equivalent to self.fit(X).transform(X), but more convenient and more
720
- efficient. See fit for the parameters, transform for the return value.
721
- """
780
+ def _fit_transform (self , X ):
781
+ """Asssumes X contains only categorical features."""
722
782
X = check_arrays (X , sparse_format = 'dense' , dtype = np .int )[0 ]
723
783
if np .any (X < 0 ):
724
784
raise ValueError ("X needs to contain only non-negative integers." )
@@ -759,19 +819,17 @@ def fit_transform(self, X, y=None):
759
819
760
820
return out
761
821
762
- def transform (self , X ):
763
- """Transform X using one-hot encoding.
764
-
765
- Parameters
766
- ----------
767
- X : array-like, shape=(n_samples, feature_indices_[-1])
768
- Input array of type int.
822
+ def fit_transform (self , X , y = None ):
823
+ """Fit OneHotEncoder to X, then transform X.
769
824
770
- Returns
771
- -------
772
- X_out : sparse matrix, dtype=int
773
- Transformed input.
825
+ Equivalent to self.fit(X).transform(X), but more convenient and more
826
+ efficient. See fit for the parameters, transform for the return value.
774
827
"""
828
+ return _transform_selected (X , self ._fit_transform ,
829
+ self .categorical_features )
830
+
831
+ def _transform (self , X ):
832
+ """Asssumes X contains only categorical features."""
775
833
X = check_arrays (X , sparse_format = 'dense' , dtype = np .int )[0 ]
776
834
if np .any (X < 0 ):
777
835
raise ValueError ("X needs to contain only non-negative integers." )
@@ -798,6 +856,22 @@ def transform(self, X):
798
856
out = out [:, self .active_features_ ]
799
857
return out
800
858
859
+ def transform (self , X ):
860
+ """Transform X using one-hot encoding.
861
+
862
+ Parameters
863
+ ----------
<
F438
code>864 + X : array-like, shape=(n_samples, n_features)
865
+ Input array of type int.
866
+
867
+ Returns
868
+ -------
869
+ X_out : sparse matrix, dtype=int
870
+ Transformed input.
871
+ """
872
+ return _transform_selected (X , self ._transform ,
873
+ self .categorical_features )
874
+
801
875
802
876
class LabelEncoder (BaseEstimator , TransformerMixin ):
803
877
"""Encode labels with value between 0 and n_classes-1.