30
30
from ..utils .validation import (check_is_fitted , check_random_state ,
31
31
FLOAT_DTYPES )
32
32
from .label import LabelEncoder
33
- from ..utils .fixes import np_version
34
- from ..utils .deprecation import deprecated
35
33
36
34
37
35
BOUNDS_THRESHOLD = 1e-7
@@ -1682,27 +1680,23 @@ def add_dummy_feature(X, value=1.0):
1682
1680
return np .hstack ((np .ones ((n_samples , 1 ))
9E88
* value , X ))
1683
1681
1684
1682
1685
- def _apply_selected (X , transform , selected = "all" , dtype = np .float , copy = True ,
1686
- return_val = True ):
1687
- """Apply a function to portion of selected features
1683
+ def _transform_selected (X , transform , selected = "all" , copy = True ):
1684
+ """Apply a transform function to portion of selected features
1688
1685
Parameters
1689
1686
----------
1690
- X : {array, sparse matrix}, shape [n_samples, n_features]
1687
+ X : {array-like , sparse matrix}, shape [n_samples, n_features]
1691
1688
Dense array or sparse matrix.
1692
1689
transform : callable
1693
1690
A callable transform(X) -> X_transformed
1694
1691
copy : boolean, optional
1695
1692
Copy X even if it could be avoided.
1696
1693
selected: "all" or array of indices or mask
1697
1694
Specify which features to apply the transform to.
1698
- return_val : boolean, optional
1699
- Whether to return the transformed matrix. If not set `None` is
1700
- returned.
1701
1695
Returns
1702
1696
-------
1703
- X : array or sparse matrix, shape=(n_samples, n_features_new)
1697
+ X : array or sparse matrix, shape=(n_samples, n_features_new)
1704
1698
"""
1705
- X = check_array (X , accept_sparse = 'csc' , copy = copy , dtype = None )
1699
+ X = check_array (X , accept_sparse = 'csc' , copy = copy , dtype = FLOAT_DTYPES )
1706
1700
1707
1701
if isinstance (selected , six .string_types ) and selected == "all" :
1708
1702
return transform (X )
@@ -1725,24 +1719,23 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
1725
1719
return transform (X )
1726
1720
else :
1727
1721
X_sel = transform (X [:, ind [sel ]])
1728
- X_not_sel = X [:, ind [not_sel ]]. astype ( dtype )
1722
+ X_not_sel = X [:, ind [not_sel ]]
1729
1723
1730
- if return_val :
1731
- if sparse .issparse (X_sel ) or sparse .issparse (X_not_sel ):
1732
- return sparse .hstack ((X_sel , X_not_sel ))
1733
- else :
1734
- return np .hstack ((X_sel , X_not_sel ))
1724
+ if sparse .issparse (X_sel ) or sparse .issparse (X_not_sel ):
1725
+ return sparse .hstack ((X_sel , X_not_sel ))
1726
+ else :
1727
+ return np .hstack ((X_sel , X_not_sel ))
1735
1728
1736
1729
1737
- @deprecated ('`OneHotEncoder` is deprecated, use `CategoricalEncoder` instead.' )
1738
1730
class OneHotEncoder (BaseEstimator , TransformerMixin ):
1739
- """Encode categorical integer features using a one-hot aka one-of-K scheme.
1731
+ """Encode ordinal integer features using a one-hot aka one-of-K scheme.
1740
1732
1741
1733
The input to this transformer should be a matrix of integers, denoting
1742
1734
the values taken on by categorical (discrete) features. The output will be
1743
1735
a sparse matrix where each column corresponds to one possible value of one
1744
1736
feature. It is assumed that input features take on values in the range
1745
- [0, n_values).
1737
+ [0, n_values). For an encoder based on the unique values of the input
1738
+ features, see the :class:`sklearn.preprocessing.CategoricalEncoder`.
1746
1739
1747
1740
This encoding is needed for feeding categorical data to many scikit-learn
1748
1741
estimators, notably linear models and SVMs with the standard kernels.
@@ -1819,6 +1812,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
1819
1812
1820
1813
See also
1821
1814
--------
1815
+ sklearn.preprocessing.CategoricalEncoder : performs a one-hot encoding of
1816
+ all features (also handles string-valued features). This encoder
1817
+ derives the categories based on the unique values in the features.
1822
1818
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
1823
1819
dictionary items (also handles string-valued features).
1824
1820
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
@@ -1908,8 +1904,8 @@ def fit_transform(self, X, y=None):
1908
1904
Equivalent to self.fit(X).transform(X), but more convenient and more
1909
1905
efficient. See fit for the parameters, transform for the return value.
1910
1906
"""
1911
- return _apply_selected (X , self ._fit_transform , dtype = self . dtype ,
1912
- selected = self .categorical_features , copy = True )
1907
+ return _transform_selected (X , self ._fit_transform ,
1908
+ self .categorical_features , copy = True )
1913
1909
1914
1910
def _transform (self , X ):
1915
1911
"""Assumes X contains only categorical features."""
@@ -1964,8 +1960,8 @@ def transform(self, X):
1964
1960
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
1965
1961
Transformed input.
1966
1962
"""
1967
- return _apply_selected (X , self ._transform , dtype = self . dtype ,
1968
- selected = self .categorical_features , copy = True )
1963
+ return _transform_selected (X , self ._transform ,
1964
+ self .categorical_features , copy = True )
1969
1965
1970
1966
1971
1967
class QuantileTransformer (BaseEstimator , TransformerMixin ):
@@ -2465,7 +2461,7 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
2465
2461
categories : 'auto' or a list of lists/arrays of values.
2466
2462
Values per feature.
2467
2463
2468
- - 'auto' : Determine classes automatically from the training data.
2464
+ - 'auto' : Determine categories automatically from the training data.
2469
2465
- list : ``categories[i]`` holds the categories expected in the ith
2470
2466
column.
2471
2467
@@ -2484,17 +2480,21 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
2484
2480
Given a dataset with three features and two samples, we let the encoder
2485
2481
find the maximum value per feature and transform the data to a binary
2486
2482
one-hot encoding.
2483
+
2487
2484
>>> from sklearn.preprocessing import CategoricalEncoder
2488
2485
>>> enc = CategoricalEncoder()
2489
- >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
2490
- [1, 0, 2]]) # doctest: +ELLIPSIS
2486
+ >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
2487
+ ... # doctest: +ELLIPSIS
2491
2488
CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
2492
2489
handle_unknown='error', sparse=True)
2493
2490
>>> enc.transform([[0, 1, 1]]).toarray()
2494
2491
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
2495
2492
2496
2493
See also
2497
2494
--------
2495
+ sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
2496
+ integer ordinal features. This transformer assumes that input features
2497
+ take on values in the range [0, max(feature)].
2498
2498
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
2499
2499
dictionary items (also handles string-valued features).
2500
2500
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
@@ -2529,45 +2529,27 @@ def fit(self, X, y=None):
2529
2529
X = check_array (X , dtype = np .object , accept_sparse = 'csc', copy = True )
2530
2530
n_samples , n_features = X .shape
2531
2531
2532
- self ._fit (X )
2533
- return self
2534
-
2535
- def _fit (self , X ):
2536
- "Assumes `X` contains only cetergorical features."
2537
-
2538
- X = check_array (X , dtype = np .object )
2539
- n_samples , n_features = X .shape
2540
-
2541
- self ._label_encoders_ = [LabelEncoder () for i in range (n_features )]
2532
+ self ._label_encoders_ = [LabelEncoder () for _ in range (n_features )]
2542
2533
2543
2534
for i in range (n_features ):
2544
2535
le = self ._label_encoders_ [i ]
2536
+ Xi = X [:, i ]
2545
2537
if self .categories == 'auto' :
2546
- le .fit (X [:, i ] )
2538
+ le .fit (Xi )
2547
2539
else :
2548
- if not np .all (np .in1d (X [:, i ] , self .categories [i ])):
2540
+ if not np .all (np .in1d (Xi , self .categories [i ])):
2549
2541
if self .handle_unknown == 'error' :
2550
- diff = np .setdiff1d (X [:, i ] , self .categories [i ])
2542
+ diff = np .setdiff1d (Xi , self .categories [i ])
2551
2543
msg = 'Unknown feature(s) %s in column %d' % (diff , i )
2552
2544
raise ValueError (msg )
2553
2545
le .classes_ = np .array (np .sort (self .categories [i ]))
2554
2546
2555
- @staticmethod
2556
- def _check_unknown_categories (values , categories ):
2557
- """Returns False if not all categories in the values are known"""
2558
- valid_mask = np .in1d (values , categories )
2559
- return np .all (valid_mask )
2547
+ return self
2560
2548
2561
2549
def transform (self , X , y = None ):
2562
2550
"""Encode the selected categorical features using the one-hot scheme.
2563
2551
"""
2564
- X = check_array (X , dtype = np .object , copy = True )
2565
- return self ._transform (X )
2566
-
2567
- def _transform (self , X ):
2568
- "Assumes `X` contains only categorical features."
2569
-
2570
- X = check_array (X , accept_sparse = 'csc' , dtype = np .object )
2552
+ X = check_array (X , accept_sparse = 'csc' , dtype = np .object , copy = True )
2571
2553
n_samples , n_features = X .shape
2572
2554
X_int = np .zeros_like (X , dtype = np .int )
2573
2555
X_mask = np .ones_like (X , dtype = np .bool )
@@ -2593,7 +2575,6 @@ def _transform(self, X):
2593
2575
n_values = [le .classes_ .shape [0 ] for le in self ._label_encoders_ ]
2594
2576
n_values = np .hstack ([[0 ], n_values ])
2595
2577
indices = np .cumsum (n_values )
2596
- self .feature_indices_ = indices
2597
2578
2598
2579
column_indices = (X_int + indices [:- 1 ]).ravel ()[mask ]
2599
2580
row_indices = np .repeat (np .arange (n_samples , dtype = np .int32 ),
0 commit comments