8000 further clean-up + tests · scikit-learn/scikit-learn@6c764e0 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit 6c764e0

Browse files
further clean-up + tests
- check that it works on pandas frames - fix doctests - un-deprecate OneHotEncoder - undo changes in _transform_selected (as we no longer need those changes for CategoricalEncoder) - add see also to OneHotEncoder and vice versa - for now remove the self.feature_indices_ attribute
1 parent fda6d27 commit 6c764e0

File tree

4 files changed

+53
-60
lines changed

4 files changed

+53
-60
lines changed

doc/modules/classes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ See the :ref:`metrics` section of the user guide for further details.
11971197
preprocessing.MaxAbsScaler
11981198
preprocessing.MinMaxScaler
11991199
preprocessing.Normalizer
1200+
preprocessing.OneHotEncoder
12001201
preprocessing.CategoricalEncoder
12011202
preprocessing.PolynomialFeatures
12021203
preprocessing.QuantileTransformer

doc/modules/preprocessing.rst

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -470,9 +470,8 @@ Continuing the example above::
470470
>>> enc = preprocessing.CategoricalEncoder()
471471
>>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
472472
>>> enc.fit(X) # doctest: +ELLIPSIS
473-
CategoricalEncoder(categorical_features='all', classes='auto',
474-
dtype=<... 'numpy.float64'>, handle_unknown='error',
475-
sparse=True)
473+
CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
474+
handle_unknown='error', sparse=True)
476475
>>> enc.transform([['female', 'from US', 'uses Safari']]).toarray()
477476
array([[ 1., 0., 0., 1., 0., 1.]])
478477

@@ -488,18 +487,17 @@ features, one has to explicitly set ``classes``. For example,
488487
>>> genders = ['male', 'female']
489488
>>> locations = ['from Europe', 'from US', 'from Africa', 'from Asia']
490489
>>> browsers = ['uses Safari', 'uses Firefox', 'uses IE', 'uses Chrome']
491-
>>> enc = preprocessing.CategoricalEncoder(classes=[genders, locations, browsers])
490+
>>> enc = preprocessing.CategoricalEncoder(categories=[genders, locations, browsers])
492491
>>> # Note that for there are missing categorical values for the 2nd and 3rd
493492
>>> # feature
494493
>>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
495494
>>> enc.fit(X) # doctest: +ELLIPSIS
496-
CategoricalEncoder(categorical_features='all',
497-
classes=[...],
495+
CategoricalEncoder(categories=[...],
498496
dtype=<... 'numpy.float64'>, handle_unknown='error',
499497
sparse=True)
500498

501499
>>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
502-
array([[ 1., 0., 0., 0., 0., 1., 1., 0., 0., 0.]])
500+
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
503501

504502
See :ref:`dict_feature_extraction` for categorical features that are represented
505503
as a dict, not as integers.

sklearn/preprocessing/data.py

Lines changed: 34 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@
3030
from ..utils.validation import (check_is_fitted, check_random_state,
3131
FLOAT_DTYPES)
3232
from .label import LabelEncoder
33-
from ..utils.fixes import np_version
34-
from ..utils.deprecation import deprecated
3533

3634

3735
BOUNDS_THRESHOLD = 1e-7
@@ -1682,27 +1680,23 @@ def add_dummy_feature(X, value=1.0):
16821680
return np.hstack((np.ones((n_samples, 1)) 9E88 * value, X))
16831681

16841682

1685-
def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
1686-
return_val=True):
1687-
"""Apply a function to portion of selected features
1683+
def _transform_selected(X, transform, selected="all", copy=True):
1684+
"""Apply a transform function to portion of selected features
16881685
Parameters
16891686
----------
1690-
X : {array, sparse matrix}, shape [n_samples, n_features]
1687+
X : {array-like, sparse matrix}, shape [n_samples, n_features]
16911688
Dense array or sparse matrix.
16921689
transform : callable
16931690
A callable transform(X) -> X_transformed
16941691
copy : boolean, optional
16951692
Copy X even if it could be avoided.
16961693
selected: "all" or array of indices or mask
16971694
Specify which features to apply the transform to.
1698-
return_val : boolean, optional
1699-
Whether to return the transformed matrix. If not set `None` is
1700-
returned.
17011695
Returns
17021696
-------
1703-
X : array or sparse matrix, shape=(n_samples, n_features_new)
1697+
X : array or sparse matrix, shape=(n_samples, n_features_new)
17041698
"""
1705-
X = check_array(X, accept_sparse='csc', copy=copy, dtype=None)
1699+
X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
17061700

17071701
if isinstance(selected, six.string_types) and selected == "all":
17081702
return transform(X)
@@ -1725,24 +1719,23 @@ def _apply_selected(X, transform, selected="all", dtype=np.float, copy=True,
17251719
return transform(X)
17261720
else:
17271721
X_sel = transform(X[:, ind[sel]])
1728-
X_not_sel = X[:, ind[not_sel]].astype(dtype)
1722+
X_not_sel = X[:, ind[not_sel]]
17291723

1730-
if return_val:
1731-
if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
1732-
return sparse.hstack((X_sel, X_not_sel))
1733-
else:
1734-
return np.hstack((X_sel, X_not_sel))
1724+
if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
1725+
return sparse.hstack((X_sel, X_not_sel))
1726+
else:
1727+
return np.hstack((X_sel, X_not_sel))
17351728

17361729

1737-
@deprecated('`OneHotEncoder` is deprecated, use `CategoricalEncoder` instead.')
17381730
class OneHotEncoder(BaseEstimator, TransformerMixin):
1739-
"""Encode categorical integer features using a one-hot aka one-of-K scheme.
1731+
"""Encode ordinal integer features using a one-hot aka one-of-K scheme.
17401732
17411733
The input to this transformer should be a matrix of integers, denoting
17421734
the values taken on by categorical (discrete) features. The output will be
17431735
a sparse matrix where each column corresponds to one possible value of one
17441736
feature. It is assumed that input features take on values in the range
1745-
[0, n_values).
1737+
[0, n_values). For an encoder based on the unique values of the input
1738+
features, see the :class:`sklearn.preprocessing.CategoricalEncoder`.
17461739
17471740
This encoding is needed for feeding categorical data to many scikit-learn
17481741
estimators, notably linear models and SVMs with the standard kernels.
@@ -1819,6 +1812,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
18191812
18201813
See also
18211814
--------
1815+
sklearn.preprocessing.CategoricalEncoder : performs a one-hot encoding of
1816+
all features (also handles string-valued features). This encoder
1817+
derives the categories based on the unique values in the features.
18221818
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
18231819
dictionary items (also handles string-valued features).
18241820
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
@@ -1908,8 +1904,8 @@ def fit_transform(self, X, y=None):
19081904
Equivalent to self.fit(X).transform(X), but more convenient and more
19091905
efficient. See fit for the parameters, transform for the return value.
19101906
"""
1911-
return _apply_selected(X, self._fit_transform, dtype=self.dtype,
1912-
selected=self.categorical_features, copy=True)
1907+
return _transform_selected(X, self._fit_transform,
1908+
self.categorical_features, copy=True)
19131909

19141910
def _transform(self, X):
19151911
"""Assumes X contains only categorical features."""
@@ -1964,8 +1960,8 @@ def transform(self, X):
19641960
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
19651961
Transformed input.
19661962
"""
1967-
return _apply_selected(X, self._transform, dtype=self.dtype,
1968-
selected=self.categorical_features, copy=True)
1963+
return _transform_selected(X, self._transform,
1964+
self.categorical_features, copy=True)
19691965

19701966

19711967
class QuantileTransformer(BaseEstimator, TransformerMixin):
@@ -2465,7 +2461,7 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
24652461
categories : 'auto' or a list of lists/arrays of values.
24662462
Values per feature.
24672463
2468-
- 'auto' : Determine classes automatically from the training data.
2464+
- 'auto' : Determine categories automatically from the training data.
24692465
- list : ``categories[i]`` holds the categories expected in the ith
24702466
column.
24712467
@@ -2484,17 +2480,21 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
24842480
Given a dataset with three features and two samples, we let the encoder
24852481
find the maximum value per feature and transform the data to a binary
24862482
one-hot encoding.
2483+
24872484
>>> from sklearn.preprocessing import CategoricalEncoder
24882485
>>> enc = CategoricalEncoder()
2489-
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
2490-
[1, 0, 2]]) # doctest: +ELLIPSIS
2486+
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
2487+
... # doctest: +ELLIPSIS
24912488
CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
24922489
handle_unknown='error', sparse=True)
24932490
>>> enc.transform([[0, 1, 1]]).toarray()
24942491
array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.]])
24952492
24962493
See also
24972494
--------
2495+
sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
2496+
integer ordinal features. This transformer assumes that input features
2497+
take on values in the range [0, max(feature)].
24982498
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
24992499
dictionary items (also handles string-valued features).
25002500
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
@@ -2529,45 +2529,27 @@ def fit(self, X, y=None):
25292529
X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
25302530
n_samples, n_features = X.shape
25312531

2532-
self._fit(X)
2533-
return self
2534-
2535-
def _fit(self, X):
2536-
"Assumes `X` contains only cetergorical features."
2537-
2538-
X = check_array(X, dtype=np.object)
2539-
n_samples, n_features = X.shape
2540-
2541-
self._label_encoders_ = [LabelEncoder() for i in range(n_features)]
2532+
self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
25422533

25432534
for i in range(n_features):
25442535
le = self._label_encoders_[i]
2536+
Xi = X[:, i]
25452537
if self.categories == 'auto':
2546-
le.fit(X[:, i])
2538+
le.fit(Xi)
25472539
else:
2548-
if not np.all(np.in1d(X[:, i], self.categories[i])):
2540+
if not np.all(np.in1d(Xi, self.categories[i])):
25492541
if self.handle_unknown == 'error':
2550-
diff = np.setdiff1d(X[:, i], self.categories[i])
2542+
diff = np.setdiff1d(Xi, self.categories[i])
25512543
msg = 'Unknown feature(s) %s in column %d' % (diff, i)
25522544
raise ValueError(msg)
25532545
le.classes_ = np.array(np.sort(self.categories[i]))
25542546

2555-
@staticmethod
2556-
def _check_unknown_categories(values, categories):
2557-
"""Returns False if not all categories in the values are known"""
2558-
valid_mask = np.in1d(values, categories)
2559-
return np.all(valid_mask)
2547+
return self
25602548

25612549
def transform(self, X, y=None):
25622550
"""Encode the selected categorical features using the one-hot scheme.
25632551
"""
2564-
X = check_array(X, dtype=np.object, copy=True)
2565-
return self._transform(X)
2566-
2567-
def _transform(self, X):
2568-
"Assumes `X` contains only categorical features."
2569-
2570-
X = check_array(X, accept_sparse='csc', dtype=np.object)
2552+
X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
25712553
n_samples, n_features = X.shape
25722554
X_int = np.zeros_like(X, dtype=np.int)
25732555
X_mask = np.ones_like(X, dtype=np.bool)
@@ -2593,7 +2575,6 @@ def _transform(self, X):
25932575
n_values = [le.classes_.shape[0] for le in self._label_encoders_]
25942576
n_values = np.hstack([[0], n_values])
25952577
indices = np.cumsum(n_values)
2596-
self.feature_indices_ = indices
25972578

25982579
column_indices = (X_int + indices[:-1]).ravel()[mask]
25992580
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),

sklearn/preprocessing/tests/test_data.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,19 @@ def test_categorical_encoder_specified_categories():
20332033
enc.fit(X)
20342034

20352035

2036+
def test_categorical_encoder_pandas():
2037+
2038+
try:
2039+
import pandas as pd
2040+
except ImportError:
2041+
raise SkipTest("pandas is not installed")
2042+
2043+
X_df = pd.DataFrame({'A': ['a', 'b'], 'B': ['c', 'd']})
2044+
2045+
Xtr = check_categorical(X_df)
2046+
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
2047+
2048+
20362049
def test_fit_cold_start():
20372050
X = iris.data
20382051
X_2d = X[:, :2]

0 commit comments

Comments
 (0)
0