8000 Rename _apply_transform to _transform_selected and make it a function · scikit-learn/scikit-learn@017afab · GitHub
[go: up one dir, main page]

Skip to content

Commit 017afab

Browse files
committed
Rename _apply_transform to _transform_selected and make it a function
rather than a method.
1 parent 6e2aee2 commit 017afab

File tree

3 files changed

+84
-30
lines changed

3 files changed

+84
-30
lines changed

doc/modules/preprocessing.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,15 +333,16 @@ Continuing the example above::
333333

334334
>>> enc = preprocessing.OneHotEncoder()
335335
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
336-
OneHotEncoder(dtype=<type 'float'>, n_values='auto')
336+
OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
337+
n_values='auto')
337338
>>> enc.transform([[0, 1, 3]]).toarray()
338339
array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])
339340

340341
By default, how many values each feature can take is inferred automatically from the dataset.
341-
It is possible to specify this explicitly using the parameter ``n_values``.
342+
It is possible to specify this explicitly using the parameter ``n_values``.
342343
There are two genders, three possible continents and four web browsers in our
343344
dataset.
344-
Then we fit the estimator, and transform a data point.
345+
Then we fit the estimator, and transform a data point.
345346
In the result, the first two numbers encode the gender, the next set of three
346347
numbers the continent and the last four the web browser.
347348

sklearn/preprocessing.py

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .utils import check_arrays
1616
from .utils import array2d
1717
from .utils import atleast2d_or_csr
18+
from .utils import atleast2d_or_csc
1819
from .utils import safe_asarray
1920
from .utils import warn_if_not_float
2021
from .utils.fixes import unique
@@ -629,6 +630,53 @@ def transform(self, X, y=None, copy=None):
629630
return binarize(X, threshold=self.threshold, copy=copy)
630631

631632

633+
def _transform_selected(X, transform, selected):
634+
"""Apply a transform function to portion of selected features
635+
636+
Parameters
637+
----------
638+
X : array-like or sparse matrix, shape=(n_samples, n_features)
639+
Dense array or sparse matrix.
640+
641+
transform : callable
642+
A callable transform(X) -> X_transformed
643+
644+
selected: "all" or array of indices or mask
645+
Specify what features to apply the transform to.
646+
647+
Returns
648+
-------
649+
X : array or sparse matrix, shape=(n_samples, n_features_new)
650+
"""
651+
if len(selected) == 0:
652+
return X
653+
elif selected == "all":
654+
return transform(X)
655+
else:
656+
X = atleast2d_or_csc(X)
657+
n_features = X.shape[1]
658+
ind = np.arange(n_features)
659+
sel = np.zeros(n_features, dtype=bool)
660+
sel[np.array(selected)] = True
661+
not_sel = np.logical_not(sel)
662+
n_selected = np.sum(sel)
663+
664+
if n_selected == 0:
665+
# No features selected.
666+
return X
667+
elif n_selected == n_features:
668+
# All features selected.
669+
return transform(X)
670+
else:
671+
X_sel = transform(X[:, ind[sel]])
672+
X_not_sel = X[:, ind[not_sel]]
673+
674+
if sp.issparse(X_sel) or sp.issparse(X_not_sel):
675+
return sp.hstack((X_sel, X_not_sel))
676+
else:
677+
return np.hstack((X_sel, X_not_sel))
678+
679+
632680
class OneHotEncoder(BaseEstimator, TransformerMixin):
633681
"""Encode categorical integer features using a one-hot aka one-of-K scheme.
634682
@@ -722,29 +770,6 @@ def fit(self, X, y=None):
722770
self.fit_transform(X)
723771
return self
724772

725-
def _apply_transform(self, X, transform):
726-
if self.categorical_features == "all":
727-
return transform(X)
728-
else:
729-
X = check_arrays(X, sparse_format='dense')[0]
730-
n_features = X.shape[1]
731-
ind = np.arange(n_features)
732-
categorical = np.zeros(n_features, dtype=bool)
733-
categorical[np.array(self.categorical_features)] = True
734-
not_categorical = np.logical_not(categorical)
735-
n_categorical = np.sum(categorical)
736-
737-
if n_categorical == 0:
738-
# No categorical variables.
739-
return X
740-
elif n_categorical == n_features:
741-
# All categorical variables.
742-
return transform(X)
743-
else:
744-
X_cat = transform(X[:, categorical])
745-
X_not_cat = X[:, not_categorical]
746-
return sp.hstack((X_cat, X_not_cat))
747-
748773
def _fit_transform(self, X):
749774
"""Asssumes X contains only categorical features."""
750775
X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
@@ -793,7 +818,8 @@ def fit_transform(self, X, y=None):
793818
Equivalent to self.fit(X).transform(X), but more convenient and more
794819
efficient. See fit for the parameters, transform for the return value.
795820
"""
796-
return self._apply_transform(X, self._fit_transform)
821+
return _transform_selected(X, self._fit_transform,
822+
self.categorical_features)
797823

798824
def _transform(self, X):
799825
"""Asssumes X contains only categorical features."""
@@ -836,7 +862,8 @@ def transform(self, X):
836862
X_out : sparse matrix, dtype=int
837863
Transformed input.
838864
"""
839-
return self._apply_transform(X, self._transform)
865+
return _transform_selected(X, self._transform,
866+
self.categorical_features)
840867

841868

842869
class LabelEncoder(BaseEstimator, TransformerMixin):

sklearn/tests/test_preprocessing.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from sklearn.preprocessing import Binarizer
1616
from sklearn.preprocessing import KernelCenterer
1717
from sklearn.preprocessing import LabelBinarizer
18+
from sklearn.preprocessing import _transform_selected
1819
from sklearn.preprocessing import OneHotEncoder
1920
from sklearn.preprocessing import Lab 10000 elEncoder
2021
from sklearn.preprocessing import Normalizer
@@ -599,6 +600,27 @@ def test_one_hot_encoder():
599600
assert_raises(ValueError, enc.transform, [[0], [-1]])
600601

601602

603+
def _check_transform_selected(X, Xexpected, sel):
604+
for M in (X, sp.csr_matrix(X)):
605+
Xtr = _transform_selected(M, Binarizer().transform, sel)
606+
assert_array_equal(toarray(Xtr), Xexpected)
607+
608+
609+
def test_transform_selected():
610+
X = [[3, 2, 1], [0, 1, 1]]
611+
612+
Xexpected = [[1, 2, 1], [0, 1, 1]]
613+
_check_transform_selected(X, Xexpected, [0])
614+
_check_transform_selected(X, Xexpected, [True, False, False])
615+
616+
Xexpected = [[1, 1, 1], [0, 1, 1]]
617+
_check_transform_selected(X, Xexpected, [0, 1, 2])
618+
_check_transform_selected(X, Xexpected, [True, True, True])
619+
620+
_check_transform_selected(X, X, [])
621+
_check_transform_selected(X, X, [False, False, False])
622+
623+
602624
def _run_one_hot(X, X2, cat):
603625
enc = OneHotEncoder(categorical_features=cat)
604626
Xtr = enc.fit_transform(X)
@@ -608,18 +630,22 @@ def _run_one_hot(X, X2, cat):
608630

609631
def _check_one_hot(X, X2, cat, n_features):
610632
ind = np.where(cat)[0]
633+
# With mask
611634
A, B = _run_one_hot(X, X2, cat)
635+
# With indices
612636
C, D = _run_one_hot(X, X2, ind)
637+
# Check shape
613638
assert_equal(A.shape, (2, n_features))
614639
assert_equal(B.shape, (1, n_features))
615640
assert_equal(C.shape, (2, n_features))
616641
assert_equal(D.shape, (1, n_features))
642+
# Check that mask and indices give the same results
617643
assert_array_equal(toarray(A), toarray(C))
618644
assert_array_equal(toarray(B), toarray(D))
619645

620646
def test_one_hot_encoder_categorical_features():
621-
X = [[3, 2, 1], [0, 1, 1]]
622-
X2 = [[1, 1, 1]]
647+
X = np.array([[3, 2, 1], [0, 1, 1]])
648+
X2 = np.array([[1, 1, 1]])
623649

624650
cat = [True, False, False]
625651
_check_one_hot(X, X2, cat, 4)

0 commit comments

Comments
 (0)
0