scikit-learn
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 4 additions & 3 deletions b/‎doc/modules/preprocessing.rst
Lines changed: 4 additions & 3 deletions
diff --git a/‎sklearn/preprocessing.py
Lines changed: 52 additions & 25 deletions b/‎sklearn/preprocessing.py
Lines changed: 52 additions & 25 deletions
diff --git a/‎sklearn/tests/test_preprocessing.py
Lines changed: 28 additions & 2 deletions b/‎sklearn/tests/test_preprocessing.py
Lines changed: 28 additions & 2 deletions
@@ -333,15 +333,16 @@ Continuing the example above::
 
   >>> enc = preprocessing.OneHotEncoder()
   >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
-  OneHotEncoder(dtype=<type 'float'>, n_values='auto')
+  OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
+         n_values='auto')
   >>> enc.transform([[0, 1, 3]]).toarray()
   array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 
 By default, how many values each feature can take is inferred automatically from the dataset.
-It is possible to specify this explicitly using the parameter ``n_values``. 
+It is possible to specify this explicitly using the parameter ``n_values``.
 There are two genders, three possible continents and four web browsers in our
 dataset.
-Then we fit the estimator, and transform a data point. 
+Then we fit the estimator, and transform a data point.
 In the result, the first two numbers encode the gender, the next set of three
 numbers the continent and the last four the web browser.
 
 
@@ -15,6 +15,7 @@
 from .utils import check_arrays
 from .utils import array2d
 from .utils import atleast2d_or_csr
+from .utils import atleast2d_or_csc
 from .utils import safe_asarray
 from .utils import warn_if_not_float
 from .utils.fixes import unique
@@ -629,6 +630,53 @@ def transform(self, X, y=None, copy=None):
         return binarize(X, threshold=self.threshold, copy=copy)
 
 
+def _transform_selected(X, transform, selected):
+    """Apply a transform function to portion of selected features
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+        Dense array or sparse matrix.
+
+    transform : callable
+        A callable transform(X) -> X_transformed
+
+    selected: "all" or array of indices or mask
+        Specify what features to apply the transform to.
+
+    Returns
+    -------
+    X : array or sparse matrix, shape=(n_samples, n_features_new)
+    """
+    if len(selected) == 0:
+        return X
+    elif selected == "all":
+        return transform(X)
+    else:
+        X = atleast2d_or_csc(X)
+        n_features = X.shape[1]
+        sel = np.zeros(n_features, dtype=bool)
+        sel[np.array(selected)] = True
+        not_sel = np.logical_not(sel)
+        n_selected = np.sum(sel)
+
+        if n_selected == 0:
+            # No features selected.
+            return X
+        elif n_selected == n_features:
+            # All features selected.
+            return transform(X)
+        else:
+            X_sel = transform(X[:, ind[sel]])
+            X_not_sel = X[:, ind[not_sel]]
+
+            if sp.issparse(X_sel) or sp.issparse(X_not_sel):
+                return sp.hstack((X_sel, X_not_sel))
+            else:
+                return np.hstack((X_sel, X_not_sel))
+
+
 class OneHotEncoder(BaseEstimator, TransformerMixin):
     """Encode categorical integer features using a one-hot aka one-of-K scheme.
 
@@ -722,29 +770,6 @@ def fit(self, X, y=None):
         self.fit_transform(X)
         return self
 
-    def _apply_transform(self, X, transform):
-        if self.categorical_features == "all":
-            return transform(X)
-        else:
-            X = check_arrays(X, sparse_format='dense')[0]
-            n_features = X.shape[1]
-            ind = np.arange(n_features)
-            categorical = np.zeros(n_features, dtype=bool)
-            categorical[np.array(self.categorical_features)] = True
-            not_categorical = np.logical_not(categorical)
-            n_categorical = np.sum(categorical)
-
-            if n_categorical == 0:
-                # No categorical variables.
-                return X
-            elif n_categorical == n_features:
-                # All categorical variables.
-                return transform(X)
-            else:
-                X_cat = transform(X[:, categorical])
-                X_not_cat = X[:, not_categorical]
-                return sp.hstack((X_cat, X_not_cat))
-
     def _fit_transform(self, X):
         """Asssumes X contains only categorical features."""
         X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
@@ -793,7 +818,8 @@ def fit_transform(self, X, y=None):
         Equivalent to self.fit(X).transform(X), but more convenient and more
         efficient. See fit for the parameters, transform for the return value.
         """
-        return self._apply_transform(X, self._fit_transform)
+        return _transform_selected(X, self._fit_transform,
+                                   self.categorical_features)
 
     def _transform(self, X):
         """Asssumes X contains only categorical features."""
@@ -836,7 +862,8 @@ def transform(self, X):
         X_out : sparse matrix, dtype=int
             Transformed input.
         """
-        return self._apply_transform(X, self._transform)
+        return _transform_selected(X, self._transform,
+                                   self.categorical_features)
 
 
 class LabelEncoder(BaseEstimator, TransformerMixin):
 
@@ -15,6 +15,7 @@
 from sklearn.preprocessing import Binarizer
 from sklearn.preprocessing import KernelCenterer
 from sklearn.preprocessing import LabelBinarizer
+from sklearn.preprocessing import _transform_selected
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import Lab
10000
elEncoder
 from sklearn.preprocessing import Normalizer
@@ -599,6 +600,27 @@ def test_one_hot_encoder():
     assert_raises(ValueError, enc.transform, [[0], [-1]])
 
 
+def _check_transform_selected(X, Xexpected, sel):
+    for M in (X, sp.csr_matrix(X)):
+        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        assert_array_equal(toarray(Xtr), Xexpected)
+
+
+def test_transform_selected():
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    Xexpected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, Xexpected, [0])
+    _check_transform_selected(X, Xexpected, [True, False, False])
+
+    Xexpected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, Xexpected, [0, 1, 2])
+    _check_transform_selected(X, Xexpected, [True, True, True])
+
+    _check_transform_selected(X, X, [])
+    _check_transform_selected(X, X, [False, False, False])
+
+
 def _run_one_hot(X, X2, cat):
     enc = OneHotEncoder(categorical_features=cat)
     Xtr = enc.fit_transform(X)
@@ -608,18 +630,22 @@ def _run_one_hot(X, X2, cat):
 
 def _check_one_hot(X, X2, cat, n_features):
     ind = np.where(cat)[0]
+    # With mask
     A, B = _run_one_hot(X, X2, cat)
+    # With indices
     C, D = _run_one_hot(X, X2, ind)
+    # Check shape
     assert_equal(A.shape, (2, n_features))
     assert_equal(B.shape, (1, n_features))
     assert_equal(C.shape, (2, n_features))
     assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
     assert_array_equal(toarray(A), toarray(C))
     assert_array_equal(toarray(B), toarray(D))
 
 def test_one_hot_encoder_categorical_features():
-    X = [[3, 2, 1], [0, 1, 1]]
-    X2 = [[1, 1, 1]]
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
 
     cat = [True, False, False]
     _check_one_hot(X, X2, cat, 4)