scikit-learn
diff --git a/‎doc/modules/preprocessing.rst
Lines changed: 4 additions & 3 deletions b/‎doc/modules/preprocessing.rst
Lines changed: 4 additions & 3 deletions
diff --git a/‎sklearn/preprocessing.py
Lines changed: 99 additions & 25 deletions b/‎sklearn/preprocessing.py
Lines changed: 99 additions & 25 deletions
diff --git a/‎sklearn/tests/test_preprocessing.py
Lines changed: 62 additions & 0 deletions b/‎sklearn/tests/test_preprocessing.py
Lines changed: 62 additions & 0 deletions
@@ -333,15 +333,16 @@ Continuing the example above::
 
   >>> enc = preprocessing.OneHotEncoder()
   >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
-  OneHotEncoder(dtype=<type 'float'>, n_values='auto')
+  OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
+         n_values='auto')
   >>> enc.transform([[0, 1, 3]]).toarray()
   array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
 
 By default, how many values each feature can take is inferred automatically from the dataset.
-It is possible to specify this explicitly using the parameter ``n_values``. 
+It is possible to specify this explicitly using the parameter ``n_values``.
 There are two genders, three possible continents and four web browsers in our
 dataset.
-Then we fit the estimator, and transform a data point. 
+Then we fit the estimator, and transform a data point.
 In the result, the first two numbers encode the gender, the next set of three
 numbers the continent and the last four the web browser.
 
 
@@ -12,7 +12,11 @@
 
 from .base import BaseEstimator, TransformerMixin
 from .externals.six import string_types
-from .utils import check_arrays, array2d, atleast2d_or_csr, safe_asarray
+from .utils import check_arrays
+from .utils import array2d
+from .utils import atleast2d_or_csr
+from .utils import atleast2d_or_csc
+from .utils import safe_asarray
 from .utils import warn_if_not_float
 from .utils.fixes import unique
 
@@ -35,6 +39,7 @@
            'LabelEncoder',
            'MinMaxScaler',
            'Normalizer',
+           'OneHotEncoder',
            'StandardScaler',
            'binarize',
            'normalize',
@@ -632,6 +637,53 @@ def transform(self, X, y=None, copy=None):
         return binarize(X, threshold=self.threshold, copy=copy)
 
 
+def _transform_selected(X, transform, selected="all"):
+    """Apply a transform function to portion of selected features
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+        Dense array or sparse matrix.
+
+    transform : callable
+        A callable transform(X) -> X_transformed
+
+    selected: "all" or array of indices or mask
+        Specify what features to apply the transform to.
+
+    Returns
+    -------
+    X : array or sparse matrix, shape=(n_samples, n_features_new)
+    """
+    if selected == "all":
+        return transform(X)
+
elif len(selected) == 0:    elif len(selected) == 0:
+        return X
+    else:
+        X = atleast2d_or_csc(X)
+        n_features = X.shape[1]
+        ind = np.arange(n_features)
+        sel = np.zeros(n_features, dtype=bool)
+        sel[np.array(selected)] = True
+        not_sel = np.logical_not(sel)
+        n_selected = np.sum(sel)
+
+        if n_selected == 0:
+            # No features selected.
+            return X
+        elif n_selected == n_features:
+            # All features selected.
+            return transform(X)
+        else:
+            X_sel = transform(X[:, ind[sel]])
+            X_not_sel = X[:, ind[not_sel]]
+
+            if sp.issparse(X_sel) or sp.issparse(X_not_sel):
+                return sp.hstack((X_sel, X_not_sel))
+            else:
+                return np.hstack((X_sel, X_not_sel))
+
+
 class OneHotEncoder(BaseEstimator, TransformerMixin):
     """Encode categorical integer features using a one-hot aka one-of-K scheme.
 
@@ -646,11 +698,21 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    n_values : 'auto', int or array of int
+    n_values : 'auto', int or array of ints
         Number of values per feature.
-        'auto' : determine value range from training data.
-        int : maximum value for all features.
-        array : maximum value per feature.
+
+        - 'auto' : determine value range from training data.
+        - int : maximum value for all features.
+        - array : maximum value per feature.
+
+    categorical_features: "all" or array of indices or mask
+        Specify what features are treated as categorical.
+
+        - 'all' (default): All features are treated as categorical.
+        - array of indices: Array of categorical feature indices.
+        - mask: Array of length n_features and with dtype=bool.
+
+        Non-categorical features are always stacked to the right of the matrix.
 
     dtype : number type, default=np.float
         Desired dtype of output.
@@ -680,7 +742,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     >>> enc = OneHotEncoder()
     >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1],
10000
 \
 [1, 0, 2]])  # doctest: +ELLIPSIS
-    OneHotEncoder(dtype=<... 'float'>, n_values='auto')
+    OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
+           n_values='auto')
     >>> enc.n_values_
     array([2, 3, 4])
     >>> enc.feature_indices_
@@ -690,12 +753,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
 
     See also
     --------
-    LabelEncoder : performs a one-hot encoding on arbitrary class labels.
     sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
       dictionary items (also handles string-valued features).
     """
-    def __init__(self, n_values="auto", dtype=np.float):
+    def __init__(self, n_values="auto", categorical_features="all",
+                 dtype=np.float):
         self.n_values = n_values
+        self.categorical_features = categorical_features
         self.dtype = dtype
 
     def fit(self, X, y=None):
@@ -713,12 +777,8 @@ def fit(self, X, y=None):
         self.fit_transform(X)
         return self
 
-    def fit_transform(self, X, y=None):
-        """Fit OneHotEncoder to X, then transform X.
-
-        Equivalent to self.fit(X).transform(X), but more convenient and more
-        efficient. See fit for the parameters, transform for the return value.
-        """
+    def _fit_transform(self, X):
+        """Asssumes X contains only categorical features."""
         X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
         if np.any(X < 0):
             raise ValueError("X needs to contain only non-negative integers.")
@@ -759,19 +819,17 @@ def fit_transform(self, X, y=None):
 
         return out
 
-    def transform(self, X):
-        """Transform X using one-hot encoding.
-
-        Parameters
-        ----------
-        X : array-like, shape=(n_samples, feature_indices_[-1])
-            Input array of type int.
+    def fit_transform(self, X, y=None):
+        """Fit OneHotEncoder to X, then transform X.
 
-        Returns
-        -------
-        X_out : sparse matrix, dtype=int
-            Transformed input.
+        Equivalent to self.fit(X).transform(X), but more convenient and more
+        efficient. See fit for the parameters, transform for the return value.
         """
+        return _transform_selected(X, self._fit_transform,
+                                   self.categorical_features)
+
+    def _transform(self, X):
+        """Asssumes X contains only categorical features."""
         X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
         if np.any(X < 0):
             raise ValueError("X needs to contain only non-negative integers.")
@@ -798,6 +856,22 @@ def transform(self, X):
             out = out[:, self.active_features_]
         return out
 
+    def transform(self, X):
+        """Transform X using one-hot encoding.
 
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples, n_features)
+            Input array of type int.
+
+        Returns
+        -------
+        X_out : sparse matrix, dtype=int
+            Transformed input.
+        """
+        return _transform_selected(X, self._transform,
+                                   self.categorical_features)
+
 
 class LabelEncoder(BaseEstimator, TransformerMixin):
     """Encode labels with value between 0 and n_classes-1.
 
@@ -15,6 +15,7 @@
 from sklearn.preprocessing import Binarizer
 from sklearn.preprocessing import KernelCenterer
 from sklearn.preprocessing import LabelBinarizer
+from sklearn.preprocessing import _transform_selected
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import Normalizer
@@ -612,6 +613,67 @@ def test_one_hot_encoder():
     assert_raises(ValueError, enc.transform, [[0], [-1]])
 
 
+def _check_transform_selected(X, Xexpected, sel):
+    for M in (X, sp.csr_matrix(X)):
+        Xtr = _transform_selected(M, Binarizer().transform, sel)
+        assert_array_equal(toarray(Xtr), Xexpected)
+
+
+def test_transform_selected():
+    X = [[3, 2, 1], [0, 1, 1]]
+
+    Xexpected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, Xexpected, [0])
+    _check_transform_selected(X, Xexpected, [True, False, False])
+
+    Xexpected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, Xexpected, [0, 1, 2])
+    _check_transform_selected(X, Xexpected, [True, True, True])
+    _check_transform_selected(X, Xexpected, "all")
+
+    _check_transform_selected(X, X, [])
+    _check_transform_selected(X, X, [False, False, False])
+
+
+def _run_one_hot(X, X2, cat):
+    enc = OneHotEncoder(categorical_features=cat)
+    Xtr = enc.fit_transform(X)
+    X2tr = enc.transform(X2)
+    return Xtr, X2tr
+
+
+def _check_one_hot(X, X2, cat, n_features):
+    ind = np.where(cat)[0]
+    # With mask
+    A, B = _run_one_hot(X, X2, cat)
+    # With indices
+    C, D = _run_one_hot(X, X2, ind)
+    # Check shape
+    assert_equal(A.shape, (2, n_features))
+    assert_equal(B.shape, (1, n_features))
+    assert_equal(C.shape, (2, n_features))
+    assert_equal(D.shape, (1, n_features))
+    # Check that mask and indices give the same results
+    assert_array_equal(toarray(A), toarray(C))
+    assert_array_equal(toarray(B), toarray(D))
+
+
+def test_one_hot_encoder_categorical_features():
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X2 = np.array([[1, 1, 1]])
+
+    cat = [True, False, False]
+    _check_one_hot(X, X2, cat, 4)
+
+    # Edge case: all non-categorical
+    cat = [False, False, False]
+    _check_one_hot(X, X2, cat, 3)
+
+    # Edge case: all categorical
+    cat = [True, True, True]
+    _check_one_hot(X, X2, cat, 5)
+
+
 def test_label_encoder():
     """Test LabelEncoder's transform and inverse_transform methods"""
     le = LabelEncoder()