scikit-learn · hlin117 · Oct 14, 2016 · Oct 14, 2016 · Oct 15, 2016 · Oct 22, 2016
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1190,6 +1190,7 @@ See the :ref:`metrics` section of the user guide for further details.
    preprocessing.Binarizer
    preprocessing.FunctionTransformer
    preprocessing.Imputer
+   preprocessing.KBinsDiscretizer
    preprocessing.KernelCenterer
    preprocessing.LabelBinarizer
    preprocessing.LabelEncoder

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -381,10 +381,107 @@ The normalizer instance can then be used on sample vectors as any transformer::
   efficient Cython routines. To avoid unnecessary memory copies, it is
   recommended to choose the CSR representation upstream.
 
-.. _preprocessing_binarization:
+.. _preprocessing_categorical_features:
+
+Encoding categorical features
+=============================
+Often features are not given as continuous values but categorical.
+For example a person could have features ``["male", "female"]``,
+``["from Europe", "from US", "from Asia"]``,
+``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``.
+Such features can be efficiently coded as integers, for instance
+``["male", "from US", "uses Internet Explorer"]`` could be expressed as
+``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
+``[1, 2, 1]``.
+
+Such integer representation can not be used directly with scikit-learn estimators, as these
+expect continuous input, and would interpret the categories as being ordered, which is often
+not desired (i.e. the set of browsers was ordered arbitrarily).
+
+One possibility to convert categorical features to features that can be used
+with scikit-learn estimators is to use a one-of-K or one-hot encoding, which is
+implemented in :class:`OneHotEncoder`.  This estimator transforms each
+categorical feature with ``m`` possible values into ``m`` binary features, with
+only one active.
+
+Continuing the example above::
+
+  >>> enc = preprocessing.OneHotEncoder()
+  >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  # doctest: +ELLIPSIS
+  OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
+         handle_unknown='error', n_values='auto', sparse=True)
+  >>> enc.transform([[0, 1, 3]]).toarray()
+  array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
+
+By default, how many values each feature can take is inferred automatically from the dataset.
+It is possible to specify this explicitly using the parameter ``n_values``.
+There are two genders, three possible continents and four web browsers in our
+dataset.
+Then we fit the estimator, and transform a data point.
+In the result, the first two numbers encode the gender, the next set of three
+numbers the continent and the last four the web browser.
+
+Note that, if there is a possibilty that the training data might have missing categorical
+features, one has to explicitly set ``n_values``. For example,
+
+    >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
+    >>> # Note that there are missing categorical values for the 2nd and 3rd
+    >>> # features
+    >>> enc.fit([[1, 2, 3], [0, 2, 0]])  # doctest: +ELLIPSIS
+    OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
+           handle_unknown='error', n_values=[2, 3, 4], sparse=True)
+    >>> enc.transform([[1, 0, 0]]).toarray()
+    array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
+
+See :ref:`dict_feature_extraction` for categorical features that are represented
+as a dict, not as integers.
+
+.. _discretization:
+
+Discretization
+==============
+
+`Discretization <https://en.wikipedia.org/wiki/Discretization_of_continuous_features>`_
+(otherwise known as quantization or binning) provides a way to partition continuous
+features into discrete values. Certain datasets with continuous features
+may benefit from discretization, because discretization can transform the dataset
+of continuous attributes to one with only nominal attributes.
+
+K-bins discretization
+---------------------
+
+:class:`KBinsDiscretizer` discretizers features into ``k`` equal width bins::
+
+  >>> X = np.array([[ -3., 5., 15 ],
+  ...               [  0., 6., 14 ],
+  ...               [  6., 3., 11 ]])
+  >>> est = preprocessing.KBinsDiscretizer(n_bins=[3, 3, 2]).fit(X)
+  >>> est.bin_width_
+  array([ 3.,  1.,  2.])
+
+For each feature, the bin width is computed during ``fit`` and together with
+the number of bins, they will define the intervals. Therefore, for the current
+example, these intervals are defined as:
+
+ - feature 1: :math:`{[-\infty, 0), [0, 3), [3, \infty)}`
+ - feature 2: :math:`{[-\infty, 4), [4, 5), [5, \infty)}`
+ - feature 3: :math:`{[-\infty, 13), [13, \infty)}`
 
-Binarization
-============
+ Based on these bin intervals, ``X`` is transformed as follows::
+
+  >>> est.transform(X)                      # doctest: +SKIP
+  array([[ 0., 2., 1.],
+         [ 1., 2., 1.],
+         [ 2., 0., 0.]])
+
+The resulting dataset contains ordinal attributes which can be further used
+in a :class:`sklearn.pipeline.Pipeline`.
+
+Discretization is similar to constructing histograms for continuous data.
+However, histograms focus on counting features which fall into particular
+bins, whereas discretization focuses on assigning feature values to these bins.
+
+.. _preprocessing_binarization:
 
 Feature binarization
 --------------------
@@ -431,6 +528,9 @@ As for the :class:`StandardScaler` and :class:`Normalizer` classes, the
 preprocessing module provides a companion function :func:`binarize`
 to be used when the transformer API is not necessary.
 
+Note that the :class:`Binarizer` is similar to the :class:`KBinsDiscretizer`
+when ``k = 2``, and when the bin edge is at the value ``threshold``.
+
 .. topic:: Sparse input
 
   :func:`binarize` and :class:`Binarizer` accept **both dense array-like
@@ -441,62 +541,6 @@ to be used when the transformer API is not necessary.
   To avoid unnecessary memory copies, it is recommended to choose the CSR
   representation upstream.
 
-
-.. _preprocessing_categorical_features:
-
-Encoding categorical features
-=============================
-Often features are not given as continuous values but categorical.
-For example a person could have features ``["male", "female"]``,
-``["from Europe", "from US", "from Asia"]``,
-``["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]``.
-Such features can be efficiently coded as integers, for instance
-``["male", "from US", "uses Internet Explorer"]`` could be expressed as
-``[0, 1, 3]`` while ``["female", "from Asia", "uses Chrome"]`` would be
-``[1, 2, 1]``.
-
-Such integer representation can not be used directly with scikit-learn estimators, as these
-expect continuous input, and would interpret the categories as being ordered, which is often
-not desired (i.e. the set of browsers was ordered arbitrarily).
-
-One possibility to convert categorical features to features that can be used
-with scikit-learn estimators is to use a one-of-K or one-hot encoding, which is
-implemented in :class:`OneHotEncoder`.  This estimator transforms each
-categorical feature with ``m`` possible values into ``m`` binary features, with
-only one active.
-
-Continuing the example above::
-
-  >>> enc = preprocessing.OneHotEncoder()
-  >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  # doctest: +ELLIPSIS
-  OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-         handle_unknown='error', n_values='auto', sparse=True)
-  >>> enc.transform([[0, 1, 3]]).toarray()
-  array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
-
-By default, how many values each feature can take is inferred automatically from the dataset.
-It is possible to specify this explicitly using the parameter ``n_values``.
-There are two genders, three possible continents and four web browsers in our
-dataset.
-Then we fit the estimator, and transform a data point.
-In the result, the first two numbers encode the gender, the next set of three
-numbers the continent and the last four the web browser.
-
-Note that, if there is a possibilty that the training data might have missing categorical
-features, one has to explicitly set ``n_values``. For example,
-
-    >>> enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
-    >>> # Note that there are missing categorical values for the 2nd and 3rd
-    >>> # features
-    >>> enc.fit([[1, 2, 3], [0, 2, 0]])  # doctest: +ELLIPSIS
-    OneHotEncoder(categorical_features='all', dtype=<... 'numpy.float64'>,
-           handle_unknown='error', n_values=[2, 3, 4], sparse=True)
-    >>> enc.transform([[1, 0, 0]]).toarray()
-    array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
-
-See :ref:`dict_feature_extraction` for categorical features that are represented
-as a dict, not as integers.
-
 .. _imputation:
 
 Imputation of missing values

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -24,6 +24,7 @@
 from .data import OneHotEncoder
 
 from .data import PolynomialFeatures
+from .discretization import KBinsDiscretizer
 
 from .label import label_binarize
 from .label import LabelBinarizer
@@ -37,6 +38,7 @@
     'Binarizer',
     'FunctionTransformer',
     'Imputer',
+    'KBinsDiscretizer',
     'KernelCenterer',
     'LabelBinarizer',
     'LabelEncoder',

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -1677,8 +1677,12 @@ def add_dummy_feature(X, value=1.0):
         return np.hstack((np.ones((n_samples, 1)) * value, X))
 
 
-def _transform_selected(X, transform, selected="all", copy=True):
-    """Apply a transform function to portion of selected features
+def _transform_selected(X, transform, selected="all", copy=True,
+                        retain_order=False):
+    """Apply a transform function to portion of selected features.
+
+    Returns an array Xt, where the non-selected features appear on the right
+    side (largest column indices) of Xt.
 
     Parameters
     ----------
@@ -1688,18 +1692,28 @@ def _transform_selected(X, transform, selected="all", copy=True):
     transform : callable
         A callable transform(X) -> X_transformed
 
-    copy : boolean, optional
+    copy : boolean, default=True
         Copy X even if it could be avoided.
 
-    selected: "all" or array of indices or mask
+    selected : "all" or array of indices or mask
         Specify which features to apply the transform to.
 
+    retain_order : boolean, default=False
+        If True, the non-selected features will not be displaced to the right
+        side of the transformed array. The number of features in Xt must
+        match the number of features in X. Furthermore, X and Xt cannot be
+        sparse.
+
     Returns
     -------
-    X : array or sparse matrix, shape=(n_samples, n_features_new)
+    Xt : array or sparse matrix, shape=(n_samples, n_features_new)
     """
     X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
 
+    if sparse.issparse(X) and retain_order:
+        raise ValueError("The retain_order option can only be set to True "
+                         "for dense matrices.")
+
     if isinstance(selected, six.string_types) and selected == "all":
         return transform(X)
 
@@ -1719,14 +1733,24 @@ def _transform_selected(X, transform, selected="all", copy=True):
     elif n_selected == n_features:
         # All features selected.
         return transform(X)
-    else:
-        X_sel = transform(X[:, ind[sel]])
-        X_not_sel = X[:, ind[not_sel]]
 
-        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
-            return sparse.hstack((X_sel, X_not_sel))
-        else:
-            return np.hstack((X_sel, X_not_sel))
+    X_sel = transform(X[:, ind[sel]])
+    X_not_sel = X[:, ind[not_sel]]
+
+    if retain_order:
+        if X_sel.shape[1] + X_not_sel.shape[1] != n_features:
+            raise ValueError("The retain_order option can only be set to True "
+                             "if the dimensions of the input array match the "
+                             "dimensions of the transformed array.")
+
+        # Fancy indexing not supported for sparse matrices
+        X[:, ind[sel]] = X_sel
+        return X
+
+    if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
+        return sparse.hstack((X_sel, X_not_sel))
+    else:
+        return np.hstack((X_sel, X_not_sel))
 
 
 class OneHotEncoder(BaseEstimator, TransformerMixin):