10000 [MRG+1] discrete branch: add encoding option to KBinsDiscretizer by qinhanmin2014 · Pull Request #9647 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG+1] discrete branch: add encoding option to KBinsDiscretizer #9647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Sep 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -455,10 +455,13 @@ K-bins discretization
>>> X = np.array([[ -3., 5., 15 ],
... [ 0., 6., 14 ],
... [ 6., 3., 11 ]])
>>> est = preprocessing.KBinsDiscretizer(n_bins=[3, 3, 2]).fit(X)
>>> est = preprocessing.KBinsDiscretizer(n_bins=[3, 3, 2], encode='ordinal').fit(X)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Below please note that by default the output is one-hot encoded into a sparse matrix (use :ref: to link to the right place), but that this can be configured with the encode parameter.

>>> est.bin_width_
array([ 3., 1., 2.])

By default the output is one-hot encoded into a sparse matrix
(See :ref:`preprocessing_categorical_features`)
and this can be configured with the ``encode`` parameter.
For each feature, the bin width is computed during ``fit`` and together with
the number of bins, they will define the intervals. Therefore, for the current
example, these intervals are defined as:
Expand Down
58 changes: 52 additions & 6 deletions sklearn/preprocessing/discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import warnings

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing.data import _transform_selected
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted
Expand All @@ -18,6 +19,8 @@
class KBinsDiscretizer(BaseEstimator, TransformerMixin):
"""Bins continuous data into k equal width intervals.

Read more in the :ref:`User Guide <discretization>`.

Parameters
----------
n_bins : int or array-like, shape (n_features,) (default=2)
Expand All @@ -32,6 +35,20 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
Column indices of ignored features. (Example: Categorical features.)
If ``None``, all features will be discretized.

encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
Method used to encode the transformed result.

onehot
Encode the transformed result with one-hot encoding
and return a sparse matrix. Ignored features are always
stacked to the right.
onehot-dense
Encode the transformed result with one-hot encoding
and return a dense array. Ignored features are always
stacked to the right.
ordinal
Return the bin identifier encoded as an integer value.

Attributes
----------
offset_ : float array, shape (n_features,)
Expand All @@ -55,7 +72,7 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
... [-1, 2, -3, -0.5],
... [ 0, 3, -2, 0.5],
... [ 1, 4, -1, 2]]
>>> est = KBinsDiscretizer(n_bins=3)
>>> est = KBinsDiscretizer(n_bins=3, encode='ordinal')
>>> est.fit(X) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
KBinsDiscretizer(...)
>>> Xt = est.transform(X)
Expand Down Expand Up @@ -94,9 +111,10 @@ class KBinsDiscretizer(BaseEstimator, TransformerMixin):
``1`` based on a parameter ``threshold``.
"""

def __init__(self, n_bins=2, ignored_features=None):
def __init__(self, n_bins=2, ignored_features=None, encode='onehot'):
self.n_bins = n_bins
self.ignored_features = ignored_features
self.encode = encode

def fit(self, X, y=None):
"""Fits the estimator.
Expand All @@ -114,6 +132,12 @@ def fit(self, X, y=None):
"""
X = check_array(X, dtype='numeric')

valid_encode = ('onehot', 'onehot-dense', 'ordinal')
if self.encode not in valid_encode:
raise ValueError("Valid options for 'encode' are {}. "
"Got 'encode = {}' instead."
.format(valid_encode, self.encode))

n_features = X.shape[1]
ignored = self._validate_ignored_features(n_features)
self.transformed_features_ = np.delete(np.arange(n_features), ignored)
Expand Down Expand Up @@ -199,15 +223,29 @@ def transform(self, X):

Returns
-------
Xt : numeric array-like, shape (n_samples, n_features)
Xt : numeric array-like or sparse matrix
Data in the binned space.
"""
check_is_fitted(self, ["offset_", "bin_width_"])
X = self._validate_X_post_fit(X)

return _transform_selected(X, self._transform,
self.transformed_features_, copy=True,
retain_order=True)
Xt = _transform_selected(X, self._transform,
self.transformed_features_, copy=True,
retain_order=True)

if self.encode == 'ordinal':
return Xt

# Only one-hot encode discretized features
mask = np.ones(X.shape[1], dtype=bool)
if self.ignored_features is not None:
mask[self.ignored_features] = False

encode_sparse = self.encode == 'onehot'
return OneHotEncoder(n_values=self.n_bins_[mask],
categorical_features='all'
if self.ignored_features is None else mask,
sparse=encode_sparse).fit_transform(Xt)

def _validate_X_post_fit(self, X):
X = check_array(X, dtype='numeric')
Expand Down Expand Up @@ -259,6 +297,14 @@ def inverse_transform(self, Xt):
Data in the original feature space.
"""
check_is_fitted(self, ["offset_", "bin_width_"])

# Currently, OneHotEncoder doesn't support inverse_transform
if self.encode != 'ordinal':
raise ValueError("inverse_transform only supports "
"'encode = ordinal'. "
"Got 'encode = {}' instead."
.format(self.encode))

Xt = self._validate_X_post_fit(Xt)
trans = self.transformed_features_
Xinv = Xt.copy()
Expand Down
63 changes: 56 additions & 7 deletions sklearn/preprocessing/tests/test_discretization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import absolute_import

import numpy as np
from six.moves import range
import scipy.sparse as sp
import warnings

from sklearn.externals.six.moves import xrange as range
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.testing import (
assert_array_equal,
assert_raises,
Expand All @@ -19,7 +21,7 @@


def test_fit_transform():
est = KBinsDiscretizer(n_bins=3).fit(X)
est = KBinsDiscretizer(n_bins=3, encode='ordinal').fit(X)
expected = [[0, 0, 0, 0],
[1, 1, 1, 0],
[2, 2, 2, 1],
Expand Down Expand Up @@ -74,7 +76,7 @@ def test_invalid_n_bins_array():


def test_fit_transform_n_bins_array():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3]).fit(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X)
expected = [[0, 0, 0, 0],
[0, 1, 1, 0],
[1, 2, 2, 1],
Expand All @@ -92,7 +94,8 @@ def test_invalid_n_features():

def test_ignored_transform():
# Feature at col_idx=1 should not change
est = KBinsDiscretizer(n_bins=3, ignored_features=[1]).fit(X)
est = KBinsDiscretizer(n_bins=3, ignored_features=[1],
encode='ordinal').fit(X)

expected = [[0., 1.5, 0., 0.],
[1., 2.5, 1., 0.],
Expand Down Expand Up @@ -129,7 +132,8 @@ def test_same_min_max():
[1, 1]])
est = assert_warns_message(UserWarning,
"Features 0 are constant and will be replaced "
"with 0.", KBinsDiscretizer(n_bins=3).fit, X)
"with 0.", KBinsDiscretizer
(n_bins=3, encode='ordinal').fit, X)
Xt = est.transform(X)

expected = [[0, 0],
Expand All @@ -150,7 +154,8 @@ def test_transform_1d_behavior():


def test_inverse_transform_with_ignored():
est = KBinsDiscretizer(n_bins=[2, 3, 0, 3], ignored_features=[1, 2]).fit(X)
est = KBinsDiscretizer(n_bins=[2, 3, 0, 3], ignored_features=[1, 2],
encode='ordinal').fit(X)
Xt = [[0, 1, -4.5, 0],
[0, 2, -3.5, 0],
[1, 3, -2.5, 1],
Expand All @@ -172,5 +177,49 @@ def test_numeric_stability():
# Test up to discretizing nano units
for i in range(1, 9):
X = X_init / 10**i
Xt = KBinsDiscretizer(n_bins=2).fit_transform(X)
Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
assert_array_equal(Xt_expected, Xt)


def test_invalid_encode_option():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
assert_raise_message(ValueError, "Valid options for 'encode' are "
"('onehot', 'onehot-dense', 'ordinal'). "
"Got 'encode = invalid-encode' instead.",
est.fit, X)


def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='ordinal').fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot-dense').fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(OneHotEncoder(n_values=[2, 3, 3, 3], sparse=False)
.fit_transform(Xt_1), Xt_2)
assert_raise_message(ValueError, "inverse_transform only supports "
"'encode = ordinal'. Got 'encode = onehot-dense' "
"instead.", est.inverse_transform, Xt_2)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot').fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(OneHotEncoder(n_values=[2, 3, 3, 3], sparse=True)
.fit_transform(Xt_1).toarray(),
Xt_3.toarray())
assert_raise_message(ValueError, "inverse_transform only supports "
"'encode = ordinal'. Got 'encode = onehot' "
"instead.", est.inverse_transform, Xt_2)


def test_one_hot_encode_with_ignored_features():
est = KBinsDiscretizer(n_bins=3, ignored_features=[1, 2],
encode='onehot-dense').fit(X)
Xt = est.transform(X)
Xt_expected = [[1, 0, 0, 1, 0, 0, 1.5, -4],
[0, 1, 0, 1, 0, 0, 2.5, -3],
[0, 0, 1, 0, 1, 0, 3.5, -2],
[0, 0, 1, 0, 0, 1, 4.5, -1]]
assert_array_equal(Xt_expected, Xt)
0