diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py new file mode 100644 index 0000000000000..9c428a8d93b19 --- /dev/null +++ b/examples/preprocessing/plot_discretization.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +""" +================================================================ +Using KBinsDiscretizer to discretize continuous features +================================================================ + +The example compares prediction result of linear regression (linear model) +and decision tree (tree based model) with and without discretization of +real-valued features. + +As is shown in the result before discretization, linear model is fast to +build and relatively straightforward to interpret, but can only model +linear relationships, while decision tree can build a much more complex model +of the data. One way to make linear model more powerful on continuous data +is to use discretization (also known as binning). In the example, we +discretize the feature and one-hot encode the transformed data. Note that if +the bins are not reasonably wide, there would appear to be a substantially +increased risk of overfitting, so the discretizer parameters should usually +be tuned under cross validation. + +After discretization, linear regression and decision tree make exactly the +same prediction. As features are constant within each bin, any model must +predict the same value for all points within a bin. Compared with the result +before discretization, linear model become much more flexible while decision +tree gets much less flexible. Note that binning features generally has no +beneficial effect for tree-based models, as these models can learn to split +up the data anywhere. + +""" + +# Author: Andreas Müller +# Hanmin Qin +# License: BSD 3 clause + +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import KBinsDiscretizer +from sklearn.tree import DecisionTreeRegressor + +print(__doc__) + +# construct the dataset +rnd = np.random.RandomState(42) +X = rnd.uniform(-3, 3, size=100) +y = np.sin(X) + rnd.normal(size=len(X)) / 3 +X = X.reshape(-1, 1) + +# transform the dataset with KBinsDiscretizer +enc = KBinsDiscretizer(n_bins=10, encode='onehot') +X_binned = enc.fit_transform(X) + +# predict with original dataset +fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) +line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) +reg = LinearRegression().fit(X, y) +ax1.plot(line, reg.predict(line), linewidth=2, color='green', + label="linear regression") +reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) +ax1.plot(line, reg.predict(line), linewidth=2, color='red', + label="decision tree") +ax1.plot(X[:, 0], y, 'o', c='k') +ax1.legend(loc="best") +ax1.set_ylabel("Regression output") +ax1.set_xlabel("Input feature") +ax1.set_title("Result before discretization") + +# predict with transformed dataset +line_binned = enc.transform(line) +reg = LinearRegression().fit(X_binned, y) +ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green', + linestyle='-', label='linear regression') +reg = DecisionTreeRegressor(min_samples_split=3, + random_state=0).fit(X_binned, y) +ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red', + linestyle=':', label='decision tree') +ax2.plot(X[:, 0], y, 'o', c='k') +bins = enc.offset_[0] + enc.bin_width_[0] * np.arange(1, enc.n_bins_[0]) +ax2.vlines(bins, *plt.gca().get_ylim(), linewidth=1, alpha=.2) +ax2.legend(loc="best") +ax2.set_xlabel("Input feature") +ax2.set_title("Result after discretization") + +plt.tight_layout() +plt.show()