scikit-learn
diff --git a/‎examples/preprocessing/plot_discretization_classification.py
Lines changed: 187 additions & 0 deletions b/‎examples/preprocessing/plot_discretization_classification.py
Lines changed: 187 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/discretization.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/preprocessing/discretization.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 5 additions & 0 deletions b/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,187 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+======================
+Feature discretization
+======================
+
+A demonstration of feature discretization on synthetic classification datasets.
+Feature discretization decomposes each feature into a set of bins, here equally
+distributed in width. The discrete values are then one-hot encoded, and given
+to a linear classifier. This preprocessing enables a non-linear behavior even
+though the classifier is linear.
+
+On this example, the first two rows represent linearly non-separable datasets
+(moons and concentric circles) while the third is approximately linearly
+separable. On the two linearly non-separable datasets, feature discretization
+largely increases the performance of linear classifiers. On the linearly
+separable dataset, feature discretization decreases the performance of linear
+classifiers. Two non-linear classifiers are also shown for comparison.
+
+This example should be taken with a grain of salt, as the intuition conveyed
+does not necessarily carry over to real datasets. Particularly in
+high-dimensional spaces, data can more easily be separated linearly. Moreover,
+using feature discretization and one-hot encoding increases the number of
+features, which easily lead to overfitting when the number of samples is small.
+
+The plots show training points in solid colors and testing points
+semi-transparent. The lower right shows the classification accuracy on the test
+set.
+"""
+print(__doc__)
+
+# Code source: Tom Dupré la Tour
+# Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
+#
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.svm import SVC, LinearSVC
+from sklearn.ensemble import GradientBoostingClassifier
+
+h = .02  # step size in the mesh
+
+
+def get_name(estimator):
+    name = estimator.__class__.__name__
+    if name == 'Pipeline':
+        name = [get_name(est[1]) for est in estimator.steps]
+        name = ' + '.join(name)
+    return name
+
+
+# list of (estimator, param_grid), where param_grid is used in GridSearchCV
+classifiers = [
+    (LogisticRegression(solver='lbfgs', random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+    (LinearSVC(random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+    (make_pipeline(
+        KBinsDiscretizer(encode='onehot'),
+        LogisticRegression(solver='lbfgs', random_state=0)), {
+            'kbinsdiscretizer__n_bins': np.arange(2, 10),
+            'logisticregression__C': np.logspace(-2, 7, 10),
+        }),
+    (make_pipeline(
+        KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
+            'kbinsdiscretizer__n_bins': np.arange(2, 10),
+            'linearsvc__C': np.logspace(-2, 7, 10),
+        }),
+    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
+        'learning_rate': np.logspace(-4, 0, 10)
+    }),
+    (SVC(random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+]
+
+names = [get_name(e) for e, g in classifiers]
+
+n_samples = 100
+datasets = [
+    make_moons(n_samples=n_samples, noise=0.2, random_state=0),
+    make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
+    make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
+                        n_informative=2, random_state=2,
+                        n_clusters_per_class=1)
+]
+
+figure = plt.figure(figsize=(21, 9))
+cm = plt.cm.PiYG
+cm_bright = ListedColormap(['#b30065', '#178000'])
+i = 1
+# iterate over datasets
+for ds_cnt, (X, y) in enumerate(datasets):
+    print('\ndataset %d\n---------' % ds_cnt)
+
+    # preprocess dataset, split into training and test part
+    X = StandardScaler().fit_transform(X)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=.5, random_state=42)
+
+    # create the grid for background colors
+    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+    xx, yy = np.meshgrid(
+        np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+    # plot the dataset first
+    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+    if ds_cnt == 0:
+        ax.set_title("Input data")
+    # plot the training points
+    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+               edgecolors='k')
+    # and testing points
+    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
+               edgecolors='k')
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xticks(())
+    ax.set_yticks(())
+    i += 1
+
+    # iterate over classifiers
+    for name, (estimator, param_grid) in zip(names, classifiers):
+        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+
+        clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)
+        clf.fit(X_train, y_train)
+        score = clf.score(X_test, y_test)
+        print('%s: %.2f' % (name, score))
+
+        # plot the decision boundary. For that, we will assign a color to each
+        # point in the mesh [x_min, x_max]*[y_min, y_max].
+        if hasattr(clf, "decision_function"):
+            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+
+        # put the result into a color plot
+        Z = Z.reshape(xx.shape)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+
+        # plot the training points
+        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+                   edgecolors='k')
+        # and testing points
+        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
+                   edgecolors='k', alpha=0.6)
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+
+        if ds_cnt == 0:
+            ax.set_title(name.replace(' + ', '\n'))
+        ax.text(0.95, 0.06, ('%.2f' % score).lstrip('0'), size=15,
+                bbox=dict(boxstyle='round', alpha=0.8, facecolor='white'),
+                transform=ax.transAxes, horizontalalignment='right')
+
+        i += 1
+
+plt.tight_layout()
+
+# Add suptitles above the figure
+plt.subplots_adjust(top=0.90)
+suptitles = [
+    'Linear classifiers',
+    'Feature discretization and linear classifiers',
+    'Non-linear classifiers',
+]
+for i, suptitle in zip([2, 4, 6], suptitles):
+    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+    ax.text(1.05, 1.25, suptitle, transform=ax.transAxes,
+            horizontalalignment='center', size='x-large')
+plt.show()
@@ -166,7 +166,7 @@ def _validate_n_bins(self, n_features, ignored):
         """
         orig_bins = self.n_bins
         if isinstance(orig_bins, numbers.Number):
-            if not isinstance(orig_bins, np.int):
+            if not isinstance(orig_bins, (numbers.Integral, np.integer)):
                 raise ValueError("{} received an invalid n_bins type. "
                                  "Received {}, expected int."
                                  .format(KBinsDiscretizer.__name__,
 
@@ -29,6 +29,11 @@ def test_fit_transform():
     assert_array_equal(expected, est.transform(X))
 
 
+def test_valid_n_bins():
+    KBinsDiscretizer(n_bins=2).fit_transform(X)
+    KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
+
+
 def test_invalid_n_bins():
     est = KBinsDiscretizer(n_bins=1)
     assert_raise_message(ValueError, "KBinsDiscretizer received an invalid "