scikit-learn
diff --git a/‎examples/preprocessing/plot_discretization_classification.py
Lines changed: 163 additions & 0 deletions b/‎examples/preprocessing/plot_discretization_classification.py
Lines changed: 163 additions & 0 deletions
diff --git a/‎sklearn/preprocessing/discretization.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/preprocessing/discretization.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 5 additions & 0 deletions b/‎sklearn/preprocessing/tests/test_discretization.py
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,163 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+======================
+Feature discretization
+======================
+
+A demonstration of feature discretization on synthetic classification datasets.
+Feature discretization decomposes each feature into a set of bins, here
+equally distributed in width. The discrete values are then one-hot encoded,
+and given to a linear classifier. On the two non-linearly separable datasets,
+feature discretization largely increases the performance of linear classifiers.
+
+This should be taken with a grain of salt, as the intuition conveyed by
+these examples does not necessarily carry over to real datasets.
+
+Particularly in high-dimensional spaces, data can more easily be separated
+linearly.
+
+The plots show training points in solid colors and testing points
+semi-transparent. The lower right shows the classification accuracy on the test
+set.
+"""
+print(__doc__)
+
+# Code source: Tom Dupré la Tour
+# Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
+#
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.svm import SVC, LinearSVC
+from sklearn.ensemble import GradientBoostingClassifier
+
+h = .02  # step size in the mesh
+
+
+def get_name(estimator):
+    name = estimator.__class__.__name__
+    if name == 'Pipeline':
+        name = [get_name(est[1]) for est in estimator.steps]
+        name = '\n'.join(name)
+    return name
+
+
+classifiers = [
+    (LogisticRegression(solver='lbfgs', random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+    (LinearSVC(random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+    (GradientBoostingClassifier(n_estimators=50, random_state=0), {
+        'learning_rate': np.logspace(-4, 0, 10)
+    }),
+    (SVC(random_state=0), {
+        'C': np.logspace(-2, 7, 10)
+    }),
+    (make_pipeline(
+        KBinsDiscretizer(encode='onehot'),
+        LogisticRegression(solver='lbfgs', random_state=0)), {
+            'kbinsdiscretizer__n_bins': np.arange(2, 10),
+            'logisticregression__C': np.logspace(-2, 7, 10),
+        }),
+    (make_pipeline(
+        KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), {
+            'kbinsdiscretizer__n_bins': np.arange(2, 10),
+            'linearsvc__C': np.logspace(-2, 7, 10),
+        }),
+]
+
+names = [get_name(e) for e, g in classifiers]
+
+n_samples = 100
+datasets = [
+    make_moons(n_samples=n_samples, noise=0.2, random_state=0),
+    make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
+    make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
+                        n_informative=2, random_state=2,
+                        n_clusters_per_class=1)
+]
+
+figure = plt.figure(figsize=(21, 9))
+i = 1
+# iterate over datasets
+for ds_cnt, ds in enumerate(datasets):
+    # preprocess dataset, split into training and test part
+    X, y = ds
+    X = StandardScaler().fit_transform(X)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=.5, random_state=42)
+
+    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
+    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+    xx, yy = np.meshgrid(
+        np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+    # just plot the dataset first
+    cm = plt.cm.PiYG
+    cm_bright = ListedColormap(['#b30065', '#178000'])
+    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+    if ds_cnt == 0:
+        ax.set_title("Input data")
+    # Plot the training points
+    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+               edgecolors='k')
+    # and testing points
+    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
+               edgecolors='k')
+    ax.set_xlim(xx.min(), xx.max())
+    ax.set_ylim(yy.min(), yy.max())
+    ax.set_xticks(())
+    ax.set_yticks(())
+    i += 1
+
+    # iterate over classifiers
+    for name, (estimator, param_grid) in zip(names, classifiers):
+        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
+        clf = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)
+        clf.fit(X_train, y_train)
+        score = clf.score(X_test, y_test)
+        print(ds_cnt, name, score)
+
+        # Plot the decision boundary. For that, we will assign a color to each
+        # point in the mesh [x_min, x_max]x[y_min, y_max].
+        if hasattr(clf, "decision_function"):
+            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+
+        # Put the result into a color plot
+        Z = Z.reshape(xx.shape)
+        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
+
+        # Plot also the training points
+        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
+                   edgecolors='k')
+        # and testing points
+        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
+                   edgecolors='k', alpha=0.6)
+
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+        if ds_cnt == 0:
+            ax.set_title(name)
+        ax.text(xx.max() - .3,
+                yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15,
+                horizontalalignment='right')
+        i += 1
+
+plt.tight_layout()
+plt.show()
@@ -166,7 +166,7 @@ def _validate_n_bins(self, n_features, ignored):
         """
         orig_bins = self.n_bins
         if isinstance(orig_bins, numbers.Number):
-            if not isinstance(orig_bins, np.int):
+            if not isinstance(orig_bins, (np.int, np.integer)):
                 raise ValueError("{} received an invalid n_bins type. "
                                  "Received {}, expected int."
                                  .format(KBinsDiscretizer.__name__,
 
@@ -29,6 +29,11 @@ def test_fit_transform():
     assert_array_equal(expected, est.transform(X))
 
 
+def test_valid_n_bins():
+    KBinsDiscretizer(n_bins=2).fit_transform(X)
+    KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
+
+
 def test_invalid_n_bins():
     est = KBinsDiscretizer(n_bins=1)
     assert_raise_message(ValueError, "KBinsDiscretizer received an invalid "