scikit-learn
diff --git a/‎doc/modules/classes.rst
Lines changed: 20 additions & 12 deletions b/‎doc/modules/classes.rst
Lines changed: 20 additions & 12 deletions
diff --git a/‎doc/modules/gaussian_process.rst
Lines changed: 589 additions & 74 deletions b/‎doc/modules/gaussian_process.rst
Lines changed: 589 additions & 74 deletions
diff --git a/‎examples/classification/plot_classification_probability.py
Lines changed: 7 additions & 3 deletions b/‎examples/classification/plot_classification_probability.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎examples/classification/plot_classifier_comparison.py
Lines changed: 10 additions & 5 deletions b/‎examples/classification/plot_classifier_comparison.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/gaussian_process/gp_diabetes_dataset.py
Lines changed: 0 additions & 51 deletions b/‎examples/gaussian_process/gp_diabetes_dataset.py
Lines changed: 0 additions & 51 deletions
diff --git a/‎examples/gaussian_process/plot_compare_gpr_krr.py
Lines changed: 116 additions & 0 deletions b/‎examples/gaussian_process/plot_compare_gpr_krr.py
Lines changed: 116 additions & 0 deletions
diff --git a/‎examples/gaussian_process/plot_gpc.py
Lines changed: 100 additions & 0 deletions b/‎examples/gaussian_process/plot_gpc.py
Lines changed: 100 additions & 0 deletions
@@ -495,22 +495,30 @@ From text
   :toctree: generated/
   :template: class.rst
 
+  gaussian_process.GaussianProcessRegressor
+  gaussian_process.GaussianProcessClassifier
   gaussian_process.GaussianProcess
 
-.. autosummary::
-   :toctree: generated
-   :template: function.rst
+Kernels:
 
-   gaussian_process.correlation_models.absolute_exponential
-   gaussian_process.correlation_models.squared_exponential
-   gaussian_process.correlation_models.generalized_exponential
-   gaussian_process.correlation_models.pure_nugget
-   gaussian_process.correlation_models.cubic
-   gaussian_process.correlation_models.linear
-   gaussian_process.regression_models.constant
-   gaussian_process.regression_models.linear
-   gaussian_process.regression_models.quadratic
+.. autosummary::
+  :toctree: generated/
+  :template: class.rst
 
+  gaussian_process.kernels.Kernel
+  gaussian_process.kernels.Sum
+  gaussian_process.kernels.Product
+  gaussian_process.kernels.Exponentiation
+  gaussian_process.kernels.ConstantKernel
+  gaussian_process.kernels.WhiteKernel
+  gaussian_process.kernels.RBF
+  gaussian_process.kernels.Matern
+  gaussian_process.kernels.RationalQuadratic
+  gaussian_process.kernels.ExpSineSquared
+  gaussian_process.kernels.DotProduct
+  gaussian_process.kernels.PairwiseKernel
+  gaussian_process.kernels.CompoundKernel
+  gaussian_process.kernels.Hyperparameter
 
 .. _grid_search_ref:
 
 
@@ -6,7 +6,7 @@
 Plot the classification probability for different classifiers. We use a 3
 class dataset, and we classify it with a Support Vector classifier, L1
 and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting.
+setting, and Gaussian process classification.
 
 The logistic regression is not a multiclass classifier out of the box. As
 a result it can identify only the first class.
@@ -21,6 +21,8 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn import datasets
 
 iris = datasets.load_iris()
@@ -30,6 +32,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 n_features = X.shape[1]
 
 C = 1.0
+kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
 # Create different classifiers. The logistic regression cannot do
 # multiclass out of the box.
@@ -38,8 +41,9 @@ class dataset, and we classify it with a Support Vector classifier, L1
                'Linear SVC': SVC(kernel='linear', C=C, probability=True,
                                  random_state=0),
                'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'
-                )}
+                C=C, solver='lbfgs', multi_class='multinomial'),
+               'GPC': GaussianProcessClassifier(kernel)
+               }
 
 n_classifiers = len(classifiers)
 
 
@@ -36,6 +36,8 @@
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.naive_bayes import GaussianNB
@@ -44,13 +46,15 @@
 
 h = .02  # step size in the mesh
 
-names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
-         "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
-         "Quadratic Discriminant Analysis"]
+names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
+         "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes",
+         "Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]
+
 classifiers = [
     KNeighborsClassifier(3),
     SVC(kernel="linear", C=0.025),
     SVC(gamma=2, C=1),
+    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
     DecisionTreeClassifier(max_depth=5),
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
     AdaBoostClassifier(),
@@ -76,7 +80,8 @@
     # preprocess dataset, split into training and test part
     X, y = ds
     X = StandardScaler().fit_transform(X)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=.4, random_state=42)
 
     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
@@ -129,5 +134,5 @@
                 size=15, horizontalalignment='right')
         i += 1
 
-figure.subplots_adjust(left=.02, right=.98)
+plt.tight_layout()
 plt.show()
@@ -0,0 +1,116 @@
+"""
+==========================================================
+Comparison of kernel ridge and Gaussian process regression
+==========================================================
+
+Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
+a target function by employing internally the "kernel trick". KRR learns a
+linear function in the space induced by the respective kernel which corresponds
+to a non-linear function in the original space. The linear function in the
+kernel space is chosen based on the mean-squared error loss with
+ridge regularization. GPR uses the kernel to define the covariance of
+a prior distribution over the target functions and uses the observed training
+data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
+posterior distribution over target functions is defined, whose mean is used
+for prediction.
+
+A major difference is that GPR can choose the kernel's hyperparameters based
+on gradient-ascent on the marginal likelihood function while KRR needs to
+perform a grid search on a cross-validated loss function (mean-squared error
+loss). A further difference is that GPR learns a generative, probabilistic
+model of the target function and can thus provide meaningful confidence
+intervals and posterior samples along with the predictions while KRR only
+provides predictions.
+
+This example illustrates both methods on an artificial dataset, which
+consists of a sinusoidal target function and strong noise. The figure compares
+the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
+suited for learning periodic functions. The kernel's hyperparameters control
+the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
+of the data is learned explicitly by GPR by an additional WhiteKernel component
+in the kernel and by the regularization parameter alpha of KRR.
+
+The figure shows that both methods learn reasonable models of the target
+function. GPR correctly identifies the periodicity of the function to be
+roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
+that, GPR provides reasonable confidence bounds on the prediction which are not
+available for KRR. A major difference between the two methods is the time
+required for fitting and predicting: while fitting KRR is fast in principle,
+the grid-search for hyperparameter optimization scales exponentially with the
+number of hyperparameters ("curse of dimensionality"). The gradient-based
+optimization of the parameters in GPR does not suffer from this exponential
+scaling and is thus considerable faster on this example with 3-dimensional
+hyperparameter space. The time for predicting is similar; however, generating
+the variance of the predictive distribution of GPR takes considerable longer
+than just predicting the mean.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+
+import time
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.grid_search import GridSearchCV
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
+
+rng = np.random.RandomState(0)
+
+# Generate sample data
+X = 15 * rng.rand(100, 1)
+y = np.sin(X).ravel()
+y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
+
+# Fit KernelRidge with parameter selection based on 5-fold cross validation
+param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
+              "kernel": [ExpSineSquared(l, p)
+                         for l in np.logspace(-2, 2, 10)
+                         for p in np.logspace(0, 2, 10)]}
+kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
+stime = time.time()
+kr.fit(X, y)
+print("Time for KRR fitting: %.3f" % (time.time() - stime))
+
+gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
+    + WhiteKernel(1e-1)
+gpr = GaussianProcessRegressor(kernel=gp_kernel)
+stime = time.time()
+gpr.fit(X, y)
+print("Time for GPR fitting: %.3f" % (time.time() - stime))
+
+# Predict using kernel ridge
+X_plot = np.linspace(0, 20, 10000)[:, None]
+stime = time.time()
+y_kr = kr.predict(X_plot)
+print("Time for KRR prediction: %.3f" % (time.time() - stime))
+
+# Predict using kernel ridge
+stime = time.time()
+y_gpr = gpr.predict(X_plot, return_std=False)
+print("Time for GPR prediction: %.3f" % (time.time() - stime))
+
+stime = time.time()
+y_gpr, y_std = gpr.predict(X_plot, return_std=True)
+print("Time for GPR prediction with standard-deviation: %.3f"
+      % (time.time() - stime))
+
+# Plot results
+plt.scatter(X, y, c='k', label='data')
+plt.plot(X_plot, np.sin(X_plot), c='k', label='True')
+plt.plot(X_plot, y_kr, c='g', label='KRR (%s)' % kr.best_params_)
+plt.plot(X_plot, y_gpr, c='r', label='GPR (%s)' % gpr.kernel_)
+plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='r',
+                 alpha=0.2)
+plt.xlabel('data')
+plt.ylabel('target')
+plt.xlim(0, 20)
+plt.title('GPR versus Kernel Ridge')
+plt.legend(loc="best", prop={'size': 10})
+plt.show()
@@ -0,0 +1,100 @@
+"""
+====================================================================
+Probabilistic predictions with Gaussian process classification (GPC)
+====================================================================
+
+This example illustrates the predicted probability of GPC for an RBF kernel
+with different choices of the hyperparameters. The first figure shows the
+predicted probability of GPC with arbitrarily chosen hyperparameters and with
+the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
+
+While the hyperparameters chosen by optimizing LML have a considerable larger
+LML, they perform slightly worse according to the log-loss on test data. The
+figure shows that this is because they exhibit a steep change of the class
+probabilities at the class boundaries (which is good) but have predicted
+probabilities close to 0.5 far away from the class boundaries (which is bad)
+This undesirable effect is caused by the Laplace approximation used
+internally by GPC.
+
+The second figure shows the log-marginal-likelihood for different choices of
+the kernel's hyperparameters, highlighting the two choices of the
+hyperparameters used in the first figure by black dots.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+from matplotlib import pyplot as plt
+
+from sklearn.metrics.classification import accuracy_score, log_loss
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
+
+
+# Generate data
+train_size = 50
+rng = np.random.RandomState(0)
+X = rng.uniform(0, 5, 100)[:, np.newaxis]
+y = np.array(X[:, 0] > 2.5, dtype=int)
+
+# Specify Gaussian Processes with fixed and optimized hyperparameters
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
+                                   optimizer=None)
+gp_fix.fit(X[:train_size], y[:train_size])
+
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
+gp_opt.fit(X[:train_size], y[:train_size])
+
+print("Log Marginal Likelihood (initial): %.3f"
+      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
+print("Log Marginal Likelihood (optimized): %.3f"
+      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))
+
+print("Accuracy: %.3f (initial) %.3f (optimized)"
+      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
+         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
+print("Log-loss: %.3f (initial) %.3f (optimized)"
+      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
+         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))
+
+
+# Plot posteriors
+plt.figure(0)
+plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data")
+plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data")
+X_ = np.linspace(0, 5, 100)
+plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
+         label="Initial kernel: %s" % gp_fix.kernel_)
+plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
+         label="Optimized kernel: %s" % gp_opt.kernel_)
+plt.xlabel("Feature")
+plt.ylabel("Class 1 probability")
+plt.xlim(0, 5)
+plt.ylim(-0.25, 1.5)
+plt.legend(loc="best")
+
+# Plot LML landscape
+plt.figure(1)
+theta0 = np.logspace(0, 8, 30)
+theta1 = np.logspace(-1, 1, 29)
+Theta0, Theta1 = np.meshgrid(theta0, theta1)
+LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
+        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = np.array(LML).T
+plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
+         'ko', zorder=10)
+plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
+         'ko', zorder=10)
+plt.pcolor(Theta0, Theta1, LML)
+plt.xscale("log")
+plt.yscale("log")
+plt.colorbar(label="Log-marginal Likelihood")
+plt.xlabel("Magnitude")
+plt.ylabel("Length-scale")
+plt.title("Log-marginal-likelihood")
+
+plt.show()