10000 Merge pull request #5466 from glouppe/gp_new · scikit-learn/scikit-learn@5c697ba · GitHub
[go: up one dir, main page]

Skip to content

Commit 5c697ba

Browse files
committed
Merge pull request #5466 from glouppe/gp_new
[MRG+2] Gaussian process by @jmetzen
2 parents 40ba4fc + cea3f29 commit 5c697ba

25 files changed

+5044
-220
lines changed

doc/modules/classes.rst

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -495,22 +495,30 @@ From text
495495
:toctree: generated/
496496
:template: class.rst
497497

498+
gaussian_process.GaussianProcessRegressor
499+
gaussian_process.GaussianProcessClassifier
498500
gaussian_process.GaussianProcess
499501

500-
.. autosummary::
501-
:toctree: generated
502-
:template: function.rst
502+
Kernels:
503503

504-
gaussian_process.correlation_models.absolute_exponential
505-
gaussian_process.correlation_models.squared_exponential
506-
gaussian_process.correlation_models.generalized_exponential
507-
gaussian_process.correlation_models.pure_nugget
508-
gaussian_process.correlation_models.cubic
509-
gaussian_process.correlation_models.linear
510-
gaussian_process.regression_models.constant
511-
gaussian_process.regression_models.linear
512-
gaussian_process.regression_models.quadratic
504+
.. autosummary::
505+
:toctree: generated/
506+
:template: class.rst
513507

508+
gaussian_process.kernels.Kernel
509+
gaussian_process.kernels.Sum
510+
gaussian_process.kernels.Product
511+
gaussian_process.kernels.Exponentiation
512+
gaussian_process.kernels.ConstantKernel
513+
gaussian_process.kernels.WhiteKernel
514+
gaussian_process.kernels.RBF
515+
gaussian_process.kernels.Matern
516+
gaussian_process.kernels.RationalQuadratic
517+
gaussian_process.kernels.ExpSineSquared
518+
gaussian_process.kernels.DotProduct
519+
gaussian_process.kernels.PairwiseKernel
520+
gaussian_process.kernels.CompoundKernel
521+
gaussian_process.kernels.Hyperparameter
514522

515523
.. _grid_search_ref:
516524

doc/modules/gaussian_process.rst

Lines changed: 589 additions & 74 deletions
Large diffs are not rendered by default.

examples/classification/plot_classification_probability.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
Plot the classification probability for different classifiers. We use a 3
77
class dataset, and we classify it with a Support Vector classifier, L1
88
and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
9-
setting.
9+
setting, and Gaussian process classification.
1010
1111
The logistic regression is not a multiclass classifier out of the box. As
1212
a result it can identify only the first class.
@@ -21,6 +21,8 @@ class dataset, and we classify it with a Support Vector classifier, L1
2121

2222
from sklearn.linear_model import LogisticRegression
2323
from sklearn.svm import SVC
24+
from sklearn.gaussian_process import GaussianProcessClassifier
25+
from sklearn.gaussian_process.kernels import RBF
2426
from sklearn import datasets
2527

2628
iris = datasets.load_iris()
@@ -30,6 +32,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
3032
n_features = X.shape[1]
3133

3234
C = 1.0
35+
kernel = 1.0 * RBF([1.0, 1.0]) # for GPC
3336

3437
# Create different classifiers. The logistic regression cannot do
3538
# multiclass out of the box.
@@ -38,8 +41,9 @@ class dataset, and we classify it with a Support Vector classifier, L1
3841
'Linear SVC': SVC(kernel='linear', C=C, probability=True,
3942
random_state=0),
4043
'L2 logistic (Multinomial)': LogisticRegression(
41-
C=C, solver='lbfgs', multi_class='multinomial'
42-
)}
44+
C=C, solver='lbfgs', multi_class='multinomial'),
45+
'GPC': GaussianProcessClassifier(kernel)
46+
}
4347

4448
n_classifiers = len(classifiers)
4549

examples/classification/plot_classifier_comparison.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
from sklearn.datasets import make_moons, make_circles, make_classification
3737
from sklearn.neighbors import KNeighborsClassifier
3838
from sklearn.svm import SVC
39+
from sklearn.gaussian_process import GaussianProcessClassifier
40+
from sklearn.gaussian_process.kernels import RBF
3941
from sklearn.tree import DecisionTreeClassifier
4042
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
4143
from sklearn.naive_bayes import GaussianNB
@@ -44,13 +46,15 @@
4446

4547
h = .02 # step size in the mesh
4648

47-
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
48-
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
49-
"Quadratic Discriminant Analysis"]
49+
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
50+
"Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes",
51+
"Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]
52+
5053
classifiers = [
5154
KNeighborsClassifier(3),
5255
SVC(kernel="linear", C=0.025),
5356
SVC(gamma=2, C=1),
57+
GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
5458
DecisionTreeClassifier(max_depth=5),
5559
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
5660
AdaBoostClassifier(),
@@ -76,7 +80,8 @@
7680
# preprocess dataset, split into training and test part
7781
X, y = ds
7882
X = StandardScaler().fit_transform(X)
79-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
83+
X_train, X_test, y_train, y_test = \
84+
train_test_split(X, y, test_size=.4, random_state=42)
8085

8186
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
8287
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
@@ -129,5 +134,5 @@
129134
size=15, horizontalalignment='right')
130135
i += 1
131136

132-
figure.subplots_adjust(left=.02, right=.98)
137+
plt.tight_layout()
133138
plt.show()

examples/gaussian_process/gp_diabetes_dataset.py

Lines changed: 0 additions & 51 deletions
This file was deleted.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
==========================================================
3+
Comparison of kernel ridge and Gaussian process regression
4+
==========================================================
5+
6+
Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
7+
a target function by employing internally the "kernel trick". KRR learns a
8+
linear function in the space induced by the respective kernel which corresponds
9+
to a non-linear function in the original space. The linear function in the
10+
kernel space is chosen based on the mean-squared error loss with
11+
ridge regularization. GPR uses the kernel to define the covariance of
12+
a prior distribution over the target functions and uses the observed training
13+
data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
14+
posterior distribution over target functions is defined, whose mean is used
15+
for prediction.
16+
17+
A major difference is that GPR can choose the kernel's hyperparameters based
18+
on gradient-ascent on the marginal likelihood function while KRR needs to
19+
perform a grid search on a cross-validated loss function (mean-squared error
20+
loss). A further difference is that GPR learns a generative, probabilistic
21+
model of the target function and can thus provide meaningful confidence
22+
intervals and posterior samples along with the predictions while KRR only
23+
provides predictions.
24+
25+
This example illustrates both methods on an artificial dataset, which
26+
consists of a sinusoidal target function and strong noise. The figure compares
27+
the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
28+
suited for learning periodic functions. The kernel's hyperparameters control
29+
the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
30+
of the data is learned explicitly by GPR by an additional WhiteKernel component
31+
in the kernel and by the regularization parameter alpha of KRR.
32+
33+
The figure shows that both methods learn reasonable models of the target
34+
function. GPR correctly identifies the periodicity of the function to be
35+
roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
36+
that, GPR provides reasonable confidence bounds on the prediction which are not
37+
available for KRR. A major difference between the two methods is the time
38+
required for fitting and predicting: while fitting KRR is fast in principle,
39+
the grid-search for hyperparameter optimization scales exponentially with the
40+
number of hyperparameters ("curse of dimensionality"). The gradient-based
41+
optimization of the parameters in GPR does not suffer from this exponential
42+
scaling and is thus considerable faster on this example with 3-dimensional
43+
hyperparameter space. The time for predicting is similar; however, generating
44+
the variance of the predictive distribution of GPR takes considerable longer
45+
than just predicting the mean.
46+
"""
47+
print(__doc__)
48+
49+
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
50+
# License: BSD 3 clause
51+
52+
53+
import time
54+
55+
import numpy as np
56+
57+
import matplotlib.pyplot as plt
58+
59+
from sklearn.kernel_ridge import KernelRidge
60+
from sklearn.grid_search import GridSearchCV
61+
from sklearn.gaussian_process import GaussianProcessRegressor
62+
from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
63+
64+
rng = np.random.RandomState(0)
65+
66+
# Generate sample data
67+
X = 15 * rng.rand(100, 1)
68+
y = np.sin(X).ravel()
69+
y += 3 * (0.5 - rng.rand(X.shape[0])) # add noise
70+
71+
# Fit KernelRidge with parameter selection based on 5-fold cross validation
72+
param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
73+
"kernel": [ExpSineSquared(l, p)
74+
for l in np.logspace(-2, 2, 10)
75+
for p in np.logspace(0, 2, 10)]}
76+
kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
77+
stime = time.time()
78+
kr.fit(X, y)
79+
print("Time for KRR fitting: %.3f" % (time.time() - stime))
80+
81+
gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
82+
+ WhiteKernel(1e-1)
83+
gpr = GaussianProcessRegressor(kernel=gp_kernel)
84+
stime = time.time()
85+
gpr.fit(X, y)
86+
print("Time for GPR fitting: %.3f" % (time.time() - stime))
87+
88+
# Predict using kernel ridge
89+
X_plot = np.linspace(0, 20, 10000)[:, None]
90+
stime = time.time()
91+
y_kr = kr.predict(X_plot)
92+
print("Time for KRR prediction: %.3f" % (time.time() - stime))
93+
94+
# Predict using kernel ridge
95+
stime = time.time()
96+
y_gpr = gpr.predict(X_plot, return_std=False)
97+
print("Time for GPR prediction: %.3f" % (time.time() - stime))
98+
99+
stime = time.time()
100+
y_gpr, y_std = gpr.predict(X_plot, return_std=True)
101+
print("Time for GPR prediction with standard-deviation: %.3f"
102+
% (time.time() - stime))
103+
104+
# Plot results
105+
plt.scatter(X, y, c='k', label='data')
106+
plt.plot(X_plot, np.sin(X_plot), c='k', label='True')
107+
plt.plot(X_plot, y_kr, c='g', label='KRR (%s)' % kr.best_params_)
108+
plt.plot(X_plot, y_gpr, c='r', label='GPR (%s)' % gpr.kernel_)
109+
plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='r',
110+
alpha=0.2)
111+
plt.xlabel('data')
112+
plt.ylabel('target')
113+
plt.xlim(0, 20)
114+
plt.title('GPR versus Kernel Ridge')
115+
plt.legend(loc="best", prop={'size': 10})
116+
plt.show()

examples/gaussian_process/plot_gpc.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
====================================================================
3+
Probabilistic predictions with Gaussian process classification (GPC)
4+
====================================================================
5+
6+
This example illustrates the predicted probability of GPC for an RBF kernel
7+
with different choices of the hyperparameters. The first figure shows the
8+
predicted probability of GPC with arbitrarily chosen hyperparameters and with
9+
the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
10+
11+
While the hyperparameters chosen by optimizing LML have a considerable larger
12+
LML, they perform slightly worse according to the log-loss on test data. The
13+
figure shows that this is because they exhibit a steep change of the class
14+
probabilities at the class boundaries (which is good) but have predicted
15+
probabilities close to 0.5 far away from the class boundaries (which is bad)
16+
This undesirable effect is caused by the Laplace approximation used
17+
internally by GPC.
18+
19+
The second figure shows the log-marginal-likelihood for different choices of
20+
the kernel's hyperparameters, highlighting the two choices of the
21+
hyperparameters used in the first figure by black dots.
22+
"""
23+
print(__doc__)
24+
25+
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
26+
#
27+
# License: BSD 3 clause
28+
29+
import numpy as np
30+
31+
from matplotlib import pyplot as plt
32+
33+
from sklearn.metrics.classification import accuracy_score, log_loss
34+
from sklearn.gaussian_process import GaussianProcessClassifier
35+
from sklearn.gaussian_process.kernels import RBF
36+
37+
38+
# Generate data
39+
train_size = 50
40+
rng = np.random.RandomState(0)
41+
X = rng.uniform(0, 5, 100)[:, np.newaxis]
42+
y = np.array(X[:, 0] > 2.5, dtype=int)
43+
44+
# Specify Gaussian Processes with fixed and optimized hyperparameters
45+
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
46+
optimizer=None)
47+
gp_fix.fit(X[:train_size], y[:train_size])
48+
49+
gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
50+
gp_opt.fit(X[:train_size], y[:train_size])
51+
52+
print("Log Marginal Likelihood (initial): %.3f"
53+
% gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
54+
print("Log Marginal Likelihood (optimized): %.3f"
55+
% gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))
56+
57+
print("Accuracy: %.3f (initial) %.3f (optimized)"
58+
% (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
59+
accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
60+
print("Log-loss: %.3f (initial) %.3f (optimized)"
61+
% (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
62+
log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))
63+
64+
65+
# Plot posteriors
66+
plt.figure(0)
67+
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data")
68+
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data")
69+
X_ = np.linspace(0, 5, 100)
70+
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
71+
label="Initial kernel: %s" % gp_fix.kernel_)
72+
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
73+
label="Optimized kernel: %s" % gp_opt.kernel_)
74+
plt.xlabel("Feature")
75+
plt.ylabel("Class 1 probability")
76+
plt.xlim(0, 5)
77+
plt.ylim(-0.25, 1.5)
78+
plt.legend(loc="best")
79+
80+
# Plot LML landscape
81+
plt.figure(1)
82+
theta0 = np.logspace(0, 8, 30)
83+
theta1 = np.logspace(-1, 1, 29)
84+
Theta0, Theta1 = np.meshgrid(theta0, theta1)
85+
LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
86+
for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
87+
LML = np.array(LML).T
88+
plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
89+
'ko', zorder=10)
90+
plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
91+
'ko', zorder=10)
92+
plt.pcolor(Theta0, Theta1, LML)
93+
plt.xscale("log")
94+
plt.yscale("log")
95+
plt.colorbar(label="Log-marginal Likelihood")
96+
plt.xlabel("Magnitude")
97+
plt.ylabel("Length-scale")
98+
plt.title("Log-marginal-likelihood")
99+
100+
plt.show()

0 commit comments

Comments
 (0)
0