scikit-learn
diff --git a/‎examples/gaussian_process/plot_gpr_noisy_targets.py
Lines changed: 120 additions & 88 deletions b/‎examples/gaussian_process/plot_gpr_noisy_targets.py
Lines changed: 120 additions & 88 deletions
@@ -11,111 +11,143 @@
 In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
 
-The figures illustrate the interpolating property of the Gaussian Process
-model as well as its probabilistic nature in the form of a pointwise 95%
-confidence interval.
-
-Note that the parameter ``alpha`` is applied as a Tikhonov
-regularization of the assumed covariance between the training points.
+The figures illustrate the interpolating property of the Gaussian Process model
+as well as its probabilistic nature in the form of a pointwise 95% confidence
+interval.
 
+Note that `alpha` is a parameter to control the strength of the Tikhonov
+regularization on the assumed training points' covariance matrix.
 """
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
-#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
+# %%
+# Dataset generation
+# ------------------
+#
+# We will start by generating a synthetic dataset. The true generative process
+# is defined as :math:`f(x) = x \sin(x)`.
 import numpy as np
-from matplotlib import pyplot as plt
-
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
-
-np.random.seed(1)
-
-
-def f(x):
-    """The function to predict."""
-    return x * np.sin(x)
-
-
-# ----------------------------------------------------------------------
-#  First the noiseless case
-X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
 
-# Observations
-y = f(X).ravel()
+X = np.linspace(start=0, stop=10, num=1_000).reshape(-1, 1)
+y = np.squeeze(X * np.sin(X))
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
-x = np.atleast_2d(np.linspace(0, 10, 1000)).T
+# %%
+import matplotlib.pyplot as plt
 
-# Instantiate a Gaussian Process model
-kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
-gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
-plt.plot(X, y, "r.", markersize=10, label="Observations")
-plt.plot(x, y_pred, "b-", label="Prediction")
-plt.fill(
-    np.concatenate([x, x[::-1]]),
-    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-
67E6
1]]),
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.legend()
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+_ = plt.title("True generative process")
+
+# %%
+# We will use this dataset in the next experiment to illustrate how Gaussian
+# Process regression is working.
+#
+# Example with noise-free target
+# ------------------------------
+#
+# In this first example, we will use the true generative process without
+# adding any noise. For training the Gaussian Process regression, we will only
+# select few samples.
+rng = np.random.RandomState(1)
+training_indices = rng.choice(np.arange<
A3E2
/span>(y.size), size=6, replace=False)
+X_train, y_train = X[training_indices], y[training_indices]
+
+# %%
+# Now, we fit a Gaussian process on these few training data samples. We will
+# use a radial basis function (RBF) kernel and a constant parameter to fit the
+# amplitude.
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF
+
+kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
+gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
+gaussian_process.fit(X_train, y_train)
+gaussian_process.kernel_
+
+# %%
+# After fitting our model, we see that the hyperparameters of the kernel have
+# been optimized. Now, we will use our kernel to compute the mean prediction
+# of the full dataset and plot the 95% confidence interval.
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.scatter(X_train, y_train, label="Observations")
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
     alpha=0.5,
-    fc="b",
-    ec="None",
-    label="95% confidence interval",
+    label=r"95% confidence interval",
 )
+plt.legend()
 plt.xlabel("$x$")
 plt.ylabel("$f(x)$")
-plt.ylim(-10, 20)
-plt.legend(loc="upper left")
-
-# ----------------------------------------------------------------------
-# now the noisy case
-X = np.linspace(0.1, 9.9, 20)
-X = np.atleast_2d(X).T
-
-# Observations and noise
-y = f(X).ravel()
-dy = 0.5 + 1.0 * np.random.random(y.shape)
-noise = np.random.normal(0, dy)
-y += noise
-
-# Instantiate a Gaussian Process model
-gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2, n_restarts_optimizer=10)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
-plt.errorbar(X.ravel(), y, dy, fmt="r.", markersize=10, label="Observations")
-plt.plot(x, y_pred, "b-", label="Prediction")
-plt.fill(
-    np.concatenate([x, x[::-1]]),
-    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]),
+_ = plt.title("Gaussian process regression on noise-free dataset")
+
+# %%
+# We see that for a prediction made on a data point close to the one from the
+# training set, the 95% confidence has a small amplitude. Whenever a sample
+# falls far from training data, our model's prediction is less accurate and the
+# model prediction is less precise (higher uncertainty).
+#
+# Example with noisy targets
+# --------------------------
+#
+# We can repeat a similar experiment adding an additional noise to the target
+# this time. It will allow seeing the effect of the noise on the fitted model.
+#
+# We add some random Gaussian noise to the target with an arbitrary
+# standard deviation.
+noise_std = 0.75
+y_train_noisy = y_train + rng.normal(loc=0.0, scale=noise_std, size=y_train.shape)
+
+# %%
+# We create a similar Gaussian process model. In addition to the kernel, this
+# time, we specify the parameter `alpha` which can be interpreted as the
+# variance of a Gaussian noise.
+gaussian_process = GaussianProcessRegressor(
+    kernel=kernel, alpha=noise_std ** 2, n_restarts_optimizer=9
+)
+gaussian_process.fit(X_train, y_train_noisy)
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+# %%
+# Let's plot the mean prediction and the uncertainty region as before.
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.errorbar(
+    X_train,
+    y_train_noisy,
+    noise_std,
+    linestyle="None",
+    color="tab:blue",
+    marker=".",
+    markersize=10,
+    label="Observations",
+)
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
+    color="tab:orange",
     alpha=0.5,
-    fc="b",
-    ec="None",
-    label="95% confidence interval",
+    label=r"95% confidence interval",
 )
+plt.legend()
 plt.xlabel("$x$")
 plt.ylabel("$f(x)$")
-plt.ylim(-10, 20)
-plt.legend(loc="upper left")
+_ = plt.title("Gaussian process regression on a noisy dataset")
 
-plt.show()
+# %%
+# The noise affects the predictions close to the training samples: the
+# predictive uncertainty near to the training samples is larger because we
+# explicitly model a given level target noise independent of the input
+# variable.