diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index b310ba3ea3dbf..59a51fd01fa03 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -490,22 +490,30 @@ From text
   :toctree: generated/
   :template: class.rst
 
+  gaussian_process.GaussianProcessRegressor
+  gaussian_process.GaussianProcessClassifier
   gaussian_process.GaussianProcess
 
-.. autosummary::
-   :toctree: generated
-   :template: function.rst
+Kernels:
 
-   gaussian_process.correlation_models.absolute_exponential
-   gaussian_process.correlation_models.squared_exponential
-   gaussian_process.correlation_models.generalized_exponential
-   gaussian_process.correlation_models.pure_nugget
-   gaussian_process.correlation_models.cubic
-   gaussian_process.correlation_models.linear
-   gaussian_process.regression_models.constant
-   gaussian_process.regression_models.linear
-   gaussian_process.regression_models.quadratic
+.. autosummary::
+  :toctree: generated/
+  :template: class.rst
 
+  gaussian_process.kernels.Kernel
+  gaussian_process.kernels.Sum
+  gaussian_process.kernels.Product
+  gaussian_process.kernels.Exponentiation
+  gaussian_process.kernels.ConstantKernel
+  gaussian_process.kernels.WhiteKernel
+  gaussian_process.kernels.RBF
+  gaussian_process.kernels.Matern
+  gaussian_process.kernels.RationalQuadratic
+  gaussian_process.kernels.ExpSineSquared
+  gaussian_process.kernels.DotProduct
+  gaussian_process.kernels.PairwiseKernel
+  gaussian_process.kernels.CompoundKernel
+  gaussian_process.kernels.Hyperparameter
 
 .. _grid_search_ref:
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index a272dd177fa1e..52049efc16153 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -8,45 +8,598 @@ Gaussian Processes
 
 .. currentmodule:: sklearn.gaussian_process
 
-**Gaussian Processes for Machine Learning (GPML)** is a generic supervised
-learning method primarily designed to solve *regression* problems. It has also
-been extended to *probabilistic classification*, but in the present
-implementation, this is only a post-processing of the *regression* exercise.
+**Gaussian Processes (GP)** are a generic supervised learning method designed
+to solve *regression* and *probabilistic classification* problems.
 
-The advantages of Gaussian Processes for Machine Learning are:
+The advantages of Gaussian processes are:
 
     - The prediction interpolates the observations (at least for regular
-      correlation models).
+      kernels).
 
     - The prediction is probabilistic (Gaussian) so that one can compute
-      empirical confidence intervals and exceedance probabilities that might be
-      used to refit (online fitting, adaptive fitting) the prediction in some
+      empirical confidence intervals and decide based on those if one should
+      refit (online fitting, adaptive fitting) the prediction in some
       region of interest.
 
-    - Versatile: different :ref:`linear regression models
-      <linear_model>` and :ref:`correlation models
-      <correlation_models>` can be specified. Common models are provided, but
-      it is also possible to specify custom models provided they are
-      stationary.
+    - Versatile: different :ref:`kernels
+      <gp_kernels>` can be specified. Common kernels are provided, but
+      it is also possible to specify custom kernels.
 
-The disadvantages of Gaussian Processes for Machine Learning include:
+The disadvantages of Gaussian processes include:
 
-    - It is not sparse. It uses the whole samples/features information to
+    - They are not sparse, i.e., they use the whole samples/features information to
       perform the prediction.
 
-    - It loses efficiency in high dimensional spaces -- namely when the number
-      of features exceeds a few dozens. It might indeed give poor performance
-      and it loses computational efficiency.
+    - They lose efficiency in high dimensional spaces -- namely when the number
+      of features exceeds a few dozens.
 
-    - Classification is only a post-processing, meaning that one first need
-      to solve a regression problem by providing the complete scalar float
-      precision output :math:`y` of the experiment one attempt to model.
 
-Thanks to the Gaussian property of the prediction, it has been given varied
-applications: e.g. for global optimization, probabilistic classification.
+.. _gpr:
 
-Examples
-========
+Gaussian Process Regression (GPR)
+=================================
+
+.. currentmodule:: sklearn.gaussian_process
+
+The :class:`GaussianProcessRegressor` implements Gaussian processes (GP) for
+regression purposes. For this, the prior of the GP needs to be specified. The
+prior mean is assumed to be constant and zero (for ``normalize_y=False``) or the
+training data's mean (for ``normalize_y=True``). The prior's
+covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
+hyperparameters of the kernel are optimized during fitting of
+GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
+on the passed ``optimizer``. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
+first run is always conducted starting from the initial hyperparameter values
+of the kernel; subsequent runs are conducted from hyperparameter values
+that have been chosen randomly from the range of allowed values.
+If the initial hyperparameters should be kept fixed, `None` can be passed as
+optimizer.
+
+The noise level in the targets can be specified by passing it via the
+parameter ``alpha``, either globally as a scalar or per datapoint.
+Note that a moderate noise level can also be helpful for dealing with numeric
+issues during fitting as it is effectively implemented as Tikhonov
+regularization, i.e., by adding it to the diagonal of the kernel matrix. An
+alternative to specifying the noise level explicitly is to include a
+WhiteKernel component into the kernel, which can estimate the global noise
+level from the data (see example below).
+
+The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to
+the API of standard sklearn estimators, GaussianProcessRegressor:
+     * allows prediction without prior fitting (based on the GP prior)
+     * provides an additional method ``sample_y(X)``, which evaluates samples
+       drawn from the GPR (prior or posterior) at given inputs
+     * exposes a method ``log_marginal_likelihood(theta)``, which can be used
+       externally for other ways of selecting hyperparameters, e.g., via
+       Markov chain Monte Carlo.
+
+
+GPR examples
+============
+
+GPR with noise-level estimation
+-------------------------------
+This example illustrates that GPR with a sum-kernel including a WhiteKernel can
+estimate the noise level of data. An illustration of the
+log-marginal-likelihood (LML) landscape shows that there exist two local
+maxima of LML.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+The first corresponds to a model with a high noise level and a
+large length scale, which explains all variations in the data by noise.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+The second one has a smaller noise level and shorter length scale, which explains
+most of the variation by the noise-free functional relationship. The second
+model has a higher likelihood; however, depending on the initial value for the
+hyperparameters, the gradient-based optimization might also converge to the
+high-noise solution. It is thus important to repeat the optimization several
+times for different initializations.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_noisy_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
+   :align: center
+
+
+Comparison of GPR and Kernel Ridge Regression
+---------------------------------------------
+
+Both kernel ridge regression (KRR) and GPR learn
+a target function by employing internally the "kernel trick". KRR learns a
+linear function in the space induced by the respective kernel which corresponds
+to a non-linear function in the original space. The linear function in the
+kernel space is chosen based on the mean-squared error loss with
+ridge regularization. GPR uses the kernel to define the covariance of
+a prior distribution over the target functions and uses the observed training
+data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
+posterior distribution over target functions is defined, whose mean is used
+for prediction.
+
+A major difference is that GPR can choose the kernel's hyperparameters based
+on gradient-ascent on the marginal likelihood function while KRR needs to
+perform a grid search on a cross-validated loss function (mean-squared error
+loss). A further difference is that GPR learns a generative, probabilistic
+model of the target function and can thus provide meaningful confidence
+intervals and posterior samples along with the predictions while KRR only
+provides predictions.
+
+The following figure illustrates both methods on an artificial dataset, which
+consists of a sinusoidal target function and strong noise. The figure compares
+the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
+suited for learning periodic functions. The kernel's hyperparameters control
+the smoothness (length_scale) and periodicity of the kernel (periodicity).
+Moreover, the noise level
+of the data is learned explicitly by GPR by an additional WhiteKernel component
+in the kernel and by the regularization parameter alpha of KRR.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_compare_gpr_krr_001.png
+   :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
+   :align: center
+
+The figure shows that both methods learn reasonable models of the target
+function. GPR correctly identifies the periodicity of the function to be
+roughly :math:`2*\pi` (6.28), while KRR chooses the doubled periodicity
+:math:`4*\pi` . Besides
+that, GPR provides reasonable confidence bounds on the prediction which are not
+available for KRR. A major difference between the two methods is the time
+required for fitting and predicting: while fitting KRR is fast in principle,
+the grid-search for hyperparameter optimization scales exponentially with the
+number of hyperparameters ("curse of dimensionality"). The gradient-based
+optimization of the parameters in GPR does not suffer from this exponential
+scaling and is thus considerable faster on this example with 3-dimensional
+hyperparameter space. The time for predicting is similar; however, generating
+the variance of the predictive distribution of GPR takes considerable longer
+than just predicting the mean.
+
+GPR on Mauna Loa CO2 data
+-------------------------
+
+This example is based on Section 5.4.3 of "Gaussian Processes for Machine
+Learning" [RW2006]_. It illustrates an example of complex kernel engineering and
+hyperparameter optimization using gradient ascent on the
+log-marginal-likelihood. The data consists of the monthly average atmospheric
+CO2 concentrations (in parts per million by volume (ppmv)) collected at the
+Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
+model the CO2 concentration as a function of the time t.
+
+The kernel is composed of several terms that are responsible for explaining
+different properties of the signal:
+ - a long term, smooth rising trend is to be explained by an RBF kernel. The
+   RBF kernel with a large length-scale enforces this component to be smooth;
+   it is not enforced that the trend is rising which leaves this choice to the
+   GP. The specific length-scale and the amplitude are free hyperparameters.
+ - a seasonal component, which is to be explained by the periodic
+   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+   of this periodic component, controlling its smoothness, is a free parameter.
+   In order to allow decaying away from exact periodicity, the product with an
+   RBF kernel is taken. The length-scale of this RBF component controls the
+   decay time and is a further free parameter.
+ - smaller, medium term irregularities are to be explained by a
+   RationalQuadratic kernel component, whose length-scale and alpha parameter,
+   which determines the diffuseness of the length-scales, are to be determined.
+   According to [RW2006]_, these irregularities can better be explained by
+   a RationalQuadratic than an RBF kernel component, probably because it can
+   accommodate several length-scales.
+ - a "noise" term, consisting of an RBF kernel contribution, which shall
+   explain the correlated noise components such as local weather phenomena,
+   and a WhiteKernel contribution for the white noise. The relative amplitudes
+   and the RBF's length scale are further free parameters.
+
+Maximizing the log-marginal-likelihood after subtracting the target's mean
+yields the following kernel with an LML of -83.214:
+
+::
+
+   34.4**2 * RBF(length_scale=41.8)
+   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
+                                                      periodicity=1)
+   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
+   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
+
+Thus, most of the target signal (34.4ppm) is explained by a long-term rising
+trend (length-scale 41.8 years). The periodic component has an amplitude of
+3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
+time indicates that we have a locally very close to periodic seasonal
+component. The correlated noise has an amplitude of 0.197ppm with a length
+scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
+overall noise level is very small, indicating that the data can be very well
+explained by the model. The figure shows also that the model makes very
+confident predictions until around 2015
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_co2_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
+   :align: center
+
+.. _gpc:
+
+Gaussian Process Classification (GPC)
+=====================================
+
+.. currentmodule:: sklearn.gaussian_process
+
+The :class:`GaussianProcessClassifier` implements Gaussian processes (GP) for
+classification purposes, more specifically for probabilistic classification,
+where test predictions take the form of class probabilities.
+GaussianProcessClassifier places a GP prior on a latent function :math:`f`,
+which is then squashed through a link function to obtain the probabilistic
+classification. The latent function :math:`f` is a so-called nuisance function,
+whose values are not observed and are not relevant by themselves.
+Its purpose is to allow a convenient formulation of the model, and :math:`f`
+is removed (integrated out) during prediction. GaussianProcessClassifier
+implements the logistic link function, for which the integral cannot be
+computed analytically but is easily approximated in the binary case.
+
+In contrast to the regression setting, the posterior of the latent function
+:math:`f` is not Gaussian even for a GP prior since a Gaussian likelihood is
+inappropriate for discrete class labels. Rather, a non-Gaussian likelihood
+corresponding to the logistic link function (logit) is used.
+GaussianProcessClassifier approximates the non-Gaussian posterior with a
+Gaussian based on the Laplace approximation. More details can be found in
+Chapter 3 of [RW2006]_.
+
+The GP prior mean is assumed to be zero. The prior's
+covariance is specified by a passing a :ref:`kernel <gp_kernels>` object. The
+hyperparameters of the kernel are optimized during fitting of
+GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based
+on the passed ``optimizer``. As the LML may have multiple local optima, the
+optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The
+first run is always conducted starting from the initial hyperparameter values
+of the kernel; subsequent runs are conducted from hyperparameter values
+that have been chosen randomly from the range of allowed values.
+If the initial hyperparameters should be kept fixed, `None` can be passed as
+optimizer.
+
+:class:`GaussianProcessClassifier` supports multi-class classification
+by performing either one-versus-rest or one-versus-one based training and
+prediction.  In one-versus-rest, one binary Gaussian process classifier is
+fitted for each class, which is trained to separate this class from the rest.
+In "one_vs_one", one binary Gaussian process classifier is fitted for each pair
+of classes, which is trained to separate these two classes. The predictions of
+these binary predictors are combined into multi-class predictions. See the
+section on :ref:`multi-class classification <multiclass>` for more details.
+
+In the case of Gaussian process classification, "one_vs_one" might be
+computationally  cheaper since it has to solve many problems involving only a
+subset of the whole training set rather than fewer problems on the whole
+dataset. Since Gaussian process classification scales cubically with the size
+of the dataset, this might be considerably faster. However, note that
+"one_vs_one" does not support predicting probability estimates but only plain
+predictions. Moreover, note that :class:`GaussianProcessClassifier` does not
+(yet) implement a true multi-class Laplace approximation internally, but
+as discussed aboved is based on solving several binary classification tasks
+internally, which are combined using one-versus-rest or one-versus-one.
+
+GPC examples
+============
+
+Probabilistic predictions with GPC
+----------------------------------
+
+This example illustrates the predicted probability of GPC for an RBF kernel
+with different choices of the hyperparameters. The first figure shows the
+predicted probability of GPC with arbitrarily chosen hyperparameters and with
+the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
+
+While the hyperparameters chosen by optimizing LML have a considerable larger
+LML, they perform slightly worse according to the log-loss on test data. The
+figure shows that this is because they exhibit a steep change of the class
+probabilities at the class boundaries (which is good) but have predicted
+probabilities close to 0.5 far away from the class boundaries (which is bad)
+This undesirable effect is caused by the Laplace approximation used
+internally by GPC.
+
+The second figure shows the log-marginal-likelihood for different choices of
+the kernel's hyperparameters, highlighting the two choices of the
+hyperparameters used in the first figure by black dots.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpc.html
+   :align: center
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc.html
+   :align: center
+
+
+Illustration of GPC on the XOR dataset
+--------------------------------------
+
+.. currentmodule:: sklearn.gaussian_process.kernels
+
+This example illustrates GPC on XOR data. Compared are a stationary, isotropic
+kernel (:class:`RBF`) and a non-stationary kernel (:class:`DotProduct`). On this particular
+dataset, the `DotProduct` kernel obtains considerably better results because the
+class-boundaries are linear and coincide with the coordinate axes. In practice,
+however, stationary kernels such as :class:`RBF` often obtain better results.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_xor_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_xor.html
+   :align: center
+
+.. currentmodule:: sklearn.gaussian_process
+
+
+Gaussian process classification (GPC) on iris dataset
+-----------------------------------------------------
+
+This example illustrates the predicted probability of GPC for an isotropic
+and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
+This illustrates the applicability of GPC to non-binary classification.
+The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
+assigning different length-scales to the two feature dimensions.
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpc_iris_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpc_iris.html
+   :align: center
+
+
+.. _gp_kernels:
+
+Kernels for Gaussian Processes
+==============================
+.. currentmodule:: sklearn.gaussian_process.kernels
+
+Kernels (also called "covariance functions" in the context of GPs) are a crucial
+ingredient of GPs which determine the shape of prior and posterior of the GP.
+They encode the assumptions on the function being learned by defining the "similarity"
+of two datapoints combined with the assumption that similar datapoints should
+have similar target values. Two categories of kernels can be distinguished:
+stationary kernels depend only on the distance of two datapoints and not on their
+absolute values :math:`k(x_i, x_j)= k(d(x_i, x_j))` and are thus invariant to
+translations in the input space, while non-stationary kernels
+depend also on the specific values of the datapoints. Stationary kernels can further
+be subdivided into isotropic and anisotropic kernels, where isotropic kernels are
+also invariant to rotations in the input space. For more details, we refer to
+Chapter 4 of [RW2006]_.
+
+Gaussian Process Kernel API
+---------------------------
+The main usage of a :class:`Kernel` is to compute the GP's covariance between
+datapoints. For this, the method ``__call__`` of the kernel can be called. This
+method can either be used to compute the "auto-covariance" of all pairs of
+datapoints in a 2d array X, or the "cross-covariance" of all combinations
+of datapoints of a 2d array X with datapoints in a 2d array Y. The following
+identity holds true for all kernels k (except for the :class:`WhiteKernel`):
+``k(X) == K(X, Y=X)``
+
+If only the diagonal of the auto-covariance is being used, the method ``diag()``
+of a kernel can be called, which is more computationally efficient than the
+equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)``
+
+Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These
+hyperparameters can for instance control length-scales or periodicity of a
+kernel (see below). All kernels support computing analytic gradients of
+of the kernel's auto-covariance with respect to :math:`\theta` via setting
+``eval_gradient=True`` in the ``__call__`` method. This gradient is used by the
+Gaussian process (both regressor and classifier) in computing the gradient
+of the log-marginal-likelihood, which in turn is used to determine the
+value of :math:`\theta`, which maximizes the log-marginal-likelihood,  via
+gradient ascent. For each hyperparameter, the initial value and the
+bounds need to be specified when creating an instance of the kernel. The
+current value of :math:`\theta` can be get and set via the property
+``theta`` of the kernel object. Moreover, the bounds of the hyperparameters can be
+accessed by the property ``bounds`` of the kernel. Note that both properties
+(theta and bounds) return log-transformed values of the internally used values
+since those are typically more amenable to gradient-based optimization.
+The specification of each hyperparameter is stored in the form of an instance of
+:class:`Hyperparameter` in the respective kernel. Note that a kernel using a
+hyperparameter with name "x" must have the attributes self.x and self.x_bounds.
+
+The abstract base class for all kernels is :class:`Kernel`. Kernel implements a
+similar interface as :class:`Estimator`, providing the methods ``get_params()``,
+``set_params()``, and ``clone()``. This allows setting kernel values also via
+meta-estimators such as :class:`Pipeline` or :class:`GridSearch`. Note that due to the nested
+structure of kernels (by applying kernel operators, see below), the names of
+kernel parameters might become relatively complicated. In general, for a
+binary kernel operator, parameters of the left operand are prefixed with ``k1__``
+and parameters of the right operand with ``k2__``. An additional convenience
+method is ``clone_with_theta(theta)``, which returns a cloned version of the
+kernel but with the hyperparameters set to ``theta``. An illustrative example:
+
+    >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF
+    >>> kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0))
+    >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter)
+    Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[  0.,  10.]]), n_elements=1, fixed=False)
+    >>> params = kernel.get_params()
+    >>> for key in sorted(params): print("%s : %s" % (key, params[key]))
+    k1 : 1**2 * RBF(length_scale=0.5)
+    k1__k1 : 1**2
+    k1__k1__constant_value : 1.0
+    k1__k1__constant_value_bounds : (0.0, 10.0)
+    k1__k2 : RBF(length_scale=0.5)
+    k1__k2__length_scale : 0.5
+    k1__k2__length_scale_bounds : (0.0, 10.0)
+    k2 : RBF(length_scale=2)
+    k2__length_scale : 2.0
+    k2__length_scale_bounds : (0.0, 10.0)
+    >>> print(kernel.theta)  # Note: log-transformed
+    [ 0.         -0.69314718  0.69314718]
+    >>> print(kernel.bounds)  # Note: log-transformed
+    [[       -inf  2.30258509]
+     [       -inf  2.30258509]
+     [       -inf  2.30258509]]
+
+
+All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise`
+and vice versa: instances of subclasses of :class:`Kernel` can be passed as
+``metric`` to pairwise_kernels`` from :mod:`sklearn.metrics.pairwise`. Moreover,
+kernel functions from pairwise can be used as GP kernels by using the wrapper
+class :class:`PairwiseKernel`. The only caveat is that the gradient of
+the hyperparameters is not analytic but numeric and all those kernels support
+only isotropic distances. The parameter ``gamma`` is considered to be a
+hyperparameter and may be optimized. The other kernel parameters are set
+directly at initialization and are kept fixed.
+
+
+Basic kernels
+-------------
+The :class:`ConstantKernel` kernel can be used as part of a :class:`Product`
+kernel where it scales the magnitude of the other factor (kernel) or as part
+of a :class:`Sum` kernel, where it modifies the mean of the Gaussian process.
+It depends on a parameter :math:`constant\_value`. It is defined as:
+
+.. math::
+   k(x_i, x_j) = constant\_value \;\forall\; x_1, x_2
+
+The main use-case of the :class:`WhiteKernel` kernel is as part of a
+sum-kernel where it explains the noise-component of the signal. Tuning its
+parameter :math:`noise\_level` corresponds to estimating the noise-level.
+It is defined as:e
+
+.. math::
+    k(x_i, x_j) = noise\_level \text{ if } x_i == x_j \text{ else } 0
+
+
+Kernel operators
+----------------
+Kernel operators take one or two base kernels and combine them into a new
+kernel. The :class:`Sum` kernel takes two kernels :math:`k1` and :math:`k2`
+and combines them via :math:`k_{sum}(X, Y) = k1(X, Y) + k2(X, Y)`.
+The  :class:`Product` kernel takes two kernels :math:`k1` and :math:`k2`
+and combines them via :math:`k_{product}(X, Y) = k1(X, Y) * k2(X, Y)`.
+The :class:`Exponentiation` kernel takes one base kernel and a scalar parameter
+:math:`exponent` and combines them via
+:math:`k_{exp}(X, Y) = k(X, Y)^\text{exponent}`.
+
+Radial-basis function (RBF) kernel
+----------------------------------
+The :class:`RBF` kernel is a stationary kernel. It is also known as the "squared
+exponential" kernel. It is parameterized by a length-scale parameter :math:`l>0`, which
+can either be a scalar (isotropic variant of the kernel) or a vector with the same
+number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel).
+The kernel is given by:
+
+.. math::
+   k(x_i, x_j) = \text{exp}\left(-\frac{1}{2} d(x_i / l, x_j / l)^2\right)
+
+This kernel is infinitely differentiable, which implies that GPs with this
+kernel as covariance function have mean square derivatives of all orders, and are thus
+very smooth. The prior and posterior of a GP resulting from an RBF kernel are shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_000.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+
+Matérn kernel
+-------------
+The :class:`Matern` kernel is a stationary kernel and a generalization of the
+:class:`RBF` kernel. It has an additional parameter :math:`\nu` which controls
+the smoothness of the resulting function. It is parameterized by a length-scale parameter :math:`l>0`, which can either be a scalar (isotropic variant of the kernel) or a vector with the same number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel). The kernel is given by:
+
+.. math::
+
+    k(x_i, x_j) = \sigma^2\frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg(\gamma\sqrt{2\nu} d(x_i / l, x_j / l)\Bigg)^\nu K_\nu\Bigg(\gamma\sqrt{2\nu} d(x_i / l, x_j / l)\Bigg),
+
+As :math:`\nu\rightarrow\infty`, the Matérn kernel converges to the RBF kernel.
+When :math:`\nu = 1/2`, the Matérn kernel becomes identical to the absolute
+exponential kernel, i.e.,
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \exp \Bigg(-\gamma d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{1}{2}
+
+In particular, :math:`\nu = 3/2`:
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \Bigg(1 + \gamma \sqrt{3} d(x_i / l, x_j / l)\Bigg) \exp \Bigg(-\gamma \sqrt{3}d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{3}{2}
+
+and :math:`\nu = 5/2`:
+
+.. math::
+    k(x_i, x_j) = \sigma^2 \Bigg(1 + \gamma \sqrt{5}d(x_i / l, x_j / l) +\frac{5}{3} \gamma^2d(x_i / l, x_j / l)^2 \Bigg) \exp \Bigg(-\gamma \sqrt{5}d(x_i / l, x_j / l) \Bigg) \quad \quad \nu= \tfrac{5}{2}
+
+are popular choices for learning functions that are not infinitely
+differentiable (as assumed by the RBF kernel) but at least once (:math:`\nu =
+3/2`) or twice differentiable (:math:`\nu = 5/2`).
+
+The flexibility of controlling the smoothness of the learned function via :math:`\nu`
+allows adapting to the properties of the true underlying functional relation.
+The prior and posterior of a GP resulting from a Matérn kernel are shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_004.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+See [RW2006]_, pp84 for further details regarding the
+different variants of the Matérn kernel.
+
+Rational quadratic kernel
+-------------------------
+
+The :class:`RationalQuadratic` kernel can be seen as a scale mixture (an infinite sum)
+of :class:`RBF` kernels with different characteristic length-scales. It is parameterized
+by a length-scale parameter :math:`l>0` and a scale mixture parameter  :math:`\alpha>0`
+Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
+The kernel is given by:
+
+.. math::
+   k(x_i, x_j) = \left(1 + \frac{d(x_i, x_j)^2}{2\alpha l^2}\right)^\alpha
+
+The prior and posterior of a GP resulting from an RBF kernel are shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_001.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+Exp-Sine-Squared kernel
+-----------------------
+
+The :class:`ExpSineSquared` kernel allows modeling periodic functions.
+It is parameterized by a length-scale parameter :math:`l>0` and a periodicity parameter
+:math:`p>0`. Only the isotropic variant where :math:`l` is a scalar is supported at the moment.
+The kernel is given by:
+
+.. math::
+   k(x_i, x_j) = \text{exp}\left(-2 \text{sin}(\pi / p * d(x_i, x_j)) / l\right)^2
+
+The prior and posterior of a GP resulting from an ExpSineSquared kernel are shown in
+the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_002.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+Dot-Product kernel
+------------------
+
+The :class:`DotProduct` kernel is non-stationary and can be obtained from linear regression
+by putting :math:`N(0, 1)` priors on the coefficients of :math:`x_d (d = 1, . . . , D)` and
+a prior of :math:`N(0, \sigma_0^2)` on the bias. The :class:`DotProduct` kernel is invariant to a rotation
+of the coordinates about the origin, but not translations.
+It is parameterized by a parameter :math:`\sigma_0^2`. For :math:`\sigma_0^2 = 0`, the kernel
+is called the homogeneous linear kernel, otherwise it is inhomogeneous. The kernel is given by
+
+.. math::
+   k(x_i, x_j) = \sigma_0 ^ 2 + x_i \cdot x_j
+
+The :class:`DotProduct` kernel is commonly combined with exponentiation. An example with exponent 2 is
+shown in the following figure:
+
+.. figure:: ../auto_examples/gaussian_process/images/plot_gpr_prior_posterior_003.png
+   :target: ../auto_examples/gaussian_process/plot_gpr_prior_posterior.html
+   :align: center
+
+
+.. currentmodule:: sklearn.gaussian_process
+
+Legacy Gaussian Processes
+=========================
+
+In this section, the implementation of Gaussian processes used in sklearn until
+release 0.16.1 is described. Note that this implementation is deprecated and
+will be removed in version 0.18.
 
 An introductory regression example
 ----------------------------------
@@ -59,9 +612,6 @@ data. Depending on the number of parameters provided at instantiation, the
 fitting procedure may recourse to maximum likelihood estimation for the
 parameters or alternatively it uses the given parameters.
 
-.. figure:: ../auto_examples/gaussian_process/images/plot_gp_regression_001.png
-   :target: ../auto_examples/gaussian_process/plot_gp_regression.html
-   :align: center
 
 ::
 
@@ -88,34 +638,26 @@ Fitting Noisy Data
 
 When the data to be fit includes noise, the Gaussian process model can be
 used by specifying the variance of the noise for each point.
-:class:`GaussianProcess` takes a parameter ``nugget`` which 
+:class:`GaussianProcess` takes a parameter ``nugget`` which
 is added to the diagonal of the correlation matrix between training points:
 in general this is a type of Tikhonov regularization.  In the special case
-of a squared-exponential correlation function, this normalization is 
+of a squared-exponential correlation function, this normalization is
 equivalent to specifying a fractional variance in the input.  That is
 
 .. math::
    \mathrm{nugget}_i = \left[\frac{\sigma_i}{y_i}\right]^2
 
 With ``nugget`` and ``corr`` properly set, Gaussian Processes can be
-used to robustly recover an underlying function from noisy data:
-
-.. figure:: ../auto_examples/gaussian_process/images/plot_gp_regression_002.png
-   :target: ../auto_examples/gaussian_process/plot_gp_regression.html
-   :align: center
-
-.. topic:: Other examples
-
-  * :ref:`example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py`
+used to robustly recover an underlying function from noisy data.
 
 
 
 Mathematical formulation
-========================
+------------------------
 
 
 The initial assumption
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 Suppose one wants to model the output of a computer experiment, say a
 mathematical function:
@@ -159,7 +701,7 @@ and zero otherwise : a *dirac* correlation model -- sometimes referred to as a
 
 
 The best linear unbiased prediction (BLUP)
-------------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 We now derive the *best linear unbiased prediction* of the sample path
 :math:`g` conditioned on the observations:
@@ -264,7 +806,7 @@ decomposition algorithm.
 
 
 The empirical best linear unbiased predictor (EBLUP)
-----------------------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Until now, both the autocorrelation and regression models were assumed given.
 In practice however they are never known in advance so that one has to make
@@ -282,37 +824,10 @@ fmin_cobyla optimization function from scipy.optimize. In the case of
 anisotropy however, we provide an implementation of Welch's componentwise
 optimization algorithm -- see references.
 
-For a more comprehensive description of the theoretical aspects of Gaussian
-Processes for Machine Learning, please refer to the references below:
-
-.. topic:: References:
-
-    * `DACE, A Matlab Kriging Toolbox
-      <http://www2.imm.dtu.dk/~hbn/dace/>`_ S Lophaven, HB Nielsen, J
-      Sondergaard 2002
-
-
-    * `Screening, predicting, and computer experiments
-      <http://www.jstor.org/pss/1269548>`_ WJ Welch, RJ Buck, J Sacks,
-      HP Wynn, TJ Mitchell, and MD Morris Technometrics 34(1) 15--25,
-      1992
-
-
-    * `Gaussian Processes for Machine Learning
-      <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_ CE
-      Rasmussen, CKI Williams MIT Press, 2006 (Ed. T Diettrich)
-
-
-    * `The design and analysis of computer experiments
-      <http://www.stat.osu.edu/~comp_exp/book.html>`_ TJ Santner, BJ
-      Williams, W Notz Springer, 2003
-
-
-
 .. _correlation_models:
 
 Correlation Models
-==================
+------------------
 
 Common correlation models matches some famous SVM's kernels because they are
 mostly built on equivalent assumptions. They must fulfill Mercer's conditions
@@ -334,7 +849,7 @@ models, see the book by Rasmussen & Williams in references.
 
 
 Regression Models
-=================
+-----------------
 
 Common linear regression models involve zero- (constant), first- and
 second-order polynomials. But one may specify its own in the form of a Python
@@ -345,9 +860,9 @@ that the underlying regression problem is not *underdetermined*.
 
 
 Implementation details
-======================
+----------------------
 
-The present implementation is based on a translation of the DACE Matlab
+The implementation is based on a translation of the DACE Matlab
 toolbox.
 
 .. topic:: References:
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 1b768535e4391..3991a70cb4c0f 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -6,7 +6,7 @@
 Plot the classification probability for different classifiers. We use a 3
 class dataset, and we classify it with a Support Vector classifier, L1
 and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting.
+setting, and Gaussian process classification.
 
 The logistic regression is not a multiclass classifier out of the box. As
 a result it can identify only the first class.
@@ -21,6 +21,8 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn import datasets
 
 iris = datasets.load_iris()
@@ -30,6 +32,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 n_features = X.shape[1]
 
 C = 1.0
+kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
 # Create different classifiers. The logistic regression cannot do
 # multiclass out of the box.
@@ -38,8 +41,9 @@ class dataset, and we classify it with a Support Vector classifier, L1
                'Linear SVC': SVC(kernel='linear', C=C, probability=True,
                                  random_state=0),
                'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'
-                )}
+                C=C, solver='lbfgs', multi_class='multinomial'),
+               'GPC': GaussianProcessClassifier(kernel)
+               }
 
 n_classifiers = len(classifiers)
 
diff --git a/examples/classification/plot_classifier_comparison.py b/examples/classification/plot_classifier_comparison.py
index 7be3c0cacad9f..7e2b953df8dc7 100644
--- a/examples/classification/plot_classifier_comparison.py
+++ b/examples/classification/plot_classifier_comparison.py
@@ -36,6 +36,8 @@
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.naive_bayes import GaussianNB
@@ -44,12 +46,14 @@
 
 h = .02  # step size in the mesh
 
-names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
-         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
+names = ["3 Near. Neighb.", "Linear SVM", "RBF SVM", "RBF GPC",
+         "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA",
+         "QDA"]
 classifiers = [
     KNeighborsClassifier(3),
     SVC(kernel="linear", C=0.025),
     SVC(gamma=2, C=1),
+    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
     DecisionTreeClassifier(max_depth=5),
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
     AdaBoostClassifier(),
@@ -75,7 +79,8 @@
     # preprocess dataset, split into training and test part
     X, y = ds
     X = StandardScaler().fit_transform(X)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=.4, random_state=42)
 
     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
@@ -128,5 +133,5 @@
                 size=15, horizontalalignment='right')
         i += 1
 
-figure.subplots_adjust(left=.02, right=.98)
+plt.tight_layout()
 plt.show()
diff --git a/examples/gaussian_process/gp_diabetes_dataset.py b/examples/gaussian_process/gp_diabetes_dataset.py
deleted file mode 100644
index 9baaeccf17d6d..0000000000000
--- a/examples/gaussian_process/gp_diabetes_dataset.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-========================================================================
-Gaussian Processes regression: goodness-of-fit on the 'diabetes' dataset
-========================================================================
-
-In this example, we fit a Gaussian Process model onto the diabetes
-dataset.
-
-We determine the correlation parameters with maximum likelihood
-estimation (MLE). We use an anisotropic squared exponential
-correlation model with a constant regression model. We also use a
-nugget of 1e-2 to account for the (strong) noise in the targets.
-
-We compute a cross-validation estimate of the coefficient of
-determination (R2) without reperforming MLE, using the set of correlation
-parameters found on the whole dataset.
-"""
-print(__doc__)
-
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
-# Licence: BSD 3 clause
-
-from sklearn import datasets
-from sklearn.gaussian_process import GaussianProcess
-from sklearn.cross_validation import cross_val_score, KFold
-
-# Load the dataset from scikit's data sets
-diabetes = datasets.load_diabetes()
-X, y = diabetes.data, diabetes.target
-
-# Instanciate a GP model
-gp = GaussianProcess(regr='constant', corr='absolute_exponential',
-                     theta0=[1e-4] * 10, thetaL=[1e-12] * 10,
-                     thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch')
-
-# Fit the GP model to the data performing maximum likelihood estimation
-gp.fit(X, y)
-
-# Deactivate maximum likelihood estimation for the cross-validation loop
-gp.theta0 = gp.theta_  # Given correlation parameter = MLE
-gp.thetaL, gp.thetaU = None, None  # None bounds deactivate MLE
-
-# Perform a cross-validation estimate of the coefficient of determination using
-# the cross_validation module using all CPUs available on the machine
-K = 20  # folds
-R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean()
-print("The %d-Folds estimate of the coefficient of determination is R2 = %s"
-      % (K, R2))
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
new file mode 100644
index 0000000000000..6c728144e44ff
--- /dev/null
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -0,0 +1,116 @@
+"""
+==========================================================
+Comparison of kernel ridge and Gaussian process regression
+==========================================================
+
+Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
+a target function by employing internally the "kernel trick". KRR learns a
+linear function in the space induced by the respective kernel which corresponds
+to a non-linear function in the original space. The linear function in the
+kernel space is chosen based on the mean-squared error loss with
+ridge regularization. GPR uses the kernel to define the covariance of
+a prior distribution over the target functions and uses the observed training
+data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
+posterior distribution over target functions is defined, whose mean is used
+for prediction.
+
+A major difference is that GPR can choose the kernel's hyperparameters based
+on gradient-ascent on the marginal likelihood function while KRR needs to
+perform a grid search on a cross-validated loss function (mean-squared error
+loss). A further difference is that GPR learns a generative, probabilistic
+model of the target function and can thus provide meaningful confidence
+intervals and posterior samples along with the predictions while KRR only
+provides predictions.
+
+This example illustrates both methods on an artificial dataset, which
+consists of a sinusoidal target function and strong noise. The figure compares
+the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
+suited for learning periodic functions. The kernel's hyperparameters control
+the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
+of the data is learned explicitly by GPR by an additional WhiteKernel component
+in the kernel and by the regularization parameter alpha of KRR.
+
+The figure shows that both methods learn reasonable models of the target
+function. GPR correctly identifies the periodicity of the function to be
+roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
+that, GPR provides reasonable confidence bounds on the prediction which are not
+available for KRR. A major difference between the two methods is the time
+required for fitting and predicting: while fitting KRR is fast in principle,
+the grid-search for hyperparameter optimization scales exponentially with the
+number of hyperparameters ("curse of dimensionality"). The gradient-based
+optimization of the parameters in GPR does not suffer from this exponential
+scaling and is thus considerable faster on this example with 3-dimensional
+hyperparameter space. The time for predicting is similar; however, generating
+the variance of the predictive distribution of GPR takes considerable longer
+than just predicting the mean.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# License: BSD 3 clause
+
+
+import time
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.grid_search import GridSearchCV
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
+
+rng = np.random.RandomState(0)
+
+# Generate sample data
+X = 15 * rng.rand(100, 1)
+y = np.sin(X).ravel()
+y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
+
+# Fit KernelRidge with parameter selection based on 5-fold cross validation
+param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
+              "kernel": [ExpSineSquared(l, p)
+                         for l in np.logspace(-2, 2, 10)
+                         for p in np.logspace(0, 2, 10)]}
+kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
+stime = time.time()
+kr.fit(X, y)
+print("Time for KRR fitting: %.3f" % (time.time() - stime))
+
+gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
+    + WhiteKernel(1e-1)
+gpr = GaussianProcessRegressor(kernel=gp_kernel)
+stime = time.time()
+gpr.fit(X, y)
+print("Time for GPR fitting: %.3f" % (time.time() - stime))
+
+# Predict using kernel ridge
+X_plot = np.linspace(0, 20, 10000)[:, None]
+stime = time.time()
+y_kr = kr.predict(X_plot)
+print("Time for KRR prediction: %.3f" % (time.time() - stime))
+
+# Predict using kernel ridge
+stime = time.time()
+y_gpr = gpr.predict(X_plot, return_std=False)
+print("Time for GPR prediction: %.3f" % (time.time() - stime))
+
+stime = time.time()
+y_gpr, y_std = gpr.predict(X_plot, return_std=True)
+print("Time for GPR prediction with standard-deviation: %.3f"
+      % (time.time() - stime))
+
+# Plot results
+plt.scatter(X, y, c='k', label='data')
+plt.plot(X_plot, np.sin(X_plot), c='k', label='True')
+plt.plot(X_plot, y_kr, c='g', label='KRR (%s)' % kr.best_params_)
+plt.plot(X_plot, y_gpr, c='r', label='GPR (%s)' % gpr.kernel_)
+plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='r',
+                 alpha=0.2)
+plt.xlabel('data')
+plt.ylabel('target')
+plt.xlim(0, 20)
+plt.title('GPR versus Kernel Ridge')
+plt.legend(loc="best", prop={'size': 10})
+plt.show()
diff --git a/examples/gaussian_process/plot_gpc.py b/examples/gaussian_process/plot_gpc.py
new file mode 100644
index 0000000000000..26b7d6d0decfe
--- /dev/null
+++ b/examples/gaussian_process/plot_gpc.py
@@ -0,0 +1,100 @@
+"""
+====================================================================
+Probabilistic predictions with Gaussian process classification (GPC)
+====================================================================
+
+This example illustrates the predicted probability of GPC for an RBF kernel
+with different choices of the hyperparameters. The first figure shows the
+predicted probability of GPC with arbitrarily chosen hyperparameters and with
+the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
+
+While the hyperparameters chosen by optimizing LML have a considerable larger
+LML, they perform slightly worse according to the log-loss on test data. The
+figure shows that this is because they exhibit a steep change of the class
+probabilities at the class boundaries (which is good) but have predicted
+probabilities close to 0.5 far away from the class boundaries (which is bad)
+This undesirable effect is caused by the Laplace approximation used
+internally by GPC.
+
+The second figure shows the log-marginal-likelihood for different choices of
+the kernel's hyperparameters, highlighting the two choices of the
+hyperparameters used in the first figure by black dots.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+from matplotlib import pyplot as plt
+
+from sklearn.metrics.classification import accuracy_score, log_loss
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
+
+
+# Generate data
+train_size = 50
+rng = np.random.RandomState(0)
+X = rng.uniform(0, 5, 100)[:, np.newaxis]
+y = np.array(X[:, 0] > 2.5, dtype=int)
+
+# Specify Gaussian Processes with fixed and optimized hyperparameters
+gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
+                                   optimizer=None)
+gp_fix.fit(X[:train_size], y[:train_size])
+
+gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
+gp_opt.fit(X[:train_size], y[:train_size])
+
+print("Log Marginal Likelihood (initial): %.3f"
+      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
+print("Log Marginal Likelihood (optimized): %.3f"
+      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))
+
+print("Accuracy: %.3f (initial) %.3f (optimized)"
+      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
+         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
+print("Log-loss: %.3f (initial) %.3f (optimized)"
+      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
+         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))
+
+
+# Plot posteriors
+plt.figure(0)
+plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data")
+plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data")
+X_ = np.linspace(0, 5, 100)
+plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
+         label="Initial kernel: %s" % gp_fix.kernel_)
+plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
+         label="Optimized kernel: %s" % gp_opt.kernel_)
+plt.xlabel("Feature")
+plt.ylabel("Class 1 probability")
+plt.xlim(0, 5)
+plt.ylim(-0.25, 1.5)
+plt.legend(loc="best")
+
+# Plot LML landscape
+plt.figure(1)
+theta0 = np.logspace(0, 8, 30)
+theta1 = np.logspace(-1, 1, 29)
+Theta0, Theta1 = np.meshgrid(theta0, theta1)
+LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
+        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = np.array(LML).T
+plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
+         'ko', zorder=10)
+plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
+         'ko', zorder=10)
+plt.pcolor(Theta0, Theta1, LML)
+plt.xscale("log")
+plt.yscale("log")
+plt.colorbar(label="Log-marginal Likelihood")
+plt.xlabel("Magnitude")
+plt.ylabel("Length-scale")
+plt.title("Log-marginal-likelihood")
+
+plt.show()
diff --git a/examples/gaussian_process/plot_gpc_iris.py b/examples/gaussian_process/plot_gpc_iris.py
new file mode 100644
index 0000000000000..ab9c1c6810b75
--- /dev/null
+++ b/examples/gaussian_process/plot_gpc_iris.py
@@ -0,0 +1,62 @@
+"""
+=====================================================
+Gaussian process classification (GPC) on iris dataset
+=====================================================
+
+This example illustrates the predicted probability of GPC for an isotropic
+and anisotropic RBF kernel on a two-dimensional version for the iris-dataset.
+The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by
+assigning different length-scales to the two feature dimensions.
+"""
+print(__doc__)
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import datasets
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF
+
+# import some data to play with
+iris = datasets.load_iris()
+X = iris.data[:, :2]  # we only take the first two features.
+y = np.array(iris.target, dtype=int)
+
+h = .02  # step size in the mesh
+
+kernel = 1.0 * RBF([1.0])
+gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+kernel = 1.0 * RBF([1.0, 1.0])
+gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+# create a mesh to plot in
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
+                     np.arange(y_min, y_max, h))
+
+titles = ["Isotropic RBF", "Anisotropic RBF"]
+plt.figure(figsize=(10, 5))
+for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)):
+    # Plot the predicted probabilities. For that, we will assign a color to
+    # each point in the mesh [x_min, m_max]x[y_min, y_max].
+    plt.subplot(1, 2, i + 1)
+
+    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape((xx.shape[0], xx.shape[1], 3))
+    plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")
+
+    # Plot also the training points
+    plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y])
+    plt.xlabel('Sepal length')
+    plt.ylabel('Sepal width')
+    plt.xlim(xx.min(), xx.max())
+    plt.ylim(yy.min(), yy.max())
+    plt.xticks(())
+    plt.yticks(())
+    plt.title("%s, LML: %.3f" %
+              (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))
+
+plt.tight_layout()
+plt.show()
diff --git a/examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py b/examples/gaussian_process/plot_gpc_isoprobability.py
similarity index 56%
rename from examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py
rename to examples/gaussian_process/plot_gpc_isoprobability.py
index 3c9887aa66852..21b3a010bf2f3 100644
--- a/examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py
+++ b/examples/gaussian_process/plot_gpc_isoprobability.py
@@ -2,33 +2,27 @@
 # -*- coding: utf-8 -*-
 
 """
-==============================================================================
-Gaussian Processes classification example: exploiting the probabilistic output
-==============================================================================
+=================================================================
+Iso-probability lines for Gaussian Processes classification (GPC)
+=================================================================
 
-A two-dimensional regression exercise with a post-processing allowing for
-probabilistic classification thanks to the Gaussian property of the prediction.
-
-The figure illustrates the probability that the prediction is negative with
-respect to the remaining uncertainty in the prediction. The red and blue lines
-corresponds to the 95% confidence interval on the prediction of the zero level
-set.
+A two-dimensional classification example showing iso-probability lines for
+the predicted probabilities.
 """
 print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
+# Adapted to GaussianProcessClassifier:
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
 # Licence: BSD 3 clause
 
 import numpy as np
-from scipy import stats
-from sklearn.gaussian_process import GaussianProcess
+
 from matplotlib import pyplot as pl
 from matplotlib import cm
 
-# Standard normal distribution functions
-phi = stats.distributions.norm().pdf
-PHI = stats.distributions.norm().cdf
-PHIinv = stats.distributions.norm().ppf
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import DotProduct, ConstantKernel as C
 
 # A few constants
 lim = 8
@@ -50,32 +44,28 @@ def g(x):
               [5.21301203, 4.26386883]])
 
 # Observations
-y = g(X)
+y = np.array(g(X) > 0, dtype=int)
 
 # Instanciate and fit Gaussian Process Model
-gp = GaussianProcess(theta0=5e-1)
-
-# Don't perform MLE or you'll get a perfect prediction for this simple example!
+kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2
+gp = GaussianProcessClassifier(kernel=kernel)
 gp.fit(X, y)
+print("Learned kernel: %s " % gp.kernel_)
 
-# Evaluate real function, the prediction and its MSE on a grid
+# Evaluate real function and the predicted probability
 res = 50
 x1, x2 = np.meshgrid(np.linspace(- lim, lim, res),
                      np.linspace(- lim, lim, res))
 xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T
 
 y_true = g(xx)
-y_pred, MSE = gp.predict(xx, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_prob = gp.predict_proba(xx)[:, 1]
 y_true = y_true.reshape((res, res))
-y_pred = y_pred.reshape((res, res))
-sigma = sigma.reshape((res, res))
-k = PHIinv(.975)
+y_prob = y_prob.reshape((res, res))
 
-# Plot the probabilistic classification iso-values using the Gaussian property
-# of the prediction
+# Plot the probabilistic classification iso-values
 fig = pl.figure(1)
-ax = fig.add_subplot(111)
+ax = fig.gca()
 ax.axes.set_aspect('equal')
 pl.xticks([])
 pl.yticks([])
@@ -84,11 +74,12 @@ def g(x):
 pl.xlabel('$x_1$')
 pl.ylabel('$x_2$')
 
-cax = pl.imshow(np.flipud(PHI(- y_pred / sigma)), cmap=cm.gray_r, alpha=0.8,
-                extent=(- lim, lim, - lim, lim))
+cax = pl.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
+                extent=(-lim, lim, -lim, lim))
 norm = pl.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
 cb = pl.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
 cb.set_label('${\\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\\right]$')
+pl.clim(0, 1)
 
 pl.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
 
@@ -96,15 +87,15 @@ def g(x):
 
 cs = pl.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.025], colors='b',
+cs = pl.contour(x1, x2, y_prob, [0.666], colors='b',
                 linestyles='solid')
 pl.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.5], colors='k',
+cs = pl.contour(x1, x2, y_prob, [0.5], colors='k',
                 linestyles='dashed')
 pl.clabel(cs, fontsize=11)
 
-cs = pl.contour(x1, x2, PHI(- y_pred / sigma), [0.975], colors='r',
+cs = pl.contour(x1, x2, y_prob, [0.334], colors='r',
                 linestyles='solid')
 pl.clabel(cs, fontsize=11)
 
diff --git a/examples/gaussian_process/plot_gpc_xor.py b/examples/gaussian_process/plot_gpc_xor.py
new file mode 100644
index 0000000000000..084f6b4b7cace
--- /dev/null
+++ b/examples/gaussian_process/plot_gpc_xor.py
@@ -0,0 +1,57 @@
+"""
+========================================================================
+Illustration of Gaussian process classification (GPC) on the XOR dataset
+========================================================================
+
+This example illustrates GPC on XOR data. Compared are a stationary, isotropic
+kernel (RBF) and a non-stationary kernel (DotProduct). On this particular
+dataset, the DotProduct kernel obtains considerably better results because the
+class-boundaries are linear and coincide with the coordinate axes. In general,
+stationary kernels often obtain better results.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF, DotProduct
+
+
+xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
+                     np.linspace(-3, 3, 50))
+rng = np.random.RandomState(0)
+X = rng.randn(200, 2)
+Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
+
+# fit the model
+plt.figure(figsize=(10, 5))
+kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
+for i, kernel in enumerate(kernels):
+    clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)
+
+    # plot the decision function for each datapoint on the grid
+    Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
+    Z = Z.reshape(xx.shape)
+
+    plt.subplot(1, 2, i + 1)
+    image = plt.imshow(Z, interpolation='nearest',
+                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+                       aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
+    contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
+                           linetypes='--')
+    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
+    plt.xticks(())
+    plt.yticks(())
+    plt.axis([-3, 3, -3, 3])
+    plt.colorbar(image)
+    plt.title("%s\n Log-Marginal-Likelihood:%.3f"
+              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
+              fontsize=12)
+
+plt.tight_layout()
+plt.show()
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
new file mode 100644
index 0000000000000..07f1d8214df91
--- /dev/null
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -0,0 +1,125 @@
+"""
+========================================================
+Gaussian process regression (GPR) on Mauna Loa CO2 data.
+========================================================
+
+This example is based on Section 5.4.3 of "Gaussian Processes for Machine
+Learning" [RW2006]. It illustrates an example of complex kernel engineering and
+hyperparameter optimization using gradient ascent on the
+log-marginal-likelihood. The data consists of the monthly average atmospheric
+CO2 concentrations (in parts per million by volume (ppmv)) collected at the
+Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to
+model the CO2 concentration as a function of the time t.
+
+The kernel is composed of several terms that are responsible for explaining
+different properties of the signal:
+ - a long term, smooth rising trend is to be explained by an RBF kernel. The
+   RBF kernel with a large length-scale enforces this component to be smooth;
+   it is not enforced that the trend is rising which leaves this choice to the
+   GP. The specific length-scale and the amplitude are free hyperparameters.
+ - a seasonal component, which is to be explained by the periodic
+   ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
+   of this periodic component, controlling its smoothness, is a free parameter.
+   In order to allow decaying away from exact periodicity, the product with an
+   RBF kernel is taken. The length-scale of this RBF component controls the
+   decay time and is a further free parameter.
+ - smaller, medium term irregularities are to be explained by a
+   RationalQuadratic kernel component, whose length-scale and alpha parameter,
+   which determines the diffuseness of the length-scales, are to be determined.
+   According to [RW2006], these irregularities can better be explained by
+   a RationalQuadratic than an RBF kernel component, probably because it can
+   accommodate several length-scales.
+ - a "noise" term, consisting of an RBF kernel contribution, which shall
+   explain the correlated noise components such as local weather phenomena,
+   and a WhiteKernel contribution for the white noise. The relative amplitudes
+   and the RBF's length scale are further free parameters.
+
+Maximizing the log-marginal-likelihood after subtracting the target's mean
+yields the following kernel with an LML of -83.214:
+   34.4**2 * RBF(length_scale=41.8)
+   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
+                                                      periodicity=1)
+   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
+   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
+Thus, most of the target signal (34.4ppm) is explained by a long-term rising
+trend (length-scale 41.8 years). The periodic component has an amplitude of
+3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
+time indicates that we have a locally very close to periodic seasonal
+component. The correlated noise has an amplitude of 0.197ppm with a length
+scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
+overall noise level is very small, indicating that the data can be very well
+explained by the model. The figure shows also that the model makes very
+confident predictions until around 2015.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+from matplotlib import pyplot as plt
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels \
+    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
+from sklearn.datasets import fetch_mldata
+
+data = fetch_mldata('mauna-loa-atmospheric-co2').data
+X = data[:, [1]]
+y = data[:, 0]
+
+# Kernel with parameters given in GPML book
+k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
+k2 = 2.4**2 * RBF(length_scale=90.0) \
+    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
+# medium term irregularity
+k3 = 0.66**2 \
+    * RationalQuadratic(length_scale=1.2, alpha=0.78)
+k4 = 0.18**2 * RBF(length_scale=0.134) \
+    + WhiteKernel(noise_level=0.19**2)  # noise terms
+kernel_gpml = k1 + k2 + k3 + k4
+
+gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
+                              optimizer=None, normalize_y=True)
+gp.fit(X, y)
+
+print("GPML kernel: %s" % gp.kernel_)
+print("Log-marginal-likelihood: %.3f"
+      % gp.log_marginal_likelihood(gp.kernel_.theta))
+
+# Kernel with optimized parameters
+k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
+k2 = 2.0**2 * RBF(length_scale=100.0) \
+    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
+                     periodicity_bounds="fixed")  # seasonal component
+# medium term irregularities
+k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
+k4 = 0.1**2 * RBF(length_scale=0.1) \
+    + WhiteKernel(noise_level=0.1**2,
+                  noise_level_bounds=(1e-3, np.inf))  # noise terms
+kernel = k1 + k2 + k3 + k4
+
+gp = GaussianProcessRegressor(kernel=kernel, alpha=0,
+                              normalize_y=True)
+gp.fit(X, y)
+
+print("\nLearned kernel: %s" % gp.kernel_)
+print("Log-marginal-likelihood: %.3f"
+      % gp.log_marginal_likelihood(gp.kernel_.theta))
+
+X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
+y_pred, y_std = gp.predict(X_, return_std=True)
+
+# Illustration
+plt.scatter(X, y, c='k')
+plt.plot(X_, y_pred)
+plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
+                 alpha=0.5, color='k')
+plt.xlim(X_.min(), X_.max())
+plt.xlabel("Year")
+plt.ylabel(r"CO$_2$ in ppm")
+plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
+plt.tight_layout()
+plt.show()
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
new file mode 100644
index 0000000000000..55af701f50d36
--- /dev/null
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -0,0 +1,97 @@
+"""
+=============================================================
+Gaussian process regression (GPR) with noise-level estimation
+=============================================================
+
+This example illustrates that GPR with a sum-kernel including a WhiteKernel can
+estimate the noise level of data. An illustration of the
+log-marginal-likelihood (LML) landscape shows that there exist two local
+maxima of LML. The first corresponds to a model with a high noise level and a
+large length scale, which explains all variations in the data by noise. The
+second one has a smaller noise level and shorter length scale, which explains
+most of the variation by the noise-free functional relationship. The second
+model has a higher likelihood; however, depending on the initial value for the
+hyperparameters, the gradient-based optimization might also converge to the
+high-noise solution. It is thus important to repeat the optimization several
+times for different initializations.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+from matplotlib import pyplot as plt
+from matplotlib.colors import LogNorm
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF, WhiteKernel
+
+
+rng = np.random.RandomState(0)
+X = rng.uniform(0, 5, 20)[:, np.newaxis]
+y = 0.5 * np.sin(3 * X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
+
+# First run
+plt.figure(0)
+kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
+    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
+gp = GaussianProcessRegressor(kernel=kernel,
+                              alpha=0.0).fit(X, y)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+          % (kernel, gp.kernel_,
+             gp.log_marginal_likelihood(gp.kernel_.theta)))
+plt.tight_layout()
+
+# Second run
+plt.figure(1)
+kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
+    + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
+gp = GaussianProcessRegressor(kernel=kernel,
+                              alpha=0.0).fit(X, y)
+X_ = np.linspace(0, 5, 100)
+y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
+plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
+                 y_mean + np.sqrt(np.diag(y_cov)),
+                 alpha=0.5, color='k')
+plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
+plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
+          % (kernel, gp.kernel_,
+             gp.log_marginal_likelihood(gp.kernel_.theta)))
+plt.tight_layout()
+
+# Plot LML landscape
+plt.figure(2)
+theta0 = np.logspace(-2, 3, 49)
+theta1 = np.logspace(-2, 0, 50)
+Theta0, Theta1 = np.meshgrid(theta0, theta1)
+LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
+        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
+LML = np.array(LML).T
+
+vmin, vmax = (-LML).min(), (-LML).max()
+vmax = 50
+plt.contour(Theta0, Theta1, -LML,
+            levels=np.logspace(np.log10(vmin), np.log10(vmax), 50),
+            norm=LogNorm(vmin=vmin, vmax=vmax))
+plt.colorbar()
+plt.xscale("log")
+plt.yscale("log")
+plt.xlabel("Length-scale")
+plt.ylabel("Noise-level")
+plt.title("Log-marginal-likelihood")
+plt.tight_layout()
+
+plt.show()
diff --git a/examples/gaussian_process/plot_gp_regression.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
similarity index 65%
rename from examples/gaussian_process/plot_gp_regression.py
rename to examples/gaussian_process/plot_gpr_noisy_targets.py
index 33b78750d1fe4..329d0384b40ea 100644
--- a/examples/gaussian_process/plot_gp_regression.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -1,42 +1,36 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-r"""
+"""
 =========================================================
 Gaussian Processes regression: basic introductory example
 =========================================================
 
-A simple one-dimensional regression exercise computed in two different ways:
+A simple one-dimensional regression example computed in two different ways:
 
-1. A noise-free case with a cubic correlation model
-2. A noisy case with a squared Euclidean correlation model
+1. A noise-free case
+2. A noisy case with known noise-level per datapoint
 
-In both cases, the model parameters are estimated using the maximum
+In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
 
 The figures illustrate the interpolating property of the Gaussian Process
 model as well as its probabilistic nature in the form of a pointwise 95%
 confidence interval.
 
-Note that the parameter ``nugget`` is applied as a Tikhonov regularization
-of the assumed covariance between the training points.  In the special case
-of the squared euclidean correlation model, nugget is mathematically equivalent
-to a normalized variance:  That is
-
-.. math::
-   \mathrm{nugget}_i = \left[\frac{\sigma_i}{y_i}\right]^2
-
+Note that the parameter ``alpha`` is applied as a Tikhonov
+regularization of the assumed covariance between the training points.
 """
 print(__doc__)
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
 # Licence: BSD 3 clause
 
 import numpy as np
-from sklearn.gaussian_process import GaussianProcess
 from matplotlib import pyplot as pl
 
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
+
 np.random.seed(1)
 
 
@@ -44,7 +38,7 @@ def f(x):
     """The function to predict."""
     return x * np.sin(x)
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 #  First the noiseless case
 X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
 
@@ -56,15 +50,14 @@ def f(x):
 x = np.atleast_2d(np.linspace(0, 10, 1000)).T
 
 # Instanciate a Gaussian Process model
-gp = GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
-                     random_start=100)
+kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
+gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
 
 # Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, MSE = gp.predict(x, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_pred, sigma = gp.predict(x, return_std=True)
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
@@ -81,7 +74,7 @@ def f(x):
 pl.ylim(-10, 20)
 pl.legend(loc='upper left')
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # now the noisy case
 X = np.linspace(0.1, 9.9, 20)
 X = np.atleast_2d(X).T
@@ -92,22 +85,15 @@ def f(x):
 noise = np.random.normal(0, dy)
 y += noise
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
-x = np.atleast_2d(np.linspace(0, 10, 1000)).T
-
 # Instanciate a Gaussian Process model
-gp = GaussianProcess(corr='squared_exponential', theta0=1e-1,
-                     thetaL=1e-3, thetaU=1,
-                     nugget=(dy / y) ** 2,
-                     random_start=100)
+gp = GaussianProcessRegressor(kernel=kernel, alpha=(dy / y) ** 2,
+                              n_restarts_optimizer=10)
 
 # Fit to data using Maximum Likelihood Estimation of the parameters
 gp.fit(X, y)
 
 # Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, MSE = gp.predict(x, eval_MSE=True)
-sigma = np.sqrt(MSE)
+y_pred, sigma = gp.predict(x, return_std=True)
 
 # Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
diff --git a/examples/gaussian_process/plot_gpr_prior_posterior.py b/examples/gaussian_process/plot_gpr_prior_posterior.py
new file mode 100644
index 0000000000000..6c2bfe525f730
--- /dev/null
+++ b/examples/gaussian_process/plot_gpr_prior_posterior.py
@@ -0,0 +1,78 @@
+"""
+==========================================================================
+Illustration of prior and posterior Gaussian process for different kernels
+==========================================================================
+
+This example illustrates the prior and posterior of a GPR with different
+kernels. Mean, standard deviation, and 10 samples are shown for both prior
+and posterior.
+"""
+print(__doc__)
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import numpy as np
+
+from matplotlib import pyplot as plt
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
+                                              ExpSineSquared, DotProduct,
+                                              ConstantKernel)
+
+
+kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
+           1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1),
+           1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0,
+                                length_scale_bounds=(0.1, 10.0),
+                                periodicity_bounds=(1.0, 10.0)),
+           ConstantKernel(0.1, (0.01, 10.0))
+               * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.0, 10.0)) ** 2),
+           1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0),
+                        nu=1.5)]
+
+for fig_index, kernel in enumerate(kernels):
+    # Specify Gaussian Process
+    gp = GaussianProcessRegressor(kernel=kernel)
+
+    # Plot prior
+    plt.figure(fig_index, figsize=(8, 8))
+    plt.subplot(2, 1, 1)
+    X_ = np.linspace(0, 5, 100)
+    y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
+    plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+    plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
+                     alpha=0.5, color='k')
+    y_samples = gp.sample_y(X_[:, np.newaxis], 10)
+    plt.plot(X_, y_samples, lw=1)
+    plt.xlim(0, 5)
+    plt.ylim(-3, 3)
+    plt.title("Prior (kernel:  %s)" % kernel, fontsize=12)
+
+    # Generate data and fit GP
+    rng = np.random.RandomState(4)
+    X = rng.uniform(0, 5, 10)[:, np.newaxis]
+    y = np.sin((X[:, 0] - 2.5) ** 2)
+    gp.fit(X, y)
+
+    # Plot posterior
+    plt.subplot(2, 1, 2)
+    X_ = np.linspace(0, 5, 100)
+    y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
+    plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
+    plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
+                     alpha=0.5, color='k')
+
+    y_samples = gp.sample_y(X_[:, np.newaxis], 10)
+    plt.plot(X_, y_samples, lw=1)
+    plt.scatter(X[:, 0], y, c='r', s=50, zorder=10)
+    plt.xlim(0, 5)
+    plt.ylim(-3, 3)
+    plt.title("Posterior (kernel: %s)\n Log-Likelihood: %.3f"
+              % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)),
+              fontsize=12)
+    plt.tight_layout()
+
+plt.show()
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
index fa7c7f210bc05..baafcf0bba32d 100644
--- a/sklearn/cross_validation.py
+++ b/sklearn/cross_validation.py
@@ -31,6 +31,7 @@
 from .externals.six.moves import zip
 from .metrics.scorer import check_scoring
 from .utils.fixes import bincount
+from .gaussian_process.kernels import Kernel as GPKernel
 
 __all__ = ['KFold',
            'LeaveOneLabelOut',
@@ -1338,7 +1339,8 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
 
 def _safe_split(estimator, X, y, indices, train_indices=None):
     """Create subset of dataset and properly handle kernels."""
-    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
+    if hasattr(estimator, 'kernel') and callable(estimator.kernel) \
+       and not isinstance(estimator.kernel, GPKernel):
         # cannot compute the kernel values with custom function
         raise ValueError("Cannot use a custom kernel function. "
                          "Precompute the kernel matrix instead.")
diff --git a/sklearn/gaussian_process/__init__.py b/sklearn/gaussian_process/__init__.py
index 1d903181be719..0d6fa6bd1e765 100644
--- a/sklearn/gaussian_process/__init__.py
+++ b/sklearn/gaussian_process/__init__.py
@@ -1,16 +1,23 @@
 # -*- coding: utf-8 -*-
 
-# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Vincent Dubourg <vincent.dubourg@gmail.com>
 #         (mostly translation, see implementation details)
 # Licence: BSD 3 clause
 
 """
-The :mod:`sklearn.gaussian_process` module implements scalar Gaussian Process
-based predictions.
+The :mod:`sklearn.gaussian_process` module implements Gaussian Process
+based regression and classification.
 """
 
+from .gpr import GaussianProcessRegressor
+from .gpc import GaussianProcessClassifier
+from . import kernels
+
 from .gaussian_process import GaussianProcess
 from . import correlation_models
 from . import regression_models
 
-__all__ = ['GaussianProcess', 'correlation_models', 'regression_models']
+__all__ = ['GaussianProcess', 'correlation_models', 'regression_models',
+           'GaussianProcessRegressor', 'GaussianProcessClassifier',
+           'kernels']
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
index 07f688bc57f61..ab8225ea5bd2a 100644
--- a/sklearn/gaussian_process/gaussian_process.py
+++ b/sklearn/gaussian_process/gaussian_process.py
@@ -15,10 +15,12 @@
 from ..utils.validation import check_is_fitted
 from . import regression_models as regression
 from . import correlation_models as correlation
+from ..utils import deprecated
 
 MACHINE_EPSILON = np.finfo(np.double).eps
 
 
+@deprecated("l1_cross_distances is deprecated and will be removed in 0.18.")
 def l1_cross_distances(X):
     """
     Computes the nonzero componentwise L1 cross-distances between the vectors
@@ -56,8 +58,13 @@ def l1_cross_distances(X):
     return D, ij
 
 
+@deprecated("GaussianProcess is deprecated and will be removed in 0.18. "
+            "Use the GaussianProcessRegressor instead.")
 class GaussianProcess(BaseEstimator, RegressorMixin):
-    """The Gaussian Process model class.
+    """The legacy Gaussian Process model class.
+
+    Note that this class is deprecated and will be removed in 0.18.
+    Use the GaussianProcessRegressor instead.
 
     Read more in the :ref:`User Guide <gaussian_process>`.
 
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
new file mode 100644
index 0000000000000..f9c00a98be203
--- /dev/null
+++ b/sklearn/gaussian_process/gpc.py
@@ -0,0 +1,729 @@
+"""Gaussian processes classification."""
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import warnings
+from operator import itemgetter
+
+import numpy as np
+from scipy.linalg import cholesky, cho_solve, solve
+from scipy.optimize import fmin_l_bfgs_b
+from scipy.special import erf
+
+from sklearn.base import BaseEstimator, ClassifierMixin, clone
+from sklearn.gaussian_process.kernels \
+    import RBF, CompoundKernel, ConstantKernel as C
+from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
+from sklearn.utils import check_random_state
+from sklearn.preprocessing import LabelEncoder
+from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
+
+
+# Values required for approximating the logistic sigmoid by
+# error functions. coefs are obtained via:
+# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+# b = logistic(x)
+# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+# coefs = lstsq(A, b)[0]
+LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
+COEFS = np.array([-1854.8214151, 3516.89893646, 221.29346712,
+                  128.12323805, -2010.49422654])[:, np.newaxis]
+
+
+class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
+    """Binary Gaussian process classification based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function.
+
+    Parameters
+    ----------
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer: int, optional (default: 0)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict: int, optional (default: 100)
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, optional (default: False)
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization.
+
+    copy_X_train : bool, optional (default: True)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    Attributes
+    ----------
+    X_train_ : array-like, shape = (n_samples, n_features)
+        Feature values in training data (also required for prediction)
+
+    y_train_: array-like, shape = (n_samples,)
+        Target values in training data (also required for prediction)
+
+    classes_ : array-like, shape = (n_classes,)
+        Unique class labels.
+
+    kernel_: kernel object
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    L_: array-like, shape = (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
+
+    pi_: array-like, shape = (n_samples,)
+        The probabilities of the positive class for the training points
+        X_train_
+
+    W_sr_: array-like, shape = (n_samples,)
+        Square root of W, the Hessian of log-likelihood of the latent function
+        values for the observed labels. Since W is diagonal, only the diagonal
+        of sqrt(W) is stored.
+
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
+    """
+    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=True, random_state=None):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
+                * RBF(1.0, length_scale_bounds="fixed")
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self.rng = check_random_state(self.random_state)
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+
+        # Encode class labels and check that it is a binary classification
+        # problem
+        label_encoder = LabelEncoder()
+        self.y_train_ = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        if self.classes_.size > 2:
+            raise ValueError("%s supports only binary classification. "
+                             "y contains classes %s"
+                             % (self.__class__.__name__, self.classes_))
+        elif self.classes_.size == 1:
+            raise ValueError("{0:s} requires 2 classes.".format(
+                self.__class__.__name__))
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True)
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta)
+
+            # First optimize starting from theta specified in kernel
+            optima = [self._constrained_optimization(obj_func,
+                                                     self.kernel_.theta,
+                                                     self.kernel_.bounds)]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite.")
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
+                                                            bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial,
+                                                       bounds))
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.log_marginal_likelihood(self.kernel_.theta)
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel_(self.X_train_)
+
+        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
+            self._posterior_mode(K, return_temporaries=True)
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array, shape = (n_samples,)
+            Predicted target values for X, values are from classes_
+        """
+        check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
+
+        # As discussed on Section 3.4.2 of GPML, for making hard binary
+        # decisions, it is enough to compute the MAP of the posterior and
+        # pass it through the link function
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Algorithm 3.2,Line 4
+
+        return np.where(f_star > 0, self.classes_[1], self.classes_[0])
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array-like, shape = (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute `classes_`.
+        """
+        check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"])
+
+        # Based on Algorithm 3.2 of GPML
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Line 4
+        v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
+        # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
+        var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
+
+        # Line 7:
+        # Approximate \int log(z) * N(z | f_star, var_f_star)
+        # Approximation is due to Williams & Barber, "Bayesian Classification
+        # with Gaussian Processes", Appendix A: Approximate the logistic
+        # sigmoid by a linear combination of 5 error functions.
+        # For information on how this integral can be computed see
+        # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
+        alpha = 1 / (2 * var_f_star)
+        gamma = LAMBDAS * f_star
+        integrals = np.sqrt(np.pi / alpha) \
+            * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) \
+            / (2 * np.sqrt(var_f_star * 2 * np.pi))
+        pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
+
+        return np.vstack((1 - pi_star, pi_star)).T
+
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
+        """Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,) or None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        kernel = self.kernel_.clone_with_theta(theta)
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Compute log-marginal-likelihood Z and also store some temporaries
+        # which can be reused for computing Z's gradient
+        Z, (pi, W_sr, L, b, a) = \
+            self._posterior_mode(K, return_temporaries=True)
+
+        if not eval_gradient:
+            return Z
+
+        # Compute gradient based on Algorithm 5.1 of GPML
+        d_Z = np.empty(theta.shape[0])
+        # XXX: Get rid of the np.diag() in the next line
+        R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
+        C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
+        # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
+        s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
+            * (pi * (1 - pi) * (1 - 2 * pi))  # third derivative
+
+        for j in range(d_Z.shape[0]):
+            C = K_gradient[:, :, j]   # Line 11
+            # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
+            s_1 = .5 * a.T.dot(C).dot(a) - .5 * R.T.ravel().dot(C.ravel())
+
+            b = C.dot(self.y_train_ - pi)  # Line 13
+            s_3 = b - K.dot(R.dot(b))  # Line 14
+
+            d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
+
+        return Z, d_Z
+
+    def _posterior_mode(self, K, return_temporaries=False):
+        """Mode-finding for binary Laplace GPC and fixed kernel.
+
+        This approximates the posterior of the latent function values for given
+        inputs and target observations with a Gaussian approximation and uses
+        Newton's iteration to find the mode of this approximation.
+        """
+        # Based on Algorithm 3.1 of GPML
+
+        # If warm_start are enabled, we reuse the last solution for the
+        # posterior mode as initialization; otherwise, we initialize with 0
+        if self.warm_start and hasattr(self, "f_cached") \
+           and self.f_cached.shape == self.y_train_.shape:
+            f = self.f_cached
+        else:
+            f = np.zeros_like(self.y_train_, dtype=np.float64)
+
+        # Use Newton's iteration method to find mode of Laplace approximation
+        log_marginal_likelihood = -np.inf
+        for _ in range(self.max_iter_predict):
+            # Line 4
+            pi = 1 / (1 + np.exp(-f))
+            W = pi * (1 - pi)
+            # Line 5
+            W_sr = np.sqrt(W)
+            W_sr_K = W_sr[:, np.newaxis] * K
+            B = np.eye(W.shape[0]) + W_sr_K * W_sr
+            L = cholesky(B, lower=True)
+            # Line 6
+            b = W * f + (self.y_train_ - pi)
+            # Line 7
+            a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
+            # Line 8
+            f = K.dot(a)
+
+            # Line 10: Compute log marginal likelihood in loop and use as
+            #          convergence criterion
+            lml = -0.5 * a.T.dot(f) \
+                - np.log(1 + np.exp(-(self.y_train_ * 2 - 1) * f)).sum() \
+                - np.log(np.diag(L)).sum()
+            # Check if we have converged (log marginal likelihood does
+            # not decrease)
+            # XXX: more complex convergence criterion
+            if lml - log_marginal_likelihood < 1e-10:
+                break
+            log_marginal_likelihood = lml
+
+        self.f_cached = f  # Remember solution for later warm-starts
+        if return_temporaries:
+            return log_marginal_likelihood, (pi, W_sr, L, b, a)
+        else:
+            return log_marginal_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            theta_opt, func_min, convergence_dict = \
+                fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+            if convergence_dict["warnflag"] != 0:
+                warnings.warn("fmin_l_bfgs_b terminated abnormally with the "
+                              " state: %s" % convergence_dict)
+        elif callable(self.optimizer):
+            theta_opt, func_min = \
+                self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
+        return theta_opt, func_min
+
+
+class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
+    """Gaussian process classification (GPC) based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function. For multi-class classification, several binary one-versus rest
+    classifiers are fitted. Note that this class thus does not implement
+    a true multi-class Laplace approximation.
+
+    Parameters
+    ----------
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer: int, optional (default: 0)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict: int, optional (default: 100)
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, optional (default: False)
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization.
+
+    copy_X_train : bool, optional (default: True)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    multi_class: string, default: "one_vs_rest"
+        Specifies how multi-class classification problems are handled.
+        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
+        one binary Gaussian process classifier is fitted for each class, which
+        is trained to separate this class from the rest. In "one_vs_one", one
+        binary Gaussian process classifier is fitted for each pair of classes,
+        which is trained to separate these two classes. The predictions of
+        these binary predictors are combined into multi-class predictions.
+        Note that "one_vs_one" does not support predicting probability
+        estimates.
+
+    n_jobs : int, optional, default: 1
+        The number of jobs to use for the computation. If -1 all CPUs are used.
+        If 1 is given, no parallel computing code is used at all, which is
+        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
+        used. Thus for n_jobs = -2, all CPUs but one are used.
+
+    Attributes
+    ----------
+    kernel_ : kernel object
+        The kernel used for prediction. In case of binary classification,
+        the structure of the kernel is the same as the one passed as parameter
+        but with optimized hyperparameters. In case of multi-class
+        classification, a CompoundKernel is returned which consists of the
+        different kernels used in the one-versus-rest classifiers.
+
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
+
+    classes_ : array-like, shape = (n_classes,)
+        Unique class labels.
+
+    n_classes_ : int
+        The number of classes in the training data
+    """
+    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=True, random_state=None,
+                 multi_class="one_vs_rest", n_jobs=1):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+        self.multi_class = multi_class
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        X, y = check_X_y(X, y, multi_output=False)
+
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            self.kernel, self.optimizer, self.n_restarts_optimizer,
+            self.max_iter_predict, self.warm_start, self.copy_X_train,
+            self.random_state)
+
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.size
+        if self.n_classes_ == 1:
+            raise ValueError("GaussianProcessClassifier requires 2 or more "
+                             "distinct classes. Only class %s present."
+                             % self.classes_[0])
+        if self.n_classes_ > 2:
+            if self.multi_class == "one_vs_rest":
+                self.base_estimator_ = \
+                    OneVsRestClassifier(self.base_estimator_,
+                                        n_jobs=self.n_jobs)
+            elif self.multi_class == "one_vs_one":
+                self.base_estimator_ = \
+                    OneVsOneClassifier(self.base_estimator_,
+                                       n_jobs=self.n_jobs)
+            else:
+                raise ValueError("Unknown multi-class mode %s"
+                                 % self.multi_class)
+
+        self.base_estimator_.fit(X, y)
+
+        if self.n_classes_ > 2:
+            self.log_marginal_likelihood_value_ = np.mean(
+                [estimator.log_marginal_likelihood()
+                 for estimator in self.base_estimator_.estimators_])
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.base_estimator_.log_marginal_likelihood()
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array, shape = (n_samples,)
+            Predicted target values for X, values are from classes_
+        """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+        X = check_array(X)
+        return self.base_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+
+        Returns
+        -------
+        C : array-like, shape = (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute `classes_`.
+        """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
+            raise ValueError("one_vs_one multi-class mode does not support "
+                             "predicting probability estimates. Use "
+                             "one_vs_rest mode instead.")
+        X = check_array(X)
+        return self.base_estimator_.predict_proba(X)
+
+    @property
+    def kernel_(self):
+        if self.n_classes_ == 2:
+            return self.base_estimator_.kernel_
+        else:
+            return CompoundKernel(
+                [estimator.kernel_
+                 for estimator in self.base_estimator_.estimators_])
+
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
+        """Returns log-marginal likelihood of theta for training data.
+
+        In the case of multi-class classification, the mean log-marginal
+        likelihood of the one-versus-rest classifiers are returned.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,) or none
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. In the case of multi-class classification, theta may
+            be the  hyperparameters of the compound kernel or of an individual
+            kernel. In the latter case, all individual kernel get assigned the
+            same theta values. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. Note that gradient computation is not supported
+            for non-binary classification. If True, theta must not be None.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        check_is_fitted(self, ["classes_", "n_classes_"])
+
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        theta = np.asarray(theta)
+        if self.n_classes_ == 2:
+            return self.base_estimator_.log_marginal_likelihood(
+                theta, eval_gradient)
+        else:
+            if eval_gradient:
+                raise NotImplementedError(
+                    "Gradient of log-marginal-likelhood not implemented for "
+                    "multi-class GPC.")
+            estimators = self.base_estimator_.estimators_
+            n_dims = estimators[0].kernel_.n_dims
+            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
+                return np.mean(
+                    [estimator.log_marginal_likelihood(theta)
+                     for i, estimator in enumerate(estimators)])
+            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
+                # theta for compound kernel
+                return np.mean(
+                    [estimator.log_marginal_likelihood(
+                        theta[n_dims*i:n_dims*(i+1)])
+                     for i, estimator in enumerate(estimators)])
+            else:
+                raise ValueError("Shape of theta must be either %d or %d. "
+                                 "Obtained theta with shape %d."
+                                 % (n_dims, n_dims * self.classes_.shape[0],
+                                    theta.shape[0]))
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
new file mode 100644
index 0000000000000..85ff65b8a84b2
--- /dev/null
+++ b/sklearn/gaussian_process/gpr.py
@@ -0,0 +1,428 @@
+"""Gaussian processes regression. """
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+import warnings
+from operator import itemgetter
+
+import numpy as np
+from scipy.linalg import cholesky, cho_solve, solve_triangular
+from scipy.optimize import fmin_l_bfgs_b
+
+from sklearn.base import BaseEstimator, RegressorMixin, clone
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import check_X_y, check_array
+
+
+class GaussianProcessRegressor(BaseEstimator, RegressorMixin):
+    """Gaussian process regression (GPR).
+
+    The implementation is based on Algorithm 2.1 of ``Gaussian Processes
+    for Machine Learning'' (GPML) by Rasmussen and Williams.
+
+    In addition to standard sklearn estimator API, GaussianProcessRegressor:
+
+       * allows prediction without prior fitting (based on the GP prior)
+       * provides an additional method sample_y(X), which evaluates samples
+         drawn from the GPR (prior or posterior) at given inputs
+       * exposes a method log_marginal_likelihood(theta), which can be used
+         externally for other ways of selecting hyperparameters, e.g., via
+         Markov chain Monte Carlo.
+
+    Parameters
+    ----------
+    kernel : kernel object
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    alpha : float or array-like, optional (default: 1e-10)
+        Value added to the diagonal of the kernel matrix during fitting.
+        Larger values correspond to increased noise level in the observations
+        and reduce potential numerical issue during fitting. If an array is
+        passed, it must have the same number of entries as the data used for
+        fitting and is used as datapoint-dependent noise level. Note that this
+        is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify
+        the noise level directly as a parameter is mainly for convenience and
+        for consistency with Ridge.
+
+    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer: int, optional (default: 0)
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer == 0 implies that one
+        run is performed.
+
+    normalize_y: boolean, optional (default: False)
+        Whether the target values y are normalized, i.e., the mean of the
+        observed target values become zero. This parameter should be set to
+        True if the target values' mean is expected to differ considerable from
+        zero. When enabled, the normalization effectively modifies the GP's
+        prior based on the data, which contradicts the likelihood principle;
+        normalization is thus disabled per default.
+
+    copy_X_train : bool, optional (default: True)
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    Attributes
+    ----------
+    X_train_ : array-like, shape = (n_samples, n_features)
+        Feature values in training data (also required for prediction)
+
+    y_train_: array-like, shape = (n_samples, [n_output_dims])
+        Target values in training data (also required for prediction)
+
+    kernel_: kernel object
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    L_: array-like, shape = (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
+
+    alpha_: array-like, shape = (n_samples,)
+        Dual coefficients of training data points in kernel space
+
+    log_marginal_likelihood_value_: float
+        The log-marginal-likelihood of self.kernel_.theta
+    """
+    def __init__(self, kernel=None, alpha=1e-10,
+                 optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
+                 normalize_y=False, copy_X_train=True, random_state=None):
+        self.kernel = kernel
+        self.alpha = alpha
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.normalize_y = normalize_y
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit Gaussian process regression model
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Training data
+
+        y : array-like, shape = (n_samples, [n_output_dims])
+            Target values
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
+                * RBF(1.0, length_scale_bounds="fixed")
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self.rng = check_random_state(self.random_state)
+
+        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
+
+        # Normalize target value
+        if self.normalize_y:
+            self.y_train_mean = np.mean(y, axis=0)
+            # demean y
+            y = y - self.y_train_mean
+        else:
+            self.y_train_mean = np.zeros(1)
+
+        if np.iterable(self.alpha) \
+           and self.alpha.shape[0] != y.shape[0]:
+            if self.alpha.shape[0] == 1:
+                self.alpha = self.alpha[0]
+            else:
+                raise ValueError("alpha must be a scalar or an array"
+                                 " with same number of entries as y.(%d != %d)"
+                                 % (self.alpha.shape[0], y.shape[0]))
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+        self.y_train_ = np.copy(y) if self.copy_X_train else y
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True)
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta)
+
+            # First optimize starting from theta specified in kernel
+            optima = [(self._constrained_optimization(obj_func,
+                                                      self.kernel_.theta,
+                                                      self.kernel_.bounds))]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite.")
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = \
+                        self.rng.uniform(bounds[:, 0], bounds[:, 1])
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial,
+                                                       bounds))
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.log_marginal_likelihood(self.kernel_.theta)
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel_(self.X_train_)
+        K[np.diag_indices_from(K)] += self.alpha
+        self.L_ = cholesky(K, lower=True)  # Line 2
+        self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3
+
+        return self
+
+    def predict(self, X, return_std=False, return_cov=False):
+        """Predict using the Gaussian process regression model
+
+        We can also predict based on an unfitted model by using the GP prior.
+        In addition to the mean of the predictive distribution, also its
+        standard deviation (return_std=True) or covariance (return_cov=True).
+        Note that at most one of the two can be requested.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Query points where the GP is evaluated
+
+        return_std : bool, default: False
+            If True, the standard-deviation of the predictive distribution at
+            the query points is returned along with the mean.
+
+        return_cov : bool, default: False
+            If True, the covariance of the joint predictive distribution at
+            the query points is returned along with the mean
+
+        Returns
+        -------
+        y_mean : array, shape = (n_samples, [n_output_dims])
+            Mean of predictive distribution a query points
+
+        y_std : array, shape = (n_samples,), optional
+            Standard deviation of predictive distribution at query points.
+            Only returned when return_std is True.
+
+        y_cov : array, shape = (n_samples, n_samples), optional
+            Covariance of joint predictive distribution a query points.
+            Only returned when return_cov is True.
+        """
+        if return_std and return_cov:
+            raise RuntimeError(
+                "Not returning standard deviation of predictions when "
+                "returning full covariance.")
+
+        X = check_array(X)
+
+        if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
+            y_mean = np.zeros(X.shape[0])
+            if return_cov:
+                y_cov = self.kernel(X)
+                return y_mean, y_cov
+            elif return_std:
+                y_var = self.kernel.diag(X)
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+        else:  # Predict based on GP posterior
+            K_trans = self.kernel_(X, self.X_train_)
+            y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
+            y_mean = self.y_train_mean + y_mean  # undo normal.
+            if return_cov:
+                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
+                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
+                return y_mean, y_cov
+            elif return_std:
+                # compute inverse K_inv of K based on its Cholesky
+                # decomposition L and its inverse L_inv
+                L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
+                K_inv = L_inv.dot(L_inv.T)
+                # Compute variance of predictive distribution
+                y_var = self.kernel_.diag(X)
+                y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv)
+
+                # Check if any of the variances is negative because of
+                # numerical issues. If yes: set the variance to 0.
+                y_var_negative = y_var < 0
+                if np.any(y_var_negative):
+                    warnings.warn("Predicted variances smaller than 0. "
+                                  "Setting those variances to 0.")
+                    y_var[y_var_negative] = 0.0
+                return y_mean, np.sqrt(y_var)
+            else:
+                return y_mean
+
+    def sample_y(self, X, n_samples=1, random_state=0):
+        """Draw samples from Gaussian process and evaluate at X.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples_X, n_features)
+            Query points where the GP samples are evaluated
+
+        n_samples : int, default: 1
+            The number of samples drawn from the Gaussian process
+
+        random_state: RandomState or an int seed (0 by default)
+            A random number generator instance
+
+        Returns
+        -------
+        y_samples : array, shape = (n_samples_X, [n_output_dims], n_samples)
+            Values of n_samples samples drawn from Gaussian process and
+            evaluated at query points.
+        """
+        rng = check_random_state(random_state)
+
+        y_mean, y_cov = self.predict(X, return_cov=True)
+        if y_mean.ndim == 1:
+            y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
+        else:
+            y_samples = \
+                [rng.multivariate_normal(y_mean[:, i], y_cov,
+                                         n_samples).T[:, np.newaxis]
+                 for i in range(y_mean.shape[1])]
+            y_samples = np.hstack(y_samples)
+        return y_samples
+
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
+        """Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like, shape = (n_kernel_params,) or None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of self.kernel_.theta is returned.
+
+        eval_gradient : bool, default: False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when eval_gradient is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        kernel = self.kernel_.clone_with_theta(theta)
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        K[np.diag_indices_from(K)] += self.alpha
+        try:
+            L = cholesky(K, lower=True)  # Line 2
+        except np.linalg.LinAlgError:
+            return (-np.inf, np.zeros_like(theta)) \
+                if eval_gradient else -np.inf
+
+        # Support multi-dimensional output of self.y_train_
+        y_train = self.y_train_
+        if y_train.ndim == 1:
+            y_train = y_train[:, np.newaxis]
+
+        alpha = cho_solve((L, True), y_train)  # Line 3
+
+        # Compute log-likelihood (compare line 7)
+        log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
+        log_likelihood_dims -= np.log(np.diag(L)).sum()
+        log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
+        log_likelihood = log_likelihood_dims.sum(-1)  # sum over dimensions
+
+        if eval_gradient:  # compare Equation 5.9 from GPML
+            tmp = np.einsum("ik,jk->ijk", alpha, alpha)  # k: output-dimension
+            tmp -= cho_solve((L, True), np.eye(K.shape[0]))[:, :, np.newaxis]
+            # Compute "0.5 * trace(tmp.dot(K_gradient))" without
+            # constructing the full matrix tmp.dot(K_gradient) since only
+            # its diagonal is required
+            log_likelihood_gradient_dims = \
+                0.5 * np.einsum("ijl,ijk->kl", tmp, K_gradient)
+            log_likelihood_gradient = log_likelihood_gradient_dims.sum(-1)
+
+        if eval_gradient:
+            return log_likelihood, log_likelihood_gradient
+        else:
+            return log_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            theta_opt, func_min, convergence_dict = \
+                fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds)
+            if convergence_dict["warnflag"] != 0:
+                warnings.warn("fmin_l_bfgs_b terminated abnormally with the "
+                              " state: %s" % convergence_dict)
+        elif callable(self.optimizer):
+            theta_opt, func_min = \
+                self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
+        return theta_opt, func_min
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
new file mode 100644
index 0000000000000..d3583fe19f944
--- /dev/null
+++ b/sklearn/gaussian_process/kernels.py
@@ -0,0 +1,1783 @@
+"""Kernels for Gaussian process regression and classification.
+
+The kernels in this module allow kernel-engineering, i.e., they can be
+combined via the "+" and "*" operators or be exponentiated with a scalar
+via "**". These sum and product expressions can also contain scalar values,
+which are automatically converted to a constant kernel.
+
+All kernels allow (analytic) gradient-based hyperparameter optimization.
+The space of hyperparameters can be specified by giving lower und upper
+boundaries for the value of each hyperparameter (the search space is thus
+rectangular). Instead of specifying bounds, hyperparameters can also be
+declared to be "fixed", which causes these hyperparameters to be excluded from
+optimization.
+"""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+# Note: this module is strongly inspired by the kernel module of the george
+#       package.
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+import inspect
+import math
+
+import numpy as np
+from scipy.special import kv, gamma
+from scipy.spatial.distance import pdist, cdist, squareform
+
+from ..metrics.pairwise import pairwise_kernels
+from ..externals import six
+from ..base import clone
+
+
+class Hyperparameter(namedtuple('Hyperparameter',
+                                ('name', 'value_type', 'bounds',
+                                 'n_elements', 'fixed'))):
+    """A kernel hyperparameter's specification in form of a namedtuple.
+
+    Entries
+    -------
+    name : string
+        The name of the hyperparameter. Note that a kernel using a
+        hyperparameter with name "x" must have the attributes self.x and
+        self.x_bounds
+
+    value_type : string
+        The type of the hyperparameter. Currently, only "numeric"
+        hyperparameters are supported.
+
+    bounds : pair of floats >= 0 or "fixed"
+        The lower and upper bound on the parameter. If n_elements>1, a pair
+        of 1d array with n_elements each may be given alternatively. If
+        the string "fixed" is passed as bounds, the hyperparameter's value
+        cannot be changed.
+
+    n_elements : int, default=1
+        The number of elements of the hyperparameter value. Defaults to 1,
+        which corresponds to a scalar hyperparameter. n_elements > 1
+        corresponds to a hyperparameter which is vector-valued,
+        such as, e.g., anisotropic length-scales.
+
+    fixed : bool, default: None
+        Whether the value of this hyperparameter is fixed, i.e., cannot be
+        changed during hyperparameter tuning. If None is passed, the "fixed" is
+        derived based on the given bounds.
+    """
+    # A raw namedtuple is very memory efficient as it packs the attributes
+    # in a struct to get rid of the __dict__ of attributes in particular it
+    # does not copy the string for the keys on each instance.
+    # By deriving a namedtuple class just to introduce the __init__ method we
+    # would also reintroduce the __dict__ on the instance. By telling the
+    # Python interpreter that this subclass uses static __slots__ instead of
+    # dynamic attributes. Furthermore we don't need any additional slot in the
+    # subclass so we set __slots__ to the empty tuple.
+    __slots__ = ()
+
+    def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
+        if bounds != "fixed":
+            bounds = np.atleast_2d(bounds)
+            if n_elements > 1:  # vector-valued parameter
+                if bounds.shape[0] == 1:
+                    bounds = np.repeat(bounds, n_elements, 0)
+                elif bounds.shape[0] != n_elements:
+                    raise ValueError("Bounds on %s should have either 1 or "
+                                     "%d dimensions. Given are %d"
+                                     % (name, n_elements, bounds.shape[0]))
+
+        if fixed is None:
+            fixed = (bounds == "fixed")
+        return super(Hyperparameter, cls).__new__(
+            cls, name, value_type, bounds, n_elements, fixed)
+
+
+class Kernel(six.with_metaclass(ABCMeta)):
+    """Base class for all kernels."""
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict()
+
+        # introspect the constructor arguments to find the model parameters
+        # to represent
+        cls = self.__class__
+        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
+        args, varargs, kw, default = inspect.getargspec(init)
+        if varargs is not None:
+            raise RuntimeError("scikit-learn kernels should always "
+                               "specify their parameters in the signature"
+                               " of their __init__ (no varargs)."
+                               " %s doesn't follow this convention."
+                               % (cls, ))
+        # Remove 'self' and store remaining arguments in params
+        args = args[1:]
+        for arg in args:
+            params[arg] = getattr(self, arg, None)
+        return params
+
+    def set_params(self, **params):
+        """Set the parameters of this kernel.
+
+        The method works on simple kernels as well as on nested kernels.
+        The latter have parameters of the form ``<component>__<parameter>``
+        so that it's possible to update each component of a nested object.
+
+        Returns
+        -------
+        self
+        """
+        if not params:
+            # Simple optimisation to gain speed (inspect is slow)
+            return self
+        valid_params = self.get_params(deep=True)
+        for key, value in six.iteritems(params):
+            split = key.split('__', 1)
+            if len(split) > 1:
+                # nested objects case
+                name, sub_name = split
+                if name not in valid_params:
+                    raise ValueError('Invalid parameter %s for kernel %s. '
+                                     'Check the list of available parameters '
+                                     'with `kernel.get_params().keys()`.' %
+                                     (name, self))
+                sub_object = valid_params[name]
+                sub_object.set_params(**{sub_name: value})
+            else:
+                # simple objects case
+                if key not in valid_params:
+                    raise ValueError('Invalid parameter %s for kernel %s. '
+                                     'Check the list of available parameters '
+                                     'with `kernel.get_params().keys()`.' %
+                                     (key, self.__class__.__name__))
+                setattr(self, key, value)
+        return self
+
+    def clone_with_theta(self, theta):
+        """Returns a clone of self with given hyperparameters theta. """
+        cloned = clone(self)
+        cloned.theta = theta
+        return cloned
+
+    @property
+    def n_dims(self):
+        """Returns the number of non-fixed hyperparameters of the kernel."""
+        return self.theta.shape[0]
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter specifications."""
+        r = []
+        for attr, value in sorted(self.__dict__.items()):
+            if attr.startswith("hyperparameter_"):
+                r.append(value)
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        theta = []
+        for hyperparameter in self.hyperparameters:
+            if not hyperparameter.fixed:
+                theta.append(getattr(self, hyperparameter.name))
+        if len(theta) > 0:
+            return np.log(np.hstack(theta))
+        else:
+            return np.array([])
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        i = 0
+        for hyperparameter in self.hyperparameters:
+            if hyperparameter.fixed:
+                continue
+            if hyperparameter.n_elements > 1:
+                # vector-valued parameter
+                setattr(self, hyperparameter.name,
+                        np.exp(theta[i:i + hyperparameter.n_elements]))
+                i += hyperparameter.n_elements
+            else:
+                setattr(self, hyperparameter.name, np.exp(theta[i]))
+                i += 1
+
+        if i != len(theta):
+            raise ValueError("theta has not the correct number of entries."
+                             " Should be %d; given are %d"
+                             % (i, len(theta)))
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        bounds = []
+        for hyperparameter in self.hyperparameters:
+            if not hyperparameter.fixed:
+                bounds.append(hyperparameter.bounds)
+        if len(bounds) > 0:
+            return np.log(np.vstack(bounds))
+        else:
+            return np.array([])
+
+    def __add__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(self, ConstantKernel(b))
+        return Sum(self, b)
+
+    def __radd__(self, b):
+        if not isinstance(b, Kernel):
+            return Sum(ConstantKernel(b), self)
+        return Sum(b, self)
+
+    def __mul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(self, ConstantKernel(b))
+        return Product(self, b)
+
+    def __rmul__(self, b):
+        if not isinstance(b, Kernel):
+            return Product(ConstantKernel(b), self)
+        return Product(b, self)
+
+    def __pow__(self, b):
+        return Exponentiation(self, b)
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        params_a = self.get_params()
+        params_b = b.get_params()
+        for key in set(list(params_a.keys()) + list(params_b.keys())):
+            if np.any(params_a.get(key, None) != params_b.get(key, None)):
+                return False
+        return True
+
+    def __repr__(self):
+        return "{0}({1})".format(self.__class__.__name__,
+                                 ", ".join(map("{0:.3g}".format, self.theta)))
+
+    @abstractmethod
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Evaluate the kernel."""
+
+    @abstractmethod
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+
+    @abstractmethod
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+
+
+class NormalizedKernelMixin(object):
+    """Mixin for kernels which are normalized: k(X, X)=1."""
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.ones(X.shape[0])
+
+
+class StationaryKernelMixin(object):
+    """Mixin for kernels which are stationary: k(X, Y)= f(X-Y)."""
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return True
+
+
+class CompoundKernel(Kernel):
+    """Kernel which is composed of a set of other kernels."""
+
+    def __init__(self, kernels):
+        self.kernels = kernels
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        return dict(kernels=self.kernels)
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return np.hstack([kernel.theta for kernel in self.kernels])
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        k_dims = self.k1.n_dims
+        for i, kernel in enumerate(self.kernels):
+            kernel.theta = theta[i*k_dims:(i+1)*k_dims]
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        return np.vstack([kernel.bounds for kernel in self.kernels])
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Note that this compound kernel returns the results of all simple kernel
+        stacked along an additional axis.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y, n_kernels)
+            Kernel k(X, Y)
+
+        K_gradient : array, shape (n_samples_X, n_samples_X, n_dims, n_kernels)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K = []
+            K_grad = []
+            for kernel in self.kernels:
+                K_single, K_grad_single = kernel(X, Y, eval_gradient)
+                K.append(K_single)
+                K_grad.append(K_grad_single[..., np.newaxis])
+            return np.dstack(K), np.concatenate(K_grad, 3)
+        else:
+            return np.dstack([kernel(X, Y, eval_gradient)
+                              for kernel in self.kernels])
+
+    def __eq__(self, b):
+        if type(self) != type(b) or len(self.kernels) != len(b.kernels):
+            return False
+        return np.all([self.kernels[i] == b.kernels[i]
+                       for i in range(len(self.kernels))])
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return np.all([kernel.is_stationary() for kernel in self.kernels])
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X, n_kernels)
+            Diagonal of kernel k(X, X)
+        """
+        return np.vstack([kernel.diag(X) for kernel in self.kernels]).T
+
+
+class KernelOperator(Kernel):
+    """Base class for all kernel operators. """
+
+    def __init__(self, k1, k2):
+        self.k1 = k1
+        self.k2 = k2
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict(k1=self.k1, k2=self.k2)
+        if deep:
+            deep_items = self.k1.get_params().items()
+            params.update(('k1__' + k, val) for k, val in deep_items)
+            deep_items = self.k2.get_params().items()
+            params.update(('k2__' + k, val) for k, val in deep_items)
+
+        return params
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for hyperparameter in self.k1.hyperparameters:
+            r.append(Hyperparameter("k1__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        for hyperparameter in self.k2.hyperparameters:
+            r.append(Hyperparameter("k2__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return np.append(self.k1.theta, self.k2.theta)
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        k1_dims = self.k1.n_dims
+        self.k1.theta = theta[:k1_dims]
+        self.k2.theta = theta[k1_dims:]
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        if self.k1.bounds.size == 0:
+            return self.k2.bounds
+        if self.k2.bounds.size == 0:
+            return self.k1.bounds
+        return np.vstack((self.k1.bounds, self.k2.bounds))
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        return (self.k1 == b.k1 and self.k2 == b.k2) \
+            or (self.k1 == b.k2 and self.k2 == b.k1)
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return self.k1.is_stationary() and self.k2.is_stationary()
+
+
+class Sum(KernelOperator):
+    """Sum-kernel k1 + k2 of two kernels k1 and k2.
+
+    The resulting kernel is defined as
+    k_sum(X, Y) = k1(X, Y) + k2(X, Y)
+
+    Parameters
+    ----------
+    k1 : Kernel object
+        The first base-kernel of the sum-kernel
+
+    k2 : Kernel object
+        The second base-kernel of the sum-kernel
+    """
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
+            return K1 + K2, np.dstack((K1_gradient, K2_gradient))
+        else:
+            return self.k1(X, Y) + self.k2(X, Y)
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) + self.k2.diag(X)
+
+    def __repr__(self):
+        return "{0} + {1}".format(self.k1, self.k2)
+
+
+class Product(KernelOperator):
+    """Product-kernel k1 * k2 of two kernels k1 and k2.
+
+    The resulting kernel is defined as
+    k_prod(X, Y) = k1(X, Y) * k2(X, Y)
+
+    Parameters
+    ----------
+    k1 : Kernel object
+        The first base-kernel of the product-kernel
+
+    k2 : Kernel object
+        The second base-kernel of the product-kernel
+    """
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
+            K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
+            return K1 * K2, np.dstack((K1_gradient * K2[:, :, np.newaxis],
+                                       K2_gradient * K1[:, :, np.newaxis]))
+        else:
+            return self.k1(X, Y) * self.k2(X, Y)
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.k1.diag(X) * self.k2.diag(X)
+
+    def __repr__(self):
+        return "{0} * {1}".format(self.k1, self.k2)
+
+
+class Exponentiation(Kernel):
+    """Exponentiate kernel by given exponent.
+
+    The resulting kernel is defined as
+    k_exp(X, Y) = k(X, Y) ** exponent
+
+    Parameters
+    ----------
+    kernel : Kernel object
+        The base kernel
+
+    exponent : float
+        The exponent for the base kernel
+
+    """
+    def __init__(self, kernel, exponent):
+        self.kernel = kernel
+        self.exponent = exponent
+
+    def get_params(self, deep=True):
+        """Get parameters of this kernel.
+
+        Parameters
+        ----------
+        deep: boolean, optional
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : mapping of string to any
+            Parameter names mapped to their values.
+        """
+        params = dict(kernel=self.kernel, exponent=self.exponent)
+        if deep:
+            deep_items = self.kernel.get_params().items()
+            params.update(('kernel__' + k, val) for k, val in deep_items)
+        return params
+
+    @property
+    def hyperparameters(self):
+        """Returns a list of all hyperparameter."""
+        r = []
+        for hyperparameter in self.kernel.hyperparameters:
+            r.append(Hyperparameter("kernel__" + hyperparameter.name,
+                                    hyperparameter.value_type,
+                                    hyperparameter.bounds,
+                                    hyperparameter.n_elements))
+        return r
+
+    @property
+    def theta(self):
+        """Returns the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Note that theta are typically the log-transformed values of the
+        kernel's hyperparameters as this representation of the search space
+        is more amenable for hyperparameter search, as hyperparameters like
+        length-scales naturally live on a log-scale.
+
+        Returns
+        -------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        return self.kernel.theta
+
+    @theta.setter
+    def theta(self, theta):
+        """Sets the (flattened, log-transformed) non-fixed hyperparameters.
+
+        Parameters
+        ----------
+        theta : array, shape (n_dims,)
+            The non-fixed, log-transformed hyperparameters of the kernel
+        """
+        self.kernel.theta = theta
+
+    @property
+    def bounds(self):
+        """Returns the log-transformed bounds on the theta.
+
+        Returns
+        -------
+        bounds : array, shape (n_dims, 2)
+            The log-transformed bounds on the kernel's hyperparameters theta
+        """
+        return self.kernel.bounds
+
+    def __eq__(self, b):
+        if type(self) != type(b):
+            return False
+        return (self.kernel == b.kernel and self.exponent == b.exponent)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        if eval_gradient:
+            K, K_gradient = self.kernel(X, Y, eval_gradient=True)
+            K_gradient *= \
+                self.exponent * K[:, :, np.newaxis] ** (self.exponent - 1)
+            return K ** self.exponent, K_gradient
+        else:
+            K = self.kernel(X, Y, eval_gradient=False)
+            return K ** self.exponent
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.kernel.diag(X) ** self.exponent
+
+    def __repr__(self):
+        return "{0} ** {1}".format(self.kernel, self.exponent)
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return self.kernel.is_stationary()
+
+
+class ConstantKernel(StationaryKernelMixin, Kernel):
+    """Constant kernel.
+
+    Can be used as part of a product-kernel where it scales the magnitude of
+    the other factor (kernel) or as part of a sum-kernel, where it modifies
+    the mean of the Gaussian process.
+
+    k(x_1, x_2) = constant_value for all x_1, x_2
+
+    Parameters
+    ----------
+    constant_value : float, default: 1.0
+        The constant value which defines the covariance:
+        k(x_1, x_2) = constant_value
+
+    constant_value_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on constant_value
+    """
+    def __init__(self, constant_value=1.0, constant_value_bounds=(1e-5, 1e5)):
+        self.constant_value = constant_value
+        self.constant_value_bounds = constant_value_bounds
+
+        self.hyperparameter_constant_value = \
+            Hyperparameter("constant_value", "numeric", constant_value_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            Y = X
+        elif eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        K = self.constant_value * np.ones((X.shape[0], Y.shape[0]))
+        if eval_gradient:
+            if not self.hyperparameter_constant_value.fixed:
+                return (K, self.constant_value
+                        * np.ones((X.shape[0], X.shape[0], 1)))
+            else:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.constant_value * np.ones(X.shape[0])
+
+    def __repr__(self):
+        return "{0:.3g}**2".format(np.sqrt(self.constant_value))
+
+
+class WhiteKernel(StationaryKernelMixin, Kernel):
+    """White kernel.
+
+    The main use-case of this kernel is as part of a sum-kernel where it
+    explains the noise-component of the signal. Tuning its parameter
+    corresponds to estimating the noise-level.
+
+    k(x_1, x_2) = noise_level if x_1 == x_2 else 0
+
+    Parameters
+    ----------
+    noise_level : float, default: 1.0
+        Parameter controlling the noise level
+
+    noise_level_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on noise_level
+    """
+    def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)):
+        self.noise_level = noise_level
+        self.noise_level_bounds = noise_level_bounds
+
+        self.hyperparameter_noise_level = \
+            Hyperparameter("noise_level", "numeric", noise_level_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is not None and eval_gradient:
+            raise ValueError("Gradient can only be evaluated when Y is None.")
+
+        if Y is None:
+            K = self.noise_level * np.eye(X.shape[0])
+            if eval_gradient:
+                if not self.hyperparameter_noise_level.fixed:
+                    return (K, self.noise_level
+                            * np.eye(X.shape[0])[:, :, np.newaxis])
+                else:
+                    return K, np.empty((X.shape[0], X.shape[0], 0))
+            else:
+                return K
+        else:
+            return np.zeros((X.shape[0], Y.shape[0]))
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return self.noise_level * np.ones(X.shape[0])
+
+    def __repr__(self):
+        return "{0}(noise_level={1:.3g})".format(self.__class__.__name__,
+                                                 self.noise_level)
+
+
+class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    """Radial-basis function kernel (aka squared-exponential kernel).
+
+    The RBF kernel is a stationary kernel. It is also known as the
+    "squared exponential" kernel. It is parameterized by a length-scale
+    parameter length_scale>0, which can either be a scalar (isotropic variant
+    of the kernel) or a vector with the same number of dimensions as the inputs
+    X (anisotropic variant of the kernel). The kernel is given by:
+
+    k(x_i, x_j) = exp(-1 / 2 d(x_i / length_scale, x_j / length_scale)^2)
+
+    This kernel is infinitely differentiable, which implies that GPs with this
+    kernel as covariance function have mean square derivatives of all orders,
+    and are thus very smooth.
+
+    Parameters
+    -----------
+    length_scale : float or array with shape (n_features,), default: 1.0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
+
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
+    """
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5)):
+        if np.iterable(length_scale):
+            if len(length_scale) > 1:
+                self.anisotropic = True
+                self.length_scale = np.asarray(length_scale, dtype=np.float)
+            else:
+                self.anisotropic = False
+                self.length_scale = float(length_scale[0])
+        else:
+            self.anisotropic = False
+            self.length_scale = float(length_scale)
+        self.length_scale_bounds = length_scale_bounds
+
+        if self.anisotropic:  # anisotropic length_scale
+            self.hyperparameter_length_scale = \
+                Hyperparameter("length_scale", "numeric", length_scale_bounds,
+                               len(length_scale))
+        else:
+            self.hyperparameter_length_scale = \
+                Hyperparameter("length_scale", "numeric", length_scale_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if self.anisotropic and X.shape[1] != self.length_scale.shape[0]:
+            raise Exception("Anisotropic kernel must have the same number of "
+                            "dimensions as data (%d!=%d)"
+                            % (self.length_scale.shape[0], X.shape[1]))
+
+        if Y is None:
+            dists = pdist(X / self.length_scale, metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / self.length_scale, Y / self.length_scale,
+                          metric='sqeuclidean')
+            K = np.exp(-.5 * dists)
+
+        if eval_gradient:
+            if self.hyperparameter_length_scale.fixed:
+                # Hyperparameter l kept fixed
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            elif not self.anisotropic or self.length_scale.shape[0] == 1:
+                K_gradient = \
+                    (K * squareform(dists))[:, :, np.newaxis]
+                return K, K_gradient
+            elif self.anisotropic:
+                # We need to recompute the pairwise dimension-wise distances
+                K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 \
+                    / (self.length_scale ** 2)
+                K_gradient *= K[..., np.newaxis]
+                return K, K_gradient
+            else:
+                raise Exception("Anisotropic kernels require that the number "
+                                "of length scales and features match.")
+        else:
+            return K
+
+    def __repr__(self):
+        if self.anisotropic:
+            return "{0}(length_scale=[{1}])".format(
+                self.__class__.__name__, ", ".join(map("{0:.3g}".format,
+                                                   self.length_scale)))
+        else:  # isotropic
+            return "{0}(length_scale={1:.3g})".format(
+                self.__class__.__name__, self.length_scale)
+
+
+class Matern(RBF):
+    """ Matern kernel.
+
+    The class of Matern kernels is a generalization of the RBF and the
+    absolute exponential kernel parameterized by an additional parameter
+    nu. The smaller nu, the less smooth the approximated function is.
+    For nu=inf, the kernel becomes equivalent to the RBF kernel and for nu=0.5
+    to the absolute exponential kernel. Important intermediate values are
+    nu=1.5 (once differentiable functions) and nu=2.5 (twice differentiable
+    functions).
+
+    See Rasmussen and Williams 2006, pp84 for details regarding the
+    different variants of the Matern kernel.
+
+    Parameters
+    -----------
+    length_scale : float or array with shape (n_features,), default: 1.0
+        The length scale of the kernel. If a float, an isotropic kernel is
+        used. If an array, an anisotropic kernel is used where each dimension
+        of l defines the length-scale of the respective feature dimension.
+
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
+
+    nu: float, default: 1.5
+        The parameter nu controlling the smoothness of the learned function.
+        The smaller nu, the less smooth the approximated function is.
+        For nu=inf, the kernel becomes equivalent to the RBF kernel and for
+        nu=0.5 to the absolute exponential kernel. Important intermediate
+        values are nu=1.5 (once differentiable functions) and nu=2.5
+        (twice differentiable functions). Note that values of nu not in
+        [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost
+        (appr. 10 times higher) since they require to evaluate the modified
+        Bessel function. Furthermore, in contrast to l, nu is kept fixed to
+        its initial value and not optimized.
+    """
+    def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5),
+                 nu=1.5):
+        super(Matern, self).__init__(length_scale, length_scale_bounds)
+        self.nu = nu
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if self.anisotropic and X.shape[1] != self.length_scale.shape[0]:
+            raise Exception("Anisotropic kernel must have the same number of "
+                            "dimensions as data (%d!=%d)"
+                            % (self.length_scale.shape[0], X.shape[1]))
+
+        if Y is None:
+            dists = pdist(X / self.length_scale, metric='euclidean')
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X / self.length_scale, Y / self.length_scale,
+                          metric='euclidean')
+
+        if self.nu == 0.5:
+            K = np.exp(-dists)
+        elif self.nu == 1.5:
+            K = dists * math.sqrt(3)
+            K = (1. + K) * np.exp(-K)
+        elif self.nu == 2.5:
+            K = dists * math.sqrt(5)
+            K = (1. + K + K ** 2 / 3.0) * np.exp(-K)
+        else:  # general case; expensive to evaluate
+            K = dists
+            K[K == 0.0] += np.finfo(float).eps  # strict zeros result in nan
+            tmp = (math.sqrt(2 * self.nu) * K)
+            K.fill((2 ** (1. - self.nu)) / gamma(self.nu))
+            K *= tmp ** self.nu
+            K *= kv(self.nu, tmp)
+
+        if Y is None:
+            # convert from upper-triangular matrix to square matrix
+            K = squareform(K)
+            np.fill_diagonal(K, 1)
+
+        if eval_gradient:
+            if self.hyperparameter_length_scale.fixed:
+                # Hyperparameter l kept fixed
+                K_gradient = np.empty((X.shape[0], X.shape[0], 0))
+                return K, K_gradient
+
+            # We need to recompute the pairwise dimension-wise distances
+            if self.anisotropic:
+                D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 \
+                    / (self.length_scale ** 2)
+            else:
+                D = squareform(dists**2)[:, :, np.newaxis]
+
+            if self.nu == 0.5:
+                K_gradient = K[..., np.newaxis] * D \
+                    / np.sqrt(D.sum(2))[:, :, np.newaxis]
+                K_gradient[~np.isfinite(K_gradient)] = 0
+            elif self.nu == 1.5:
+                K_gradient = \
+                    3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
+            elif self.nu == 2.5:
+                tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
+                K_gradient = 5.0/3.0 * D * (tmp + 1) * np.exp(-tmp)
+            else:
+                # approximate gradient numerically
+                def f(theta):  # helper function
+                    return self.clone_with_theta(theta)(X, Y)
+                return K, _approx_fprime(self.theta, f, 1e-10)
+
+            if not self.anisotropic:
+                return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
+            else:
+                return K, K_gradient
+        else:
+            return K
+
+    def __repr__(self):
+        if self.anisotropic:
+            return "{0}(length_scale=[{1}], nu={2:.3g})".format(
+                self.__class__.__name__,
+                ", ".join(map("{0:.3g}".format, self.length_scale)),
+                self.nu)
+        else:  # isotropic
+            return "{0}(length_scale={1:.3g}, nu={2:.3g})".format(
+                self.__class__.__name__, self.length_scale, self.nu)
+
+
+class RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    """Rational Quadratic kernel.
+
+    The RationalQuadratic kernel can be seen as a scale mixture (an infinite
+    sum) of RBF kernels with different characteristic length-scales. It is
+    parameterized by a length-scale parameter length_scale>0 and a scale
+    mixture parameter alpha>0. Only the isotropic variant where length_scale is
+    a scalar is supported at the moment. The kernel given by:
+
+    k(x_i, x_j) = (1 + d(x_i, x_j)^2 / (2*alpha * length_scale^2))^-alpha
+
+    Parameters
+    ----------
+    length_scale : float > 0, default: 1.0
+        The length scale of the kernel.
+
+    alpha : float > 0, default: 1.0
+        Scale mixture parameter
+
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
+
+    alpha_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on alpha
+    """
+    def __init__(self, length_scale=1.0, alpha=1.0,
+                 length_scale_bounds=(1e-5, 1e5), alpha_bounds=(1e-5, 1e5)):
+        self.length_scale = length_scale
+        self.alpha = alpha
+        self.length_scale_bounds = length_scale_bounds
+        self.alpha_bounds = alpha_bounds
+
+        self.hyperparameter_length_scale = \
+            Hyperparameter("length_scale", "numeric", length_scale_bounds)
+        self.hyperparameter_alpha = \
+            Hyperparameter("alpha", "numeric", alpha_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            dists = squareform(pdist(X, metric='sqeuclidean'))
+            tmp = dists / (2 * self.alpha * self.length_scale ** 2)
+            base = (1 + tmp)
+            K = base ** -self.alpha
+            np.fill_diagonal(K, 1)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric='sqeuclidean')
+            K = (1 + dists / (2 * self.alpha * self.length_scale ** 2)) \
+                ** -self.alpha
+
+        if eval_gradient:
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = \
+                    dists * K / (self.length_scale ** 2 * base)
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
+            else:  # l is kept fixed
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            # gradient with respect to alpha
+            if not self.hyperparameter_alpha.fixed:
+                alpha_gradient = \
+                    K * (-self.alpha * np.log(base)
+                         + dists / (2 * self.length_scale ** 2 * base))
+                alpha_gradient = alpha_gradient[:, :, np.newaxis]
+            else:  # alpha is kept fixed
+                alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((alpha_gradient, length_scale_gradient))
+        else:
+            return K
+
+    def __repr__(self):
+        return "{0}(alpha={1:.3g}, length_scale={2:.3g})".format(
+            self.__class__.__name__, self.alpha, self.length_scale)
+
+
+class ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
+    """Exp-Sine-Squared kernel.
+
+    The ExpSineSquared kernel allows modeling periodic functions. It is
+    parameterized by a length-scale parameter length_scale>0 and a periodicity
+    parameter periodicity>0. Only the isotropic variant where l is a scalar is
+    supported at the moment. The kernel given by:
+
+    k(x_i, x_j) = exp(-2 sin(\pi / periodicity * d(x_i, x_j)) / length_scale)^2
+
+    Parameters
+    ----------
+    length_scale : float > 0, default: 1.0
+        The length scale of the kernel.
+
+    periodicity : float > 0, default: 1.0
+        The periodicity of the kernel.
+
+    length_scale_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on length_scale
+
+    periodicity_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on periodicity
+    """
+    def __init__(self, length_scale=1.0, periodicity=1.0,
+                 length_scale_bounds=(1e-5, 1e5),
+                 periodicity_bounds=(1e-5, 1e5)):
+        self.length_scale = length_scale
+        self.periodicity = periodicity
+        self.length_scale_bounds = length_scale_bounds
+        self.periodicity_bounds = periodicity_bounds
+
+        self.hyperparameter_length_scale = \
+            Hyperparameter("length_scale", "numeric", length_scale_bounds)
+        self.hyperparameter_periodicity = \
+            Hyperparameter("periodicity", "numeric", periodicity_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            dists = squareform(pdist(X, metric='euclidean'))
+            arg = np.pi * dists / self.periodicity
+            sin_of_arg = np.sin(arg)
+            K = np.exp(- 2 * (sin_of_arg / self.length_scale) ** 2)
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            dists = cdist(X, Y, metric='euclidean')
+            K = np.exp(- 2 * (np.sin(np.pi / self.periodicity * dists)
+                              / self.length_scale) ** 2)
+
+        if eval_gradient:
+            cos_of_arg = np.cos(arg)
+            # gradient with respect to length_scale
+            if not self.hyperparameter_length_scale.fixed:
+                length_scale_gradient = \
+                    4 / self.length_scale**2 * sin_of_arg**2 * K
+                length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
+            else:  # length_scale is kept fixed
+                length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
+            # gradient with respect to p
+            if not self.hyperparameter_periodicity.fixed:
+                periodicity_gradient = \
+                    4 * arg / self.length_scale**2 * cos_of_arg \
+                    * sin_of_arg * K
+                periodicity_gradient = periodicity_gradient[:, :, np.newaxis]
+            else:  # p is kept fixed
+                periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))
+
+            return K, np.dstack((length_scale_gradient, periodicity_gradient))
+        else:
+            return K
+
+    def __repr__(self):
+        return "{0}(length_scale={1:.3g}, periodicity={2:.3g})".format(
+            self.__class__.__name__, self.length_scale, self.periodicity)
+
+
+class DotProduct(Kernel):
+    """Dot-Product kernel.
+
+    The DotProduct kernel is non-stationary and can be obtained from linear
+    regression by putting N(0, 1) priors on the coefficients of x_d (d = 1, . .
+    . , D) and a prior of N(0, \sigma_0^2) on the bias. The DotProduct kernel
+    is invariant to a rotation of the coordinates about the origin, but not
+    translations. It is parameterized by a parameter sigma_0^2. For
+    sigma_0^2 =0, the kernel is called the homogeneous linear kernel, otherwise
+    it is inhomogeneous. The kernel is given by
+
+    k(x_i, x_j) = sigma_0 ^ 2 + x_i \cdot x_j
+
+    The DotProduct kernel is commonly combined with exponentiation.
+
+    Parameters
+    ----------
+    sigma_0 : float >= 0, default: 1.0
+        Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
+        the kernel is homogenous.
+
+    sigma_0_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on l
+    """
+
+    def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)):
+        self.sigma_0 = sigma_0
+        self.sigma_0_bounds = sigma_0_bounds
+
+        self.hyperparameter_sigma_0 = \
+            Hyperparameter("sigma_0", "numeric", sigma_0_bounds)
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        if Y is None:
+            K = np.inner(X, X) + self.sigma_0 ** 2
+        else:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated when Y is None.")
+            K = np.inner(X, Y) + self.sigma_0 ** 2
+
+        if eval_gradient:
+            if not self.hyperparameter_sigma_0.fixed:
+                K_gradient = np.empty((K.shape[0], K.shape[1], 1))
+                K_gradient[..., 0] = 2 * self.sigma_0 ** 2
+                return K, K_gradient
+            else:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        return np.einsum('ij,ij->i', X, X) + self.sigma_0 ** 2
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return False
+
+    def __repr__(self):
+        return "{0}(sigma_0={1:.3g})".format(
+            self.__class__.__name__, self.sigma_0)
+
+
+# adapted from scipy/optimize/optimize.py for functions with 2d output
+def _approx_fprime(xk, f, epsilon, args=()):
+    f0 = f(*((xk,) + args))
+    grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
+    ei = np.zeros((len(xk), ), float)
+    for k in range(len(xk)):
+        ei[k] = 1.0
+        d = epsilon * ei
+        grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
+        ei[k] = 0.0
+    return grad
+
+
+class PairwiseKernel(Kernel):
+    """Wrapper for kernels in sklearn.metrics.pairwise.
+
+    A thin wrapper around the functionality of the kernels in
+    sklearn.metrics.pairwise.
+
+    Note: Evaluation of eval_gradient is not analytic but numeric and all
+          kernels support only isotropic distances. The parameter gamma is
+          considered to be a hyperparameter and may be optimized. The other
+          kernel parameters are set directly at initialization and are kept
+          fixed.
+
+    Parameters
+    ----------
+    gamma: float >= 0, default: 1.0
+        Parameter gamma of the pairwise kernel specified by metric
+
+    gamma_bounds : pair of floats >= 0, default: (1e-5, 1e5)
+        The lower and upper bound on gamma
+
+    metric : string, or callable, default: "linear"
+        The metric to use when calculating kernel between instances in a
+        feature array. If metric is a string, it must be one of the metrics
+        in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
+        If metric is "precomputed", X is assumed to be a kernel matrix.
+        Alternatively, if metric is a callable function, it is called on each
+        pair of instances (rows) and the resulting value recorded. The callable
+        should take two arrays from X as input and return a value indicating
+        the distance between them.
+
+    pairwise_kernels_kwargs : dict, default: None
+        All entries of this dict (if any) are passed as keyword arguments to
+        the pairwise kernel function.
+    """
+
+    def __init__(self, gamma=1.0, gamma_bounds=(1e-5, 1e5), metric="linear",
+                 pairwise_kernels_kwargs=None):
+        self.gamma = gamma
+        self.gamma_bounds = gamma_bounds
+
+        self.hyperparameter_gamma = \
+            Hyperparameter("gamma", "numeric", gamma_bounds)
+
+        self.metric = metric
+        if pairwise_kernels_kwargs is not None:
+            self.pairwise_kernels_kwargs = pairwise_kernels_kwargs
+        else:
+            self.pairwise_kernels_kwargs = {}
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        """Return the kernel k(X, Y) and optionally its gradient.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Y : array, shape (n_samples_Y, n_features), (optional, default=None)
+            Right argument of the returned kernel k(X, Y). If None, k(X, X)
+            if evaluated instead.
+
+        eval_gradient : bool (optional, default=False)
+            Determines whether the gradient with respect to the kernel
+            hyperparameter is determined. Only supported when Y is None.
+
+        Returns
+        -------
+        K : array, shape (n_samples_X, n_samples_Y)
+            Kernel k(X, Y)
+
+        K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
+            The gradient of the kernel k(X, X) with respect to the
+            hyperparameter of the kernel. Only returned when eval_gradient
+            is True.
+        """
+        X = np.atleast_2d(X)
+        K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma,
+                             filter_params=True,
+                             **self.pairwise_kernels_kwargs)
+        if eval_gradient:
+            if self.hyperparameter_gamma.fixed:
+                return K, np.empty((X.shape[0], X.shape[0], 0))
+            else:
+                # approximate gradient numerically
+                def f(gamma):  # helper function
+                    return pairwise_kernels(
+                        X, Y, metric=self.metric, gamma=np.exp(gamma),
+                        filter_params=True, **self.pairwise_kernels_kwargs)
+                return K, _approx_fprime(self.theta, f, 1e-10)
+        else:
+            return K
+
+    def diag(self, X):
+        """Returns the diagonal of the kernel k(X, X).
+
+        The result of this method is identical to np.diag(self(X)); however,
+        it can be evaluated more efficiently since only the diagonal is
+        evaluated.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples_X, n_features)
+            Left argument of the returned kernel k(X, Y)
+
+        Returns
+        -------
+        K_diag : array, shape (n_samples_X,)
+            Diagonal of kernel k(X, X)
+        """
+        # We have to fall back to slow way of computing diagonal
+        return np.apply_along_axis(self, 1, X)[:, 0]
+
+    def is_stationary(self):
+        """Returns whether the kernel is stationary. """
+        return self.metric in ["rbf"]
+
+    def __repr__(self):
+        return "{0}(gamma={1}, metric={2})".format(
+            self.__class__.__name__, self.gamma, self.metric)
diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
new file mode 100644
index 0000000000000..3cc1a2ea429e3
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -0,0 +1,163 @@
+"""Testing for Gaussian process classification """
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.gaussian_process import GaussianProcessClassifier
+from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
+
+from sklearn.utils.testing import (assert_true, assert_greater, assert_equal,
+                                   assert_almost_equal, assert_array_equal)
+
+
+def f(x):
+    return np.sin(x)
+X = np.atleast_2d(np.linspace(0, 10, 30)).T
+X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
+y = np.array(f(X).ravel() > 0, dtype=int)
+fX = f(X).ravel()
+y_mc = np.empty(y.shape, dtype=int)  # multi-class
+y_mc[fX < -0.35] = 0
+y_mc[(fX >= -0.35) & (fX < 0.35)] = 1
+y_mc[fX > 0.35] = 2
+
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [RBF(length_scale=0.1), fixed_kernel,
+           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))]
+
+
+def test_predict_consistent():
+    """ Check binary predict decision has also predicted probability above 0.5.
+    """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+        assert_array_equal(gpc.predict(X),
+                           gpc.predict_proba(X)[:, 1] >= 0.5)
+
+
+def test_lml_improving():
+    """ Test that hyperparameter-tuning improves log-marginal likelihood. """
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                       gpc.log_marginal_likelihood(kernel.theta))
+
+
+def test_lml_precomputed():
+    """ Test that lml of optimized kernel is stored correctly. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+        assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                            gpc.log_marginal_likelihood(), 7)
+
+
+def test_converged_to_local_maximum():
+    """ Test that we are in local maximum after hyperparameter-optimization."""
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = \
+            gpc.log_marginal_likelihood(gpc.kernel_.theta, True)
+
+        assert_true(np.all((np.abs(lml_gradient) < 1e-4)
+                           | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0])
+                           | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1])))
+
+
+def test_lml_gradient():
+    """ Compare analytic and numeric gradient of log marginal likelihood. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True)
+        lml_gradient_approx = \
+            approx_fprime(kernel.theta,
+                          lambda theta: gpc.log_marginal_likelihood(theta,
+                                                                    False),
+                          1e-10)
+
+        assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+def test_random_starts():
+    """
+    Test that an increasing number of random-starts of GP fitting only
+    increases the log marginal likelihood of the chosen theta.
+    """
+    n_samples, n_features = 25, 3
+    np.random.seed(0)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0
+
+    kernel = C(1.0, (1e-2, 1e2)) \
+        * RBF(length_scale=[1e-3] * n_features,
+              length_scale_bounds=[(1e-4, 1e+2)] * n_features)
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(9):
+        gp = GaussianProcessClassifier(
+            kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert_greater(lml, last_lml - np.finfo(np.float32).eps)
+        last_lml = lml
+
+
+def test_custom_optimizer():
+    """ Test that GPC can use externally defined optimizers. """
+    # Define a dummy optimizer that simply tests 1000 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = \
+            initial_theta, obj_func(initial_theta, eval_gradient=False)
+        for _ in range(1000):
+            theta = np.atleast_1d(rng.uniform(np.maximum(-2, bounds[:, 0]),
+                                              np.minimum(1, bounds[:, 1])))
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
+        gpc.fit(X, y_mc)
+        # Checks that optimizer improved marginal likelihood
+        assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
+                       gpc.log_marginal_likelihood(kernel.theta))
+
+
+def test_multi_class():
+    """ Test GPC for multi-class classification problems. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel)
+        gpc.fit(X, y_mc)
+
+        y_prob = gpc.predict_proba(X2)
+        assert_almost_equal(y_prob.sum(1), 1)
+
+        y_pred = gpc.predict(X2)
+        assert_array_equal(np.argmax(y_prob, 1), y_pred)
+
+
+def test_multi_class_n_jobs():
+    """ Test that multi-class GPC produces identical results with n_jobs>1. """
+    for kernel in kernels:
+        gpc = GaussianProcessClassifier(kernel=kernel)
+        gpc.fit(X, y_mc)
+
+        gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
+        gpc_2.fit(X, y_mc)
+
+        y_prob = gpc.predict_proba(X2)
+        y_prob_2 = gpc_2.predict_proba(X2)
+        assert_almost_equal(y_prob, y_prob_2)
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
new file mode 100644
index 0000000000000..218054dbbb52f
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -0,0 +1,315 @@
+"""Testing for Gaussian process regression """
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels \
+    import RBF, ConstantKernel as C, WhiteKernel
+
+from sklearn.utils.testing \
+    import (assert_true, assert_greater, assert_array_less,
+            assert_almost_equal, assert_equal)
+
+
+def f(x):
+    return x * np.sin(x)
+X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
+X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
+y = f(X).ravel()
+
+fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
+kernels = [RBF(length_scale=1.0), fixed_kernel,
+           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
+           C(1.0, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+           + C(1e-5, (1e-5, 1e2)),
+           C(0.1, (1e-2, 1e2))
+           * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3))
+           + C(1e-5, (1e-5, 1e2))]
+
+
+def test_gpr_interpolation():
+    """Test the interpolating property for different kernels."""
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        y_pred, y_cov = gpr.predict(X, return_cov=True)
+
+        assert_true(np.allclose(y_pred, y))
+        assert_true(np.allclose(np.diag(y_cov), 0.))
+
+
+def test_lml_improving():
+    """ Test that hyperparameter-tuning improves log-marginal likelihood. """
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                       gpr.log_marginal_likelihood(kernel.theta))
+
+
+def test_lml_precomputed():
+    """ Test that lml of optimized kernel is stored correctly. """
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        assert_equal(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                     gpr.log_marginal_likelihood())
+
+
+def test_converged_to_local_maximum():
+    """ Test that we are in local maximum after hyperparameter-optimization."""
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = \
+            gpr.log_marginal_likelihood(gpr.kernel_.theta, True)
+
+        assert_true(np.all((np.abs(lml_gradient) < 1e-4)
+                           | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0])
+                           | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1])))
+
+
+def test_solution_inside_bounds():
+    """ Test that hyperparameter-optimization remains in bounds"""
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+        bounds = gpr.kernel_.bounds
+        max_ = np.finfo(gpr.kernel_.theta.dtype).max
+        tiny = 1e-10
+        bounds[~np.isfinite(bounds[:, 1]), 1] = max_
+
+        assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
+        assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
+
+
+def test_lml_gradient():
+    """ Compare analytic and numeric gradient of log marginal likelihood. """
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+        lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True)
+        lml_gradient_approx = \
+            approx_fprime(kernel.theta,
+                          lambda theta: gpr.log_marginal_likelihood(theta,
+                                                                    False),
+                          1e-10)
+
+        assert_almost_equal(lml_gradient, lml_gradient_approx, 3)
+
+
+def test_prior():
+    """ Test that GP prior has mean 0 and identical variances."""
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel)
+
+        y_mean, y_cov = gpr.predict(X, return_cov=True)
+
+        assert_almost_equal(y_mean, 0, 5)
+        if len(gpr.kernel.theta) > 1:
+            # XXX: quite hacky, works only for current kernels
+            assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5)
+        else:
+            assert_almost_equal(np.diag(y_cov), 1, 5)
+
+
+def test_sample_statistics():
+    """ Test that statistics of samples drawn from GP are correct."""
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+        y_mean, y_cov = gpr.predict(X2, return_cov=True)
+
+        samples = gpr.sample_y(X2, 1000000)
+
+        # More digits accuracy would require many more samples
+        assert_almost_equal(y_mean, np.mean(samples, 1), 2)
+        assert_almost_equal(np.diag(y_cov) / np.diag(y_cov).max(),
+                            np.var(samples, 1) / np.diag(y_cov).max(), 1)
+
+
+def test_no_optimizer():
+    """ Test that kernel parameters are unmodified when optimizer is None."""
+    kernel = RBF(1.0)
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
+    assert_equal(np.exp(gpr.kernel_.theta), 1.0)
+
+
+def test_predict_cov_vs_std():
+    """ Test that predicted std.-dev. is consistent with cov's diagonal."""
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+        y_mean, y_cov = gpr.predict(X2, return_cov=True)
+        y_mean, y_std = gpr.predict(X2, return_std=True)
+        assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std)
+
+
+def test_anisotropic_kernel():
+    """ Test that GPR can identify meaningful anisotropic length-scales. """
+    # We learn a function which varies in one dimension ten-times slower
+    # than in the other. The corresponding length-scales should differ by at
+    # least a factor 5
+    rng = np.random.RandomState(0)
+    X = rng.uniform(-1, 1, (50, 2))
+    y = X[:, 0] + 0.1 * X[:, 1]
+
+    kernel = RBF([1.0, 1.0])
+    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
+    assert_greater(np.exp(gpr.kernel_.theta[1]),
+                   np.exp(gpr.kernel_.theta[0]) * 5)
+
+
+def test_random_starts():
+    """
+    Test that an increasing number of random-starts of GP fitting only
+    increases the log marginal likelihood of the chosen theta.
+    """
+    n_samples, n_features = 25, 3
+    np.random.seed(0)
+    rng = np.random.RandomState(0)
+    X = rng.randn(n_samples, n_features) * 2 - 1
+    y = np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1) \
+        + rng.normal(scale=0.1, size=n_samples)
+
+    kernel = C(1.0, (1e-2, 1e2)) \
+        * RBF(length_scale=[1.0] * n_features,
+              length_scale_bounds=[(1e-4, 1e+2)] * n_features) \
+        + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1))
+    last_lml = -np.inf
+    for n_restarts_optimizer in range(9):
+        gp = GaussianProcessRegressor(
+            kernel=kernel, n_restarts_optimizer=n_restarts_optimizer,
+            random_state=0,).fit(X, y)
+        lml = gp.log_marginal_likelihood(gp.kernel_.theta)
+        assert_greater(lml, last_lml - np.finfo(np.float32).eps)
+        last_lml = lml
+
+
+def test_y_normalization():
+    """ Test normalization of the target values in GP
+
+    Fitting non-normalizing GP on normalized y and fitting normalizing GP
+    on unnormalized y should yield identical results
+    """
+    y_mean = y.mean(0)
+    y_norm = y - y_mean
+    for kernel in kernels:
+        # Fit non-normalizing GP on normalized y
+        gpr = GaussianProcessRegressor(kernel=kernel)
+        gpr.fit(X, y_norm)
+        # Fit normalizing GP on unnormalized y
+        gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr_norm.fit(X, y)
+
+        # Compare predicted mean, std-devs and covariances
+        y_pred, y_pred_std = gpr.predict(X2, return_std=True)
+        y_pred = y_mean + y_pred
+        y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True)
+
+        assert_almost_equal(y_pred, y_pred_norm)
+        assert_almost_equal(y_pred_std, y_pred_std_norm)
+
+        _, y_cov = gpr.predict(X2, return_cov=True)
+        _, y_cov_norm = gpr_norm.predict(X2, return_cov=True)
+        assert_almost_equal(y_cov, y_cov_norm)
+
+
+def test_y_multioutput():
+    """ Test that GPR can deal with multi-dimensional target values"""
+    y_2d = np.vstack((y, y*2)).T
+
+    # Test for fixed kernel that first dimension of 2d GP equals the output
+    # of 1d GP and that second dimension is twice as large
+    kernel = RBF(length_scale=1.0)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None,
+                                   normalize_y=False)
+    gpr.fit(X, y)
+
+    gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None,
+                                      normalize_y=False)
+    gpr_2d.fit(X, y_2d)
+
+    y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True)
+    y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True)
+    _, y_cov_1d = gpr.predict(X2, return_cov=True)
+    _, y_cov_2d = gpr_2d.predict(X2, return_cov=True)
+
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 0])
+    assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2)
+
+    # Standard deviation and covariance do not depend on output
+    assert_almost_equal(y_std_1d, y_std_2d)
+    assert_almost_equal(y_cov_1d, y_cov_2d)
+
+    y_sample_1d = gpr.sample_y(X2, n_samples=10)
+    y_sample_2d = gpr_2d.sample_y(X2, n_samples=10)
+    assert_almost_equal(y_sample_1d, y_sample_2d[:, 0])
+
+    # Test hyperparameter optimization
+    for kernel in kernels:
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr.fit(X, y)
+
+        gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        gpr_2d.fit(X, np.vstack((y, y)).T)
+
+        assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
+
+
+def test_custom_optimizer():
+    """ Test that GPR can use externally defined optimizers. """
+    # Define a dummy optimizer that simply tests 1000 random hyperparameters
+    def optimizer(obj_func, initial_theta, bounds):
+        rng = np.random.RandomState(0)
+        theta_opt, func_min = \
+            initial_theta, obj_func(initial_theta, eval_gradient=False)
+        for _ in range(1000):
+            theta = np.atleast_1d(rng.uniform(np.maximum(-2, bounds[:, 0]),
+                                              np.minimum(1, bounds[:, 1])))
+            f = obj_func(theta, eval_gradient=False)
+            if f < func_min:
+                theta_opt, func_min = theta, f
+        return theta_opt, func_min
+
+    for kernel in kernels:
+        if kernel == fixed_kernel: continue
+        gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer)
+        gpr.fit(X, y)
+        # Checks that optimizer improved marginal likelihood
+        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
+                       gpr.log_marginal_likelihood(gpr.kernel.theta))
+
+
+def test_duplicate_input():
+    """ Test GPR can handle two different output-values for the same input. """
+    for kernel in kernels:
+        gpr_equal_inputs = \
+            GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+        gpr_similar_inputs = \
+            GaussianProcessRegressor(kernel=kernel, alpha=1e-2)
+
+        X_ = np.vstack((X, X[0]))
+        y_ = np.hstack((y, y[0] + 1))
+        gpr_equal_inputs.fit(X_, y_)
+
+        X_ = np.vstack((X, X[0] + 1e-15))
+        y_ = np.hstack((y, y[0] + 1))
+        gpr_similar_inputs.fit(X_, y_)
+
+        X_test = np.linspace(0, 10, 100)[:, None]
+        y_pred_equal, y_std_equal = \
+            gpr_equal_inputs.predict(X_test, return_std=True)
+        y_pred_similar, y_std_similar = \
+            gpr_similar_inputs.predict(X_test, return_std=True)
+
+        assert_almost_equal(y_pred_equal, y_pred_similar)
+        assert_almost_equal(y_std_equal, y_std_similar)
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
new file mode 100644
index 0000000000000..260fba97fedec
--- /dev/null
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -0,0 +1,290 @@
+"""Testing for kernels for Gaussian processes."""
+
+# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+# Licence: BSD 3 clause
+
+from collections import Hashable
+import inspect
+
+import numpy as np
+
+from scipy.optimize import approx_fprime
+
+from sklearn.metrics.pairwise \
+    import PAIRWISE_KERNEL_FUNCTIONS, euclidean_distances, pairwise_kernels
+from sklearn.gaussian_process.kernels \
+    import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct,
+            ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator,
+            Exponentiation)
+from sklearn.base import clone
+
+from sklearn.utils.testing import (assert_equal, assert_almost_equal,
+                                   assert_not_equal, assert_array_equal,
+                                   assert_array_almost_equal)
+
+
+X = np.random.RandomState(0).normal(0, 1, (10, 2))
+Y = np.random.RandomState(0).normal(0, 1, (11, 2))
+
+kernel_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
+kernels = [RBF(length_scale=2.0), RBF(length_scale_bounds=(0.5, 2.0)),
+           ConstantKernel(constant_value=10.0),
+           2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
+           2.0 * RBF(length_scale=0.5), kernel_white,
+           2.0 * RBF(length_scale=[0.5, 2.0]),
+           2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
+           2.0 * Matern(length_scale=0.5, nu=0.5),
+           2.0 * Matern(length_scale=1.5, nu=1.5),
+           2.0 * Matern(length_scale=2.5, nu=2.5),
+           2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
+           3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
+           4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
+           RationalQuadratic(length_scale=0.5, alpha=1.5),
+           ExpSineSquared(length_scale=0.5, periodicity=1.5),
+           DotProduct(sigma_0=2.0), DotProduct(sigma_0=2.0) ** 2]
+for metric in PAIRWISE_KERNEL_FUNCTIONS:
+    if metric in ["additive_chi2", "chi2"]:
+        continue
+    kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
+
+
+def test_kernel_gradient():
+    """ Compare analytic and numeric gradient of kernels. """
+    for kernel in kernels:
+        K, K_gradient = kernel(X, eval_gradient=True)
+
+        assert_equal(K_gradient.shape[0], X.shape[0])
+        assert_equal(K_gradient.shape[1], X.shape[0])
+        assert_equal(K_gradient.shape[2], kernel.theta.shape[0])
+
+        K_gradient_approx = np.empty_like(K_gradient)
+        for i in range(K.shape[0]):
+            for j in range(K.shape[1]):
+                def eval_kernel_ij_for_theta(theta):
+                    kernel_clone = kernel.clone_with_theta(theta)
+                    K = kernel_clone(X, eval_gradient=False)
+                    return K[i, j]
+                K_gradient_approx[i, j] = \
+                    approx_fprime(kernel.theta, eval_kernel_ij_for_theta,
+                                  1e-10)
+
+        assert_almost_equal(K_gradient, K_gradient_approx, 4)
+
+
+def test_kernel_theta():
+    """ Check that parameter vector theta of kernel is set correctly. """
+    for kernel in kernels:
+        if isinstance(kernel, KernelOperator) \
+           or isinstance(kernel, Exponentiation):  # skip non-basic kernels
+            continue
+        theta = kernel.theta
+        _, K_gradient = kernel(X, eval_gradient=True)
+
+        # Determine kernel parameters that contribute to theta
+        args, varargs, kw, default = \
+            inspect.getargspec(kernel.__class__.__init__)
+        theta_vars = map(lambda s: s.rstrip("_bounds"),
+                         filter(lambda s: s.endswith("_bounds"), args))
+        assert_equal(
+            set(hyperparameter.name
+                for hyperparameter in kernel.hyperparameters),
+            set(theta_vars))
+
+        # Check that values returned in theta are consistent with
+        # hyperparameter values (being their logarithms)
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
+            assert_equal(theta[i],
+                         np.log(getattr(kernel, hyperparameter.name)))
+
+        # Fixed kernel parameters must be excluded from theta and gradient.
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
+            # create copy with certain hyperparameter fixed
+            params = kernel.get_params()
+            params[hyperparameter.name + "_bounds"] = "fixed"
+            kernel_class = kernel.__class__
+            new_kernel = kernel_class(**params)
+            # Check that theta and K_gradient are identical with the fixed
+            # dimension left out
+            _, K_gradient_new = new_kernel(X, eval_gradient=True)
+            assert_equal(theta.shape[0], new_kernel.theta.shape[0] + 1)
+            assert_equal(K_gradient.shape[2], K_gradient_new.shape[2] + 1)
+            if i > 0:
+                assert_equal(theta[:i], new_kernel.theta[:i])
+                assert_array_equal(K_gradient[..., :i],
+                                   K_gradient_new[..., :i])
+            if i + 1 < len(kernel.hyperparameters):
+                assert_equal(theta[i+1:], new_kernel.theta[i:])
+                assert_array_equal(K_gradient[..., i+1:],
+                                   K_gradient_new[..., i:])
+
+        # Check that values of theta are modified correctly
+        for i, hyperparameter in enumerate(kernel.hyperparameters):
+            theta[i] = np.log(42)
+            kernel.theta = theta
+            assert_almost_equal(getattr(kernel, hyperparameter.name), 42)
+
+            setattr(kernel, hyperparameter.name, 43)
+            assert_almost_equal(kernel.theta[i], np.log(43))
+
+
+def test_auto_vs_cross():
+    """ Auto-correlation and cross-correlation should be consistent. """
+    for kernel in kernels:
+        if kernel == kernel_white:
+            continue  # Identity is not satisfied on diagonal
+        K_auto = kernel(X)
+        K_cross = kernel(X, X)
+        assert_almost_equal(K_auto, K_cross, 5)
+
+
+def test_kernel_diag():
+    """ Test that diag method of kernel returns consistent results. """
+    for kernel in kernels:
+        K_call_diag = np.diag(kernel(X))
+        K_diag = kernel.diag(X)
+        assert_almost_equal(K_call_diag, K_diag, 5)
+
+
+def test_kernel_operator_commutative():
+    """ Adding kernels and multiplying kernels should be commutative. """
+    # Check addition
+    assert_almost_equal((RBF(2.0) + 1.0)(X),
+                        (1.0 + RBF(2.0))(X))
+
+    # Check multiplication
+    assert_almost_equal((3.0 * RBF(2.0))(X),
+                        (RBF(2.0) * 3.0)(X))
+
+
+def test_kernel_anisotropic():
+    """ Anisotropic kernel should be consistent with isotropic kernels."""
+    kernel = 3.0 * RBF([0.5, 2.0])
+
+    K = kernel(X)
+    X1 = np.array(X)
+    X1[:, 0] *= 4
+    K1 = 3.0 * RBF(2.0)(X1)
+    assert_almost_equal(K, K1)
+
+    X2 = np.array(X)
+    X2[:, 1] /= 4
+    K2 = 3.0 * RBF(0.5)(X2)
+    assert_almost_equal(K, K2)
+
+    # Check getting and setting via theta
+    kernel.theta = kernel.theta + np.log(2)
+    assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0]))
+    assert_array_equal(kernel.k2.length_scale, [1.0, 4.0])
+
+
+def test_kernel_stationary():
+    """ Test stationarity of kernels."""
+    for kernel in kernels:
+        if not kernel.is_stationary():
+            continue
+        K = kernel(X, X + 1)
+        assert_almost_equal(K[0, 0], np.diag(K))
+
+
+def test_kernel_clone():
+    """ Test that sklearn's clone works correctly on kernels. """
+    for kernel in kernels:
+        kernel_cloned = clone(kernel)
+
+        assert_equal(kernel, kernel_cloned)
+        assert_not_equal(id(kernel), id(kernel_cloned))
+        for attr in kernel.__dict__.keys():
+            attr_value = getattr(kernel, attr)
+            attr_value_cloned = getattr(kernel_cloned, attr)
+            if attr.startswith("hyperparameter_"):
+                assert_equal(attr_value.name, attr_value_cloned.name)
+                assert_equal(attr_value.value_type,
+                             attr_value_cloned.value_type)
+                assert_array_equal(attr_value.bounds,
+                                   attr_value_cloned.bounds)
+                assert_equal(attr_value.n_elements,
+                             attr_value_cloned.n_elements)
+            elif np.iterable(attr_value):
+                for i in range(len(attr_value)):
+                    if np.iterable(attr_value[i]):
+                        assert_array_equal(attr_value[i],
+                                           attr_value_cloned[i])
+                    else:
+                        assert_equal(attr_value[i], attr_value_cloned[i])
+            else:
+                assert_equal(attr_value, attr_value_cloned)
+            if not isinstance(attr_value, Hashable):
+                # modifiable attributes must not be identical
+                assert_not_equal(id(attr_value), id(attr_value_cloned))
+
+
+def test_matern_kernel():
+    """ Test consistency of Matern kernel for special values of nu. """
+    K = Matern(nu=1.5, length_scale=1.0)(X)
+    # the diagonal elements of a matern kernel are 1
+    assert_array_almost_equal(np.diag(K), np.ones(X.shape[0]))
+    # matern kernel for coef0==0.5 is equal to absolute exponential kernel
+    K_absexp = np.exp(-euclidean_distances(X, X, squared=False))
+    K = Matern(nu=0.5, length_scale=1.0)(X)
+    assert_array_almost_equal(K, K_absexp)
+    # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5])
+    # result in nearly identical results as the general case for coef0 in
+    # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny]
+    tiny = 1e-10
+    for nu in [0.5, 1.5, 2.5]:
+        K1 = Matern(nu=nu, length_scale=1.0)(X)
+        K2 = Matern(nu=nu + tiny, length_scale=1.0)(X)
+        assert_array_almost_equal(K1, K2)
+
+
+def test_kernel_versus_pairwise():
+    """Check that GP kernels can also be used as pairwise kernels."""
+    for kernel in kernels:
+        # Test auto-kernel
+        if kernel != kernel_white:
+            # For WhiteKernel: k(X) != k(X,X). This is assumed by
+            # pairwise_kernels
+            K1 = kernel(X)
+            K2 = pairwise_kernels(X, metric=kernel)
+            assert_array_almost_equal(K1, K2)
+
+        # Test cross-kernel
+        K1 = kernel(X, Y)
+        K2 = pairwise_kernels(X, Y, metric=kernel)
+        assert_array_almost_equal(K1, K2)
+
+
+def test_set_get_params():
+    """Check that set_params()/get_params() is consistent with kernel.theta."""
+    for kernel in kernels:
+        # Test get_params()
+        index = 0
+        params = kernel.get_params()
+        for hyperparameter in kernel.hyperparameters:
+            if hyperparameter.bounds is "fixed":
+                continue
+            size = hyperparameter.n_elements
+            if size > 1:  # anisotropic kernels
+                assert_almost_equal(np.exp(kernel.theta[index:index+size]),
+                                    params[hyperparameter.name])
+                index += size
+            else:
+                assert_almost_equal(np.exp(kernel.theta[index]),
+                                    params[hyperparameter.name])
+                index += 1
+        # Test set_params()
+        index = 0
+        value = 10  # arbitrary value
+        for hyperparameter in kernel.hyperparameters:
+            if hyperparameter.bounds is "fixed":
+                continue
+            size = hyperparameter.n_elements
+            if size > 1:  # anisotropic kernels
+                kernel.set_params(**{hyperparameter.name: [value]*size})
+                assert_almost_equal(np.exp(kernel.theta[index:index+size]),
+                                    [value]*size)
+                index += size
+            else:
+                kernel.set_params(**{hyperparameter.name: value})
+                assert_almost_equal(np.exp(kernel.theta[index]), value)
+                index += 1
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a5120f2cd0cec..e3290dfe343c5 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1270,8 +1270,13 @@ def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
     If metric is 'precomputed', Y is ignored and X is returned.
 
     """
+    # import GPKernel locally to prevent circular imports
+    from ..gaussian_process.kernels import Kernel as GPKernel
+
     if metric == "precomputed":
         return X
+    elif isinstance(metric, GPKernel):
+        func = metric.__call__
     elif metric in PAIRWISE_KERNEL_FUNCTIONS:
         if filter_params:
             kwds = dict((k, kwds[k]) for k in kwds
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 3255d9c58790a..6dd7ce2ea0dc5 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -50,6 +50,7 @@
 CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
 MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet',
                 'ExtraTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcess',
+                'GaussianProcessRegressor',
                 'KNeighborsRegressor', 'KernelRidge', 'Lars', 'Lasso',
                 'LassoLars', 'LinearRegression', 'MultiTaskElasticNet',
                 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV',
@@ -127,8 +128,9 @@ def _yield_regressor_checks(name, Regressor):
     if name != 'CCA':
         # check that the regressor handles int input
         yield check_regressors_int
-    # Test if NotFittedError is raised
-    yield check_estimators_unfitted
+    if name != "GaussianProcessRegressor":
+        # Test if NotFittedError is raised
+        yield check_estimators_unfitted
 
 
 def _yield_transformer_checks(name, Transformer):