diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index dbb7862d8839e..1fa157fb68d4d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -71,9 +71,10 @@
                'isotonic', 'kernel_approximation', 'kernel_ridge',
                'learning_curve', 'linear_model', 'manifold', 'metrics',
                'mixture', 'model_selection', 'multiclass', 'multioutput',
-               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
-               'preprocessing', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
+               'naive_bayes', 'neighbors', 'neural_network',
+               'partial_dependence', 'pipeline', 'preprocessing',
+               'random_projection', 'semi_supervised', 'svm', 'tree',
+               'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
                'clone', 'get_config', 'set_config', 'config_context']
 
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index e8bfc2110bb90..63d397d86c8a0 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,70 +3,9 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from itertools import count
-import numbers
-
-import numpy as np
-from scipy.stats.mstats import mquantiles
-
-from ..utils.extmath import cartesian
-from ..externals.joblib import Parallel, delayed
-from ..externals import six
-from ..externals.six.moves import map, range, zip
-from ..utils import check_array
-from ..utils.validation import check_is_fitted
-from ..tree._tree import DTYPE
-
-from ._gradient_boosting import _partial_dependence_tree
-from .gradient_boosting import BaseGradientBoosting
-
-
-def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
-    """Generate a grid of points based on the ``percentiles of ``X``.
-
-    The grid is generated by placing ``grid_resolution`` equally
-    spaced points between the ``percentiles`` of each column
-    of ``X``.
-
-    Parameters
-    ----------
-    X : ndarray
-        The data
-    percentiles : tuple of floats
-        The percentiles which are used to construct the extreme
-        values of the grid axes.
-    grid_resolution : int
-        The number of equally spaced points that are placed
-        on the grid.
-
-    Returns
-    -------
-    grid : ndarray
-        All data points on the grid; ``grid.shape[1] == X.shape[1]``
-        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
-    axes : seq of ndarray
-        The axes with which the grid has been created.
-    """
-    if len(percentiles) != 2:
-        raise ValueError('percentile must be tuple of len 2')
-    if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentile values must be in [0, 1]')
-
-    axes = []
-    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-    for col in range(X.shape[1]):
-        uniques = np.unique(X[:, col])
-        if uniques.shape[0] < grid_resolution:
-            # feature has low resolution use unique vals
-            axis = uniques
-        else:
-            # create axis based on percentiles and grid resolution
-            axis = np.linspace(emp_percentiles[0, col],
-                               emp_percentiles[1, col],
-                               num=grid_resolution, endpoint=True)
-        axes.append(axis)
-
-    return cartesian(axes), axes
+import warnings
+from ..partial_dependence import partial_dependence as new_pd
+from ..partial_dependence import plot_partial_dependence as new_ppd
 
 
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
@@ -120,47 +59,17 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-    if (grid is None and X is None) or (grid is not None and X is not None):
-        raise ValueError('Either grid or X must be specified')
-
-    target_variables = np.asarray(target_variables, dtype=np.int32,
-                                  order='C').ravel()
-
-    if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]):
-        raise ValueError('target_variables must be in [0, %d]'
-                         % (gbrt.n_features_ - 1))
-
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
-        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
-                                  grid_resolution)
-    else:
-        assert grid is not None
-        # dont return axes if grid is given
-        axes = None
-        # grid must be 2d
-        if grid.ndim == 1:
-            grid = grid[:, np.newaxis]
-        if grid.ndim != 2:
-            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
-
-    grid = np.asarray(grid, dtype=DTYPE, order='C')
-    assert grid.shape[1] == target_variables.shape[0]
-
-    n_trees_per_stage = gbrt.estimators_.shape[1]
-    n_estimators = gbrt.estimators_.shape[0]
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                   order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
-
-    return pdp, axes
+    warnings.warn("The function ensemble.partial_dependence has been moved to "
+                  "partial_dependence in 0.20 and will be removed in 0.22.",
+                  DeprecationWarning)
+    return new_pd(est=gbrt,
+                  target_variables=target_variables,
+                  grid=grid,
+                  X=X,
+                  output=None,
+                  percentiles=percentiles,
+                  grid_resolution=grid_resolution,
+                  method='recursion')
 
 
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
@@ -237,159 +146,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
-
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-
-    # set label_idx for multi-class GBRT
-    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
-        if label is None:
-            raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(gbrt.classes_, label)
-        if gbrt.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
-    else:
-        # regression and binary classification
-        label_idx = 0
-
-    X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features_')
-
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
-    # convert feature_names to list
-    if feature_names is None:
-        # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features_)]
-    elif isinstance(feature_names, np.ndarray):
-        feature_names = feature_names.tolist()
-
-    def convert_feature(fx):
-        if isinstance(fx, six.string_types):
-            try:
-                fx = feature_names.index(fx)
-            except ValueError:
-                raise ValueError('Feature %s not in feature_names' % fx)
-        return fx
-
-    # convert features into a seq of int tuples
-    tmp_features = []
-    for fxs in features:
-        if isinstance(fxs, (numbers.Integral,) + six.string_types):
-            fxs = (fxs,)
-        try:
-            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
-        except TypeError:
-            raise ValueError('features must be either int, str, or tuple '
-                             'of int/str')
-        if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('target features must be either one or two')
-
-        tmp_features.append(fxs)
-
-    features = tmp_features
-
-    names = []
-    try:
-        for fxs in features:
-            l = []
-            # explicit loop so "i" is bound for exception below
-            for i in fxs:
-                l.append(feature_names[i])
-            names.append(l)
-    except IndexError:
-        raise ValueError('All entries of features must be less than '
-                         'len(feature_names) = {0}, got {1}.'
-                         .format(len(feature_names), i))
-
-    # compute PD functions
-    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(gbrt, fxs, X=X,
-                                    grid_resolution=grid_resolution,
-                                    percentiles=percentiles)
-        for fxs in features)
-
-    # get global min and max values of PD grouped by plot type
-    pdp_lim = {}
-    for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
-        n_fx = len(axes)
-        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
-        min_pd = min(min_pd, old_min_pd)
-        max_pd = max(max_pd, old_max_pd)
-        pdp_lim[n_fx] = (min_pd, max_pd)
-
-    # create contour levels for two-way plots
-    if 2 in pdp_lim:
-        Z_level = np.linspace(*pdp_lim[2], num=8)
-
-    if ax is None:
-        fig = plt.figure(**fig_kw)
-    else:
-        fig = ax.get_figure()
-        fig.clear()
-
-    n_cols = min(n_cols, len(features))
-    n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pdp, axes) in zip(count(), features, names,
-                                        pd_result):
-        ax = fig.add_subplot(n_rows, n_cols, i + 1)
-
-        if len(axes) == 1:
-            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
-        else:
-            # make contour plot
-            assert len(axes) == 2
-            XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
-            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
-                            colors='k')
-            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
-                        vmin=Z_level[0], alpha=0.75, **contour_kw)
-            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
-
-        # plot data deciles + axes labels
-        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
-        trans = transforms.blended_transform_factory(ax.transData,
-                                                     ax.transAxes)
-        ylim = ax.get_ylim()
-        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
-        ax.set_xlabel(name[0])
-        ax.set_ylim(ylim)
-
-        # prevent x-axis ticks from overlapping
-        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
-        tick_formatter = ScalarFormatter()
-        tick_formatter.set_powerlimits((-3, 4))
-        ax.xaxis.set_major_formatter(tick_formatter)
-
-        if len(axes) > 1:
-            # two-way PDP - y-axis deciles + labels
-            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
-            trans = transforms.blended_transform_factory(ax.transAxes,
-                                                         ax.transData)
-            xlim = ax.get_xlim()
-            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
-            ax.set_ylabel(name[1])
-            # hline erases xlim
-            ax.set_xlim(xlim)
-        else:
-            ax.set_ylabel('Partial dependence')
-
-        if len(axes) == 1:
-            ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
-
-    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
-                        hspace=0.3)
-    return fig, axs
+    warnings.warn("The function ensemble.plot_partial_dependence has been "
+                  "moved to partial_dependence in 0.20 and will be removed "
+                  "in 0.22.",
+                  DeprecationWarning)
+    return new_ppd(est=gbrt,
+                   X=X,
+                   features=features,
+                   feature_names=feature_names,
+                   label=label,
+                   n_cols=n_cols,
+                   grid_resolution=grid_resolution,
+                   method='recursion',
+                   percentiles=percentiles,
+                   n_jobs=n_jobs,
+                   verbose=verbose,
+                   ax=ax,
+                   line_kw=line_kw,
+                   contour_kw=contour_kw,
+                   **fig_kw)
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index cec7efc46f03b..3a45ade617f9e 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -12,6 +12,8 @@
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn import datasets
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_warns_message
 
 
 # toy sample
@@ -27,6 +29,7 @@
 iris = datasets.load_iris()
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_classifier():
     # Test partial dependence for classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -47,6 +50,7 @@ def test_partial_dependence_classifier():
     assert_array_equal(pdp, pdp_2)
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_multiclass():
     # Test partial dependence for multi-class classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -62,6 +66,7 @@ def test_partial_dependence_multiclass():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_regressor():
     # Test partial dependence for regressor
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
@@ -75,6 +80,7 @@ def test_partial_dependence_regressor():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependecy_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -103,6 +109,7 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
 
 
+@ignore_warnings(category=DeprecationWarning)
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -136,6 +143,7 @@ def test_plot_partial_dependence():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_input():
     # Test partial dependence plot function input checks.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -171,6 +179,7 @@ def test_plot_partial_dependence_input():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -204,3 +213,31 @@ def test_plot_partial_dependence_multiclass():
     assert_raises(ValueError, plot_partial_dependence,
                   clf, iris.data, [0, 1],
                   grid_resolution=grid_resolution)
+
+
+def test_warning_raised_for_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", partial_dependence, clf, [0], X=boston.data,
+                         grid_resolution=grid_resolution)
+
+
+@if_matplotlib
+def test_warning_raised_for_plot_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.plot_partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", plot_partial_dependence, clf, boston.data,
+                         [0, 1, (0, 1)], grid_resolution=grid_resolution,
+                         feature_names=boston.feature_names)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
new file mode 100644
index 0000000000000..436bc6f2b1134
--- /dev/null
+++ b/sklearn/partial_dependence.py
@@ -0,0 +1,614 @@
+"""Partial dependence plots for regression and classification models."""
+
+# Authors: Peter Prettenhofer
+#          Trevor Stephens
+# License: BSD 3 clause
+
+from itertools import count
+import numbers
+
+import numpy as np
+from scipy.stats.mstats import mquantiles
+
+from .utils.extmath import cartesian
+from .externals.joblib import Parallel, delayed
+from .externals import six
+from .externals.six.moves import map, range, zip
+from .utils import check_array
+from .utils.validation import check_is_fitted
+from .tree._tree import DTYPE
+
+from .exceptions import NotFittedError
+
+
+__all__ = ['partial_dependence', 'plot_partial_dependence']
+
+
+def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
+    """Generate a grid of points based on the ``percentiles of ``X``.
+
+    The grid is generated by placing ``grid_resolution`` equally
+    spaced points between the ``percentiles`` of each column
+    of ``X``.
+
+    Parameters
+    ----------
+    X : ndarray
+        The data
+    percentiles : tuple of floats
+        The percentiles which are used to construct the extreme
+        values of the grid axes.
+    grid_resolution : int
+        The number of equally spaced points that are placed
+        on the grid.
+
+    Returns
+    -------
+    grid : ndarray
+        All data points on the grid; ``grid.shape[1] == X.shape[1]``
+        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
+    axes : seq of ndarray
+        The axes with which the grid has been created.
+    """
+    if len(percentiles) != 2:
+        raise ValueError('percentile must be tuple of len 2')
+    if not all(0. <= x <= 1. for x in percentiles):
+        raise ValueError('percentile values must be in [0, 1]')
+
+    axes = []
+    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+    for col in range(X.shape[1]):
+        uniques = np.unique(X[:, col])
+        if uniques.shape[0] < grid_resolution:
+            # feature has low resolution use unique vals
+            axis = uniques
+        else:
+            # create axis based on percentiles and grid resolution
+            axis = np.linspace(emp_percentiles[0, col],
+                               emp_percentiles[1, col],
+                               num=grid_resolution, endpoint=True)
+        axes.append(axis)
+
+    return cartesian(axes), axes
+
+
+def _predict(est, X_eval, output=None):
+    """Calculate part of the partial dependence of ``target_variables``.
+
+    The function will be calculated by calling the ``predict_proba`` method of
+    ``est`` for classification or ``predict`` for regression on ``X`` for every
+    point in the grid.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    X_eval : array-like, shape=(n_samples, n_features)
+        The data on which the partial dependence of ``est`` should be
+        predicted.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
+
+    Returns
+    -------
+    out : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    """
+    if est._estimator_type == 'regressor':
+        try:
+            out = est.predict(X_eval)
+        except NotFittedError:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        if out.ndim != 1 and out.shape[1] == 1:
+            # Column output
+            out = out.ravel()
+        if out.ndim != 1 and out.shape[1] != 1:
+            # Multi-output
+            if not 0 <= output < out.shape[1]:
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            out = out[:, output]
+    elif est._estimator_type == 'classifier':
+        try:
+            out = est.predict_proba(X_eval)
+        except NotFittedError:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        if isinstance(out, list):
+            # Multi-output
+            if not 0 <= output < len(out):
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            out = out[output]
+        out = np.log(np.clip(out, 1e-16, 1))
+        out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
+    else:
+        raise ValueError('est must be a fitted regressor or classifier '
+                         'model.')
+    return out
+
+
+def partial_dependence(est, target_variables, grid=None, X=None, output=None,
+                       percentiles=(0.05, 0.95), grid_resolution=100,
+                       method='auto'):
+    """Partial dependence of ``target_variables``.
+
+    Partial dependence plots show the dependence between the joint values
+    of the ``target_variables`` and the function represented
+    by the ``est``.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependency should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependency should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained. It is used to generate
+        a ``grid`` for the ``target_variables``. The ``grid`` comprises
+        ``grid_resolution`` equally spaced points between the two
+        ``percentiles``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used to create the extreme values
+        for the ``grid``. Only if ``X`` is not None.
+    grid_resolution : int, default=100
+        The number of equally spaced points on the ``grid``.
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the median of ``X``.
+        - If 'auto', then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    axes : seq of ndarray or None
+        The axes with which the grid has been created or None if
+        the grid has been given.
+
+    Examples
+    --------
+    >>> samples = [[0, 0, 2], [1, 0, 0]]
+    >>> labels = [0, 1]
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels)
+    >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2)
+    >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
+    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
+    """
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble._gradient_boosting import _partial_dependence_tree
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
+
+    if method == 'auto':
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
+    if method == 'recursion':
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
+        n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
+    if (grid is None and X is None) or (grid is not None and X is not None):
+        raise ValueError('Either grid or X must be specified')
+
+    target_variables = np.asarray(target_variables, dtype=np.int32,
+                                  order='C').ravel()
+
+    if any([not (0 <= fx < n_features) for fx in target_variables]):
+        raise ValueError('target_variables must be in [0, %d]'
+                         % (n_features - 1))
+
+    if X is not None:
+        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
+                                  grid_resolution)
+    else:
+        assert grid is not None
+        # don't return axes if grid is given
+        axes = None
+        # grid must be 2d
+        if grid.ndim == 1:
+            grid = grid[:, np.newaxis]
+        if grid.ndim != 2:
+            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
+
+    grid = np.asarray(grid, dtype=DTYPE, order='C')
+    assert grid.shape[1] == target_variables.shape[0]
+
+    if method == 'recursion':
+        if isinstance(est, BaseGradientBoosting):
+            n_trees_per_stage = est.estimators_.shape[1]
+            n_estimators = est.estimators_.shape[0]
+            learning_rate = est.learning_rate
+        else:
+            n_trees_per_stage = 1
+            n_estimators = len(est.estimators_)
+            learning_rate = 1.
+        pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                       order='C')
+        for stage in range(n_estimators):
+            for k in range(n_trees_per_stage):
+                if isinstance(est, BaseGradientBoosting):
+                    tree = est.estimators_[stage, k].tree_
+                else:
+                    tree = est.estimators_[stage].tree_
+                _partial_dependence_tree(tree, grid, target_variables,
+                                         learning_rate, pdp[k])
+        if isinstance(est, ForestRegressor):
+            pdp /= n_estimators
+    elif method == 'exact':
+        n_samples = X.shape[0]
+        pdp = []
+        for row in range(grid.shape[0]):
+            X_eval = X.copy()
+            for i, variable in enumerate(target_variables):
+                X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+            pdp_row = _predict(est, X_eval, output=output)
+            if est._estimator_type == 'regressor':
+                pdp.append(np.mean(pdp_row))
+            else:
+                pdp.append(np.mean(pdp_row, 0))
+        pdp = np.array(pdp).transpose()
+        if pdp.shape[0] == 2:
+            # Binary classification
+            pdp = pdp[1, :][np.newaxis]
+        elif pdp.ndim == 1:
+            # Regression
+            pdp = pdp[np.newaxis]
+    elif method == 'estimated':
+        n_samples = grid.shape[0]
+        X_eval = np.tile(np.median(X, 0), [n_samples, 1])
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = grid[:, i]
+        pdp = _predict(est, X_eval, output=output)
+        if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
+            # Binary classification
+            pdp = pdp[:, 1][np.newaxis]
+        elif est._estimator_type == 'classifier' and pdp.shape[1] > 2:
+            # Multi-label classification
+            pdp = pdp.T
+        if est._estimator_type == 'regressor':
+            pdp = pdp[np.newaxis]
+    else:
+        raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
+                         '"estimated", or None.' % method)
+    return pdp, axes
+
+
+def plot_partial_dependence(est, X, features, feature_names=None,
+                            label=None, output=None,
+                            n_cols=3, grid_resolution=100,
+                            percentiles=(0.05, 0.95), method='auto', n_jobs=1,
+                            verbose=0, ax=None, line_kw=None,
+                            contour_kw=None, **fig_kw):
+    """Partial dependence plots for ``features``.
+
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
+    columns. Two-way partial dependence plots are plotted as contour
+    plots.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained.
+    features : seq of ints, strings, or tuples of ints or strings
+        If seq[i] is an int or a tuple with one int value, a one-way
+        PDP is created; if seq[i] is a tuple of two ints, a two-way
+        PDP is created.
+        If feature_names is specified and seq[i] is an int, seq[i]
+        must be < len(feature_names).
+        If seq[i] is a string, feature_names must be specified, and
+        seq[i] must be in feature_names.
+    feature_names : seq of str
+        Name of each feature; feature_names[i] holds
+        the name of the feature with index i.
+    label : object
+        The class label for which the PDPs should be computed.
+        Only if est is a multi-class model. Must be in ``est.classes_``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
+    n_cols : int
+        The number of columns in the grid plot (default: 3).
+    grid_resolution : int, default=100
+        The number of equally spaced points on the axes.
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used to create the extreme values
+        for the PDP axes.
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the median of ``X``.
+        - If 'auto', then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
+    n_jobs : int
+        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
+        Defaults to 1.
+    verbose : int
+        Verbose output during PD computations. Defaults to 0.
+    ax : Matplotlib axis object, default None
+        An axis object onto which the plots will be drawn.
+    line_kw : dict
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For one-way partial dependence plots.
+    contour_kw : dict
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For two-way partial dependence plots.
+    **fig_kw : dict
+        Dict with keywords passed to the figure() call.
+        Note that all keywords not recognized above will be automatically
+        included here.
+
+    Returns
+    -------
+    fig : figure
+        The Matplotlib Figure object.
+    axs : seq of Axis objects
+        A seq of Axis objects, one for each subplot.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
+    ...
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
+
+    if method == 'auto':
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
+    if method == 'recursion':
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
+        n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
+
+    # set label_idx for multi-class estimators
+    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
+        if label is None:
+            raise ValueError('label is not given for multi-class PDP')
+        if type(est.classes_) == list:
+            # multi-output classification
+            if output is None:
+                raise ValueError('output is required for multi-output '
+                                 'estimators')
+            if output > len(est.classes_):
+                raise ValueError('output %d exceeds number of outputs in est, '
+                                 '%d' % (output, len(est.classes_)))
+            label_idx = np.searchsorted(est.classes_[output], label)
+            if est.classes_[output][label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
+        else:
+            label_idx = np.searchsorted(est.classes_, label)
+            if est.classes_[label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
+    else:
+        # regression and binary classification
+        label_idx = 0
+
+    if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match est.n_features_')
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if not feature_names use fx indices as name
+        feature_names = [str(i) for i in range(n_features)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+
+    def convert_feature(fx):
+        if isinstance(fx, six.string_types):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return fx
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral,) + six.string_types):
+            fxs = (fxs,)
+        try:
+            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
+        except TypeError:
+            raise ValueError('features must be either int, str, or tuple '
+                             'of int/str')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('target features must be either one or two')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            l = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                l.append(feature_names[i])
+            names.append(l)
+    except IndexError:
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
+
+    # compute PD functions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(est, fxs, X=X, output=output,
+                                    method=method,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pdp, axes in pd_result:
+        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
+        n_fx = len(axes)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if ax is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig = ax.get_figure()
+        fig.clear()
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pdp, axes) in zip(count(), features, names,
+                                        pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(axes) == 1:
+            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(axes) == 2
+            XX, YY = np.meshgrid(axes[0], axes[1])
+            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(axes) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(axes) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
new file mode 100644
index 0000000000000..aaa2e539cf594
--- /dev/null
+++ b/sklearn/tests/test_partial_dependence.py
@@ -0,0 +1,328 @@
+"""
+Testing for the partial dependence module.
+"""
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import if_matplotlib
+from sklearn.utils.testing import all_estimators
+from sklearn.utils.testing import ignore_warnings
+from sklearn.partial_dependence import partial_dependence
+from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
+from sklearn.ensemble.forest import ForestRegressor
+from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import make_classification, make_regression
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+
+# Make some sample data to test output shapes
+X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)
+# Non-negative for MultinomialNB
+X_c = X_c + np.abs(X_c.min())
+X_r, y_r = make_regression(n_features=10, n_informative=5, random_state=0)
+
+# Load the boston & iris datasets
+boston = load_boston()
+iris = load_iris()
+
+
+@ignore_warnings()
+def test_output_shape_recursion():
+    # Test recursion partial_dependence has same output shape for everything
+    for name, Estimator in all_estimators():
+        est = Estimator()
+        if not (isinstance(est, BaseGradientBoosting) or
+                isinstance(est, ForestRegressor)):
+            continue
+        if est._estimator_type == 'classifier':
+            est.fit(X_c, y_c)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_c,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+        elif est._estimator_type == 'regressor':
+            est.fit(X_r, y_r)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_r,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+
+
+@ignore_warnings()
+def test_output_shape_exact():
+    # Test exact partial_dependence has same output shape for everything
+    for name, Estimator in all_estimators():
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
+            continue
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
+                continue
+            est.fit(X_c, y_c)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_c,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+        elif est._estimator_type == 'regressor':
+            est.fit(X_r, y_r)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_r,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+
+
+@ignore_warnings()
+def test_output_shape_estimated():
+    # Test estimated partial_dependence has same output shape for everything
+    for name, Estimator in all_estimators():
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
+            continue
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
+                continue
+            est.fit(X_c, y_c)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_c,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+        elif est._estimator_type == 'regressor':
+            est.fit(X_r, y_r)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=X_r,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
+
+
+def test_partial_dependence_classifier():
+    # Test partial dependence for classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)
+
+    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
+    assert pdp.shape == (1, 4)
+    assert axes[0].shape[0] == 4
+
+    # now with our own grid
+    X_ = np.asarray(X)
+    grid = np.unique(X_[:, 0])
+    pdp_2, axes = partial_dependence(clf, [0], grid=grid)
+
+    assert axes is None
+    assert_array_equal(pdp, pdp_2)
+
+
+def test_partial_dependence_multiclass():
+    # Test partial dependence for multi-class classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    n_classes = clf.n_classes_
+    pdp, axes = partial_dependence(
+        clf, [0], X=iris.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (n_classes, grid_resolution)
+    assert len(axes) == 1
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependence_regressor():
+    # Test partial dependence for regressor
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    pdp, axes = partial_dependence(
+        clf, [0], X=boston.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (1, grid_resolution)
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependecy_input():
+    # Test input validation of partial dependence.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=None, X=None)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=[0, 1], X=X)
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, partial_dependence,
+                  {}, [0], X=X)
+
+    # Gradient boosting estimator must be fit
+    assert_raises(ValueError, partial_dependence,
+                  GradientBoostingClassifier(), [0], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [100], X=X)
+
+    # wrong ndim for grid
+    grid = np.random.rand(10, 2, 1)
+    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
+
+
+@if_matplotlib
+def test_plot_partial_dependence():
+    # Test partial dependence plot function.
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with str features and array feature names
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with list feature_names
+    feature_names = boston.feature_names.tolist()
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_input():
+    # Test partial dependence plot function input checks.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+
+    # not fitted yet
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [0])
+
+    clf.fit(X, y)
+
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, np.array(X)[:, :0], [0])
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, plot_partial_dependence,
+                  {}, X, [0])
+
+    # must be larger than -1
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [-1])
+
+    # too large feature value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [100])
+
+    # str feature but no feature_names
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, ['foobar'])
+
+    # not valid features value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [{'foo': 'bar'}])
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multiclass():
+    # Test partial dependence plot function on multi-class input.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label='setosa',
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # label not in gbrt.classes_
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1], label='foobar',
+                  grid_resolution=grid_resolution)
+
+    # label not provided
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1],
+                  grid_resolution=grid_resolution)