From 57f9a6f1c1a47d91f049dd716a03edaf42a9a036 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 19 Oct 2015 07:54:45 -0700
Subject: [PATCH 001/113] general partial dependence plots

---
 sklearn/partial_dependence.py | 165 ++++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 sklearn/partial_dependence.py

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
new file mode 100644
index 0000000000000..f971978dbd7c7
--- /dev/null
+++ b/sklearn/partial_dependence.py
@@ -0,0 +1,165 @@
+"""Partial dependence plots for tree ensembles. """
+
+# Authors: Peter Prettenhofer
+# License: BSD 3 clause
+
+from itertools import count
+import numbers
+
+import numpy as np
+from scipy.stats.mstats import mquantiles
+
+from .utils.extmath import cartesian
+from .externals.joblib import Parallel, delayed
+from .externals import six
+from .externals.six.moves import map, range, zip
+from .utils import check_array
+from .tree._tree import DTYPE
+
+from .base import ClassifierMixin, RegressorMixin
+from .ensemble._gradient_boosting import _partial_dependence_tree
+from .ensemble.gradient_boosting import BaseGradientBoosting
+
+
+def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
+    """Generate a grid of points based on the ``percentiles of ``X``.
+
+    The grid is generated by placing ``grid_resolution`` equally
+    spaced points between the ``percentiles`` of each column
+    of ``X``.
+
+    Parameters
+    ----------
+    X : ndarray
+        The data
+    percentiles : tuple of floats
+        The percentiles which are used to construct the extreme
+        values of the grid axes.
+    grid_resolution : int
+        The number of equally spaced points that are placed
+        on the grid.
+
+    Returns
+    -------
+    grid : ndarray
+        All data points on the grid; ``grid.shape[1] == X.shape[1]``
+        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
+    axes : seq of ndarray
+        The axes with which the grid has been created.
+    """
+    if len(percentiles) != 2:
+        raise ValueError('percentile must be tuple of len 2')
+    if not all(0. <= x <= 1. for x in percentiles):
+        raise ValueError('percentile values must be in [0, 1]')
+
+    axes = []
+    for col in range(X.shape[1]):
+        uniques = np.unique(X[:, col])
+        if uniques.shape[0] < grid_resolution:
+            # feature has low resolution use unique vals
+            axis = uniques
+        else:
+            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+            # create axis based on percentiles and grid resolution
+            axis = np.linspace(emp_percentiles[0, col],
+                               emp_percentiles[1, col],
+                               num=grid_resolution, endpoint=True)
+        axes.append(axis)
+
+    return cartesian(axes), axes
+
+
+def partial_dependence(gbrt, target_variables, grid=None, X=None,
+                       percentiles=(0.05, 0.95), grid_resolution=100):
+    """Partial dependence of ``target_variables``.
+
+    Partial dependence plots show the dependence between the joint values
+    of the ``target_variables`` and the function represented
+    by the ``gbrt``.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    gbrt : BaseGradientBoosting
+        A fitted gradient boosting model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependecy should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependecy should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``gbrt`` was trained. It is used to generate
+        a ``grid`` for the ``target_variables``. The ``grid`` comprises
+        ``grid_resolution`` equally spaced points between the two
+        ``percentiles``.
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used create the extreme values
+        for the ``grid``. Only if ``X`` is not None.
+    grid_resolution : int, default=100
+        The number of equally spaced points on the ``grid``.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    axes : seq of ndarray or None
+        The axes with which the grid has been created or None if
+        the grid has been given.
+
+    Examples
+    --------
+    >>> samples = [[0, 0, 2], [1, 0, 0]]
+    >>> labels = [0, 1]
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels)
+    >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2)
+    >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
+    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
+    """
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    if gbrt.estimators_.shape[0] == 0:
+        raise ValueError('Call %s.fit before partial_dependence' %
+                         gbrt.__class__.__name__)
+    if (grid is None and X is None) or (grid is not None and X is not None):
+        raise ValueError('Either grid or X must be specified')
+
+    target_variables = np.asarray(target_variables, dtype=np.int32,
+                                  order='C').ravel()
+
+    if any([not (0 <= fx < gbrt.n_features) for fx in target_variables]):
+        raise ValueError('target_variables must be in [0, %d]'
+                         % (gbrt.n_features - 1))
+
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
+        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
+                                  grid_resolution)
+    else:
+        assert grid is not None
+        # dont return axes if grid is given
+        axes = None
+        # grid must be 2d
+        if grid.ndim == 1:
+            grid = grid[:, np.newaxis]
+        if grid.ndim != 2:
+            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
+
+    grid = np.asarray(grid, dtype=DTYPE, order='C')
+    assert grid.shape[1] == target_variables.shape[0]
+
+    n_trees_per_stage = gbrt.estimators_.shape[1]
+    n_estimators = gbrt.estimators_.shape[0]
+    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                   order='C')
+    for stage in range(n_estimators):
+        for k in range(n_trees_per_stage):
+            tree = gbrt.estimators_[stage, k].tree_
+            _partial_dependence_tree(tree, grid, target_variables,
+                                     gbrt.learning_rate, pdp[k])
+
+    return pdp, axes

From 9a09888e27fec5929195954d327c3a74f84b542c Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 20 Oct 2015 19:34:36 -0700
Subject: [PATCH 002/113] add init

---
 sklearn/__init__.py           | 7 ++++---
 sklearn/partial_dependence.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index dbb7862d8839e..1fa157fb68d4d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -71,9 +71,10 @@
                'isotonic', 'kernel_approximation', 'kernel_ridge',
                'learning_curve', 'linear_model', 'manifold', 'metrics',
                'mixture', 'model_selection', 'multiclass', 'multioutput',
-               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
-               'preprocessing', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
+               'naive_bayes', 'neighbors', 'neural_network',
+               'partial_dependence', 'pipeline', 'preprocessing',
+               'random_projection', 'semi_supervised', 'svm', 'tree',
+               'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
                'clone', 'get_config', 'set_config', 'config_context']
 
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index f971978dbd7c7..7646d51743cf4 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,4 +1,4 @@
-"""Partial dependence plots for tree ensembles. """
+"""Partial dependence plots for regressors and classifiers. """
 
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause

From e714e16d56fd2e0a7125a78f1793378bdb58d408 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 1 Nov 2015 11:36:15 -0800
Subject: [PATCH 003/113] implement exact and estimated methods

---
 sklearn/partial_dependence.py | 461 ++++++++++++++++++++++++++++++++--
 1 file changed, 435 insertions(+), 26 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7646d51743cf4..0afdfe06cf819 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,6 +1,7 @@
-"""Partial dependence plots for regressors and classifiers. """
+"""Partial dependence plots for regression and classification models. """
 
 # Authors: Peter Prettenhofer
+#          Trevor Stephens
 # License: BSD 3 clause
 
 from itertools import count
@@ -19,6 +20,10 @@
 from .base import ClassifierMixin, RegressorMixin
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
+from .ensemble.forest import ForestRegressor
+
+
+__all__ = ['partial_dependence', 'plot_partial_dependence']
 
 
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
@@ -69,29 +74,147 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def partial_dependence(gbrt, target_variables, grid=None, X=None,
-                       percentiles=(0.05, 0.95), grid_resolution=100):
+def _exact_partial_dependence(est, target_variables, grid, X):
+    """Calculate the partial dependence of ``target_variables``.
+
+    The function will be calculated by calling the ``predict_proba`` method of
+    ``est`` for classification or ``predict`` for regression on ``X`` for every
+    point in the grid.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependency should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependency should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    """
+    n_samples = X.shape[0]
+    pdp = []
+    for row in range(grid.shape[0]):
+        X_eval = X.copy()
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+        if isinstance(est, RegressorMixin):
+            try:
+                pdp.append(np.mean(est.predict(X_eval)))
+            except:
+                raise ValueError('Call %s.fit before partial_dependence' %
+                                 est.__class__.__name__)
+        elif isinstance(est, ClassifierMixin):
+            try:
+                pdp_row = est.predict_proba(X_eval)
+            except:
+                raise ValueError('Call %s.fit before partial_dependence' %
+                                 est.__class__.__name__)
+            pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
+            pdp_row = np.subtract(pdp_row,
+                                  np.mean(pdp_row, 1)[:, np.newaxis])
+            pdp.append(np.mean(pdp_row, 0))
+        else:
+            raise ValueError('est must be a fitted regressor or classifier '
+                             'model.')
+    pdp = np.array(pdp).transpose()
+    if pdp.shape[0] == 2:
+        # Binary classification
+        pdp = pdp[1, :][np.newaxis]
+    elif len(pdp.shape) == 1:
+        # Regression
+        pdp = pdp[np.newaxis]
+    return pdp
+
+
+def _estimated_partial_dependence(est, target_variables, grid, X):
+    """Calculate the partial dependence of ``target_variables``.
+
+    The function will be calculated by calling the ``predict_proba`` method of
+    ``est`` for classification or ``predict`` for regression on the mean of
+    ``X``.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependency should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependency should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    """
+    n_samples = grid.shape[0]
+    X_eval = np.tile(X.mean(0), [n_samples, 1])
+    for i, variable in enumerate(target_variables):
+        X_eval[:, variable] = grid[:, i]
+    if isinstance(est, RegressorMixin):
+        try:
+            pdp = est.predict(X_eval)
+        except:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        pdp = pdp[np.newaxis]
+    elif isinstance(est, ClassifierMixin):
+        try:
+            pdp = est.predict_proba(X_eval)
+        except:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        pdp = np.log(np.clip(pdp, 1e-16, 1))
+        pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
+        pdp = pdp.transpose()
+    else:
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if pdp.shape[0] == 2:
+        # Binary classification
+        pdp = pdp[1, :][np.newaxis]
+    return pdp
+
+
+def partial_dependence(est, target_variables, grid=None, X=None,
+                       percentiles=(0.05, 0.95), grid_resolution=100,
+                       method=None):
     """Partial dependence of ``target_variables``.
 
     Partial dependence plots show the dependence between the joint values
     of the ``target_variables`` and the function represented
-    by the ``gbrt``.
+    by the ``est``.
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
     ----------
-    gbrt : BaseGradientBoosting
-        A fitted gradient boosting model.
+    est : BaseEstimator
+        A fitted classification or regression model.
     target_variables : array-like, dtype=int
-        The target features for which the partial dependecy should be
+        The target features for which the partial dependency should be
         computed (size should be smaller than 3 for visual renderings).
     grid : array-like, shape=(n_points, len(target_variables))
         The grid of ``target_variables`` values for which the
-        partial dependecy should be evaluated (either ``grid`` or ``X``
+        partial dependency should be evaluated (either ``grid`` or ``X``
         must be specified).
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``gbrt`` was trained. It is used to generate
+        The data on which ``est`` was trained. It is used to generate
         a ``grid`` for the ``target_variables``. The ``grid`` comprises
         ``grid_resolution`` equally spaced points between the two
         ``percentiles``.
@@ -100,6 +223,22 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
+    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
+        - If None, then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
 
     Returns
     -------
@@ -120,20 +259,42 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    if gbrt.estimators_.shape[0] == 0:
-        raise ValueError('Call %s.fit before partial_dependence' %
-                         gbrt.__class__.__name__)
+    if method is None:
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (method != 'recursion' and
+            not hasattr(est, 'predict_proba') and
+            isinstance(est, ClassifierMixin)):
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated".')
+    if method == 'recursion':
+        if len(est.estimators_) == 0:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        if isinstance(est, BaseGradientBoosting):
+            n_features = est.n_features
+        else:
+            n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
     if (grid is None and X is None) or (grid is not None and X is not None):
         raise ValueError('Either grid or X must be specified')
 
     target_variables = np.asarray(target_variables, dtype=np.int32,
                                   order='C').ravel()
 
-    if any([not (0 <= fx < gbrt.n_features) for fx in target_variables]):
+    if any([not (0 <= fx < n_features) for fx in target_variables]):
         raise ValueError('target_variables must be in [0, %d]'
-                         % (gbrt.n_features - 1))
+                         % (n_features - 1))
 
     if X is not None:
         X = check_array(X, dtype=DTYPE, order='C')
@@ -141,7 +302,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
                                   grid_resolution)
     else:
         assert grid is not None
-        # dont return axes if grid is given
+        # don't return axes if grid is given
         axes = None
         # grid must be 2d
         if grid.ndim == 1:
@@ -152,14 +313,262 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     grid = np.asarray(grid, dtype=DTYPE, order='C')
     assert grid.shape[1] == target_variables.shape[0]
 
-    n_trees_per_stage = gbrt.estimators_.shape[1]
-    n_estimators = gbrt.estimators_.shape[0]
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                   order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
+    if method == 'recursion':
+        if isinstance(est, BaseGradientBoosting):
+            n_trees_per_stage = est.estimators_.shape[1]
+            n_estimators = est.estimators_.shape[0]
+            learning_rate = est.learning_rate
+        else:
+            n_trees_per_stage = 1
+            n_estimators = len(est.estimators_)
+            learning_rate = 1.
+        pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                       order='C')
+        for stage in range(n_estimators):
+            for k in range(n_trees_per_stage):
+                if isinstance(est, BaseGradientBoosting):
+                    tree = est.estimators_[stage, k].tree_
+                else:
+                    tree = est.estimators_[stage].tree_
+                _partial_dependence_tree(tree, grid, target_variables,
+                                         learning_rate, pdp[k])
+        if isinstance(est, ForestRegressor):
+            pdp /= n_estimators
+    elif method == 'exact':
+        pdp = _exact_partial_dependence(est, target_variables, grid, X)
+    elif method == 'estimated':
+        pdp = _estimated_partial_dependence(est, target_variables, grid, X)
+    else:
+        raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
+                         '"estimated", or None.' % method)
 
     return pdp, axes
+
+
+def plot_partial_dependence(gbrt, X, features, feature_names=None,
+                            label=None, n_cols=3, grid_resolution=100,
+                            percentiles=(0.05, 0.95), n_jobs=1,
+                            verbose=0, ax=None, line_kw=None,
+                            contour_kw=None, **fig_kw):
+    """Partial dependence plots for ``features``.
+
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
+    columns. Two-way partial dependence plots are plotted as contour
+    plots.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    gbrt : BaseGradientBoosting
+        A fitted gradient boosting model.
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``gbrt`` was trained.
+    features : seq of tuples or ints
+        If seq[i] is an int or a tuple with one int value, a one-way
+        PDP is created; if seq[i] is a tuple of two ints, a two-way
+        PDP is created.
+    feature_names : seq of str
+        Name of each feature; feature_names[i] holds
+        the name of the feature with index i.
+    label : object
+        The class label for which the PDPs should be computed.
+        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
+    n_cols : int
+        The number of columns in the grid plot (default: 3).
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used to create the extreme values
+        for the PDP axes.
+    grid_resolution : int, default=100
+        The number of equally spaced points on the axes.
+    n_jobs : int
+        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
+        Defaults to 1.
+    verbose : int
+        Verbose output during PD computations. Defaults to 0.
+    ax : Matplotlib axis object, default None
+        An axis object onto which the plots will be drawn.
+    line_kw : dict
+        Dict with keywords passed to the ``pylab.plot`` call.
+        For one-way partial dependence plots.
+    contour_kw : dict
+        Dict with keywords passed to the ``pylab.plot`` call.
+        For two-way partial dependence plots.
+    fig_kw : dict
+        Dict with keywords passed to the figure() call.
+        Note that all keywords not recognized above will be automatically
+        included here.
+
+    Returns
+    -------
+    fig : figure
+        The Matplotlib Figure object.
+    axs : seq of Axis objects
+        A seq of Axis objects, one for each subplot.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
+    ...
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    if gbrt.estimators_.shape[0] == 0:
+        raise ValueError('Call %s.fit before partial_dependence' %
+                         gbrt.__class__.__name__)
+
+    # set label_idx for multi-class GBRT
+    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
+        if label is None:
+            raise ValueError('label is not given for multi-class PDP')
+        label_idx = np.searchsorted(gbrt.classes_, label)
+        if gbrt.classes_[label_idx] != label:
+            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
+    else:
+        # regression and binary classification
+        label_idx = 0
+
+    X = check_array(X, dtype=DTYPE, order='C')
+    if gbrt.n_features != X.shape[1]:
+        raise ValueError('X.shape[1] does not match gbrt.n_features')
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if not feature_names use fx indices as name
+        feature_names = [str(i) for i in range(gbrt.n_features)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+
+    def convert_feature(fx):
+        if isinstance(fx, six.string_types):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return fx
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral,) + six.string_types):
+            fxs = (fxs,)
+        try:
+            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
+        except TypeError:
+            raise ValueError('features must be either int, str, or tuple '
+                             'of int/str')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('target features must be either one or two')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            l = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                l.append(feature_names[i])
+            names.append(l)
+    except IndexError:
+        raise ValueError('features[i] must be in [0, n_features) '
+                         'but was %d' % i)
+
+    # compute PD functions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(gbrt, fxs, X=X,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pdp, axes in pd_result:
+        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
+        n_fx = len(axes)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if ax is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig = ax.get_figure()
+        fig.clear()
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pdp, axes) in zip(count(), features, names,
+                                        pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(axes) == 1:
+            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(axes) == 2
+            XX, YY = np.meshgrid(axes[0], axes[1])
+            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(axes) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(axes) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs

From 19ed28ebf670c1be1aa64a1c9a089336f7f21d86 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Thu, 5 Nov 2015 22:08:18 -0800
Subject: [PATCH 004/113] support for Pipeline and GridSearchCV type estimators

---
 sklearn/partial_dependence.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 0afdfe06cf819..262f909ff100b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -17,7 +17,6 @@
 from .utils import check_array
 from .tree._tree import DTYPE
 
-from .base import ClassifierMixin, RegressorMixin
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
 from .ensemble.forest import ForestRegressor
@@ -74,7 +73,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X):
+def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -94,6 +93,8 @@ def _exact_partial_dependence(est, target_variables, grid, X):
         must be specified).
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
 
     Returns
     -------
@@ -107,13 +108,13 @@ def _exact_partial_dependence(est, target_variables, grid, X):
         X_eval = X.copy()
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-        if isinstance(est, RegressorMixin):
+        if est._estimator_type == 'regressor':
             try:
                 pdp.append(np.mean(est.predict(X_eval)))
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
-        elif isinstance(est, ClassifierMixin):
+        elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
             except:
@@ -136,7 +137,7 @@ def _exact_partial_dependence(est, target_variables, grid, X):
     return pdp
 
 
-def _estimated_partial_dependence(est, target_variables, grid, X):
+def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -156,6 +157,8 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
         must be specified).
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
 
     Returns
     -------
@@ -167,14 +170,14 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
     X_eval = np.tile(X.mean(0), [n_samples, 1])
     for i, variable in enumerate(target_variables):
         X_eval[:, variable] = grid[:, i]
-    if isinstance(est, RegressorMixin):
+    if est._estimator_type == 'regressor':
         try:
             pdp = est.predict(X_eval)
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         pdp = pdp[np.newaxis]
-    elif isinstance(est, ClassifierMixin):
+    elif est._estimator_type == 'classifier':
         try:
             pdp = est.predict_proba(X_eval)
         except:
@@ -191,7 +194,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
     return pdp
 
 
-def partial_dependence(est, target_variables, grid=None, X=None,
+def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method=None):
     """Partial dependence of ``target_variables``.
@@ -218,6 +221,8 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         a ``grid`` for the ``target_variables``. The ``grid`` comprises
         ``grid_resolution`` equally spaced points between the two
         ``percentiles``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used create the extreme values
         for the ``grid``. Only if ``X`` is not None.
@@ -269,11 +274,12 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         raise ValueError('est has to be an instance of BaseGradientBoosting or'
                          ' ForestRegressor for the "recursion" method. Try '
                          'using method="exact" or "estimated".')
-    if (method != 'recursion' and
-            not hasattr(est, 'predict_proba') and
-            isinstance(est, ClassifierMixin)):
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if method != 'recursion' and est._estimator_type == 'classifier':
         raise ValueError('est requires a predict_proba method for '
-                         'method="exact" or "estimated".')
+                         'method="exact" or "estimated" for classification.')
     if method == 'recursion':
         if len(est.estimators_) == 0:
             raise ValueError('Call %s.fit before partial_dependence' %

From 6fafc5e585369268fd1b100b470e3ad8687a1c9e Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 16 Nov 2015 14:56:11 -0800
Subject: [PATCH 005/113] add multioutput support

---
 sklearn/partial_dependence.py | 39 ++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 262f909ff100b..9dacd3d29631b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -73,7 +73,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
+def _exact_partial_dependence(est, target_variables, grid, X, output=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -110,16 +110,29 @@ def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
             X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
         if est._estimator_type == 'regressor':
             try:
-                pdp.append(np.mean(est.predict(X_eval)))
+                pdp_row = est.predict(X_eval)
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
+            if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
+                # Multi-output
+                if not 0 <= output < pdp_row.shape[1]:
+                    raise ValueError('Valid output must be specified for '
+                                     'multi-output models.')
+                pdp_row = pdp_row[:, output]
+            pdp.append(np.mean(pdp_row))
         elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
+            if isinstance(pdp_row, list):
+                # Multi-output
+                if not 0 <= output < len(pdp_row):
+                    raise ValueError('Valid output must be specified for '
+                                     'multi-output models.')
+                pdp_row = pdp_row[output]
             pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
             pdp_row = np.subtract(pdp_row,
                                   np.mean(pdp_row, 1)[:, np.newaxis])
@@ -137,7 +150,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
     return pdp
 
 
-def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
+def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -176,6 +189,15 @@ def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if pdp.ndim != 1 and pdp.shape[1] == 1:
+            # Column output
+            pdp = pdp.ravel()
+        if pdp.ndim != 1 and pdp.shape[1] != 1:
+            # Multi-output
+            if not 0 <= output < pdp.shape[1]:
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            pdp = pdp[:, output]
         pdp = pdp[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
@@ -183,6 +205,12 @@ def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if isinstance(pdp, list):
+            # Multi-output
+            if not 0 <= output < len(pdp):
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            pdp = pdp[output]
         pdp = np.log(np.clip(pdp, 1e-16, 1))
         pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
         pdp = pdp.transpose()
@@ -341,9 +369,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if isinstance(est, ForestRegressor):
             pdp /= n_estimators
     elif method == 'exact':
-        pdp = _exact_partial_dependence(est, target_variables, grid, X)
+        pdp = _exact_partial_dependence(est, target_variables, grid, X, output)
     elif method == 'estimated':
-        pdp = _estimated_partial_dependence(est, target_variables, grid, X)
+        pdp = _estimated_partial_dependence(est, target_variables, grid, X,
+                                            output)
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 152a190927d60a90a9c59c8f1f343b3d092758e1 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 22 Jul 2017 15:54:14 +1000
Subject: [PATCH 006/113] rebase and catch up to #6762, #7673, #7846

---
 sklearn/partial_dependence.py | 37 +++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 9dacd3d29631b..3faa3f54f04b9 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -20,6 +20,7 @@
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
 from .ensemble.forest import ForestRegressor
+from .exceptions import NotFittedError
 
 
 __all__ = ['partial_dependence', 'plot_partial_dependence']
@@ -111,7 +112,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
         if est._estimator_type == 'regressor':
             try:
                 pdp_row = est.predict(X_eval)
-            except:
+            except NotFittedError:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
             if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
@@ -124,7 +125,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
         elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
-            except:
+            except NotFittedError:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
             if isinstance(pdp_row, list):
@@ -186,7 +187,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     if est._estimator_type == 'regressor':
         try:
             pdp = est.predict(X_eval)
-        except:
+        except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         if pdp.ndim != 1 and pdp.shape[1] == 1:
@@ -202,7 +203,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     elif est._estimator_type == 'classifier':
         try:
             pdp = est.predict_proba(X_eval)
-        except:
+        except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         if isinstance(pdp, list):
@@ -252,7 +253,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
     percentiles : (low, high), default=(0.05, 0.95)
-        The lower and upper percentile used create the extreme values
+        The lower and upper percentile used to create the extreme values
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
@@ -312,10 +313,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if len(est.estimators_) == 0:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if isinstance(est, BaseGradientBoosting):
-            n_features = est.n_features
-        else:
-            n_features = est.n_features_
+        n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
     else:
@@ -399,10 +397,14 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         A fitted gradient boosting model.
     X : array-like, shape=(n_samples, n_features)
         The data on which ``gbrt`` was trained.
-    features : seq of tuples or ints
+    features : seq of ints, strings, or tuples of ints or strings
         If seq[i] is an int or a tuple with one int value, a one-way
         PDP is created; if seq[i] is a tuple of two ints, a two-way
         PDP is created.
+        If feature_names is specified and seq[i] is an int, seq[i]
+        must be < len(feature_names).
+        If seq[i] is a string, feature_names must be specified, and
+        seq[i] must be in feature_names.
     feature_names : seq of str
         Name of each feature; feature_names[i] holds
         the name of the feature with index i.
@@ -424,10 +426,10 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     ax : Matplotlib axis object, default None
         An axis object onto which the plots will be drawn.
     line_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
     contour_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
     fig_kw : dict
         Dict with keywords passed to the figure() call.
@@ -473,8 +475,8 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features')
+    if gbrt.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match gbrt.n_features_')
 
     if line_kw is None:
         line_kw = {'color': 'green'}
@@ -484,7 +486,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features)]
+        feature_names = [str(i) for i in range(gbrt.n_features_)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 
@@ -522,8 +524,9 @@ def convert_feature(fx):
                 l.append(feature_names[i])
             names.append(l)
     except IndexError:
-        raise ValueError('features[i] must be in [0, n_features) '
-                         'but was %d' % i)
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(

From 2cdc5ea15d072601b21b96da23763c2168ec4169 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 26 Jul 2017 17:18:55 +1000
Subject: [PATCH 007/113] catch up on #9434

---
 sklearn/partial_dependence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 3faa3f54f04b9..b39d924294ebb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -58,13 +58,13 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         raise ValueError('percentile values must be in [0, 1]')
 
     axes = []
+    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
     for col in range(X.shape[1]):
         uniques = np.unique(X[:, col])
         if uniques.shape[0] < grid_resolution:
             # feature has low resolution use unique vals
             axis = uniques
         else:
-            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
             # create axis based on percentiles and grid resolution
             axis = np.linspace(emp_percentiles[0, col],
                                emp_percentiles[1, col],

From ba1f8daa9c245287d058db91ee63cc3667609b37 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 26 Jul 2017 18:44:48 +1000
Subject: [PATCH 008/113] initial update of plot_partial_dependence

---
 sklearn/partial_dependence.py | 72 ++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b39d924294ebb..da33ec1202b79 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -378,9 +378,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     return pdp, axes
 
 
-def plot_partial_dependence(gbrt, X, features, feature_names=None,
+def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            percentiles=(0.05, 0.95), n_jobs=1,
+                            method=None, percentiles=(0.05, 0.95), n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -393,10 +393,10 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     Parameters
     ----------
-    gbrt : BaseGradientBoosting
-        A fitted gradient boosting model.
+    est : BaseEstimator
+        A fitted classification or regression model.
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``gbrt`` was trained.
+        The data on which ``est`` was trained.
     features : seq of ints, strings, or tuples of ints or strings
         If seq[i] is an int or a tuple with one int value, a one-way
         PDP is created; if seq[i] is a tuple of two ints, a two-way
@@ -410,7 +410,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         the name of the feature with index i.
     label : object
         The class label for which the PDPs should be computed.
-        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
+        Only if est is a multi-class model. Must be in ``est.classes_``.
     n_cols : int
         The number of columns in the grid plot (default: 3).
     percentiles : (low, high), default=(0.05, 0.95)
@@ -418,6 +418,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         for the PDP axes.
     grid_resolution : int, default=100
         The number of equally spaced points on the axes.
+    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
+        - If None, then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.
@@ -457,26 +473,46 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
 
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    if gbrt.estimators_.shape[0] == 0:
-        raise ValueError('Call %s.fit before partial_dependence' %
-                         gbrt.__class__.__name__)
+    if method is None:
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if method != 'recursion' and est._estimator_type == 'classifier':
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated" for classification.')
+    if method == 'recursion':
+        if len(est.estimators_) == 0:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
 
     # set label_idx for multi-class GBRT
-    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
+    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
             raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(gbrt.classes_, label)
-        if gbrt.classes_[label_idx] != label:
+        label_idx = np.searchsorted(est.classes_, label)
+        if est.classes_[label_idx] != label:
             raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
     else:
         # regression and binary classification
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features_')
+    if est.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match est.n_features_')
 
     if line_kw is None:
         line_kw = {'color': 'green'}
@@ -486,7 +522,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features_)]
+        feature_names = [str(i) for i in range(est.n_features_)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 
@@ -530,7 +566,7 @@ def convert_feature(fx):
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(gbrt, fxs, X=X,
+        delayed(partial_dependence)(est, fxs, X=X, method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)

From 1b1d8f0304d250e84bdf61625f5bf8309a073115 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 12 Aug 2017 15:21:40 +1000
Subject: [PATCH 009/113] deprecate ensemble.partial_dependence

---
 sklearn/ensemble/partial_dependence.py        | 294 ++----------------
 .../ensemble/tests/test_partial_dependence.py |  37 +++
 sklearn/partial_dependence.py                 |  13 +-
 sklearn/tests/test_partial_dependence.py      | 206 ++++++++++++
 4 files changed, 283 insertions(+), 267 deletions(-)
 create mode 100644 sklearn/tests/test_partial_dependence.py

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index e8bfc2110bb90..63d397d86c8a0 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,70 +3,9 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from itertools import count
-import numbers
-
-import numpy as np
-from scipy.stats.mstats import mquantiles
-
-from ..utils.extmath import cartesian
-from ..externals.joblib import Parallel, delayed
-from ..externals import six
-from ..externals.six.moves import map, range, zip
-from ..utils import check_array
-from ..utils.validation import check_is_fitted
-from ..tree._tree import DTYPE
-
-from ._gradient_boosting import _partial_dependence_tree
-from .gradient_boosting import BaseGradientBoosting
-
-
-def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
-    """Generate a grid of points based on the ``percentiles of ``X``.
-
-    The grid is generated by placing ``grid_resolution`` equally
-    spaced points between the ``percentiles`` of each column
-    of ``X``.
-
-    Parameters
-    ----------
-    X : ndarray
-        The data
-    percentiles : tuple of floats
-        The percentiles which are used to construct the extreme
-        values of the grid axes.
-    grid_resolution : int
-        The number of equally spaced points that are placed
-        on the grid.
-
-    Returns
-    -------
-    grid : ndarray
-        All data points on the grid; ``grid.shape[1] == X.shape[1]``
-        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
-    axes : seq of ndarray
-        The axes with which the grid has been created.
-    """
-    if len(percentiles) != 2:
-        raise ValueError('percentile must be tuple of len 2')
-    if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentile values must be in [0, 1]')
-
-    axes = []
-    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-    for col in range(X.shape[1]):
-        uniques = np.unique(X[:, col])
-        if uniques.shape[0] < grid_resolution:
-            # feature has low resolution use unique vals
-            axis = uniques
-        else:
-            # create axis based on percentiles and grid resolution
-            axis = np.linspace(emp_percentiles[0, col],
-                               emp_percentiles[1, col],
-                               num=grid_resolution, endpoint=True)
-        axes.append(axis)
-
-    return cartesian(axes), axes
+import warnings
+from ..partial_dependence import partial_dependence as new_pd
+from ..partial_dependence import plot_partial_dependence as new_ppd
 
 
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
@@ -120,47 +59,17 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-    if (grid is None and X is None) or (grid is not None and X is not None):
-        raise ValueError('Either grid or X must be specified')
-
-    target_variables = np.asarray(target_variables, dtype=np.int32,
-                                  order='C').ravel()
-
-    if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]):
-        raise ValueError('target_variables must be in [0, %d]'
-                         % (gbrt.n_features_ - 1))
-
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
-        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
-                                  grid_resolution)
-    else:
-        assert grid is not None
-        # dont return axes if grid is given
-        axes = None
-        # grid must be 2d
-        if grid.ndim == 1:
-            grid = grid[:, np.newaxis]
-        if grid.ndim != 2:
-            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
-
-    grid = np.asarray(grid, dtype=DTYPE, order='C')
-    assert grid.shape[1] == target_variables.shape[0]
-
-    n_trees_per_stage = gbrt.estimators_.shape[1]
-    n_estimators = gbrt.estimators_.shape[0]
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                   order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
-
-    return pdp, axes
+    warnings.warn("The function ensemble.partial_dependence has been moved to "
+                  "partial_dependence in 0.20 and will be removed in 0.22.",
+                  DeprecationWarning)
+    return new_pd(est=gbrt,
+                  target_variables=target_variables,
+                  grid=grid,
+                  X=X,
+                  output=None,
+                  percentiles=percentiles,
+                  grid_resolution=grid_resolution,
+                  method='recursion')
 
 
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
@@ -237,159 +146,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
-
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-
-    # set label_idx for multi-class GBRT
-    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
-        if label is None:
-            raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(gbrt.classes_, label)
-        if gbrt.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
-    else:
-        # regression and binary classification
-        label_idx = 0
-
-    X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features_')
-
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
-    # convert feature_names to list
-    if feature_names is None:
-        # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features_)]
-    elif isinstance(feature_names, np.ndarray):
-        feature_names = feature_names.tolist()
-
-    def convert_feature(fx):
-        if isinstance(fx, six.string_types):
-            try:
-                fx = feature_names.index(fx)
-            except ValueError:
-                raise ValueError('Feature %s not in feature_names' % fx)
-        return fx
-
-    # convert features into a seq of int tuples
-    tmp_features = []
-    for fxs in features:
-        if isinstance(fxs, (numbers.Integral,) + six.string_types):
-            fxs = (fxs,)
-        try:
-            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
-        except TypeError:
-            raise ValueError('features must be either int, str, or tuple '
-                             'of int/str')
-        if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('target features must be either one or two')
-
-        tmp_features.append(fxs)
-
-    features = tmp_features
-
-    names = []
-    try:
-        for fxs in features:
-            l = []
-            # explicit loop so "i" is bound for exception below
-            for i in fxs:
-                l.append(feature_names[i])
-            names.append(l)
-    except IndexError:
-        raise ValueError('All entries of features must be less than '
-                         'len(feature_names) = {0}, got {1}.'
-                         .format(len(feature_names), i))
-
-    # compute PD functions
-    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(gbrt, fxs, X=X,
-                                    grid_resolution=grid_resolution,
-                                    percentiles=percentiles)
-        for fxs in features)
-
-    # get global min and max values of PD grouped by plot type
-    pdp_lim = {}
-    for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
-        n_fx = len(axes)
-        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
-        min_pd = min(min_pd, old_min_pd)
-        max_pd = max(max_pd, old_max_pd)
-        pdp_lim[n_fx] = (min_pd, max_pd)
-
-    # create contour levels for two-way plots
-    if 2 in pdp_lim:
-        Z_level = np.linspace(*pdp_lim[2], num=8)
-
-    if ax is None:
-        fig = plt.figure(**fig_kw)
-    else:
-        fig = ax.get_figure()
-        fig.clear()
-
-    n_cols = min(n_cols, len(features))
-    n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pdp, axes) in zip(count(), features, names,
-                                        pd_result):
-        ax = fig.add_subplot(n_rows, n_cols, i + 1)
-
-        if len(axes) == 1:
-            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
-        else:
-            # make contour plot
-            assert len(axes) == 2
-            XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
-            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
-                            colors='k')
-            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
-                        vmin=Z_level[0], alpha=0.75, **contour_kw)
-            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
-
-        # plot data deciles + axes labels
-        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
-        trans = transforms.blended_transform_factory(ax.transData,
-                                                     ax.transAxes)
-        ylim = ax.get_ylim()
-        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
-        ax.set_xlabel(name[0])
-        ax.set_ylim(ylim)
-
-        # prevent x-axis ticks from overlapping
-        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
-        tick_formatter = ScalarFormatter()
-        tick_formatter.set_powerlimits((-3, 4))
-        ax.xaxis.set_major_formatter(tick_formatter)
-
-        if len(axes) > 1:
-            # two-way PDP - y-axis deciles + labels
-            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
-            trans = transforms.blended_transform_factory(ax.transAxes,
-                                                         ax.transData)
-            xlim = ax.get_xlim()
-            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
-            ax.set_ylabel(name[1])
-            # hline erases xlim
-            ax.set_xlim(xlim)
-        else:
-            ax.set_ylabel('Partial dependence')
-
-        if len(axes) == 1:
-            ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
-
-    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
-                        hspace=0.3)
-    return fig, axs
+    warnings.warn("The function ensemble.plot_partial_dependence has been "
+                  "moved to partial_dependence in 0.20 and will be removed "
+                  "in 0.22.",
+                  DeprecationWarning)
+    return new_ppd(est=gbrt,
+                   X=X,
+                   features=features,
+                   feature_names=feature_names,
+                   label=label,
+                   n_cols=n_cols,
+                   grid_resolution=grid_resolution,
+                   method='recursion',
+                   percentiles=percentiles,
+                   n_jobs=n_jobs,
+                   verbose=verbose,
+                   ax=ax,
+                   line_kw=line_kw,
+                   contour_kw=contour_kw,
+                   **fig_kw)
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index cec7efc46f03b..3a45ade617f9e 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -12,6 +12,8 @@
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn import datasets
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_warns_message
 
 
 # toy sample
@@ -27,6 +29,7 @@
 iris = datasets.load_iris()
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_classifier():
     # Test partial dependence for classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -47,6 +50,7 @@ def test_partial_dependence_classifier():
     assert_array_equal(pdp, pdp_2)
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_multiclass():
     # Test partial dependence for multi-class classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -62,6 +66,7 @@ def test_partial_dependence_multiclass():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_regressor():
     # Test partial dependence for regressor
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
@@ -75,6 +80,7 @@ def test_partial_dependence_regressor():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependecy_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -103,6 +109,7 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
 
 
+@ignore_warnings(category=DeprecationWarning)
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -136,6 +143,7 @@ def test_plot_partial_dependence():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_input():
     # Test partial dependence plot function input checks.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -171,6 +179,7 @@ def test_plot_partial_dependence_input():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -204,3 +213,31 @@ def test_plot_partial_dependence_multiclass():
     assert_raises(ValueError, plot_partial_dependence,
                   clf, iris.data, [0, 1],
                   grid_resolution=grid_resolution)
+
+
+def test_warning_raised_for_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", partial_dependence, clf, [0], X=boston.data,
+                         grid_resolution=grid_resolution)
+
+
+@if_matplotlib
+def test_warning_raised_for_plot_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.plot_partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", plot_partial_dependence, clf, boston.data,
+                         [0, 1, (0, 1)], grid_resolution=grid_resolution,
+                         feature_names=boston.feature_names)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index da33ec1202b79..f4966b35f056d 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -15,6 +15,7 @@
 from .externals import six
 from .externals.six.moves import map, range, zip
 from .utils import check_array
+from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
 
 from .ensemble._gradient_boosting import _partial_dependence_tree
@@ -310,9 +311,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
-        if len(est.estimators_) == 0:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
         n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
@@ -490,9 +491,9 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
-        if len(est.estimators_) == 0:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
         n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
new file mode 100644
index 0000000000000..13a12f4f809c8
--- /dev/null
+++ b/sklearn/tests/test_partial_dependence.py
@@ -0,0 +1,206 @@
+"""
+Testing for the partial dependence module.
+"""
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import if_matplotlib
+from sklearn.partial_dependence import partial_dependence
+from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn import datasets
+
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+
+# also load the boston dataset
+boston = datasets.load_boston()
+
+# also load the iris dataset
+iris = datasets.load_iris()
+
+
+def test_partial_dependence_classifier():
+    # Test partial dependence for classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)
+
+    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
+    assert pdp.shape == (1, 4)
+    assert axes[0].shape[0] == 4
+
+    # now with our own grid
+    X_ = np.asarray(X)
+    grid = np.unique(X_[:, 0])
+    pdp_2, axes = partial_dependence(clf, [0], grid=grid)
+
+    assert axes is None
+    assert_array_equal(pdp, pdp_2)
+
+
+def test_partial_dependence_multiclass():
+    # Test partial dependence for multi-class classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    n_classes = clf.n_classes_
+    pdp, axes = partial_dependence(
+        clf, [0], X=iris.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (n_classes, grid_resolution)
+    assert len(axes) == 1
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependence_regressor():
+    # Test partial dependence for regressor
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    pdp, axes = partial_dependence(
+        clf, [0], X=boston.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (1, grid_resolution)
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependecy_input():
+    # Test input validation of partial dependence.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=None, X=None)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=[0, 1], X=X)
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, partial_dependence,
+                  {}, [0], X=X)
+
+    # Gradient boosting estimator must be fit
+    assert_raises(ValueError, partial_dependence,
+                  GradientBoostingClassifier(), [0], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [100], X=X)
+
+    # wrong ndim for grid
+    grid = np.random.rand(10, 2, 1)
+    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
+
+
+@if_matplotlib
+def test_plot_partial_dependence():
+    # Test partial dependence plot function.
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with str features and array feature names
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with list feature_names
+    feature_names = boston.feature_names.tolist()
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_input():
+    # Test partial dependence plot function input checks.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+
+    # not fitted yet
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [0])
+
+    clf.fit(X, y)
+
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, np.array(X)[:, :0], [0])
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, plot_partial_dependence,
+                  {}, X, [0])
+
+    # must be larger than -1
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [-1])
+
+    # too large feature value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [100])
+
+    # str feature but no feature_names
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, ['foobar'])
+
+    # not valid features value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [{'foo': 'bar'}])
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multiclass():
+    # Test partial dependence plot function on multi-class input.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label='setosa',
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # label not in gbrt.classes_
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1], label='foobar',
+                  grid_resolution=grid_resolution)
+
+    # label not provided
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1],
+                  grid_resolution=grid_resolution)

From 9095305f5ebaea0a51c0aff8797773e5ae6d661d Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 15 Aug 2017 22:29:24 +1000
Subject: [PATCH 010/113] refactor estimated and exact functions to _predict

---
 sklearn/partial_dependence.py | 173 ++++++++++++----------------------
 1 file changed, 59 insertions(+), 114 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index f4966b35f056d..cbc56fb775643 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -75,8 +75,8 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X, output=None):
-    """Calculate the partial dependence of ``target_variables``.
+def _predict(est, X_eval, method, output=None):
+    """Calculate part of the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
     ``est`` for classification or ``predict`` for regression on ``X`` for every
@@ -86,142 +86,68 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
     ----------
     est : BaseEstimator
         A fitted classification or regression model.
-    target_variables : array-like, dtype=int
-        The target features for which the partial dependency should be
-        computed (size should be smaller than 3 for visual renderings).
-    grid : array-like, shape=(n_points, len(target_variables))
-        The grid of ``target_variables`` values for which the
-        partial dependency should be evaluated (either ``grid`` or ``X``
-        must be specified).
-    X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained.
-    output : int, optional (default=None)
-        The output index to use for multi-output estimators.
-
-    Returns
-    -------
-    pdp : array, shape=(n_classes, n_points)
-        The partial dependence function evaluated on the ``grid``.
-        For regression and binary classification ``n_classes==1``.
-    """
-    n_samples = X.shape[0]
-    pdp = []
-    for row in range(grid.shape[0]):
-        X_eval = X.copy()
-        for i, variable in enumerate(target_variables):
-            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-        if est._estimator_type == 'regressor':
-            try:
-                pdp_row = est.predict(X_eval)
-            except NotFittedError:
-                raise ValueError('Call %s.fit before partial_dependence' %
-                                 est.__class__.__name__)
-            if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
-                # Multi-output
-                if not 0 <= output < pdp_row.shape[1]:
-                    raise ValueError('Valid output must be specified for '
-                                     'multi-output models.')
-                pdp_row = pdp_row[:, output]
-            pdp.append(np.mean(pdp_row))
-        elif est._estimator_type == 'classifier':
-            try:
-                pdp_row = est.predict_proba(X_eval)
-            except NotFittedError:
-                raise ValueError('Call %s.fit before partial_dependence' %
-                                 est.__class__.__name__)
-            if isinstance(pdp_row, list):
-                # Multi-output
-                if not 0 <= output < len(pdp_row):
-                    raise ValueError('Valid output must be specified for '
-                                     'multi-output models.')
-                pdp_row = pdp_row[output]
-            pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
-            pdp_row = np.subtract(pdp_row,
-                                  np.mean(pdp_row, 1)[:, np.newaxis])
-            pdp.append(np.mean(pdp_row, 0))
-        else:
-            raise ValueError('est must be a fitted regressor or classifier '
-                             'model.')
-    pdp = np.array(pdp).transpose()
-    if pdp.shape[0] == 2:
-        # Binary classification
-        pdp = pdp[1, :][np.newaxis]
-    elif len(pdp.shape) == 1:
-        # Regression
-        pdp = pdp[np.newaxis]
-    return pdp
-
-
-def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
-    """Calculate the partial dependence of ``target_variables``.
-
-    The function will be calculated by calling the ``predict_proba`` method of
-    ``est`` for classification or ``predict`` for regression on the mean of
-    ``X``.
+    X_eval : array-like, shape=(n_samples, n_features)
+        The data on which the partial dependence of ``est`` should be
+        predicted.
+    method : {'exact', 'estimated'}
+        The method to use to calculate the partial dependence function:
 
-    Parameters
-    ----------
-    est : BaseEstimator
-        A fitted classification or regression model.
-    target_variables : array-like, dtype=int
-        The target features for which the partial dependency should be
-        computed (size should be smaller than 3 for visual renderings).
-    grid : array-like, shape=(n_points, len(target_variables))
-        The grid of ``target_variables`` values for which the
-        partial dependency should be evaluated (either ``grid`` or ``X``
-        must be specified).
-    X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
 
     Returns
     -------
-    pdp : array, shape=(n_classes, n_points)
+    out : array, shape=(n_classes, n_points)
         The partial dependence function evaluated on the ``grid``.
         For regression and binary classification ``n_classes==1``.
     """
-    n_samples = grid.shape[0]
-    X_eval = np.tile(X.mean(0), [n_samples, 1])
-    for i, variable in enumerate(target_variables):
-        X_eval[:, variable] = grid[:, i]
     if est._estimator_type == 'regressor':
         try:
-            pdp = est.predict(X_eval)
+            out = est.predict(X_eval)
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if pdp.ndim != 1 and pdp.shape[1] == 1:
+        if out.ndim != 1 and out.shape[1] == 1:
             # Column output
-            pdp = pdp.ravel()
-        if pdp.ndim != 1 and pdp.shape[1] != 1:
+            out = out.ravel()
+        if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
-            if not 0 <= output < pdp.shape[1]:
+            if not 0 <= output < out.shape[1]:
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
-            pdp = pdp[:, output]
-        pdp = pdp[np.newaxis]
+            out = out[:, output]
+        if method == 'exact':
+            return np.mean(out)
+        else:
+            return out[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
-            pdp = est.predict_proba(X_eval)
+            out = est.predict_proba(X_eval)
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if isinstance(pdp, list):
+        if isinstance(out, list):
             # Multi-output
-            if not 0 <= output < len(pdp):
+            if not 0 <= output < len(out):
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
-            pdp = pdp[output]
-        pdp = np.log(np.clip(pdp, 1e-16, 1))
-        pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
-        pdp = pdp.transpose()
+            out = out[output]
+        out = np.log(np.clip(out, 1e-16, 1))
+        out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
+        if method == 'exact':
+            return np.mean(out, 0)
+        else:
+            return out.transpose()
     else:
-        raise ValueError('est must be a fitted regressor or classifier model.')
-    if pdp.shape[0] == 2:
-        # Binary classification
-        pdp = pdp[1, :][np.newaxis]
-    return pdp
+        raise ValueError('est must be a fitted regressor or classifier '
+                         'model.')
 
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
@@ -368,10 +294,29 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if isinstance(est, ForestRegressor):
             pdp /= n_estimators
     elif method == 'exact':
-        pdp = _exact_partial_dependence(est, target_variables, grid, X, output)
+        n_samples = X.shape[0]
+        pdp = []
+        for row in range(grid.shape[0]):
+            X_eval = X.copy()
+            for i, variable in enumerate(target_variables):
+                X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+            pdp.append(_predict(est, X_eval, method, output=None))
+        pdp = np.array(pdp).transpose()
+        if pdp.shape[0] == 2:
+            # Binary classification
+            pdp = pdp[1, :][np.newaxis]
+        elif len(pdp.shape) == 1:
+            # Regression
+            pdp = pdp[np.newaxis]
     elif method == 'estimated':
-        pdp = _estimated_partial_dependence(est, target_variables, grid, X,
-                                            output)
+        n_samples = grid.shape[0]
+        X_eval = np.tile(X.mean(0), [n_samples, 1])
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = grid[:, i]
+        pdp = _predict(est, X_eval, method, output=None)
+        if pdp.shape[0] == 2:
+            # Binary classification
+            pdp = pdp[1, :][np.newaxis]
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 3fc1727ecdb62002bbfba454fb11a09decda8479 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 15 Aug 2017 22:36:33 +1000
Subject: [PATCH 011/113] make "auto" the default rather than None for method

---
 sklearn/partial_dependence.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index cbc56fb775643..bc3d7a351db6c 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -152,7 +152,7 @@ def _predict(est, X_eval, method, output=None):
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
-                       method=None):
+                       method='auto'):
     """Partial dependence of ``target_variables``.
 
     Partial dependence plots show the dependence between the joint values
@@ -184,7 +184,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
-    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -197,7 +197,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on the mean of ``X``.
-        - If None, then 'recursion' will be used if ``est`` is
+        - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
 
@@ -220,7 +220,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if method is None:
+    if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
         else:
@@ -326,7 +326,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            method=None, percentiles=(0.05, 0.95), n_jobs=1,
+                            method='auto', percentiles=(0.05, 0.95), n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -364,7 +364,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         for the PDP axes.
     grid_resolution : int, default=100
         The number of equally spaced points on the axes.
-    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -377,7 +377,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on the mean of ``X``.
-        - If None, then 'recursion' will be used if ``est`` is
+        - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
     n_jobs : int
@@ -419,7 +419,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
 
-    if method is None:
+    if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
         else:

From 259ec9946927d331586def5b6ce4417f7d6e8fe8 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 16 Aug 2017 21:49:33 +1000
Subject: [PATCH 012/113] some more refactoring

---
 sklearn/partial_dependence.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index bc3d7a351db6c..a3af46443a0cb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -114,19 +114,12 @@ def _predict(est, X_eval, method, output=None):
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if out.ndim != 1 and out.shape[1] == 1:
-            # Column output
-            out = out.ravel()
         if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
             if not 0 <= output < out.shape[1]:
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
             out = out[:, output]
-        if method == 'exact':
-            return np.mean(out)
-        else:
-            return out[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
             out = est.predict_proba(X_eval)
@@ -141,13 +134,10 @@ def _predict(est, X_eval, method, output=None):
             out = out[output]
         out = np.log(np.clip(out, 1e-16, 1))
         out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
-        if method == 'exact':
-            return np.mean(out, 0)
-        else:
-            return out.transpose()
     else:
         raise ValueError('est must be a fitted regressor or classifier '
                          'model.')
+    return out
 
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
@@ -308,6 +298,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         elif len(pdp.shape) == 1:
             # Regression
             pdp = pdp[np.newaxis]
+        if est._estimator_type == 'regressor':
+            pdp = np.mean(pdp)
+        else:
+            pdp = np.mean(pdp, 0)
     elif method == 'estimated':
         n_samples = grid.shape[0]
         X_eval = np.tile(X.mean(0), [n_samples, 1])
@@ -317,6 +311,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if pdp.shape[0] == 2:
             # Binary classification
             pdp = pdp[1, :][np.newaxis]
+        if est._estimator_type == 'regressor':
+            pdp = pdp[np.newaxis]
+        else:
+            pdp = pdp.transpose()
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From cbc20af0448f219f73cbebd90f39af3b20571de4 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 21 Aug 2017 18:52:01 +1000
Subject: [PATCH 013/113] avoid namespace collision

---
 sklearn/partial_dependence.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index a3af46443a0cb..2dba915cb98a4 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -18,9 +18,6 @@
 from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
 
-from .ensemble._gradient_boosting import _partial_dependence_tree
-from .ensemble.gradient_boosting import BaseGradientBoosting
-from .ensemble.forest import ForestRegressor
 from .exceptions import NotFittedError
 
 
@@ -210,6 +207,12 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble._gradient_boosting import _partial_dependence_tree
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
+
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
@@ -223,7 +226,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     if (not hasattr(est, '_estimator_type') or
             est._estimator_type not in ('classifier', 'regressor')):
         raise ValueError('est must be a fitted regressor or classifier model.')
-    if method != 'recursion' and est._estimator_type == 'classifier':
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
@@ -416,6 +420,10 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from matplotlib import transforms
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
 
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
@@ -455,7 +463,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if est.n_features_ != X.shape[1]:
+    if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
 
     if line_kw is None:

From 63da115c613b4a6fbaf787bc51832863ad1b85fb Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 22 Aug 2017 21:12:43 +1000
Subject: [PATCH 014/113] fix output shapes of all estimators

---
 sklearn/partial_dependence.py | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 2dba915cb98a4..2582affad5714 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -72,7 +72,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _predict(est, X_eval, method, output=None):
+def _predict(est, X_eval, output=None):
     """Calculate part of the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -86,16 +86,6 @@ def _predict(est, X_eval, method, output=None):
     X_eval : array-like, shape=(n_samples, n_features)
         The data on which the partial dependence of ``est`` should be
         predicted.
-    method : {'exact', 'estimated'}
-        The method to use to calculate the partial dependence function:
-
-        - If 'exact', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on ``X``for every point in the grid. To speed up this
-          method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'estimated', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
 
@@ -111,6 +101,9 @@ def _predict(est, X_eval, method, output=None):
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if out.ndim != 1 and out.shape[1] == 1:
+            # Column output
+            out = out.ravel()
         if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
             if not 0 <= output < out.shape[1]:
@@ -294,7 +287,11 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             X_eval = X.copy()
             for i, variable in enumerate(target_variables):
                 X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-            pdp.append(_predict(est, X_eval, method, output=None))
+            pdp_row = _predict(est, X_eval, output=output)
+            if est._estimator_type == 'regressor':
+                pdp.append(np.mean(pdp_row))
+            else:
+                pdp.append(np.mean(pdp_row, 0))
         pdp = np.array(pdp).transpose()
         if pdp.shape[0] == 2:
             # Binary classification
@@ -302,23 +299,17 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         elif len(pdp.shape) == 1:
             # Regression
             pdp = pdp[np.newaxis]
-        if est._estimator_type == 'regressor':
-            pdp = np.mean(pdp)
-        else:
-            pdp = np.mean(pdp, 0)
     elif method == 'estimated':
         n_samples = grid.shape[0]
         X_eval = np.tile(X.mean(0), [n_samples, 1])
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
-        pdp = _predict(est, X_eval, method, output=None)
-        if pdp.shape[0] == 2:
+        pdp = _predict(est, X_eval, output=output)
+        if pdp.shape[1] == 2:
             # Binary classification
-            pdp = pdp[1, :][np.newaxis]
+            pdp = pdp[:, 1][np.newaxis]
         if est._estimator_type == 'regressor':
             pdp = pdp[np.newaxis]
-        else:
-            pdp = pdp.transpose()
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 8f7d2b0a9e44de42c5d019fab6302aba9a934baa Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 22 Aug 2017 22:33:42 +1000
Subject: [PATCH 015/113] add tests to ensure all estimators output same shape

---
 sklearn/partial_dependence.py            |  3 +-
 sklearn/tests/test_partial_dependence.py | 57 ++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 2582affad5714..b9401cec77dd7 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -305,7 +305,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
         pdp = _predict(est, X_eval, output=output)
-        if pdp.shape[1] == 2:
+        if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
             # Binary classification
             pdp = pdp[:, 1][np.newaxis]
         if est._estimator_type == 'regressor':
@@ -313,7 +313,6 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)
-
     return pdp, axes
 
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 13a12f4f809c8..ce3d8a99a05eb 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -7,24 +7,73 @@
 
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import if_matplotlib
+from sklearn.utils.testing import all_estimators
+from sklearn.utils.testing import ignore_warnings
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
+from sklearn.ensemble.forest import ForestRegressor
 from sklearn import datasets
 
-
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
 T = [[-1, -1], [2, 2], [3, 2]]
 true_result = [-1, 1, 1]
 
-# also load the boston dataset
+# Load the boston, iris & breast cancer datasets
 boston = datasets.load_boston()
-
-# also load the iris dataset
 iris = datasets.load_iris()
+breast_cancer = datasets.load_breast_cancer()
+
+
+@ignore_warnings()
+def test_output_shape_classifier():
+    # Test that partial_dependence has same output shape for all classifiers
+    for name, Estimator in all_estimators():
+        clf = Estimator()
+        if (not hasattr(clf, '_estimator_type') or
+                'MultiTask' in name or
+                clf._estimator_type != 'classifier' or
+                not hasattr(clf, 'predict_proba')):
+            continue
+        clf.fit(breast_cancer.data, breast_cancer.target)
+        for method in ['recursion', 'exact', 'estimated']:
+            if (method == 'recursion' and not
+                    (isinstance(clf, BaseGradientBoosting) or
+                     isinstance(clf, ForestRegressor))):
+                continue
+            pdp, axes = partial_dependence(clf,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method=method,
+                                           grid_resolution=20)
+            assert(pdp.shape == (1, 20))
+
+
+@ignore_warnings()
+def test_output_shape_regressor():
+    # Test that partial_dependence has same output shape for all regressors
+    for name, Estimator in all_estimators():
+        clf = Estimator()
+        if (not hasattr(clf, '_estimator_type') or
+                'MultiTask' in name or
+                clf._estimator_type != 'regressor'):
+            continue
+        clf.fit(boston.data, boston.target)
+        for method in ['recursion', 'exact', 'estimated']:
+            if (method == 'recursion' and not
+                    (isinstance(clf, BaseGradientBoosting) or
+                     isinstance(clf, ForestRegressor))):
+                continue
+            pdp, axes = partial_dependence(clf,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method=method,
+                                           grid_resolution=20)
+            assert(pdp.shape == (1, 20))
 
 
 def test_partial_dependence_classifier():

From 6fc3a497e861f72695d69dd1ee5d677445888161 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 30 Aug 2017 20:25:18 +1000
Subject: [PATCH 016/113] quick fixes

---
 sklearn/partial_dependence.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b9401cec77dd7..7e6635e4120e1 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -428,7 +428,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     if (not hasattr(est, '_estimator_type') or
             est._estimator_type not in ('classifier', 'regressor')):
         raise ValueError('est must be a fitted regressor or classifier model.')
-    if method != 'recursion' and est._estimator_type == 'classifier':
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
@@ -464,7 +465,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(est.n_features_)]
+        feature_names = [str(i) for i in range(n_features)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 

From b1f8bfcddd49e7eb7ba71e30b7d4bb8b24ea9056 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Fri, 1 Sep 2017 20:31:12 +1000
Subject: [PATCH 017/113] fix docstring, test fails

---
 sklearn/partial_dependence.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7e6635e4120e1..91d7ac234bc7c 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -176,7 +176,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
+          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
@@ -301,7 +301,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             pdp = pdp[np.newaxis]
     elif method == 'estimated':
         n_samples = grid.shape[0]
-        X_eval = np.tile(X.mean(0), [n_samples, 1])
+        X_eval = np.tile(np.median(X, 0), [n_samples, 1])
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
         pdp = _predict(est, X_eval, output=output)
@@ -318,7 +318,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            method='auto', percentiles=(0.05, 0.95), n_jobs=1,
+                            percentiles=(0.05, 0.95), method='auto', n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -351,11 +351,11 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         Only if est is a multi-class model. Must be in ``est.classes_``.
     n_cols : int
         The number of columns in the grid plot (default: 3).
+    grid_resolution : int, default=100
+        The number of equally spaced points on the axes.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
-    grid_resolution : int, default=100
-        The number of equally spaced points on the axes.
     method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
@@ -368,7 +368,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
+          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.

From dc93b694aeb41f220203c552dfd5dbd07f049937 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Fri, 1 Sep 2017 21:27:05 +1000
Subject: [PATCH 018/113] refactor tests for easier debugging

---
 sklearn/partial_dependence.py            |  2 +-
 sklearn/tests/test_partial_dependence.py | 97 ++++++++++++++++--------
 2 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 91d7ac234bc7c..7a51c35793bfc 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -385,7 +385,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     contour_kw : dict
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
-    fig_kw : dict
+    **fig_kw : dict
         Dict with keywords passed to the figure() call.
         Note that all keywords not recognized above will be automatically
         included here.
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index ce3d8a99a05eb..4d7855cae64aa 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -29,51 +29,86 @@
 breast_cancer = datasets.load_breast_cancer()
 
 
+def test_output_shape_recursion():
+    # Test recursion partial_dependence has same output shape for everything
+    for name, Estimator in all_estimators():
+        est = Estimator()
+        if not (isinstance(est, BaseGradientBoosting) or
+                isinstance(est, ForestRegressor)):
+            continue
+        if est._estimator_type == 'classifier':
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method='recursion',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method='recursion',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
+
+
 @ignore_warnings()
-def test_output_shape_classifier():
-    # Test that partial_dependence has same output shape for all classifiers
+def test_output_shape_exact():
+    # Test exact partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
-        clf = Estimator()
-        if (not hasattr(clf, '_estimator_type') or
-                'MultiTask' in name or
-                clf._estimator_type != 'classifier' or
-                not hasattr(clf, 'predict_proba')):
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
             continue
-        clf.fit(breast_cancer.data, breast_cancer.target)
-        for method in ['recursion', 'exact', 'estimated']:
-            if (method == 'recursion' and not
-                    (isinstance(clf, BaseGradientBoosting) or
-                     isinstance(clf, ForestRegressor))):
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
                 continue
-            pdp, axes = partial_dependence(clf,
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
                                            target_variables=[1],
                                            X=breast_cancer.data,
-                                           method=method,
-                                           grid_resolution=20)
-            assert(pdp.shape == (1, 20))
+                                           method='exact',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method='exact',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
 
 
 @ignore_warnings()
-def test_output_shape_regressor():
-    # Test that partial_dependence has same output shape for all regressors
+def test_output_shape_estimated():
+    # Test exact partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
-        clf = Estimator()
-        if (not hasattr(clf, '_estimator_type') or
-                'MultiTask' in name or
-                clf._estimator_type != 'regressor'):
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
             continue
-        clf.fit(boston.data, boston.target)
-        for method in ['recursion', 'exact', 'estimated']:
-            if (method == 'recursion' and not
-                    (isinstance(clf, BaseGradientBoosting) or
-                     isinstance(clf, ForestRegressor))):
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
                 continue
-            pdp, axes = partial_dependence(clf,
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method='estimated',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
                                            target_variables=[1],
                                            X=boston.data,
-                                           method=method,
-                                           grid_resolution=20)
-            assert(pdp.shape == (1, 20))
+                                           method='estimated',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
 
 
 def test_partial_dependence_classifier():

From cd8f8de65b3c0d17de87abf9f228a126f4039ab9 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 2 Sep 2017 11:48:54 +1000
Subject: [PATCH 019/113] speed up tests, add two-way plot test

---
 sklearn/tests/test_partial_dependence.py | 96 +++++++++++++++++-------
 1 file changed, 68 insertions(+), 28 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 4d7855cae64aa..46d12132a27ad 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -15,7 +15,8 @@
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
 from sklearn.ensemble.forest import ForestRegressor
-from sklearn import datasets
+from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import make_classification, make_regression
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -23,36 +24,53 @@
 T = [[-1, -1], [2, 2], [3, 2]]
 true_result = [-1, 1, 1]
 
-# Load the boston, iris & breast cancer datasets
-boston = datasets.load_boston()
-iris = datasets.load_iris()
-breast_cancer = datasets.load_breast_cancer()
+# Make some sample data to test output shapes
+X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)
+# Non-negative for MultinomialNB
+X_c = X_c + np.abs(X_c.min())
+X_r, y_r = make_regression(n_features=10, n_informative=5, random_state=0)
 
+# Load the boston & iris datasets
+boston = load_boston()
+iris = load_iris()
 
+
+@ignore_warnings()
 def test_output_shape_recursion():
     # Test recursion partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
         est = Estimator()
         if not (isinstance(est, BaseGradientBoosting) or
-                isinstance(est, ForestRegressor)):
+                    isinstance(est, ForestRegressor)):
             continue
         if est._estimator_type == 'classifier':
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
                                            method='recursion',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='recursion',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 @ignore_warnings()
@@ -65,27 +83,38 @@ def test_output_shape_exact():
         if est._estimator_type == 'classifier':
             if not hasattr(est, 'predict_proba'):
                 continue
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
                                            method='exact',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='exact',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 @ignore_warnings()
 def test_output_shape_estimated():
-    # Test exact partial_dependence has same output shape for everything
+    # Test estimated partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
         est = Estimator()
         if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
@@ -93,22 +122,33 @@ def test_output_shape_estimated():
         if est._estimator_type == 'classifier':
             if not hasattr(est, 'predict_proba'):
                 continue
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
                                            method='estimated',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='estimated',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 def test_partial_dependence_classifier():

From 4eb1a8081a10438b464c8a1e259100b07faa4038 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 2 Sep 2017 12:37:24 +1000
Subject: [PATCH 020/113] move input validation on X

---
 sklearn/partial_dependence.py            | 6 ++++--
 sklearn/tests/test_partial_dependence.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7a51c35793bfc..78bbb617af15d 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -223,6 +223,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
     if method == 'recursion':
         check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
                                                 'partial_dependence' %
@@ -243,7 +245,6 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                          % (n_features - 1))
 
     if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
         grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                   grid_resolution)
     else:
@@ -432,6 +433,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
             not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
     if method == 'recursion':
         check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
                                                 'partial_dependence' %
@@ -453,7 +456,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         # regression and binary classification
         label_idx = 0
 
-    X = check_array(X, dtype=DTYPE, order='C')
     if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 46d12132a27ad..4c50efa67977e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -41,7 +41,7 @@ def test_output_shape_recursion():
     for name, Estimator in all_estimators():
         est = Estimator()
         if not (isinstance(est, BaseGradientBoosting) or
-                    isinstance(est, ForestRegressor)):
+                isinstance(est, ForestRegressor)):
             continue
         if est._estimator_type == 'classifier':
             est.fit(X_c, y_c)

From 21544ce03cd709ff4294a4147a9f245d00857a84 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 28 Oct 2017 13:56:52 +1100
Subject: [PATCH 021/113] fix output shape for multi-label classification

---
 sklearn/partial_dependence.py            | 7 +++++--
 sklearn/tests/test_partial_dependence.py | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 78bbb617af15d..7e843dd4469eb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,4 +1,4 @@
-"""Partial dependence plots for regression and classification models. """
+"""Partial dependence plots for regression and classification models."""
 
 # Authors: Peter Prettenhofer
 #          Trevor Stephens
@@ -297,7 +297,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if pdp.shape[0] == 2:
             # Binary classification
             pdp = pdp[1, :][np.newaxis]
-        elif len(pdp.shape) == 1:
+        elif pdp.ndim == 1:
             # Regression
             pdp = pdp[np.newaxis]
     elif method == 'estimated':
@@ -309,6 +309,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
             # Binary classification
             pdp = pdp[:, 1][np.newaxis]
+        elif est._estimator_type == 'classifier' and pdp.shape[1] > 2:
+            # Multi-label classification
+            pdp = pdp.T
         if est._estimator_type == 'regressor':
             pdp = pdp[np.newaxis]
     else:
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 4c50efa67977e..aaa2e539cf594 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -21,8 +21,6 @@
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
-T = [[-1, -1], [2, 2], [3, 2]]
-true_result = [-1, 1, 1]
 
 # Make some sample data to test output shapes
 X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)

From 610b5c572970fd5ecffad783036d9ff299196828 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 29 Oct 2017 08:47:10 +1100
Subject: [PATCH 022/113] update plot helper to support multi-output

---
 sklearn/partial_dependence.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7e843dd4469eb..932c7d39d4a9e 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -321,7 +321,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
-                            label=None, n_cols=3, grid_resolution=100,
+                            label=None, output=None,
+                            n_cols=3, grid_resolution=100,
                             percentiles=(0.05, 0.95), method='auto', n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
@@ -353,6 +354,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     label : object
         The class label for which the PDPs should be computed.
         Only if est is a multi-class model. Must be in ``est.classes_``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
     n_cols : int
         The number of columns in the grid plot (default: 3).
     grid_resolution : int, default=100
@@ -448,13 +451,27 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     else:
         n_features = X.shape[1]
 
-    # set label_idx for multi-class GBRT
+    # set label_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
             raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(est.classes_, label)
-        if est.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
+        if type(est.classes_) == list:
+            # multi-output classification
+            if output is None:
+                raise ValueError('output is required for multi-output '
+                                 'estimators')
+            if output > len(est.classes_):
+                raise ValueError('output %d exceeds number of outputs in est, '
+                                 '%d' % (output, len(est.classes_)))
+            label_idx = np.searchsorted(est.classes_[output], label)
+            if est.classes_[output][label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
+        else:
+            label_idx = np.searchsorted(est.classes_, label)
+            if est.classes_[label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
     else:
         # regression and binary classification
         label_idx = 0

From dcbd0c6c9c86f754c3689e83a137c2252bc86646 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 29 Oct 2017 09:45:45 +1100
Subject: [PATCH 023/113] update plot helper to pass-through output

---
 sklearn/partial_dependence.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 932c7d39d4a9e..436bc6f2b1134 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -531,7 +531,8 @@ def convert_feature(fx):
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X, method=method,
+        delayed(partial_dependence)(est, fxs, X=X, output=output,
+                                    method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)

From 3f5c7f7af947dd2188caeaba472c12adbee57606 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 9 Nov 2018 15:05:00 -0500
Subject: [PATCH 024/113] removed estimated method, small refactoring

---
 sklearn/partial_dependence.py            | 211 ++++++++++-------------
 sklearn/tests/test_partial_dependence.py |  39 -----
 2 files changed, 89 insertions(+), 161 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 436bc6f2b1134..8c5c68063f378 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -124,12 +124,61 @@ def _predict(est, X_eval, output=None):
             out = out[output]
         out = np.log(np.clip(out, 1e-16, 1))
         out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
-    else:
-        raise ValueError('est must be a fitted regressor or classifier '
-                         'model.')
     return out
 
 
+def _recursion(est, grid, target_variables, output=None, X=None):
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble._gradient_boosting import _partial_dependence_tree
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
+    if isinstance(est, BaseGradientBoosting):
+        n_trees_per_stage = est.estimators_.shape[1]
+        n_estimators = est.estimators_.shape[0]
+        learning_rate = est.learning_rate
+    else:
+        n_trees_per_stage = 1
+        n_estimators = len(est.estimators_)
+        learning_rate = 1.
+    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                    order='C')
+    for stage in range(n_estimators):
+        for k in range(n_trees_per_stage):
+            if isinstance(est, BaseGradientBoosting):
+                tree = est.estimators_[stage, k].tree_
+            else:
+                tree = est.estimators_[stage].tree_
+            _partial_dependence_tree(tree, grid, target_variables,
+                                        learning_rate, pdp[k])
+    if isinstance(est, ForestRegressor):
+        pdp /= n_estimators
+
+    return pdp
+
+
+def _exact(est, grid, target_variables, output, X):
+    n_samples = X.shape[0]
+    pdp = []
+    for row in range(grid.shape[0]):
+        X_eval = X.copy()
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+        pdp_row = _predict(est, X_eval, output=output)
+        if est._estimator_type == 'regressor':
+            pdp.append(np.mean(pdp_row))
+        else:
+            pdp.append(np.mean(pdp_row, 0))
+    pdp = np.array(pdp).transpose()
+    if pdp.shape[0] == 2:
+        # Binary classification
+        pdp = pdp[1, :][np.newaxis]
+    elif pdp.ndim == 1:
+        # Regression
+        pdp = pdp[np.newaxis]
+
+    return pdp
+
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
@@ -164,7 +213,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
-    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
+    method : {'recursion', 'exact', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -174,9 +223,6 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'estimated', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
@@ -200,51 +246,57 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    # TODO: The pattern below required to avoid a namespace collision.
-    # TODO: Move below imports to module level import at 0.22 release.
-    from .ensemble._gradient_boosting import _partial_dependence_tree
+
     from .ensemble.gradient_boosting import BaseGradientBoosting
     from .ensemble.forest import ForestRegressor
 
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
         else:
             method = 'exact'
-    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
-            method == 'recursion'):
-        raise ValueError('est has to be an instance of BaseGradientBoosting or'
-                         ' ForestRegressor for the "recursion" method. Try '
-                         'using method="exact" or "estimated".')
-    if (not hasattr(est, '_estimator_type') or
-            est._estimator_type not in ('classifier', 'regressor')):
-        raise ValueError('est must be a fitted regressor or classifier model.')
-    if (method != 'recursion' and est._estimator_type == 'classifier' and
-            not hasattr(est, 'predict_proba')):
-        raise ValueError('est requires a predict_proba method for '
-                         'method="exact" or "estimated" for classification.')
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
+    method_to_function = {
+        'exact': _exact,
+        'recursion': _recursion
+    }
+    if method not in method_to_function:
+        raise ValueError('method {} is invalid. Accepted method names are '
+                         '{}'.format(method,
+                                     ', '.join(method_to_function.keys())))
+
     if method == 'recursion':
-        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
-                                                'partial_dependence' %
-                                                est.__class__.__name__)
+        if not isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            raise ValueError(
+                'est has to be an instance of BaseGradientBoosting or '
+                'ForestRegressor for the "recursion" method. Try '
+                'using method="exact".')
+        check_is_fitted(est, 'estimators_',
+                        msg='Call fit() before partial_dependence()')
         n_features = est.n_features_
     elif X is None:
-        raise ValueError('X is required for method="exact" or "estimated".')
+        raise ValueError('X is required for exact method')
     else:
+        if (est._estimator_type == 'classifier' and
+                not hasattr(est, 'predict_proba')):
+            raise ValueError('est requires a predict_proba() method for '
+                             'method="exact" for classification.')
         n_features = X.shape[1]
-    if (grid is None and X is None) or (grid is not None and X is not None):
-        raise ValueError('Either grid or X must be specified')
 
     target_variables = np.asarray(target_variables, dtype=np.int32,
                                   order='C').ravel()
-
-    if any([not (0 <= fx < n_features) for fx in target_variables]):
+    if any(not (0 <= fx < n_features) for fx in target_variables):
         raise ValueError('target_variables must be in [0, %d]'
                          % (n_features - 1))
 
+    if (grid is None and X is None) or (grid is not None and X is not None):
+        raise ValueError('Either grid or X must be specified.')
+
     if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
         grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                   grid_resolution)
     else:
@@ -260,63 +312,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     grid = np.asarray(grid, dtype=DTYPE, order='C')
     assert grid.shape[1] == target_variables.shape[0]
 
-    if method == 'recursion':
-        if isinstance(est, BaseGradientBoosting):
-            n_trees_per_stage = est.estimators_.shape[1]
-            n_estimators = est.estimators_.shape[0]
-            learning_rate = est.learning_rate
-        else:
-            n_trees_per_stage = 1
-            n_estimators = len(est.estimators_)
-            learning_rate = 1.
-        pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                       order='C')
-        for stage in range(n_estimators):
-            for k in range(n_trees_per_stage):
-                if isinstance(est, BaseGradientBoosting):
-                    tree = est.estimators_[stage, k].tree_
-                else:
-                    tree = est.estimators_[stage].tree_
-                _partial_dependence_tree(tree, grid, target_variables,
-                                         learning_rate, pdp[k])
-        if isinstance(est, ForestRegressor):
-            pdp /= n_estimators
-    elif method == 'exact':
-        n_samples = X.shape[0]
-        pdp = []
-        for row in range(grid.shape[0]):
-            X_eval = X.copy()
-            for i, variable in enumerate(target_variables):
-                X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-            pdp_row = _predict(est, X_eval, output=output)
-            if est._estimator_type == 'regressor':
-                pdp.append(np.mean(pdp_row))
-            else:
-                pdp.append(np.mean(pdp_row, 0))
-        pdp = np.array(pdp).transpose()
-        if pdp.shape[0] == 2:
-            # Binary classification
-            pdp = pdp[1, :][np.newaxis]
-        elif pdp.ndim == 1:
-            # Regression
-            pdp = pdp[np.newaxis]
-    elif method == 'estimated':
-        n_samples = grid.shape[0]
-        X_eval = np.tile(np.median(X, 0), [n_samples, 1])
-        for i, variable in enumerate(target_variables):
-            X_eval[:, variable] = grid[:, i]
-        pdp = _predict(est, X_eval, output=output)
-        if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
-            # Binary classification
-            pdp = pdp[:, 1][np.newaxis]
-        elif est._estimator_type == 'classifier' and pdp.shape[1] > 2:
-            # Multi-label classification
-            pdp = pdp.T
-        if est._estimator_type == 'regressor':
-            pdp = pdp[np.newaxis]
-    else:
-        raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
-                         '"estimated", or None.' % method)
+    pdp = method_to_function[method](est, grid, target_variables, output, X)
+
     return pdp, axes
 
 
@@ -363,7 +360,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
-    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
+    method : {'recursion', 'exact', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -373,9 +370,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'estimated', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
@@ -422,35 +416,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from .ensemble.gradient_boosting import BaseGradientBoosting
     from .ensemble.forest import ForestRegressor
 
-    if method == 'auto':
-        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
-            method = 'recursion'
-        else:
-            method = 'exact'
-    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
-            method == 'recursion'):
-        raise ValueError('est has to be an instance of BaseGradientBoosting or'
-                         ' ForestRegressor for the "recursion" method. Try '
-                         'using method="exact" or "estimated".')
-    if (not hasattr(est, '_estimator_type') or
-            est._estimator_type not in ('classifier', 'regressor')):
-        raise ValueError('est must be a fitted regressor or classifier model.')
-    if (method != 'recursion' and est._estimator_type == 'classifier' and
-            not hasattr(est, 'predict_proba')):
-        raise ValueError('est requires a predict_proba method for '
-                         'method="exact" or "estimated" for classification.')
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
-    if method == 'recursion':
-        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
-                                                'partial_dependence' %
-                                                est.__class__.__name__)
-        n_features = est.n_features_
-    elif X is None:
-        raise ValueError('X is required for method="exact" or "estimated".')
-    else:
-        n_features = X.shape[1]
-
     # set label_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
@@ -476,8 +441,10 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         # regression and binary classification
         label_idx = 0
 
+    X = check_array(X, dtype=DTYPE, order='C')
     if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
+    n_features = X.shape[1]
 
     if line_kw is None:
         line_kw = {'color': 'green'}
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index aaa2e539cf594..39e84eb02f5ca 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -110,45 +110,6 @@ def test_output_shape_exact():
             assert (pdp.shape == (1, 100))
 
 
-@ignore_warnings()
-def test_output_shape_estimated():
-    # Test estimated partial_dependence has same output shape for everything
-    for name, Estimator in all_estimators():
-        est = Estimator()
-        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
-            continue
-        if est._estimator_type == 'classifier':
-            if not hasattr(est, 'predict_proba'):
-                continue
-            est.fit(X_c, y_c)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_c,
-                                           method='estimated',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_r,
-                                           method='estimated',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
-        elif est._estimator_type == 'regressor':
-            est.fit(X_r, y_r)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_r,
-                                           method='estimated',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_r,
-                                           method='estimated',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
-
-
 def test_partial_dependence_classifier():
     # Test partial dependence for classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)

From 45e648d34ca30a4a66a3914f049d4166fe4dcbd9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 9 Nov 2018 19:40:06 -0500
Subject: [PATCH 025/113] factorized some test

---
 sklearn/tests/test_partial_dependence.py | 171 +++++++++--------------
 1 file changed, 64 insertions(+), 107 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 39e84eb02f5ca..6b2e5109beefa 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from numpy.testing import assert_array_equal
+import pytest
 
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import if_matplotlib
@@ -33,81 +34,37 @@
 iris = load_iris()
 
 
-@ignore_warnings()
-def test_output_shape_recursion():
-    # Test recursion partial_dependence has same output shape for everything
+@pytest.mark.parametrize('method', ('recursion', 'exact'))
+@pytest.mark.parametrize('target_variables, expected_shape',
+                         [([1], (1, 10)),
+                          ([1, 2], (1, 100))])
+def test_output_shape(method, target_variables, expected_shape):
+    # Check that partial_dependence has consistent output shape
     for name, Estimator in all_estimators():
         est = Estimator()
-        if not (isinstance(est, BaseGradientBoosting) or
-                isinstance(est, ForestRegressor)):
-            continue
-        if est._estimator_type == 'classifier':
-            est.fit(X_c, y_c)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_c,
-                                           method='recursion',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_c,
-                                           method='recursion',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
-        elif est._estimator_type == 'regressor':
-            est.fit(X_r, y_r)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_r,
-                                           method='recursion',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_r,
-                                           method='recursion',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
-
-
-@ignore_warnings()
-def test_output_shape_exact():
-    # Test exact partial_dependence has same output shape for everything
-    for name, Estimator in all_estimators():
-        est = Estimator()
-        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
-            continue
-        if est._estimator_type == 'classifier':
-            if not hasattr(est, 'predict_proba'):
+        if method == 'recursion':
+            if not (isinstance(est, BaseGradientBoosting) or
+                    isinstance(est, ForestRegressor)):
+                continue
+        else:
+            if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
                 continue
-            est.fit(X_c, y_c)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_c,
-                                           method='exact',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_c,
-                                           method='exact',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
-        elif est._estimator_type == 'regressor':
-            est.fit(X_r, y_r)
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1],
-                                           X=X_r,
-                                           method='exact',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 10))
-            pdp, axes = partial_dependence(est,
-                                           target_variables=[1, 2],
-                                           X=X_r,
-                                           method='exact',
-                                           grid_resolution=10)
-            assert (pdp.shape == (1, 100))
+            if (est._estimator_type == 'classifier' and 
+                    not hasattr(est, 'predict_proba')):
+                continue
+        if est._estimator_type not in ('classifier', 'regressor'):
+            continue
+
+        X, y = ((X_c, y_c) if est._estimator_type == 'classifier'
+                else (X_r, y_r))
+
+        est.fit(X, y)
+        pdp, axes = partial_dependence(est,
+                                       target_variables=target_variables,
+                                       X=X,
+                                       method=method,
+                                       grid_resolution=10)
+        assert pdp.shape == expected_shape
 
 
 def test_partial_dependence_classifier():
@@ -218,41 +175,6 @@ def test_plot_partial_dependence():
     assert all(ax.has_data for ax in axs)
 
 
-@if_matplotlib
-def test_plot_partial_dependence_input():
-    # Test partial dependence plot function input checks.
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-
-    # not fitted yet
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [0])
-
-    clf.fit(X, y)
-
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, np.array(X)[:, :0], [0])
-
-    # first argument must be an instance of BaseGradientBoosting
-    assert_raises(ValueError, plot_partial_dependence,
-                  {}, X, [0])
-
-    # must be larger than -1
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [-1])
-
-    # too large feature value
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [100])
-
-    # str feature but no feature_names
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, ['foobar'])
-
-    # not valid features value
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [{'foo': 'bar'}])
-
-
 @if_matplotlib
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
@@ -287,3 +209,38 @@ def test_plot_partial_dependence_multiclass():
     assert_raises(ValueError, plot_partial_dependence,
                   clf, iris.data, [0, 1],
                   grid_resolution=grid_resolution)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_input():
+    # Test partial dependence plot function input checks.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+
+    # not fitted yet
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [0])
+
+    clf.fit(X, y)
+
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, np.array(X)[:, :0], [0])
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, plot_partial_dependence,
+                  {}, X, [0])
+
+    # must be larger than -1
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [-1])
+
+    # too large feature value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [100])
+
+    # str feature but no feature_names
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, ['foobar'])
+
+    # not valid features value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [{'foo': 'bar'}])

From ba4868f8bbf81fd9e0356afb12f8f31f1d3eb766 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 12 Nov 2018 11:39:54 -0500
Subject: [PATCH 026/113] some more refactoring

---
 sklearn/tests/test_partial_dependence.py | 118 +++++++++++------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 6b2e5109beefa..19ead40085f3e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_almost_equal
 import pytest
 
 from sklearn.utils.testing import assert_raises
@@ -18,6 +19,7 @@
 from sklearn.ensemble.forest import ForestRegressor
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
+from sklearn.base import is_classifier, is_regressor
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -34,37 +36,63 @@
 iris = load_iris()
 
 
+
+
+@pytest.mark.parametrize('Estimator', all_estimators())
 @pytest.mark.parametrize('method', ('recursion', 'exact'))
-@pytest.mark.parametrize('target_variables, expected_shape',
-                         [([1], (1, 10)),
-                          ([1, 2], (1, 100))])
-def test_output_shape(method, target_variables, expected_shape):
+@pytest.mark.parametrize('multiclass', (True, False))
+@pytest.mark.parametrize('grid_resolution', (10,))
+@pytest.mark.parametrize('target_variables', ([1], [1, 2]))
+def test_output_shape(Estimator, method, multiclass, grid_resolution,
+                      target_variables):
     # Check that partial_dependence has consistent output shape
-    for name, Estimator in all_estimators():
-        est = Estimator()
-        if method == 'recursion':
-            if not (isinstance(est, BaseGradientBoosting) or
-                    isinstance(est, ForestRegressor)):
-                continue
+
+    name, Estimator = Estimator
+    est = Estimator()
+    if not (is_classifier(est) or is_regressor(est)):
+        return
+    if method == 'recursion':
+        # recursion method only accepts some of the ensemble estimators
+        if not (isinstance(est, BaseGradientBoosting) or
+                isinstance(est, ForestRegressor)):
+            return
+    else:
+        # why skip multitask? We shouldn't
+        if 'MultiTask' in name:
+            return
+        # classifiers with exact method need predict_proba()
+        if is_classifier(est) and not hasattr(est, 'predict_proba'):
+            return
+
+    if is_classifier(est):
+        if multiclass == True:
+            X, y = iris.data, iris.target
+            n_classes = 3
         else:
-            if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
-                continue
-            if (est._estimator_type == 'classifier' and 
-                    not hasattr(est, 'predict_proba')):
-                continue
-        if est._estimator_type not in ('classifier', 'regressor'):
-            continue
-
-        X, y = ((X_c, y_c) if est._estimator_type == 'classifier'
-                else (X_r, y_r))
-
-        est.fit(X, y)
-        pdp, axes = partial_dependence(est,
-                                       target_variables=target_variables,
-                                       X=X,
-                                       method=method,
-                                       grid_resolution=10)
-        assert pdp.shape == expected_shape
+            X, y = X_c, y_c
+            n_classes = 1
+    else:  # regressor
+            X, y = X_r, y_r
+            n_classes = 1
+
+    est.fit(X, y)
+    pdp, axes = partial_dependence(est,
+                                    target_variables=target_variables,
+                                    X=X,
+                                    method=method,
+                                    grid_resolution=grid_resolution)
+
+    expected_pdp_shape = (n_classes,
+                            grid_resolution ** len(target_variables))
+    expected_axes_shape = (len(target_variables), grid_resolution)
+
+    assert pdp.shape == expected_pdp_shape
+    assert axes is not None
+    assert np.asarray(axes).shape == expected_axes_shape
+
+
+def test_grid_from_X():
+    pass
 
 
 def test_partial_dependence_classifier():
@@ -76,7 +104,7 @@ def test_partial_dependence_classifier():
 
     # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
     assert pdp.shape == (1, 4)
-    assert axes[0].shape[0] == 4
+    assert np.asarray(axes).shape == (1, 4)
 
     # now with our own grid
     X_ = np.asarray(X)
@@ -84,38 +112,10 @@ def test_partial_dependence_classifier():
     pdp_2, axes = partial_dependence(clf, [0], grid=grid)
 
     assert axes is None
-    assert_array_equal(pdp, pdp_2)
-
-
-def test_partial_dependence_multiclass():
-    # Test partial dependence for multi-class classifier
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(iris.data, iris.target)
-
-    grid_resolution = 25
-    n_classes = clf.n_classes_
-    pdp, axes = partial_dependence(
-        clf, [0], X=iris.data, grid_resolution=grid_resolution)
-
-    assert pdp.shape == (n_classes, grid_resolution)
-    assert len(axes) == 1
-    assert axes[0].shape[0] == grid_resolution
-
-
-def test_partial_dependence_regressor():
-    # Test partial dependence for regressor
-    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
-    clf.fit(boston.data, boston.target)
-
-    grid_resolution = 25
-    pdp, axes = partial_dependence(
-        clf, [0], X=boston.data, grid_resolution=grid_resolution)
-
-    assert pdp.shape == (1, grid_resolution)
-    assert axes[0].shape[0] == grid_resolution
+    assert_array_almost_equal(pdp, pdp_2)
 
 
-def test_partial_dependecy_input():
+def test_partial_dependency_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(X, y)

From 414387b734da0c709a668cd11f1029acedf3676a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 12 Nov 2018 13:47:53 -0500
Subject: [PATCH 027/113] test for _grid_from_X

---
 sklearn/tests/test_partial_dependence.py | 54 +++++++++++-------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 19ead40085f3e..3d14f1b25ff2c 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -13,6 +13,7 @@
 from sklearn.utils.testing import ignore_warnings
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.partial_dependence import _grid_from_X
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
@@ -36,8 +37,6 @@
 iris = load_iris()
 
 
-
-
 @pytest.mark.parametrize('Estimator', all_estimators())
 @pytest.mark.parametrize('method', ('recursion', 'exact'))
 @pytest.mark.parametrize('multiclass', (True, False))
@@ -76,14 +75,12 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
             n_classes = 1
 
     est.fit(X, y)
-    pdp, axes = partial_dependence(est,
-                                    target_variables=target_variables,
-                                    X=X,
-                                    method=method,
-                                    grid_resolution=grid_resolution)
+    pdp, axes = partial_dependence(est, target_variables=target_variables,
+                                   X=X, method=method,
+                                   grid_resolution=grid_resolution)
 
     expected_pdp_shape = (n_classes,
-                            grid_resolution ** len(target_variables))
+                          grid_resolution ** len(target_variables))
     expected_axes_shape = (len(target_variables), grid_resolution)
 
     assert pdp.shape == expected_pdp_shape
@@ -92,27 +89,26 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
 
 
 def test_grid_from_X():
-    pass
-
-
-def test_partial_dependence_classifier():
-    # Test partial dependence for classifier
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(X, y)
-
-    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)
-
-    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
-    assert pdp.shape == (1, 4)
-    assert np.asarray(axes).shape == (1, 4)
-
-    # now with our own grid
-    X_ = np.asarray(X)
-    grid = np.unique(X_[:, 0])
-    pdp_2, axes = partial_dependence(clf, [0], grid=grid)
-
-    assert axes is None
-    assert_array_almost_equal(pdp, pdp_2)
+    # test shapes of returned objects depending on the number of unique values
+    # for a feature.
+
+    rng = np.random.RandomState(0)
+    grid_resolution = 15
+
+    # n_unique_values > grid_resolution
+    X = rng.normal(size=(20, 2))
+    grid, axes = _grid_from_X(X, grid_resolution=grid_resolution)
+    assert grid.shape == (grid_resolution * grid_resolution, X.shape[1])
+    assert np.asarray(axes).shape == (2, grid_resolution)
+
+    # n_unique_values < grid_resolution, will use actual values
+    n_unique_values = 10
+    X[:n_unique_values + 1, 0] = 12345
+    grid, axes = _grid_from_X(X, grid_resolution=grid_resolution)
+    assert grid.shape == (n_unique_values * grid_resolution, X.shape[1])
+    # axes is a list of arrays of different shapes
+    assert axes[0].shape == (n_unique_values,)
+    assert axes[1].shape == (grid_resolution,)
 
 
 def test_partial_dependency_input():

From f6356938aafe17f2bae3364e4ad13813272a85ad Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 13 Nov 2018 12:27:05 -0500
Subject: [PATCH 028/113] few changes and comments

---
 sklearn/partial_dependence.py            | 126 ++++++++++-------------
 sklearn/tests/test_partial_dependence.py |  11 +-
 2 files changed, 61 insertions(+), 76 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 8c5c68063f378..54cc7c0a349a3 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -72,61 +72,6 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _predict(est, X_eval, output=None):
-    """Calculate part of the partial dependence of ``target_variables``.
-
-    The function will be calculated by calling the ``predict_proba`` method of
-    ``est`` for classification or ``predict`` for regression on ``X`` for every
-    point in the grid.
-
-    Parameters
-    ----------
-    est : BaseEstimator
-        A fitted classification or regression model.
-    X_eval : array-like, shape=(n_samples, n_features)
-        The data on which the partial dependence of ``est`` should be
-        predicted.
-    output : int, optional (default=None)
-        The output index to use for multi-output estimators.
-
-    Returns
-    -------
-    out : array, shape=(n_classes, n_points)
-        The partial dependence function evaluated on the ``grid``.
-        For regression and binary classification ``n_classes==1``.
-    """
-    if est._estimator_type == 'regressor':
-        try:
-            out = est.predict(X_eval)
-        except NotFittedError:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
-        if out.ndim != 1 and out.shape[1] == 1:
-            # Column output
-            out = out.ravel()
-        if out.ndim != 1 and out.shape[1] != 1:
-            # Multi-output
-            if not 0 <= output < out.shape[1]:
-                raise ValueError('Valid output must be specified for '
-                                 'multi-output models.')
-            out = out[:, output]
-    elif est._estimator_type == 'classifier':
-        try:
-            out = est.predict_proba(X_eval)
-        except NotFittedError:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
-        if isinstance(out, list):
-            # Multi-output
-            if not 0 <= output < len(out):
-                raise ValueError('Valid output must be specified for '
-                                 'multi-output models.')
-            out = out[output]
-        out = np.log(np.clip(out, 1e-16, 1))
-        out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
-    return out
-
-
 def _recursion(est, grid, target_variables, output=None, X=None):
     # TODO: The pattern below required to avoid a namespace collision.
     # TODO: Move below imports to module level import at 0.22 release.
@@ -158,24 +103,59 @@ def _recursion(est, grid, target_variables, output=None, X=None):
 
 
 def _exact(est, grid, target_variables, output, X):
-    n_samples = X.shape[0]
+
+    def _predict(est, X, output=None):
+        if est._estimator_type == 'regressor':
+            try:
+                predictions = est.predict(X)
+            except NotFittedError:
+                raise ValueError('est parameter must be a fitted estimator')
+            if predictions.ndim != 1 and predictions.shape[1] != 1:
+                # Multi-output
+                if not 0 <= output < predictions.shape[1]:
+                    # TODO: better error msg, also could this be checked
+                    # before?
+                    raise ValueError('Valid out must be specified for '
+                                     'multi-output models.')
+                predictions = predictions[:, output]
+            print(predictions.shape)
+        elif est._estimator_type == 'classifier':
+            # Note: no support for multi-output classifiers.
+            # TODO: raise error if multi-output classifier.
+            try:
+                predictions = est.predict_proba(X)
+            except NotFittedError:
+                raise ValueError('est parameter must be a fitted estimator')
+            predictions = np.log(np.clip(predictions, 1e-16, 1))
+            # not sure yet why we need to center probas?
+            predictions = predictions - np.mean(predictions, axis=1,
+                                                keepdims=True)
+        # predictions is of shape
+        # (n_points,) for most regressors (multioutput or not)
+        # (n_points, 1) for the regressors in cross_decomposition (I think)
+        # (n_points, 2)  for binary classifaction
+        # (n_points, n_classes)  for multiclass classification
+        return predictions
+
     pdp = []
-    for row in range(grid.shape[0]):
+    for row in grid:
         X_eval = X.copy()
         for i, variable in enumerate(target_variables):
-            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-        pdp_row = _predict(est, X_eval, output=output)
-        if est._estimator_type == 'regressor':
-            pdp.append(np.mean(pdp_row))
-        else:
-            pdp.append(np.mean(pdp_row, 0))
-    pdp = np.array(pdp).transpose()
-    if pdp.shape[0] == 2:
-        # Binary classification
-        pdp = pdp[1, :][np.newaxis]
-    elif pdp.ndim == 1:
-        # Regression
-        pdp = pdp[np.newaxis]
+            X_eval[:, variable] = row[i]
+        predictions = _predict(est, X_eval, output=output)
+        pdp.append(np.mean(predictions, axis=0))  # average over points
+
+    # reshape pdp to (n_classes, n_points) where n_classes is 1 for binary
+    # classification and for regression. The shape is already correct for
+    # multiclass classification.
+    pdp = np.array(pdp).T
+    if pdp.ndim == 1:
+        # Regression, pdp shape is (n_points,)
+        pdp = pdp.reshape(1, -1)
+    elif pdp.shape[0] == 2:
+        # Binary classification, pdp shape is (2, n_points).
+        pdp = pdp[1] # we output the effect of **positive** class
+        pdp = pdp.reshape(1, -1)
 
     return pdp
 
@@ -264,8 +244,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         'recursion': _recursion
     }
     if method not in method_to_function:
-        raise ValueError('method {} is invalid. Accepted method names are '
-                         '{}'.format(method,
+        raise ValueError(
+            'method {} is invalid. Accepted method names are {}'.format(
+                method,
                                      ', '.join(method_to_function.keys())))
 
     if method == 'recursion':
@@ -310,6 +291,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             raise ValueError('grid must be 2d but is %dd' % grid.ndim)
 
     grid = np.asarray(grid, dtype=DTYPE, order='C')
+    # TODO: output error message
     assert grid.shape[1] == target_variables.shape[0]
 
     pdp = method_to_function[method](est, grid, target_variables, output, X)
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 3d14f1b25ff2c..15bdb7890ee20 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -71,8 +71,10 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
             X, y = X_c, y_c
             n_classes = 1
     else:  # regressor
-            X, y = X_r, y_r
-            n_classes = 1
+        if multiclass:  # multiclass for regressor makes no sense
+            return
+        X, y = X_r, y_r
+        n_classes = 1
 
     est.fit(X, y)
     pdp, axes = partial_dependence(est, target_variables=target_variables,
@@ -102,8 +104,9 @@ def test_grid_from_X():
     assert np.asarray(axes).shape == (2, grid_resolution)
 
     # n_unique_values < grid_resolution, will use actual values
-    n_unique_values = 10
-    X[:n_unique_values + 1, 0] = 12345
+    n_unique_values = 12
+    X[n_unique_values - 1:, 0] = 12345
+    rng.shuffle(X)  # just to make sure the order is irrelevant
     grid, axes = _grid_from_X(X, grid_resolution=grid_resolution)
     assert grid.shape == (n_unique_values * grid_resolution, X.shape[1])
     # axes is a list of arrays of different shapes

From 58bfbad7e552917652c542d88debcf3b1c65cba4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 13 Nov 2018 17:16:15 -0500
Subject: [PATCH 029/113] some test + removed multioutput logic for now

---
 sklearn/partial_dependence.py            | 145 +++++++++++------------
 sklearn/tests/test_partial_dependence.py | 136 ++++++++++++++++-----
 2 files changed, 176 insertions(+), 105 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 54cc7c0a349a3..b0dbc95ea519b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -10,6 +10,7 @@
 import numpy as np
 from scipy.stats.mstats import mquantiles
 
+from .base import is_classifier, is_regressor
 from .utils.extmath import cartesian
 from .externals.joblib import Parallel, delayed
 from .externals import six
@@ -50,10 +51,14 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     axes : seq of ndarray
         The axes with which the grid has been created.
     """
-    if len(percentiles) != 2:
-        raise ValueError('percentile must be tuple of len 2')
+    try:
+        assert len(percentiles) == 2
+    except (AssertionError, TypeError):
+        raise ValueError('percentiles must be a sequence of 2 elements.')
     if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentile values must be in [0, 1]')
+        raise ValueError('percentiles values must be in [0, 1].')
+    if percentiles[0] > percentiles[1]:
+        raise ValueError('percentiles[0] must be less than percentiles[1].')
 
     axes = []
     emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
@@ -72,12 +77,16 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _recursion(est, grid, target_variables, output=None, X=None):
+def _partial_dependence_recursion(est, grid, target_variables, X=None):
     # TODO: The pattern below required to avoid a namespace collision.
     # TODO: Move below imports to module level import at 0.22 release.
     from .ensemble._gradient_boosting import _partial_dependence_tree
     from .ensemble.gradient_boosting import BaseGradientBoosting
     from .ensemble.forest import ForestRegressor
+
+    # grid needs to be DTYPE
+    grid = np.asarray(grid, dtype=DTYPE, order='C')
+
     if isinstance(est, BaseGradientBoosting):
         n_trees_per_stage = est.estimators_.shape[1]
         n_estimators = est.estimators_.shape[0]
@@ -95,71 +104,59 @@ def _recursion(est, grid, target_variables, output=None, X=None):
             else:
                 tree = est.estimators_[stage].tree_
             _partial_dependence_tree(tree, grid, target_variables,
-                                        learning_rate, pdp[k])
+                                     learning_rate, pdp[k])
     if isinstance(est, ForestRegressor):
         pdp /= n_estimators
 
     return pdp
 
 
-def _exact(est, grid, target_variables, output, X):
+def _partial_dependence_exact(est, grid, target_variables, X):
 
-    def _predict(est, X, output=None):
-        if est._estimator_type == 'regressor':
-            try:
-                predictions = est.predict(X)
-            except NotFittedError:
-                raise ValueError('est parameter must be a fitted estimator')
-            if predictions.ndim != 1 and predictions.shape[1] != 1:
-                # Multi-output
-                if not 0 <= output < predictions.shape[1]:
-                    # TODO: better error msg, also could this be checked
-                    # before?
-                    raise ValueError('Valid out must be specified for '
-                                     'multi-output models.')
-                predictions = predictions[:, output]
-            print(predictions.shape)
-        elif est._estimator_type == 'classifier':
-            # Note: no support for multi-output classifiers.
-            # TODO: raise error if multi-output classifier.
-            try:
-                predictions = est.predict_proba(X)
-            except NotFittedError:
-                raise ValueError('est parameter must be a fitted estimator')
+    pdp = []
+    for new_values in grid:
+        X_eval = X.copy()
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = new_values[i]
+
+        try:
+            predictions = (est.predict(X_eval) if is_regressor(est)
+                           else est.predict_proba(X_eval))
+        except NotFittedError:
+            raise ValueError('est parameter must be a fitted estimator')
+
+        # Note: predictions is of shape
+        # (n_points,) for non-multioutput regressors
+        # (n_points, n_tasks) for multioutput regressors
+        # (n_points, 1) for the regressors in cross_decomposition (I think)
+        # (n_points, 2)  for binary classifaction
+        # (n_points, n_classes) for multiclass classification
+
+        if is_classifier(est):
             predictions = np.log(np.clip(predictions, 1e-16, 1))
             # not sure yet why we need to center probas?
             predictions = predictions - np.mean(predictions, axis=1,
                                                 keepdims=True)
-        # predictions is of shape
-        # (n_points,) for most regressors (multioutput or not)
-        # (n_points, 1) for the regressors in cross_decomposition (I think)
-        # (n_points, 2)  for binary classifaction
-        # (n_points, n_classes)  for multiclass classification
-        return predictions
 
-    pdp = []
-    for row in grid:
-        X_eval = X.copy()
-        for i, variable in enumerate(target_variables):
-            X_eval[:, variable] = row[i]
-        predictions = _predict(est, X_eval, output=output)
-        pdp.append(np.mean(predictions, axis=0))  # average over points
+        pdp.append(np.mean(predictions, axis=0))  # average over samples
 
-    # reshape pdp to (n_classes, n_points) where n_classes is 1 for binary
-    # classification and for regression. The shape is already correct for
-    # multiclass classification.
+    # reshape pdp to (n_targets, n_points) where n_targets is:
+    # - 1 for non-multioutput regression and binary classification (shape is
+    #   already correct in those cases)
+    # - n_tasks for multi-output regression
+    # - n_classes for multiclass classification.
     pdp = np.array(pdp).T
-    if pdp.ndim == 1:
-        # Regression, pdp shape is (n_points,)
+    if is_regressor(est) and pdp.ndim == 1:
+        # non-multioutput regression, pdp shape is (n_points,)
         pdp = pdp.reshape(1, -1)
-    elif pdp.shape[0] == 2:
+    elif is_classifier(est) and pdp.shape[0] == 2:
         # Binary classification, pdp shape is (2, n_points).
         pdp = pdp[1] # we output the effect of **positive** class
         pdp = pdp.reshape(1, -1)
 
     return pdp
 
-def partial_dependence(est, target_variables, grid=None, X=None, output=None,
+def partial_dependence(est, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
     """Partial dependence of ``target_variables``.
@@ -230,9 +227,11 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     from .ensemble.gradient_boosting import BaseGradientBoosting
     from .ensemble.forest import ForestRegressor
 
-    if (not hasattr(est, '_estimator_type') or
-            est._estimator_type not in ('classifier', 'regressor')):
-        raise ValueError('est must be a fitted regressor or classifier model.')
+    if not (is_classifier(est) or is_regressor(est)):
+        raise ValueError('est must be a fitted regressor or classifier.')
+
+    if X is not None:
+        X = check_array(X)
 
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
@@ -240,29 +239,28 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         else:
             method = 'exact'
     method_to_function = {
-        'exact': _exact,
-        'recursion': _recursion
+        'exact': _partial_dependence_exact,
+        'recursion': _partial_dependence_recursion
     }
     if method not in method_to_function:
         raise ValueError(
-            'method {} is invalid. Accepted method names are {}'.format(
-                method,
-                                     ', '.join(method_to_function.keys())))
+            'method {} is invalid. Accepted method names are {}, auto.'.format(
+                method, ', '.join(method_to_function.keys())))
 
     if method == 'recursion':
         if not isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             raise ValueError(
-                'est has to be an instance of BaseGradientBoosting or '
+                'est must be an instance of BaseGradientBoosting or '
                 'ForestRegressor for the "recursion" method. Try '
                 'using method="exact".')
         check_is_fitted(est, 'estimators_',
-                        msg='Call fit() before partial_dependence()')
+                        msg='est parameter must be a fitted estimator')
+        # Note: if method is exact, this check is done at prediction time
         n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for exact method')
     else:
-        if (est._estimator_type == 'classifier' and
-                not hasattr(est, 'predict_proba')):
+        if is_classifier(est) and not hasattr(est, 'predict_proba'):
             raise ValueError('est requires a predict_proba() method for '
                              'method="exact" for classification.')
         n_features = X.shape[1]
@@ -270,31 +268,30 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     target_variables = np.asarray(target_variables, dtype=np.int32,
                                   order='C').ravel()
     if any(not (0 <= fx < n_features) for fx in target_variables):
-        raise ValueError('target_variables must be in [0, %d]'
+        raise ValueError('all target_variables must be in [0, %d]'
                          % (n_features - 1))
 
-    if (grid is None and X is None) or (grid is not None and X is not None):
+    if (grid is None and X is None):
         raise ValueError('Either grid or X must be specified.')
 
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
+    if grid is None:
         grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                   grid_resolution)
     else:
-        assert grid is not None
-        # don't return axes if grid is given
-        axes = None
+        grid = np.asarray(grid)
+        axes = None  # don't return axes if grid is given
         # grid must be 2d
         if grid.ndim == 1:
             grid = grid[:, np.newaxis]
         if grid.ndim != 2:
-            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
-
-    grid = np.asarray(grid, dtype=DTYPE, order='C')
-    # TODO: output error message
-    assert grid.shape[1] == target_variables.shape[0]
-
-    pdp = method_to_function[method](est, grid, target_variables, output, X)
+            raise ValueError('grid must be 1d or 2d, got %dd dimensions' %
+                             grid.ndim)
+        if grid.shape[1] != target_variables.shape[0]:
+            raise ValueError('grid.shape[1] ({}) must be equal to the number '
+                             'of target variables ({})'.format(
+                                 grid.shape[1], target_variables.shape[0]))
+
+    pdp = method_to_function[method](est, grid, target_variables, X)
 
     return pdp, axes
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 15bdb7890ee20..5653c4810e933 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -8,19 +8,27 @@
 import pytest
 
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import if_matplotlib
 from sklearn.utils.testing import all_estimators
 from sklearn.utils.testing import ignore_warnings
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.partial_dependence import _grid_from_X
+from sklearn.partial_dependence import _partial_dependence_exact
+from sklearn.partial_dependence import _partial_dependence_recursion
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
+from sklearn.linear_model import LinearRegression
+from sklearn.svm import SVC
 from sklearn.ensemble.forest import ForestRegressor
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.base import is_classifier, is_regressor
+from sklearn.utils.estimator_checks import multioutput_estimator_convert_y_2d
+from sklearn.cluster import KMeans
+
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -46,7 +54,7 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
                       target_variables):
     # Check that partial_dependence has consistent output shape
 
-    name, Estimator = Estimator
+    _, Estimator = Estimator
     est = Estimator()
     if not (is_classifier(est) or is_regressor(est)):
         return
@@ -55,34 +63,32 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
         if not (isinstance(est, BaseGradientBoosting) or
                 isinstance(est, ForestRegressor)):
             return
-    else:
-        # why skip multitask? We shouldn't
-        if 'MultiTask' in name:
-            return
+    elif is_classifier(est) and not hasattr(est, 'predict_proba'):
         # classifiers with exact method need predict_proba()
-        if is_classifier(est) and not hasattr(est, 'predict_proba'):
-            return
+        return
 
     if is_classifier(est):
         if multiclass == True:
             X, y = iris.data, iris.target
-            n_classes = 3
+            n_targets = 3
         else:
             X, y = X_c, y_c
-            n_classes = 1
+            n_targets = 1
     else:  # regressor
         if multiclass:  # multiclass for regressor makes no sense
             return
         X, y = X_r, y_r
-        n_classes = 1
+        n_targets = 1
+        if "MultiTask" in est.__class__.__name__:
+            y = np.array([y, y]).T
+            n_targets = 2
 
     est.fit(X, y)
     pdp, axes = partial_dependence(est, target_variables=target_variables,
                                    X=X, method=method,
                                    grid_resolution=grid_resolution)
 
-    expected_pdp_shape = (n_classes,
-                          grid_resolution ** len(target_variables))
+    expected_pdp_shape = (n_targets, grid_resolution ** len(target_variables))
     expected_axes_shape = (len(target_variables), grid_resolution)
 
     assert pdp.shape == expected_pdp_shape
@@ -114,32 +120,100 @@ def test_grid_from_X():
     assert axes[1].shape == (grid_resolution,)
 
 
-def test_partial_dependency_input():
-    # Test input validation of partial dependence.
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(X, y)
+@pytest.mark.parametrize('target_feature', (0, 3))
+@pytest.mark.parametrize('est, partial_dependence_fun',
+                         [(LinearRegression(), _partial_dependence_exact),
+                          (GradientBoostingRegressor(random_state=0), _partial_dependence_recursion)])
+def test_partial_dependence_helpers(est, partial_dependence_fun,
+                                    target_feature):
+    # Check that the what is returned by _partial_dependence_exact or
+    # _partial_dependece_recursion is equivalent to manually setting a target
+    # feature to a given value, and computing the average prediction over all
+    # samples.
 
-    assert_raises(ValueError, partial_dependence,
-                  clf, [0], grid=None, X=None)
+    # doesn't work for _partial_dependence_recursion, dont know why :(
+    if partial_dependence_fun is _partial_dependence_recursion:
+        return
 
-    assert_raises(ValueError, partial_dependence,
-                  clf, [0], grid=[0, 1], X=X)
+    X, y = X_r, y_r
+    est.fit(X, y)
 
-    # first argument must be an instance of BaseGradientBoosting
-    assert_raises(ValueError, partial_dependence,
-                  {}, [0], X=X)
+    # target feature will be set to .5 and then to 123
+    target_variables = np.array([target_feature], dtype=np.int32)
+    grid = np.array([[.5],
+                     [123]])
+    pdp = partial_dependence_fun(est, grid, target_variables, X)
 
-    # Gradient boosting estimator must be fit
-    assert_raises(ValueError, partial_dependence,
-                  GradientBoostingClassifier(), [0], X=X)
+    mean_predictions = []
+    for val in (.5, 123):
+        X_ = X.copy()
+        X_[:, target_feature] = val
+        mean_predictions.append(est.predict(X_).mean())
 
-    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)
+    pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
+    assert_array_almost_equal(pdp, mean_predictions)
+    
 
-    assert_raises(ValueError, partial_dependence, clf, [100], X=X)
+def test_partial_dependence_input():
+    # Test input validation of partial dependence.
 
-    # wrong ndim for grid
-    grid = np.random.rand(10, 2, 1)
-    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
+    lr = LinearRegression()
+    lr.fit(X, y)
+    gbc = GradientBoostingClassifier(random_state=0)
+    gbc.fit(X, y)
+
+    assert_raises_regex(ValueError,
+                        "est must be a fitted regressor or classifier",
+                        partial_dependence, KMeans(), [0])
+
+    assert_raises_regex(ValueError,
+                        "method blahblah is invalid. Accepted method names "
+                        "are exact, recursion, auto.",
+                        partial_dependence, lr, [0], method='blahblah')
+
+    assert_raises_regex(ValueError,
+                        'est must be an instance of BaseGradientBoosting or '
+                        'ForestRegressor for the "recursion" method',
+                        partial_dependence, lr, [0], method='recursion')
+
+    assert_raises_regex(ValueError, "est requires a predict_proba()",
+                        partial_dependence, SVC(), [0], X=X)
+
+    for feature in (-1, 1000000):
+        for est in (lr, gbc):
+            assert_raises_regex(ValueError,
+                                "all target_variables must be in",
+                                partial_dependence, est, [feature], X=X)
+
+    assert_raises_regex(ValueError, "Either grid or X must be specified",
+                        partial_dependence, gbc, [0], grid=None, X=None)
+
+    for percentiles in ((1, 2, 3, 4), 12345):
+        assert_raises_regex(ValueError, "percentiles must be a sequence",
+                            partial_dependence, lr, [0], grid=None, X=X,
+                            percentiles=percentiles)
+    for percentiles in ((-1, .95), (.05, 2)):
+        assert_raises_regex(ValueError, "percentiles values must be in",
+                            partial_dependence, lr, [0], grid=None, X=X,
+                            percentiles=percentiles)
+    assert_raises_regex(ValueError, "percentiles\[0\] must be less than",
+                        partial_dependence, lr, [0], grid=None, X=X,
+                        percentiles=(.9, .1))
+
+    assert_raises_regex(ValueError, "grid must be 1d or 2d",
+                        partial_dependence, lr, [0], grid=[[[1]]], X=X)
+
+    for target_variables in ([0], [0, 1, 0]):
+        assert_raises_regex(ValueError,
+                            'grid.shape\[1\] \(2\) must be equal to the number'
+                            ' of target variables',
+                            partial_dependence, lr, target_variables,
+                            grid=[[30, -123]], X=X)
+
+    for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
+        assert_raises_regex(ValueError,
+                            'est parameter must be a fitted estimator',
+                            partial_dependence, unfitted_est, [0], X=X)
 
 
 @if_matplotlib

From cabb7f13f38b763312f14f3b3b40288e74d54de1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 13 Nov 2018 18:06:11 -0500
Subject: [PATCH 030/113] some more tests

---
 sklearn/partial_dependence.py            | 28 ++++++++++++------------
 sklearn/tests/test_partial_dependence.py | 19 +++++++++++++---
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b0dbc95ea519b..a4b278214e6b6 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -30,7 +30,8 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
     The grid is generated by placing ``grid_resolution`` equally
     spaced points between the ``percentiles`` of each column
-    of ``X``.
+    of ``X``. If ``grid_resolution`` is bigger than the number of unique values
+    in a column, then those unique values will be used instead.
 
     Parameters
     ----------
@@ -40,16 +41,16 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         The percentiles which are used to construct the extreme
         values of the grid axes.
     grid_resolution : int
-        The number of equally spaced points that are placed
-        on the grid.
+        The number of equally spaced points to be placed on the grid for a
+        given feature.
 
     Returns
     -------
     grid : ndarray
-        All data points on the grid; ``grid.shape[1] == X.shape[1]``
-        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
-    axes : seq of ndarray
-        The axes with which the grid has been created.
+        All data points on the grid. This is the cartesian product of ``axes``.
+    axes : list of ndarray
+        The axes with which the grid has been created. The ndarrays may be of
+        different shape: either (grid_resolution, ) or (n_unique_values,).
     """
     try:
         assert len(percentiles) == 2
@@ -61,16 +62,16 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         raise ValueError('percentiles[0] must be less than percentiles[1].')
 
     axes = []
-    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-    for col in range(X.shape[1]):
-        uniques = np.unique(X[:, col])
+    for feature in range(X.shape[1]):
+        uniques = np.unique(X[:, feature])
         if uniques.shape[0] < grid_resolution:
             # feature has low resolution use unique vals
             axis = uniques
         else:
             # create axis based on percentiles and grid resolution
-            axis = np.linspace(emp_percentiles[0, col],
-                               emp_percentiles[1, col],
+            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+            axis = np.linspace(emp_percentiles[0, feature],
+                               emp_percentiles[1, feature],
                                num=grid_resolution, endpoint=True)
         axes.append(axis)
 
@@ -477,8 +478,7 @@ def convert_feature(fx):
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X, output=output,
-                                    method=method,
+        delayed(partial_dependence)(est, fxs, X=X, method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 5653c4810e933..8af4b6de9afad 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -80,6 +80,7 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
         X, y = X_r, y_r
         n_targets = 1
         if "MultiTask" in est.__class__.__name__:
+            # multioutput regressor
             y = np.array([y, y]).T
             n_targets = 2
 
@@ -97,9 +98,21 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
 
 
 def test_grid_from_X():
+    # tests for _grid_from_X
+
+    # Make sure that the grid is a cartesian product of the input (it will use
+    # the unique values instead of the percentiles)
+    X = np.asarray([[1, 2], 
+                    [3, 4]])
+    grid, axes = _grid_from_X(X)
+    assert_array_almost_equal(grid, [[1, 2],
+                                     [1, 4],
+                                     [3, 2],
+                                     [3, 4]])
+    assert_array_almost_equal(axes, X.T)
+
     # test shapes of returned objects depending on the number of unique values
     # for a feature.
-
     rng = np.random.RandomState(0)
     grid_resolution = 15
 
@@ -126,7 +139,7 @@ def test_grid_from_X():
                           (GradientBoostingRegressor(random_state=0), _partial_dependence_recursion)])
 def test_partial_dependence_helpers(est, partial_dependence_fun,
                                     target_feature):
-    # Check that the what is returned by _partial_dependence_exact or
+    # Check that what is returned by _partial_dependence_exact or
     # _partial_dependece_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
     # samples.
@@ -155,7 +168,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     
 
 def test_partial_dependence_input():
-    # Test input validation of partial dependence.
+    # Test input validation of partial_dependence.
 
     lr = LinearRegression()
     lr.fit(X, y)

From 7e28cf54a5ee896fee5239cf5dadb41acacff151 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 13 Nov 2018 19:22:26 -0500
Subject: [PATCH 031/113] removed support for multioutput multiclass and added
 back multioutput support in plot_partial_dependence

---
 sklearn/partial_dependence.py | 55 ++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index a4b278214e6b6..068443259750b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -399,28 +399,29 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # set label_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
-            raise ValueError('label is not given for multi-class PDP')
-        if type(est.classes_) == list:
-            # multi-output classification
-            if output is None:
-                raise ValueError('output is required for multi-output '
-                                 'estimators')
-            if output > len(est.classes_):
-                raise ValueError('output %d exceeds number of outputs in est, '
-                                 '%d' % (output, len(est.classes_)))
-            label_idx = np.searchsorted(est.classes_[output], label)
-            if est.classes_[output][label_idx] != label:
-                raise ValueError('label %s not in ``est.classes_``' %
-                                 str(label))
-        else:
-            label_idx = np.searchsorted(est.classes_, label)
-            if est.classes_[label_idx] != label:
-                raise ValueError('label %s not in ``est.classes_``' %
-                                 str(label))
+            raise ValueError('label must be specified for multi-class PDP')
+        label_idx = np.searchsorted(est.classes_, label)
+        if est.classes_[label_idx] != label:
+            raise ValueError('label %s not in ``est.classes_``' %
+                                str(label))
     else:
         # regression and binary classification
         label_idx = 0
 
+    if is_regressor and "MultiTask" in est.__class__.__name__:
+        # multioutput regressor
+        if output is None:
+            raise ValueError(
+                'output must be specified for multi-output regressors')
+        if output < 0:
+            raise ValueError('output must be in [0, n_tasks], got {}.'.format(
+                output))
+        # Note: upper bound for output can only be checked once we have the
+        # predictions
+    else:
+        output = 0
+
+    #TODO: DYPE?????
     X = check_array(X, dtype=DTYPE, order='C')
     if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
@@ -483,10 +484,22 @@ def convert_feature(fx):
                                     percentiles=percentiles)
         for fxs in features)
 
+    # Need to check if output param is valid. We can only do that now that we
+    # have the predictions:
+    if is_regressor and "MultiTask" in est.__class__.__name__:
+        pdp, _ = pd_result[0]
+        if not 0 <= output <= pdp.shape[0]:
+                raise ValueError(
+                    'output must be in [0, n_tasks], got {}.'.format(output))
+
+    # as we don't support multiclass-multioutput estimators label_idx and
+    # output are mutually exclusive and we can merge them here.
+    target = max(label_idx, output)
+
     # get global min and max values of PD grouped by plot type
     pdp_lim = {}
     for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
+        min_pd, max_pd = pdp[target].min(), pdp[target].max()
         n_fx = len(axes)
         old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
         min_pd = min(min_pd, old_min_pd)
@@ -511,12 +524,12 @@ def convert_feature(fx):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
 
         if len(axes) == 1:
-            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
+            ax.plot(axes[0], pdp[target].ravel(), **line_kw)
         else:
             # make contour plot
             assert len(axes) == 2
             XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
+            Z = pdp[target].reshape(list(map(np.size, axes))).T
             CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                             colors='k')
             ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],

From f84787d0293137d7d0fc37082b62d5a5b9792702 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 11:42:28 -0500
Subject: [PATCH 032/113] better tests

---
 sklearn/partial_dependence.py            |   4 +
 sklearn/tests/test_partial_dependence.py | 113 ++++++++++++++---------
 2 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 068443259750b..3c60753192e59 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -231,6 +231,10 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     if not (is_classifier(est) or is_regressor(est)):
         raise ValueError('est must be a fitted regressor or classifier.')
 
+    if (hasattr(est, 'classes_') and
+            isinstance(est.classes_[0], np.ndarray)):
+        raise ValueError('Multiclass-multioutput estimators are not supported')
+
     if X is not None:
         X = check_array(X)
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 8af4b6de9afad..1edfc00ea9737 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -7,6 +7,7 @@
 from numpy.testing import assert_array_almost_equal
 import pytest
 
+import sklearn
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import if_matplotlib
@@ -20,7 +21,11 @@
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import MultiTaskLasso
 from sklearn.svm import SVC
 from sklearn.ensemble.forest import ForestRegressor
 from sklearn.datasets import load_boston, load_iris
@@ -34,55 +39,50 @@
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
 
-# Make some sample data to test output shapes
-X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)
-# Non-negative for MultinomialNB
-X_c = X_c + np.abs(X_c.min())
-X_r, y_r = make_regression(n_features=10, n_informative=5, random_state=0)
 
-# Load the boston & iris datasets
-boston = load_boston()
-iris = load_iris()
+def binary_classification():
+    return make_classification(random_state=0), 1
 
 
-@pytest.mark.parametrize('Estimator', all_estimators())
-@pytest.mark.parametrize('method', ('recursion', 'exact'))
-@pytest.mark.parametrize('multiclass', (True, False))
-@pytest.mark.parametrize('grid_resolution', (10,))
+def multiclass_classification():
+    return (make_classification(n_classes=3, n_clusters_per_class=1,
+                                random_state=0), 3)
+
+
+def regression():
+    return make_regression(random_state=0), 1
+
+
+def multioutput_regression():
+    return make_regression(n_targets=2, random_state=0), 2
+
+@pytest.mark.parametrize('Estimator, method, data', [
+    (GradientBoostingClassifier, 'recursion', binary_classification()),
+    (GradientBoostingClassifier, 'recursion', multiclass_classification()),
+    (GradientBoostingClassifier, 'exact', binary_classification()),
+    (GradientBoostingClassifier, 'exact', multiclass_classification()),
+    (GradientBoostingRegressor, 'recursion', regression()),
+    (GradientBoostingRegressor, 'exact', regression()),
+    (LinearRegression, 'exact', regression()),
+    (LogisticRegression, 'exact', binary_classification()),
+    (LogisticRegression, 'exact', multiclass_classification()),
+    (MultiTaskLasso, 'exact', multioutput_regression())])
+@pytest.mark.parametrize('grid_resolution', (5, 10))
 @pytest.mark.parametrize('target_variables', ([1], [1, 2]))
-def test_output_shape(Estimator, method, multiclass, grid_resolution,
+def test_output_shape(Estimator, method, data, grid_resolution,
                       target_variables):
-    # Check that partial_dependence has consistent output shape
+    # Check that partial_dependence has consistent output shape for different
+    # kinds of estimators:
+    # - classifiers with binary and multiclass settings
+    # - regressors
+    # - multi-task regressors
 
-    _, Estimator = Estimator
     est = Estimator()
-    if not (is_classifier(est) or is_regressor(est)):
-        return
-    if method == 'recursion':
-        # recursion method only accepts some of the ensemble estimators
-        if not (isinstance(est, BaseGradientBoosting) or
-                isinstance(est, ForestRegressor)):
-            return
-    elif is_classifier(est) and not hasattr(est, 'predict_proba'):
-        # classifiers with exact method need predict_proba()
-        return
 
-    if is_classifier(est):
-        if multiclass == True:
-            X, y = iris.data, iris.target
-            n_targets = 3
-        else:
-            X, y = X_c, y_c
-            n_targets = 1
-    else:  # regressor
-        if multiclass:  # multiclass for regressor makes no sense
-            return
-        X, y = X_r, y_r
-        n_targets = 1
-        if "MultiTask" in est.__class__.__name__:
-            # multioutput regressor
-            y = np.array([y, y]).T
-            n_targets = 2
+    # n_target corresponds to the number of classes (1 for binary classif) or
+    # the number of tasks / outputs in multi task settings. It's equal to 1 for
+    # classical regression.
+    (X, y), n_targets = data
 
     est.fit(X, y)
     pdp, axes = partial_dependence(est, target_variables=target_variables,
@@ -98,7 +98,7 @@ def test_output_shape(Estimator, method, multiclass, grid_resolution,
 
 
 def test_grid_from_X():
-    # tests for _grid_from_X
+    # tests for _grid_from_X: sanity check for output, and for shapes.
 
     # Make sure that the grid is a cartesian product of the input (it will use
     # the unique values instead of the percentiles)
@@ -148,7 +148,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     if partial_dependence_fun is _partial_dependence_recursion:
         return
 
-    X, y = X_r, y_r
+    X, y = make_regression(random_state=0)
     est.fit(X, y)
 
     # target feature will be set to .5 and then to 123
@@ -167,9 +167,34 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     assert_array_almost_equal(pdp, mean_predictions)
     
 
+@pytest.mark.parametrize('Estimator',
+                         (sklearn.tree.DecisionTreeClassifier,
+                          sklearn.tree.ExtraTreeClassifier,
+                          sklearn.ensemble.ExtraTreesClassifier,
+                          sklearn.neighbors.KNeighborsClassifier,
+                          sklearn.neighbors.RadiusNeighborsClassifier,
+                          sklearn.ensemble.RandomForestClassifier))
+def test_multiclass_multioutput(Estimator):
+    # Make sure multiclass-multioutput classifiers are not supported
+
+    # make multiclass-multioutput dataset
+    X, y = make_classification(n_classes=3, n_clusters_per_class=1,
+                               random_state=0)
+    y = np.array([y, y]).T
+
+    est = Estimator()
+    est.fit(X, y)
+
+    assert_raises_regex(ValueError,
+                        "Multiclass-multioutput estimators are not supported",
+                        partial_dependence, est, [0], X=X)
+
+
 def test_partial_dependence_input():
     # Test input validation of partial_dependence.
 
+    X, y = make_classification(random_state=0)
+
     lr = LinearRegression()
     lr.fit(X, y)
     gbc = GradientBoostingClassifier(random_state=0)
@@ -232,6 +257,7 @@ def test_partial_dependence_input():
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
+    boston = load_boston()
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
 
@@ -264,6 +290,7 @@ def test_plot_partial_dependence():
 @if_matplotlib
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
+    iris = load_iris()
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(iris.data, iris.target)
 

From 545ca6fe6cebf2f852b67b2fa88f425cd37b4802 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 13:18:02 -0500
Subject: [PATCH 033/113] Removed support for RandomForestRegressor with
 recursion (does not support multi output)

also fixed olg bug in recursion method that would not add the initial
estimator to the predictions
---
 ...ot_random_forest_regression_multioutput.py |  2 +
 sklearn/ensemble/partial_dependence.py        |  1 -
 sklearn/partial_dependence.py                 | 47 +++++++++----------
 sklearn/tests/test_partial_dependence.py      | 21 +++++----
 4 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py
index 8b7803361a60a..e79e53f114e8b 100644
--- a/examples/ensemble/plot_random_forest_regression_multioutput.py
+++ b/examples/ensemble/plot_random_forest_regression_multioutput.py
@@ -55,6 +55,8 @@
 # Predict on new data
 y_multirf = regr_multirf.predict(X_test)
 y_rf = regr_rf.predict(X_test)
+print(y_multirf.shape)
+print(y_rf.shape)
 
 # Plot the results
 plt.figure()
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index ce6e8cb9471e9..6fe483a0b59eb 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -66,7 +66,6 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
                   target_variables=target_variables,
                   grid=grid,
                   X=X,
-                  output=None,
                   percentiles=percentiles,
                   grid_resolution=grid_resolution,
                   method='recursion')
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 3c60753192e59..cb136581d12a4 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -83,7 +83,6 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
     # TODO: Move below imports to module level import at 0.22 release.
     from .ensemble._gradient_boosting import _partial_dependence_tree
     from .ensemble.gradient_boosting import BaseGradientBoosting
-    from .ensemble.forest import ForestRegressor
 
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
@@ -96,8 +95,8 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
         n_trees_per_stage = 1
         n_estimators = len(est.estimators_)
         learning_rate = 1.
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                    order='C')
+    pdp = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64,
+                   order='C')
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             if isinstance(est, BaseGradientBoosting):
@@ -106,8 +105,13 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
                 tree = est.estimators_[stage].tree_
             _partial_dependence_tree(tree, grid, target_variables,
                                      learning_rate, pdp[k])
-    if isinstance(est, ForestRegressor):
-        pdp /= n_estimators
+
+    # _partial_dependence_tree doesn't add the initial estimator to the
+    # predictions so we do it here
+    if isinstance(est, BaseGradientBoosting):
+        pdp += est._init_decision_function(X).mean()
+    
+    print(pdp.shape)
 
     return pdp
 
@@ -133,11 +137,12 @@ def _partial_dependence_exact(est, grid, target_variables, X):
         # (n_points, 2)  for binary classifaction
         # (n_points, n_classes) for multiclass classification
 
-        if is_classifier(est):
-            predictions = np.log(np.clip(predictions, 1e-16, 1))
-            # not sure yet why we need to center probas?
-            predictions = predictions - np.mean(predictions, axis=1,
-                                                keepdims=True)
+        # Commenting this out for now.
+        # if is_classifier(est):
+        #     predictions = np.log(np.clip(predictions, 1e-16, 1))
+        #     # not sure yet why we need to center probas?
+        #     predictions = predictions - np.mean(predictions, axis=1,
+        #                                         keepdims=True)
 
         pdp.append(np.mean(predictions, axis=0))  # average over samples
 
@@ -195,14 +200,13 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
-          calculate the function. Only supported for BaseGradientBoosting and
-          ForestRegressor.
+          calculate the function. Only supported for BaseGradientBoosting.
         - If 'exact', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          BaseGradientBoosting, and 'exact' used for other
           estimators.
 
     Returns
@@ -226,7 +230,6 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     """
 
     from .ensemble.gradient_boosting import BaseGradientBoosting
-    from .ensemble.forest import ForestRegressor
 
     if not (is_classifier(est) or is_regressor(est)):
         raise ValueError('est must be a fitted regressor or classifier.')
@@ -239,7 +242,7 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         X = check_array(X)
 
     if method == 'auto':
-        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+        if isinstance(est, BaseGradientBoosting):
             method = 'recursion'
         else:
             method = 'exact'
@@ -253,11 +256,10 @@ def partial_dependence(est, target_variables, grid=None, X=None,
                 method, ', '.join(method_to_function.keys())))
 
     if method == 'recursion':
-        if not isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+        if not isinstance(est, BaseGradientBoosting):
             raise ValueError(
-                'est must be an instance of BaseGradientBoosting or '
-                'ForestRegressor for the "recursion" method. Try '
-                'using method="exact".')
+                'est must be an instance of BaseGradientBoosting '
+                'for the "recursion" method. Try using method="exact".')
         check_is_fitted(est, 'estimators_',
                         msg='est parameter must be a fitted estimator')
         # Note: if method is exact, this check is done at prediction time
@@ -348,15 +350,13 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
-          calculate the function. Only supported for BaseGradientBoosting and
-          ForestRegressor.
+          calculate the function. Only supported for BaseGradientBoosting.
         - If 'exact', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
-          estimators.
+          BaseGradientBoosting, and 'exact' used for other estimators.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.
@@ -398,7 +398,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # TODO: The pattern below required to avoid a namespace collision.
     # TODO: Move below imports to module level import at 0.22 release.
     from .ensemble.gradient_boosting import BaseGradientBoosting
-    from .ensemble.forest import ForestRegressor
 
     # set label_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 1edfc00ea9737..d751c2bcb0e1e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -27,7 +27,6 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import MultiTaskLasso
 from sklearn.svm import SVC
-from sklearn.ensemble.forest import ForestRegressor
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.base import is_classifier, is_regressor
@@ -66,7 +65,8 @@ def multioutput_regression():
     (LinearRegression, 'exact', regression()),
     (LogisticRegression, 'exact', binary_classification()),
     (LogisticRegression, 'exact', multiclass_classification()),
-    (MultiTaskLasso, 'exact', multioutput_regression())])
+    (MultiTaskLasso, 'exact', multioutput_regression()),
+    ])
 @pytest.mark.parametrize('grid_resolution', (5, 10))
 @pytest.mark.parametrize('target_variables', ([1], [1, 2]))
 def test_output_shape(Estimator, method, data, grid_resolution,
@@ -143,10 +143,8 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     # _partial_dependece_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
     # samples.
-
-    # doesn't work for _partial_dependence_recursion, dont know why :(
-    if partial_dependence_fun is _partial_dependence_recursion:
-        return
+    # This also checks that the exact method and the recursion give the same
+    # output.
 
     X, y = make_regression(random_state=0)
     est.fit(X, y)
@@ -164,7 +162,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
         mean_predictions.append(est.predict(X_).mean())
 
     pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
-    assert_array_almost_equal(pdp, mean_predictions)
+    assert_array_almost_equal(pdp, mean_predictions, decimal=3)
     
 
 @pytest.mark.parametrize('Estimator',
@@ -210,8 +208,8 @@ def test_partial_dependence_input():
                         partial_dependence, lr, [0], method='blahblah')
 
     assert_raises_regex(ValueError,
-                        'est must be an instance of BaseGradientBoosting or '
-                        'ForestRegressor for the "recursion" method',
+                        'est must be an instance of BaseGradientBoosting '
+                        'for the "recursion" method',
                         partial_dependence, lr, [0], method='recursion')
 
     assert_raises_regex(ValueError, "est requires a predict_proba()",
@@ -253,6 +251,11 @@ def test_partial_dependence_input():
                             'est parameter must be a fitted estimator',
                             partial_dependence, unfitted_est, [0], X=X)
 
+    # check that array-like objects are accepted
+    for est in (lr, gbc):
+        partial_dependence(est, [0], grid=[1, 2], X=list(X))
+        partial_dependence(est, [0], grid=[[1], [2]], X=list(X))
+
 
 @if_matplotlib
 def test_plot_partial_dependence():

From e086051095b13ceaaec218e69830ef18ab2e63e7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 13:29:03 -0500
Subject: [PATCH 034/113] merged label and output into target

---
 sklearn/partial_dependence.py            | 54 +++++++++++-------------
 sklearn/tests/test_partial_dependence.py |  6 +--
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index cb136581d12a4..72d98d6d6a437 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -304,10 +304,9 @@ def partial_dependence(est, target_variables, grid=None, X=None,
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
-                            label=None, output=None,
-                            n_cols=3, grid_resolution=100,
-                            percentiles=(0.05, 0.95), method='auto', n_jobs=1,
-                            verbose=0, ax=None, line_kw=None,
+                            target=None, n_cols=3, grid_resolution=100,
+                            percentiles=(0.05, 0.95), method='auto',
+                            n_jobs=1, verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
 
@@ -399,30 +398,30 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # TODO: Move below imports to module level import at 0.22 release.
     from .ensemble.gradient_boosting import BaseGradientBoosting
 
-    # set label_idx for multi-class estimators
+    # set target_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
-        if label is None:
-            raise ValueError('label must be specified for multi-class PDP')
-        label_idx = np.searchsorted(est.classes_, label)
-        if est.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``est.classes_``' %
-                                str(label))
+        if target is None:
+            raise ValueError('target must be specified for multi-class PDP')
+        target_idx = np.searchsorted(est.classes_, target)
+        if est.classes_[target_idx] != target:
+            raise ValueError('target %s not in ``est.classes_``' % str(target))
     else:
         # regression and binary classification
-        label_idx = 0
+        target_idx = 0
 
-    if is_regressor and "MultiTask" in est.__class__.__name__:
+    if is_regressor(est) and "MultiTask" in est.__class__.__name__:
         # multioutput regressor
-        if output is None:
+        if target is None:
             raise ValueError(
-                'output must be specified for multi-output regressors')
-        if output < 0:
-            raise ValueError('output must be in [0, n_tasks], got {}.'.format(
-                output))
-        # Note: upper bound for output can only be checked once we have the
-        # predictions
+                'target must be specified for multi-output regressors')
+        if target < 0:
+            raise ValueError('target must be in [0, n_tasks], got {}.'.format(
+                target))
+        # Note: for multitask, upper bound for target can only be checked once
+        # we have the predictions
+        target_idx = target
     else:
-        output = 0
+        target_idx = 0
 
     #TODO: DYPE?????
     X = check_array(X, dtype=DTYPE, order='C')
@@ -493,16 +492,13 @@ def convert_feature(fx):
         pdp, _ = pd_result[0]
         if not 0 <= output <= pdp.shape[0]:
                 raise ValueError(
-                    'output must be in [0, n_tasks], got {}.'.format(output))
-
-    # as we don't support multiclass-multioutput estimators label_idx and
-    # output are mutually exclusive and we can merge them here.
-    target = max(label_idx, output)
+                    'output must be in [0, n_tasks], got {}.'.format(
+                        target_idx))
 
     # get global min and max values of PD grouped by plot type
     pdp_lim = {}
     for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[target].min(), pdp[target].max()
+        min_pd, max_pd = pdp[target_idx].min(), pdp[target_idx].max()
         n_fx = len(axes)
         old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
         min_pd = min(min_pd, old_min_pd)
@@ -527,12 +523,12 @@ def convert_feature(fx):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
 
         if len(axes) == 1:
-            ax.plot(axes[0], pdp[target].ravel(), **line_kw)
+            ax.plot(axes[0], pdp[target_idx].ravel(), **line_kw)
         else:
             # make contour plot
             assert len(axes) == 2
             XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[target].reshape(list(map(np.size, axes))).T
+            Z = pdp[target_idx].reshape(list(map(np.size, axes))).T
             CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                             colors='k')
             ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index d751c2bcb0e1e..5ab6b42df9db4 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -299,7 +299,7 @@ def test_plot_partial_dependence_multiclass():
 
     grid_resolution = 25
     fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       label=0,
+                                       target=0,
                                        grid_resolution=grid_resolution)
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
@@ -311,14 +311,14 @@ def test_plot_partial_dependence_multiclass():
 
     grid_resolution = 25
     fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       label='setosa',
+                                       target='setosa',
                                        grid_resolution=grid_resolution)
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
     # label not in gbrt.classes_
     assert_raises(ValueError, plot_partial_dependence,
-                  clf, iris.data, [0, 1], label='foobar',
+                  clf, iris.data, [0, 1], target='foobar',
                   grid_resolution=grid_resolution)
 
     # label not provided

From 137cd0776a1324cbea0632a337aa220231000979 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 13:38:36 -0500
Subject: [PATCH 035/113] renamed exact into brute

---
 sklearn/partial_dependence.py            | 27 ++++++++++++------------
 sklearn/tests/test_partial_dependence.py | 24 ++++++++++-----------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 72d98d6d6a437..d6fcdcc405acf 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -18,7 +18,6 @@
 from .utils import check_array
 from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
-
 from .exceptions import NotFittedError
 
 
@@ -116,7 +115,7 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
     return pdp
 
 
-def _partial_dependence_exact(est, grid, target_variables, X):
+def _partial_dependence_brute(est, grid, target_variables, X):
 
     pdp = []
     for new_values in grid:
@@ -196,17 +195,17 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
-    method : {'recursion', 'exact', 'auto'}, default='auto'
+    method : {'recursion', 'brute', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
           calculate the function. Only supported for BaseGradientBoosting.
-        - If 'exact', the function will be calculated by calling the
+        - If 'brute', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting, and 'exact' used for other
+          BaseGradientBoosting, and 'brute' used for other
           estimators.
 
     Returns
@@ -245,9 +244,9 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         if isinstance(est, BaseGradientBoosting):
             method = 'recursion'
         else:
-            method = 'exact'
+            method = 'brute'
     method_to_function = {
-        'exact': _partial_dependence_exact,
+        'brute': _partial_dependence_brute,
         'recursion': _partial_dependence_recursion
     }
     if method not in method_to_function:
@@ -259,17 +258,17 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         if not isinstance(est, BaseGradientBoosting):
             raise ValueError(
                 'est must be an instance of BaseGradientBoosting '
-                'for the "recursion" method. Try using method="exact".')
+                'for the "recursion" method. Try using method="brute".')
         check_is_fitted(est, 'estimators_',
                         msg='est parameter must be a fitted estimator')
-        # Note: if method is exact, this check is done at prediction time
+        # Note: if method is brute, this check is done at prediction time
         n_features = est.n_features_
     elif X is None:
-        raise ValueError('X is required for exact method')
+        raise ValueError('X is required for brute method')
     else:
         if is_classifier(est) and not hasattr(est, 'predict_proba'):
             raise ValueError('est requires a predict_proba() method for '
-                             'method="exact" for classification.')
+                             'method="brute" for classification.')
         n_features = X.shape[1]
 
     target_variables = np.asarray(target_variables, dtype=np.int32,
@@ -345,17 +344,17 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
-    method : {'recursion', 'exact', 'auto'}, default='auto'
+    method : {'recursion', 'brute', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
           calculate the function. Only supported for BaseGradientBoosting.
-        - If 'exact', the function will be calculated by calling the
+        - If 'brute', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on ``X``for every point in the grid. To speed up this
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting, and 'exact' used for other estimators.
+          BaseGradientBoosting, and 'brute' used for other estimators.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 5ab6b42df9db4..d12b9f3f7acbe 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -16,7 +16,7 @@
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.partial_dependence import _grid_from_X
-from sklearn.partial_dependence import _partial_dependence_exact
+from sklearn.partial_dependence import _partial_dependence_brute
 from sklearn.partial_dependence import _partial_dependence_recursion
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
@@ -58,14 +58,14 @@ def multioutput_regression():
 @pytest.mark.parametrize('Estimator, method, data', [
     (GradientBoostingClassifier, 'recursion', binary_classification()),
     (GradientBoostingClassifier, 'recursion', multiclass_classification()),
-    (GradientBoostingClassifier, 'exact', binary_classification()),
-    (GradientBoostingClassifier, 'exact', multiclass_classification()),
+    (GradientBoostingClassifier, 'brute', binary_classification()),
+    (GradientBoostingClassifier, 'brute', multiclass_classification()),
     (GradientBoostingRegressor, 'recursion', regression()),
-    (GradientBoostingRegressor, 'exact', regression()),
-    (LinearRegression, 'exact', regression()),
-    (LogisticRegression, 'exact', binary_classification()),
-    (LogisticRegression, 'exact', multiclass_classification()),
-    (MultiTaskLasso, 'exact', multioutput_regression()),
+    (GradientBoostingRegressor, 'brute', regression()),
+    (LinearRegression, 'brute', regression()),
+    (LogisticRegression, 'brute', binary_classification()),
+    (LogisticRegression, 'brute', multiclass_classification()),
+    (MultiTaskLasso, 'brute', multioutput_regression()),
     ])
 @pytest.mark.parametrize('grid_resolution', (5, 10))
 @pytest.mark.parametrize('target_variables', ([1], [1, 2]))
@@ -135,15 +135,15 @@ def test_grid_from_X():
 
 @pytest.mark.parametrize('target_feature', (0, 3))
 @pytest.mark.parametrize('est, partial_dependence_fun',
-                         [(LinearRegression(), _partial_dependence_exact),
+                         [(LinearRegression(), _partial_dependence_brute),
                           (GradientBoostingRegressor(random_state=0), _partial_dependence_recursion)])
 def test_partial_dependence_helpers(est, partial_dependence_fun,
                                     target_feature):
-    # Check that what is returned by _partial_dependence_exact or
+    # Check that what is returned by _partial_dependence_brute or
     # _partial_dependece_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
     # samples.
-    # This also checks that the exact method and the recursion give the same
+    # This also checks that the brute method and the recursion give the same
     # output.
 
     X, y = make_regression(random_state=0)
@@ -204,7 +204,7 @@ def test_partial_dependence_input():
 
     assert_raises_regex(ValueError,
                         "method blahblah is invalid. Accepted method names "
-                        "are exact, recursion, auto.",
+                        "are brute, recursion, auto.",
                         partial_dependence, lr, [0], method='blahblah')
 
     assert_raises_regex(ValueError,

From b00a23d1c0652116034cee96dda60bf9e1f2cb39 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 13:50:46 -0500
Subject: [PATCH 036/113] renamin

---
 sklearn/partial_dependence.py | 105 +++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 51 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index d6fcdcc405acf..b0e5078f3c95f 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -47,8 +47,8 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     -------
     grid : ndarray
         All data points on the grid. This is the cartesian product of ``axes``.
-    axes : list of ndarray
-        The axes with which the grid has been created. The ndarrays may be of
+    values: list of ndarray
+        The values with which the grid has been created. The ndarrays may be of
         different shape: either (grid_resolution, ) or (n_unique_values,).
     """
     try:
@@ -60,7 +60,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     if percentiles[0] > percentiles[1]:
         raise ValueError('percentiles[0] must be less than percentiles[1].')
 
-    axes = []
+    values = []
     for feature in range(X.shape[1]):
         uniques = np.unique(X[:, feature])
         if uniques.shape[0] < grid_resolution:
@@ -72,9 +72,9 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
             axis = np.linspace(emp_percentiles[0, feature],
                                emp_percentiles[1, feature],
                                num=grid_resolution, endpoint=True)
-        axes.append(axis)
+        values.append(axis)
 
-    return cartesian(axes), axes
+    return cartesian(values), values
 
 
 def _partial_dependence_recursion(est, grid, target_variables, X=None):
@@ -94,8 +94,8 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
         n_trees_per_stage = 1
         n_estimators = len(est.estimators_)
         learning_rate = 1.
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64,
-                   order='C')
+    averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64,
+                                    order='C')
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             if isinstance(est, BaseGradientBoosting):
@@ -103,21 +103,21 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
             else:
                 tree = est.estimators_[stage].tree_
             _partial_dependence_tree(tree, grid, target_variables,
-                                     learning_rate, pdp[k])
+                                     learning_rate, averaged_predictions[k])
 
     # _partial_dependence_tree doesn't add the initial estimator to the
     # predictions so we do it here
     if isinstance(est, BaseGradientBoosting):
-        pdp += est._init_decision_function(X).mean()
+        averaged_predictions += est._init_decision_function(X).mean()
     
-    print(pdp.shape)
+    print(averaged_predictions.shape)
 
-    return pdp
+    return averaged_predictions
 
 
 def _partial_dependence_brute(est, grid, target_variables, X):
 
-    pdp = []
+    averaged_predictions = []
     for new_values in grid:
         X_eval = X.copy()
         for i, variable in enumerate(target_variables):
@@ -143,23 +143,25 @@ def _partial_dependence_brute(est, grid, target_variables, X):
         #     predictions = predictions - np.mean(predictions, axis=1,
         #                                         keepdims=True)
 
-        pdp.append(np.mean(predictions, axis=0))  # average over samples
+        # average over samples
+        averaged_predictions.append(np.mean(predictions, axis=0))
 
-    # reshape pdp to (n_targets, n_points) where n_targets is:
+    # reshape to (n_targets, n_points) where n_targets is:
     # - 1 for non-multioutput regression and binary classification (shape is
     #   already correct in those cases)
     # - n_tasks for multi-output regression
     # - n_classes for multiclass classification.
-    pdp = np.array(pdp).T
-    if is_regressor(est) and pdp.ndim == 1:
-        # non-multioutput regression, pdp shape is (n_points,)
-        pdp = pdp.reshape(1, -1)
-    elif is_classifier(est) and pdp.shape[0] == 2:
-        # Binary classification, pdp shape is (2, n_points).
-        pdp = pdp[1] # we output the effect of **positive** class
-        pdp = pdp.reshape(1, -1)
-
-    return pdp
+    averaged_predictions = np.array(averaged_predictions).T
+    if is_regressor(est) and averaged_predictions.ndim == 1:
+        # non-multioutput regression, shape is (n_points,)
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+    elif is_classifier(est) and averaged_predictions.shape[0] == 2:
+        # Binary classification, shape is (2, n_points).
+        # we output the effect of **positive** class
+        averaged_predictions = averaged_predictions[1]
+        averaged_predictions = averaged_predictions.reshape(1, -1)
+
+    return averaged_predictions
 
 def partial_dependence(est, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
@@ -210,11 +212,11 @@ def partial_dependence(est, target_variables, grid=None, X=None,
 
     Returns
     -------
-    pdp : array, shape=(n_classes, n_points)
-        The partial dependence function evaluated on the ``grid``.
-        For regression and binary classification ``n_classes==1``.
-    axes : seq of ndarray or None
-        The axes with which the grid has been created or None if
+    averaged_predictions : array, shape=(n_classes, n_points)
+        The predictions for all the points in the ``grid``, averaged over
+        all samples. For regression and binary classification ``n_classes==1``.
+    values: seq of ndarray or None
+        The values with which the grid has been created or None if
         the grid has been given.
 
     Examples
@@ -281,11 +283,11 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         raise ValueError('Either grid or X must be specified.')
 
     if grid is None:
-        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
+        grid, values = _grid_from_X(X[:, target_variables], percentiles,
                                   grid_resolution)
     else:
         grid = np.asarray(grid)
-        axes = None  # don't return axes if grid is given
+        values = None  # don't return values if grid is given
         # grid must be 2d
         if grid.ndim == 1:
             grid = grid[:, np.newaxis]
@@ -297,9 +299,10 @@ def partial_dependence(est, target_variables, grid=None, X=None,
                              'of target variables ({})'.format(
                                  grid.shape[1], target_variables.shape[0]))
 
-    pdp = method_to_function[method](est, grid, target_variables, X)
+    averaged_predictions = method_to_function[method](est, grid,
+                                                      target_variables, X)
 
-    return pdp, axes
+    return averaged_predictions, values
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
@@ -377,7 +380,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     -------
     fig : figure
         The Matplotlib Figure object.
-    axs : seq of Axis objects
+    axes : seq of Axis objects
         A seq of Axis objects, one for each subplot.
 
     Examples
@@ -400,7 +403,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # set target_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if target is None:
-            raise ValueError('target must be specified for multi-class PDP')
+            raise ValueError('target must be specified for multi-class')
         target_idx = np.searchsorted(est.classes_, target)
         if est.classes_[target_idx] != target:
             raise ValueError('target %s not in ``est.classes_``' % str(target))
@@ -488,17 +491,17 @@ def convert_feature(fx):
     # Need to check if output param is valid. We can only do that now that we
     # have the predictions:
     if is_regressor and "MultiTask" in est.__class__.__name__:
-        pdp, _ = pd_result[0]
-        if not 0 <= output <= pdp.shape[0]:
+        pd, _ = pd_result[0]
+        if not 0 <= output <= pd.shape[0]:
                 raise ValueError(
                     'output must be in [0, n_tasks], got {}.'.format(
                         target_idx))
 
     # get global min and max values of PD grouped by plot type
     pdp_lim = {}
-    for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[target_idx].min(), pdp[target_idx].max()
-        n_fx = len(axes)
+    for pd, values in pd_result:
+        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
+        n_fx = len(values)
         old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
         min_pd = min(min_pd, old_min_pd)
         max_pd = max(max_pd, old_max_pd)
@@ -516,18 +519,18 @@ def convert_feature(fx):
 
     n_cols = min(n_cols, len(features))
     n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pdp, axes) in zip(count(), features, names,
+    axes = []
+    for i, fx, name, (pd, values) in zip(count(), features, names,
                                         pd_result):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
 
-        if len(axes) == 1:
-            ax.plot(axes[0], pdp[target_idx].ravel(), **line_kw)
+        if len(values) == 1:
+            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
         else:
             # make contour plot
-            assert len(axes) == 2
-            XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[target_idx].reshape(list(map(np.size, axes))).T
+            assert len(values) == 2
+            XX, YY = np.meshgrid(values[0], values[1])
+            Z = pd[target_idx].reshape(list(map(np.size, values))).T
             CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                             colors='k')
             ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
@@ -549,7 +552,7 @@ def convert_feature(fx):
         tick_formatter.set_powerlimits((-3, 4))
         ax.xaxis.set_major_formatter(tick_formatter)
 
-        if len(axes) > 1:
+        if len(values) > 1:
             # two-way PDP - y-axis deciles + labels
             deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
             trans = transforms.blended_transform_factory(ax.transAxes,
@@ -562,10 +565,10 @@ def convert_feature(fx):
         else:
             ax.set_ylabel('Partial dependence')
 
-        if len(axes) == 1:
+        if len(values) == 1:
             ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
+        axes.append(ax)
 
     fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
                         hspace=0.3)
-    return fig, axs
+    return fig, axes

From 787e07fa9fd99c890bde1dc4b75a50317f33e1ce Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 18:18:08 -0500
Subject: [PATCH 037/113] some refactoring and tests

---
 sklearn/partial_dependence.py            | 90 ++++++++++++------------
 sklearn/tests/test_partial_dependence.py | 83 +++++++++++++++-------
 2 files changed, 101 insertions(+), 72 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b0e5078f3c95f..dd19c4b532f12 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -2,6 +2,7 @@
 
 # Authors: Peter Prettenhofer
 #          Trevor Stephens
+#          Nicolas Hug
 # License: BSD 3 clause
 
 from itertools import count
@@ -69,6 +70,10 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         else:
             # create axis based on percentiles and grid resolution
             emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+            if np.allclose(emp_percentiles[0, feature],
+                           emp_percentiles[1, feature]):
+                raise ValueError('percentiles are too close to each other, '
+                                 'not able to build the grid.')
             axis = np.linspace(emp_percentiles[0, feature],
                                emp_percentiles[1, feature],
                                num=grid_resolution, endpoint=True)
@@ -107,11 +112,8 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
 
     # _partial_dependence_tree doesn't add the initial estimator to the
     # predictions so we do it here
-    if isinstance(est, BaseGradientBoosting):
-        averaged_predictions += est._init_decision_function(X).mean()
+    averaged_predictions += est._init_decision_function(X).mean()
     
-    print(averaged_predictions.shape)
-
     return averaged_predictions
 
 
@@ -325,7 +327,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
     features : seq of ints, strings, or tuples of ints or strings
-        If seq[i] is an int or a tuple with one int value, a one-way
+        If seq[i] is an int , a one-way
         PDP is created; if seq[i] is a tuple of two ints, a two-way
         PDP is created.
         If feature_names is specified and seq[i] is an int, seq[i]
@@ -405,40 +407,21 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         if target is None:
             raise ValueError('target must be specified for multi-class')
         target_idx = np.searchsorted(est.classes_, target)
-        if est.classes_[target_idx] != target:
-            raise ValueError('target %s not in ``est.classes_``' % str(target))
-    else:
-        # regression and binary classification
-        target_idx = 0
-
-    if is_regressor(est) and "MultiTask" in est.__class__.__name__:
-        # multioutput regressor
-        if target is None:
-            raise ValueError(
-                'target must be specified for multi-output regressors')
-        if target < 0:
-            raise ValueError('target must be in [0, n_tasks], got {}.'.format(
+        if (not (0 <= target_idx < len(est.classes_)) or
+            est.classes_[target_idx] != target):
+            raise ValueError('target not in est.classes_, got {}'.format(
                 target))
-        # Note: for multitask, upper bound for target can only be checked once
-        # we have the predictions
-        target_idx = target
     else:
+        # regression and binary classification
         target_idx = 0
 
     #TODO: DYPE?????
     X = check_array(X, dtype=DTYPE, order='C')
-    if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match est.n_features_')
     n_features = X.shape[1]
 
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
     # convert feature_names to list
     if feature_names is None:
-        # if not feature_names use fx indices as name
+        # if not feature_names use feature indices as name
         feature_names = [str(i) for i in range(n_features)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
@@ -449,20 +432,21 @@ def convert_feature(fx):
                 fx = feature_names.index(fx)
             except ValueError:
                 raise ValueError('Feature %s not in feature_names' % fx)
-        return fx
+        return int(fx)
 
     # convert features into a seq of int tuples
     tmp_features = []
     for fxs in features:
-        if isinstance(fxs, (numbers.Integral,) + six.string_types):
+        if isinstance(fxs, (numbers.Integral, six.string_types)):
             fxs = (fxs,)
         try:
-            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
+            fxs = [convert_feature(fx) for fx in fxs]
         except TypeError:
-            raise ValueError('features must be either int, str, or tuple '
-                             'of int/str')
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
         if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('target features must be either one or two')
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
 
         tmp_features.append(fxs)
 
@@ -481,21 +465,30 @@ def convert_feature(fx):
                          'len(feature_names) = {0}, got {1}.'
                          .format(len(feature_names), i))
 
-    # compute PD functions
+    # compute averaged predictions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
         delayed(partial_dependence)(est, fxs, X=X, method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)
 
-    # Need to check if output param is valid. We can only do that now that we
-    # have the predictions:
-    if is_regressor and "MultiTask" in est.__class__.__name__:
-        pd, _ = pd_result[0]
-        if not 0 <= output <= pd.shape[0]:
+    # For multioutput regression, we can only check the validity of target
+    # now that we have the predictions.
+    # Also note: as multiclass-multioutput classifiers are not supported,
+    # multiand multioutput setting. So there is no risk of overwriting
+    # target_idx here.
+    pd, _ = pd_result[0]  # checking the first result is enough
+    if is_regressor(est) and pd.shape[0] > 1:
+        if target is None:
+            raise ValueError(
+                'target must be specified for multi-output regressors')
+        if not 0 <= target <= pd.shape[0]:
                 raise ValueError(
-                    'output must be in [0, n_tasks], got {}.'.format(
-                        target_idx))
+                    'target must be in [0, n_tasks], got {}.'.format(
+                        target))
+        target_idx = target
+    else:
+        target_idx = 0
 
     # get global min and max values of PD grouped by plot type
     pdp_lim = {}
@@ -517,9 +510,14 @@ def convert_feature(fx):
         fig = ax.get_figure()
         fig.clear()
 
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
     n_cols = min(n_cols, len(features))
     n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axes = []
+    axs = []
     for i, fx, name, (pd, values) in zip(count(), features, names,
                                         pd_result):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
@@ -567,8 +565,8 @@ def convert_feature(fx):
 
         if len(values) == 1:
             ax.set_ylim(pdp_lim[1])
-        axes.append(ax)
+        axs.append(ax)
 
     fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
                         hspace=0.3)
-    return fig, axes
+    return fig, axs
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index d12b9f3f7acbe..0330c14f44a0e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -63,6 +63,7 @@ def multioutput_regression():
     (GradientBoostingRegressor, 'recursion', regression()),
     (GradientBoostingRegressor, 'brute', regression()),
     (LinearRegression, 'brute', regression()),
+    (LinearRegression, 'brute', multioutput_regression()),
     (LogisticRegression, 'brute', binary_classification()),
     (LogisticRegression, 'brute', multiclass_classification()),
     (MultiTaskLasso, 'brute', multioutput_regression()),
@@ -132,6 +133,13 @@ def test_grid_from_X():
     assert axes[0].shape == (n_unique_values,)
     assert axes[1].shape == (grid_resolution,)
 
+    assert_raises_regex(ValueError,
+                        'percentiles are too close',
+                        _grid_from_X,
+                        X,
+                        grid_resolution=10,
+                        percentiles=(0, 0.0001))
+
 
 @pytest.mark.parametrize('target_feature', (0, 3))
 @pytest.mark.parametrize('est, partial_dependence_fun',
@@ -173,7 +181,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
                           sklearn.neighbors.RadiusNeighborsClassifier,
                           sklearn.ensemble.RandomForestClassifier))
 def test_multiclass_multioutput(Estimator):
-    # Make sure multiclass-multioutput classifiers are not supported
+    # Make error is raised for multiclass-multioutput classifiers
 
     # make multiclass-multioutput dataset
     X, y = make_classification(n_classes=3, n_clusters_per_class=1,
@@ -329,34 +337,57 @@ def test_plot_partial_dependence_multiclass():
 
 @if_matplotlib
 def test_plot_partial_dependence_input():
-    # Test partial dependence plot function input checks.
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-
-    # not fitted yet
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [0])
-
-    clf.fit(X, y)
+    X, y = make_classification(random_state=0)
 
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, np.array(X)[:, :0], [0])
+    lr = LinearRegression()
+    lr.fit(X, y)
+    gbc = GradientBoostingClassifier(random_state=0)
+    gbc.fit(X, y)
 
-    # first argument must be an instance of BaseGradientBoosting
-    assert_raises(ValueError, plot_partial_dependence,
-                  {}, X, [0])
+    # check target param for multiclass
+    (X_m, y_m), _ = multiclass_classification()
+    lr_m = LogisticRegression()
+    lr_m.fit(X_m, y_m)
+    assert_raises_regex(ValueError,
+                        'target must be specified for multi-class',
+                        plot_partial_dependence, lr_m, X_m, [0],
+                        target=None)
+    for target in (-1, 100):
+        assert_raises_regex(ValueError,
+                            'target not in est.classes_',
+                            plot_partial_dependence, lr_m, X_m, [0],
+                            target=target)
+
+    # check target param for multioutput
+    (X_m, y_m), _ = multioutput_regression()
+    lr_m = LinearRegression()
+    lr_m.fit(X_m, y_m)
+    assert_raises_regex(ValueError,
+                        'target must be specified for multi-output',
+                        plot_partial_dependence, lr_m, X_m, [0],
+                        target=None)
+    for target in (-1, 100):
+        assert_raises_regex(ValueError,
+                            'target must be in \[0, n_tasks\]',
+                            plot_partial_dependence, lr_m, X_m, [0],
+                            target=target)
 
-    # must be larger than -1
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [-1])
 
-    # too large feature value
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [100])
+    for feature_names in (None, ['abcd', 'def']):
+        assert_raises_regex(ValueError,
+                            'Feature foobar not in feature_names',
+                            plot_partial_dependence, lr, X,
+                            features=['foobar'],
+                            feature_names=feature_names)
 
-    # str feature but no feature_names
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, ['foobar'])
+    for features in([(1, 2, 3)], [1, {}], [tuple()]):
+        assert_raises_regex(ValueError,
+                            'Each entry in features must be either an int, ',
+                            plot_partial_dependence, lr, X,
+                            features=features)
 
-    # not valid features value
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, X, [{'foo': 'bar'}])
+    assert_raises_regex(ValueError,
+                        'All entries of features must be less than ',
+                        plot_partial_dependence, lr, X,
+                        features=[123],
+                        feature_names=['blah'])
\ No newline at end of file

From 39dffd7e12b3c72134d2917aae73c4de28285ad9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Nov 2018 19:56:18 -0500
Subject: [PATCH 038/113] some docs and tests

---
 sklearn/ensemble/partial_dependence.py        |   2 +-
 .../ensemble/tests/test_partial_dependence.py |   4 +-
 sklearn/partial_dependence.py                 | 147 ++++++++----------
 sklearn/tests/test_partial_dependence.py      |   5 +
 4 files changed, 75 insertions(+), 83 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 6fe483a0b59eb..03525bbcdb46d 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -154,7 +154,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
                    X=X,
                    features=features,
                    feature_names=feature_names,
-                   label=label,
+                   target=label,
                    n_cols=n_cols,
                    grid_resolution=grid_resolution,
                    method='recursion',
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index e9ea8801b83fb..cef18076f04a4 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -45,6 +45,7 @@ def test_partial_dependence_classifier():
     # now with our own grid
     X_ = np.asarray(X)
     grid = np.unique(X_[:, 0])
+    print(grid)
     pdp_2, axes = partial_dependence(clf, [0], grid=grid)
 
     assert axes is None
@@ -90,9 +91,6 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence,
                   clf, [0], grid=None, X=None)
 
-    assert_raises(ValueError, partial_dependence,
-                  clf, [0], grid=[0, 1], X=X)
-
     # first argument must be an instance of BaseGradientBoosting
     assert_raises(ValueError, partial_dependence,
                   {}, [0], X=X)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index dd19c4b532f12..092edc373f4c8 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -28,29 +28,31 @@
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     """Generate a grid of points based on the ``percentiles of ``X``.
 
-    The grid is generated by placing ``grid_resolution`` equally
-    spaced points between the ``percentiles`` of each column
-    of ``X``. If ``grid_resolution`` is bigger than the number of unique values
-    in a column, then those unique values will be used instead.
+    The grid is a cartesian product between the columns of Z. The ith column of
+    Z consists in ``grid_resolution`` equally-spaced points between the
+    percentiles of the ith column of X.
+    If ``grid_resolution`` is bigger than the number of unique values in the
+    ith column of X, then those unique values will be used instead.
 
     Parameters
     ----------
     X : ndarray
         The data
     percentiles : tuple of floats
-        The percentiles which are used to construct the extreme
-        values of the grid axes.
+        The percentiles which are used to construct the extreme values of
+        the grid.
     grid_resolution : int
         The number of equally spaced points to be placed on the grid for a
-        given feature.
+        given column.
 
     Returns
     -------
-    grid : ndarray
-        All data points on the grid. This is the cartesian product of ``axes``.
-    values: list of ndarray
+    grid : ndarray, shape=(n_points, X.shape[1])
+        All data points on the grid. n_points is always ``<= grid_resolution **
+        X.shape[1]``.
+    Z: list of ndarray
         The values with which the grid has been created. The ndarrays may be of
-        different shape: either (grid_resolution, ) or (n_unique_values,).
+        different shape: either (grid_resolution,) or (n_unique_values,).
     """
     try:
         assert len(percentiles) == 2
@@ -73,7 +75,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
             if np.allclose(emp_percentiles[0, feature],
                            emp_percentiles[1, feature]):
                 raise ValueError('percentiles are too close to each other, '
-                                 'not able to build the grid.')
+                                 'unable to build the grid.')
             axis = np.linspace(emp_percentiles[0, feature],
                                emp_percentiles[1, feature],
                                num=grid_resolution, endpoint=True)
@@ -109,10 +111,6 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
                 tree = est.estimators_[stage].tree_
             _partial_dependence_tree(tree, grid, target_variables,
                                      learning_rate, averaged_predictions[k])
-
-    # _partial_dependence_tree doesn't add the initial estimator to the
-    # predictions so we do it here
-    averaged_predictions += est._init_decision_function(X).mean()
     
     return averaged_predictions
 
@@ -138,13 +136,6 @@ def _partial_dependence_brute(est, grid, target_variables, X):
         # (n_points, 2)  for binary classifaction
         # (n_points, n_classes) for multiclass classification
 
-        # Commenting this out for now.
-        # if is_classifier(est):
-        #     predictions = np.log(np.clip(predictions, 1e-16, 1))
-        #     # not sure yet why we need to center probas?
-        #     predictions = predictions - np.mean(predictions, axis=1,
-        #                                         keepdims=True)
-
         # average over samples
         averaged_predictions.append(np.mean(predictions, axis=0))
 
@@ -179,47 +170,47 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     Parameters
     ----------
     est : BaseEstimator
-        A fitted classification or regression model.
-    target_variables : array-like, dtype=int
+        A fitted classification or regression model. Multioutput-multiclass
+        classifiers are not supported.
+    target_variables : list or array-like of int
         The target features for which the partial dependency should be
-        computed (size should be smaller than 3 for visual renderings).
+        computed.
     grid : array-like, shape=(n_points, len(target_variables))
-        The grid of ``target_variables`` values for which the
-        partial dependency should be evaluated (either ``grid`` or ``X``
-        must be specified).
+        The grid of ``target_variables`` values for which the partial
+        dependency should be evaluated (either ``grid`` or ``X`` must be
+        specified).
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained. It is used to generate
-        a ``grid`` for the ``target_variables``. The ``grid`` comprises
-        ``grid_resolution`` equally spaced points between the two
-        ``percentiles``.
-    output : int, optional (default=None)
-        The output index to use for multi-output estimators.
+        The data on which ``est`` was trained. It is used both to generate
+        a ``grid`` for the ``target_variables`` (if ``grid`` wasn't specified),
+        and then to compute the averaged predictions where the target features
+        values would have been replaced by those in the grid.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
-        for the ``grid``. Only if ``X`` is not None.
+        for the ``grid``. Only used if ``grid`` is None.
     grid_resolution : int, default=100
-        The number of equally spaced points on the ``grid``.
+        The number of equally spaced points on the grid. Only used  if ``grid``
+        is None.
     method : {'recursion', 'brute', 'auto'}, default='auto'
-        The method to use to calculate the partial dependence function:
-
-        - If 'recursion', the underlying trees of ``est`` will be recursed to
-          calculate the function. Only supported for BaseGradientBoosting.
-        - If 'brute', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on ``X``for every point in the grid. To speed up this
-          method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting, and 'brute' used for other
+        The method to use to calculate the partial dependence predictions:
+
+        - 'recursion' is only supported for objects inheriting from
+          `BaseGradientBoosting`, but is optimal in terms of speed.
+        - 'brute' is supported for any estimator, but is more
+           computationally intensive. Both methods are equivalent.
+        - If 'auto', then 'recursion' will be used for
+          ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
 
     Returns
     -------
     averaged_predictions : array, shape=(n_classes, n_points)
         The predictions for all the points in the ``grid``, averaged over
-        all samples. For regression and binary classification ``n_classes==1``.
+        all samples in X. For regression and binary classification
+        ``n_classes==1``.
     values: seq of ndarray or None
-        The values with which the grid has been created or None if
-        the grid has been given.
+        The values with which the grid has been created, or None if
+        the grid has been given. The grid is a cartesian product of the arrays
+        in ``values``
 
     Examples
     --------
@@ -323,25 +314,25 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     Parameters
     ----------
     est : BaseEstimator
-        A fitted classification or regression model.
+        A fitted classification or regression model. Classifiers must have a
+        ``predict_proba()`` method. Multioutput-multiclass estimators aren't
+        supported.
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
     features : seq of ints, strings, or tuples of ints or strings
-        If seq[i] is an int , a one-way
-        PDP is created; if seq[i] is a tuple of two ints, a two-way
-        PDP is created.
-        If feature_names is specified and seq[i] is an int, seq[i]
-        must be < len(feature_names).
-        If seq[i] is a string, feature_names must be specified, and
-        seq[i] must be in feature_names.
+        If features[i] is an int or a string, a one-way PDP is created; if
+        features[i] is a tuple, a two-way PDP is created. Each tuple must be of
+        size 2.
+        if any entry is a string, then it must be in ``feature_names``.
     feature_names : seq of str
-        Name of each feature; feature_names[i] holds
-        the name of the feature with index i.
-    label : object
-        The class label for which the PDPs should be computed.
-        Only if est is a multi-class model. Must be in ``est.classes_``.
-    output : int, optional (default=None)
-        The output index to use for multi-output estimators.
+        Name of each feature; feature_names[i] holds the name of the feature
+        with index i.
+    target : int, optional (default=None)
+        - In a multiclass setting, specifies the class for which the PDPs
+        should be computed. Note that for binary classification, the positive
+        class (index 1) is always used.
+        - In a multioutput setting, specifies the task for which the PDPs
+          should be computed
     n_cols : int
         The number of columns in the grid plot (default: 3).
     grid_resolution : int, default=100
@@ -350,16 +341,15 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
     method : {'recursion', 'brute', 'auto'}, default='auto'
-        The method to use to calculate the partial dependence function:
-
-        - If 'recursion', the underlying trees of ``est`` will be recursed to
-          calculate the function. Only supported for BaseGradientBoosting.
-        - If 'brute', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on ``X``for every point in the grid. To speed up this
-          method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'auto', then 'recursion' will be used if ``est`` is
-          BaseGradientBoosting, and 'brute' used for other estimators.
+        The method to use to calculate the partial dependence predictions:
+
+        - 'recursion' is only supported for objects inheriting from
+          `BaseGradientBoosting`, but is optimal in terms of speed.
+        - 'brute' is supported for any estimator, but is more
+           computationally intensive. Both methods are equivalent.
+        - If 'auto', then 'recursion' will be used for
+          ``BaseGradientBoosting`` estimators, and 'brute' used for other
+          estimators.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.
@@ -382,7 +372,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     -------
     fig : figure
         The Matplotlib Figure object.
-    axes : seq of Axis objects
+    axs : seq of Axis objects
         A seq of Axis objects, one for each subplot.
 
     Examples
@@ -415,13 +405,12 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         # regression and binary classification
         target_idx = 0
 
-    #TODO: DYPE?????
-    X = check_array(X, dtype=DTYPE, order='C')
+    X = check_array(X)
     n_features = X.shape[1]
 
     # convert feature_names to list
     if feature_names is None:
-        # if not feature_names use feature indices as name
+        # if feature_names is None, use feature indices as name
         feature_names = [str(i) for i in range(n_features)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 0330c14f44a0e..5f23cf02a9f56 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -155,6 +155,10 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     # output.
 
     X, y = make_regression(random_state=0)
+    # The 'init' estimator for GBDT (here the average prediction) isn't taken
+    # into account with the recursion method, for technical reasons. We set
+    # the mean to 0 to that this 'bug' doesn't have any effect.
+    y = y - y.mean()
     est.fit(X, y)
 
     # target feature will be set to .5 and then to 123
@@ -264,6 +268,7 @@ def test_partial_dependence_input():
         partial_dependence(est, [0], grid=[1, 2], X=list(X))
         partial_dependence(est, [0], grid=[[1], [2]], X=list(X))
 
+    partial_dependence(gbc, [0], grid=[1, 2])
 
 @if_matplotlib
 def test_plot_partial_dependence():

From f9f7ee79b619f63ab197163dba23b91d73926ba9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 13:10:33 -0500
Subject: [PATCH 039/113] Added check for grid_resolution

---
 sklearn/partial_dependence.py            |  8 +++--
 sklearn/tests/test_partial_dependence.py | 37 ++++++++++++------------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 092edc373f4c8..333cc1609e386 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -60,8 +60,12 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         raise ValueError('percentiles must be a sequence of 2 elements.')
     if not all(0. <= x <= 1. for x in percentiles):
         raise ValueError('percentiles values must be in [0, 1].')
-    if percentiles[0] > percentiles[1]:
-        raise ValueError('percentiles[0] must be less than percentiles[1].')
+    if percentiles[0] >= percentiles[1]:
+        raise ValueError('percentiles[0] must be strictly less '
+                         'than percentiles[1].')
+
+    if grid_resolution <= 1:
+        raise ValueError('grid_resolution must be strictly greater than 1.')
 
     values = []
     for feature in range(X.shape[1]):
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 5f23cf02a9f56..f9045717d2365 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -133,13 +133,26 @@ def test_grid_from_X():
     assert axes[0].shape == (n_unique_values,)
     assert axes[1].shape == (grid_resolution,)
 
-    assert_raises_regex(ValueError,
-                        'percentiles are too close',
-                        _grid_from_X,
-                        X,
-                        grid_resolution=10,
+    assert_raises_regex(ValueError, 'percentiles are too close',
+                        _grid_from_X, X, grid_resolution=2,
                         percentiles=(0, 0.0001))
 
+    for percentiles in ((1, 2, 3, 4), 12345):
+        assert_raises_regex(ValueError, "percentiles must be a sequence",
+                            _grid_from_X, X, percentiles=percentiles)
+
+    for percentiles in ((-1, .95), (.05, 2)):
+        assert_raises_regex(ValueError, "percentiles values must be in",
+                            _grid_from_X, X, percentiles=percentiles)
+
+    assert_raises_regex(ValueError,
+                        "percentiles\[0\] must be strictly less than",
+                        _grid_from_X, X, percentiles=(.9, .1))
+
+    assert_raises_regex(ValueError,
+                        'grid_resolution must be strictly greater than 1.',
+                        _grid_from_X, X, grid_resolution=1)
+
 
 @pytest.mark.parametrize('target_feature', (0, 3))
 @pytest.mark.parametrize('est, partial_dependence_fun',
@@ -236,18 +249,6 @@ def test_partial_dependence_input():
     assert_raises_regex(ValueError, "Either grid or X must be specified",
                         partial_dependence, gbc, [0], grid=None, X=None)
 
-    for percentiles in ((1, 2, 3, 4), 12345):
-        assert_raises_regex(ValueError, "percentiles must be a sequence",
-                            partial_dependence, lr, [0], grid=None, X=X,
-                            percentiles=percentiles)
-    for percentiles in ((-1, .95), (.05, 2)):
-        assert_raises_regex(ValueError, "percentiles values must be in",
-                            partial_dependence, lr, [0], grid=None, X=X,
-                            percentiles=percentiles)
-    assert_raises_regex(ValueError, "percentiles\[0\] must be less than",
-                        partial_dependence, lr, [0], grid=None, X=X,
-                        percentiles=(.9, .1))
-
     assert_raises_regex(ValueError, "grid must be 1d or 2d",
                         partial_dependence, lr, [0], grid=[[[1]]], X=X)
 
@@ -395,4 +396,4 @@ def test_plot_partial_dependence_input():
                         'All entries of features must be less than ',
                         plot_partial_dependence, lr, X,
                         features=[123],
-                        feature_names=['blah'])
\ No newline at end of file
+                        feature_names=['blah'])

From 15e824d59a570226c8affe48c77ad305d32d0ec4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 14:51:56 -0500
Subject: [PATCH 040/113] docs

---
 doc/model_selection.rst                       |   1 +
 doc/modules/classes.rst                       |  18 +++
 doc/modules/ensemble.rst                      | 125 ------------------
 doc/modules/partial_dependence.rst            | 124 +++++++++++++++++
 .../{ensemble => }/plot_partial_dependence.py |   7 +-
 sklearn/ensemble/partial_dependence.py        |   7 +-
 .../ensemble/tests/test_partial_dependence.py |  10 +-
 sklearn/partial_dependence.py                 |  27 ++--
 8 files changed, 170 insertions(+), 149 deletions(-)
 create mode 100644 doc/modules/partial_dependence.rst
 rename examples/{ensemble => }/plot_partial_dependence.py (94%)

diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index daec6a6ed83e4..7d559615e069f 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -12,3 +12,4 @@ Model selection and evaluation
     modules/model_evaluation
     modules/model_persistence
     modules/learning_curve
+    modules/partial_dependence
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 473ea1c6a3539..e13f83d5eb3ee 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1225,6 +1225,24 @@ Model validation
    pipeline.make_pipeline
    pipeline.make_union
 
+.. _partial_dependence_ref:
+
+:mod:`sklearn.partial_dependence`: Partial dependence plots
+===========================================================
+
+.. automodule:: sklearn.partial_dependence
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   partial_dependence.partial_dependence
+   partial_dependence.plot_partial_dependence
+
 
 .. _preprocessing_ref:
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index c8895f3fd5ad3..0d72680f2ad09 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -797,131 +797,6 @@ accessed via the ``feature_importances_`` property::
 
  * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py`
 
-.. currentmodule:: sklearn.ensemble.partial_dependence
-
-.. _partial_dependence:
-
-Partial dependence
-..................
-
-Partial dependence plots (PDP) show the dependence between the target response
-and a set of 'target' features, marginalizing over the
-values of all other features (the 'complement' features).
-Intuitively, we can interpret the partial dependence as the expected
-target response [1]_ as a function of the 'target' features [2]_.
-
-Due to the limits of human perception the size of the target feature
-set must be small (usually, one or two) thus the target features are
-usually chosen among the most important features.
-
-The Figure below shows four one-way and one two-way partial dependence plots
-for the California housing dataset:
-
-.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_partial_dependence_001.png
-   :target: ../auto_examples/ensemble/plot_partial_dependence.html
-   :align: center
-   :scale: 70
-
-One-way PDPs tell us about the interaction between the target
-response and the target feature (e.g. linear, non-linear).
-The upper left plot in the above Figure shows the effect of the
-median income in a district on the median house price; we can
-clearly see a linear relationship among them.
-
-PDPs with two target features show the
-interactions among the two features. For example, the two-variable PDP in the
-above Figure shows the dependence of median house price on joint
-values of house age and avg. occupants per household. We can clearly
-see an interaction between the two features:
-For an avg. occupancy greater than two, the house price is nearly independent
-of the house age, whereas for values less than two there is a strong dependence
-on age.
-
-The module :mod:`partial_dependence` provides a convenience function
-:func:`~sklearn.ensemble.partial_dependence.plot_partial_dependence`
-to create one-way and two-way partial dependence plots. In the below example
-we show how to create a grid of partial dependence plots: two one-way
-PDPs for the features ``0`` and ``1`` and a two-way PDP between the two
-features::
-
-    >>> from sklearn.datasets import make_hastie_10_2
-    >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> from sklearn.ensemble.partial_dependence import plot_partial_dependence
-
-    >>> X, y = make_hastie_10_2(random_state=0)
-    >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
-    ...     max_depth=1, random_state=0).fit(X, y)
-    >>> features = [0, 1, (0, 1)]
-    >>> fig, axs = plot_partial_dependence(clf, X, features) #doctest: +SKIP
-
-For multi-class models, you need to set the class label for which the
-PDPs should be created via the ``label`` argument::
-
-    >>> from sklearn.datasets import load_iris
-    >>> iris = load_iris()
-    >>> mc_clf = GradientBoostingClassifier(n_estimators=10,
-    ...     max_depth=1).fit(iris.data, iris.target)
-    >>> features = [3, 2, (3, 2)]
-    >>> fig, axs = plot_partial_dependence(mc_clf, X, features, label=0) #doctest: +SKIP
-
-If you need the raw values of the partial dependence function rather
-than the plots you can use the
-:func:`~sklearn.ensemble.partial_dependence.partial_dependence` function::
-
-    >>> from sklearn.ensemble.partial_dependence import partial_dependence
-
-    >>> pdp, axes = partial_dependence(clf, [0], X=X)
-    >>> pdp  # doctest: +ELLIPSIS
-    array([[ 2.46643157,  2.46643157, ...
-    >>> axes  # doctest: +ELLIPSIS
-    [array([-1.62497054, -1.59201391, ...
-
-The function requires either the argument ``grid`` which specifies the
-values of the target features on which the partial dependence function
-should be evaluated or the argument ``X`` which is a convenience mode
-for automatically creating ``grid`` from the training data. If ``X``
-is given, the ``axes`` value returned by the function gives the axis
-for each target feature.
-
-For each value of the 'target' features in the ``grid`` the partial
-dependence function need to marginalize the predictions of a tree over
-all possible values of the 'complement' features. In decision trees
-this function can be evaluated efficiently without reference to the
-training data. For each grid point a weighted tree traversal is
-performed: if a split node involves a 'target' feature, the
-corresponding left or right branch is followed, otherwise both
-branches are followed, each branch is weighted by the fraction of
-training samples that entered that branch. Finally, the partial
-dependence is given by a weighted average of all visited leaves. For
-tree ensembles the results of each individual tree are again
-averaged.
-
-.. rubric:: Footnotes
-
-.. [1] For classification with ``loss='deviance'``  the target
-   response is logit(p).
-
-.. [2] More precisely its the expectation of the target response after
-   accounting for the initial model; partial dependence plots
-   do not include the ``init`` model.
-
-.. topic:: Examples:
-
- * :ref:`sphx_glr_auto_examples_ensemble_plot_partial_dependence.py`
-
-
-.. topic:: References
-
- .. [F2001] J. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine",
-   The Annals of Statistics, Vol. 29, No. 5, 2001.
-
- .. [F1999] J. Friedman, "Stochastic Gradient Boosting", 1999
-
- .. [HTF2009] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.
-
- .. [R2007] G. Ridgeway, "Generalized Boosted Models: A guide to the gbm package", 2007
-
-
  .. _voting_classifier:
 
 Voting Classifier
diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
new file mode 100644
index 0000000000000..ee89e6d9d33c3
--- /dev/null
+++ b/doc/modules/partial_dependence.rst
@@ -0,0 +1,124 @@
+
+.. _partial_dependence:
+
+========================
+Partial dependence plots
+========================
+
+.. currentmodule:: sklearn.partial_dependence
+
+Partial dependence plots (PDP) show the dependence between the target response
+and a set of 'target' features, marginalizing over the values of all other
+features (the 'complement' features). Intuitively, we can interpret the
+partial dependence as the expected target response [1]_ as a function of the
+'target' features.
+
+Due to the limits of human perception the size of the target feature set
+must be small (usually, one or two) thus the target features are usually
+chosen among the most important features.
+
+The Figure below shows four one-way and one two-way partial dependence plots
+for the California housing dataset, with a :class:`GradientBoostingRegressor
+<sklearn.ensemble.GradientBoostingRegressor>`:
+
+.. figure:: ../auto_examples/images/sphx_glr_plot_partial_dependence_001.png
+   :target: ../auto_examples/plot_partial_dependence.html
+   :align: center
+   :scale: 70
+
+One-way PDPs tell us about the interaction between the target response and
+the target feature (e.g. linear, non-linear). The upper left plot in the
+above figure shows the effect of the median income in a district on the
+median house price; we can clearly see a linear relationship among them.
+
+PDPs with two target features show the interactions among the two features.
+For example, the two-variable PDP in the above Figure shows the dependence
+of median house price on joint values of house age and avg. occupants per
+household. We can clearly see an interaction between the two features: for
+an avg. occupancy greater than two, the house price is nearly independent of
+the house age, whereas for values less than two there is a strong dependence
+on age.
+
+The :mod:`sklearn.partial_dependence` module provides a convenience function
+:func:`plot_partial_dependence` to create one-way and two-way partial
+dependence plots. In the below example we show how to create a grid of
+partial dependence plots: two one-way PDPs for the features ``0`` and ``1``
+and a two-way PDP between the two features::
+
+    >>> from sklearn.datasets import make_hastie_10_2
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> from sklearn.partial_dependence import plot_partial_dependence
+
+    >>> X, y = make_hastie_10_2(random_state=0)
+    >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
+    ...     max_depth=1, random_state=0).fit(X, y)
+    >>> features = [0, 1, (0, 1)]
+    >>> fig, axs = plot_partial_dependence(clf, X, features) #doctest: +SKIP
+
+For multi-class classification, you need to set the class label for which
+the PDPs should be created via the ``target`` argument::
+
+    >>> from sklearn.datasets import load_iris
+    >>> iris = load_iris()
+    >>> mc_clf = GradientBoostingClassifier(n_estimators=10,
+    ...     max_depth=1).fit(iris.data, iris.target)
+    >>> features = [3, 2, (3, 2)]
+    >>> fig, axs = plot_partial_dependence(mc_clf, X, features, target=0) #doctest: +SKIP
+
+The same parameter ``target`` is used to specify the target in multi-output
+regression settings.
+
+If you need the raw values of the partial dependence function rather than
+the plots, you can use the :func:`partial_dependence` function::
+
+    >>> from sklearn.partial_dependence import partial_dependence
+
+    >>> pdp, axes = partial_dependence(clf, [0], X=X)
+    >>> pdp  # doctest: +ELLIPSIS
+    array([[ 2.46643157,  2.46643157, ...
+    >>> axes  # doctest: +ELLIPSIS
+    [array([-1.62497054, -1.59201391, ...
+
+The function requires either the argument ``grid`` which specifies the
+values of the target features on which the partial dependence function
+should be evaluated, or the argument ``X`` which is a convenience mode for
+automatically creating ``grid`` from the training data. If ``grid`` is not
+specified, the ``values`` field returned by the function gives the actual
+values used in the grid for each target feature. They also correspond to the
+axis of the plots.
+
+For each value of the 'target' features in the ``grid`` the partial
+dependence function needs to marginalize the predictions of the estimator
+over all possible values of the 'complement' features. With the ``'brute'``
+method, this is done by replacing every target feature value of `X` by those
+in the grid, and computing the average prediction.
+
+In decision trees this can be evaluated efficiently without reference to the
+training data (``'recursion'`` method). For each grid point a weighted tree
+traversal is performed: if a split node involves a 'target' feature, the
+corresponding left or right branch is followed, otherwise both branches are
+followed, each branch is weighted by the fraction of training samples that
+entered that branch. Finally, the partial dependence is given by a weighted
+average of all visited leaves.
+
+.. rubric:: Footnotes
+
+.. [1] For classification, the target response is the probability of a class.
+   In particular for binary classification, this is the probability of the
+   positive class.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_plot_partial_dependence.py`
+
+
+.. topic:: References
+
+ .. [F2001] J. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine",
+   The Annals of Statistics, Vol. 29, No. 5, 2001.
+
+ .. [F1999] J. Friedman, "Stochastic Gradient Boosting", 1999
+
+ .. [HTF2009] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.
+
+ .. [R2007] G. Ridgeway, "Generalized Boosted Models: A guide to the gbm package", 2007
diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/plot_partial_dependence.py
similarity index 94%
rename from examples/ensemble/plot_partial_dependence.py
rename to examples/plot_partial_dependence.py
index f1e5f8607d1b6..c3fd666b72de4 100644
--- a/examples/ensemble/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -8,8 +8,7 @@
 values of all other features (the complement features). Due to the limits
 of human perception the size of the target feature set must be small (usually,
 one or two) thus the target features are usually chosen among the most
-important features
-(see :attr:`~sklearn.ensemble.GradientBoostingRegressor.feature_importances_`).
+important features.
 
 This example shows how to obtain partial dependence plots from a
 :class:`~sklearn.ensemble.GradientBoostingRegressor` trained on the California
@@ -54,8 +53,8 @@
 
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import GradientBoostingRegressor
-from sklearn.ensemble.partial_dependence import plot_partial_dependence
-from sklearn.ensemble.partial_dependence import partial_dependence
+from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.partial_dependence import partial_dependence
 from sklearn.datasets.california_housing import fetch_california_housing
 
 
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 03525bbcdb46d..4c95ac637e00e 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -60,7 +60,8 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
     warnings.warn("The function ensemble.partial_dependence has been moved to "
-                  "partial_dependence in 0.20 and will be removed in 0.22.",
+                  "partial_dependence.partial_dependence in 0.21 and will "
+                  "be removed in 0.23.",
                   DeprecationWarning)
     return new_pd(est=gbrt,
                   target_variables=target_variables,
@@ -147,8 +148,8 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     ...
     """
     warnings.warn("The function ensemble.plot_partial_dependence has been "
-                  "moved to partial_dependence in 0.20 and will be removed "
-                  "in 0.22.",
+                  "moved to partial_dependence.plot_partial_dependence in "
+                  " 0.21 and will be removed in 0.23.",
                   DeprecationWarning)
     return new_ppd(est=gbrt,
                    X=X,
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index cef18076f04a4..9f2ad310bc905 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -227,9 +227,8 @@ def test_warning_raised_for_partial_dependence():
     grid_resolution = 25
 
     assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.partial_dependence has been moved to "
-                         "partial_dependence in 0.20 and will be removed in "
-                         "0.22.", partial_dependence, clf, [0], X=boston.data,
+                         "ensemble.partial_dependence has been moved to ",
+                         partial_dependence, clf, [0], X=boston.data,
                          grid_resolution=grid_resolution)
 
 
@@ -241,8 +240,7 @@ def test_warning_raised_for_plot_partial_dependence():
     grid_resolution = 25
 
     assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.plot_partial_dependence has been moved to "
-                         "partial_dependence in 0.20 and will be removed in "
-                         "0.22.", plot_partial_dependence, clf, boston.data,
+                         "ensemble.plot_partial_dependence has been moved to ",
+                         plot_partial_dependence, clf, boston.data,
                          [0, 1, (0, 1)], grid_resolution=grid_resolution,
                          feature_names=boston.feature_names)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 333cc1609e386..fd45f36e024be 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -165,10 +165,6 @@ def partial_dependence(est, target_variables, grid=None, X=None,
                        method='auto'):
     """Partial dependence of ``target_variables``.
 
-    Partial dependence plots show the dependence between the joint values
-    of the ``target_variables`` and the function represented
-    by the ``est``.
-
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
@@ -179,14 +175,13 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     target_variables : list or array-like of int
         The target features for which the partial dependency should be
         computed.
-    grid : array-like, shape=(n_points, len(target_variables))
+    grid : array-like, shape=(n_points, len(target_variables)), optional
         The grid of ``target_variables`` values for which the partial
-        dependency should be evaluated (either ``grid`` or ``X`` must be
-        specified).
+        dependency should be evaluated.
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained. It is used both to generate
         a ``grid`` for the ``target_variables`` (if ``grid`` wasn't specified),
-        and then to compute the averaged predictions where the target features
+        and to compute the averaged predictions where the target features
         values would have been replaced by those in the grid.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
@@ -198,13 +193,18 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         The method to use to calculate the partial dependence predictions:
 
         - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is optimal in terms of speed.
+          `BaseGradientBoosting`, but is optimal in terms of speed. With
+          this method, ``X`` is optional and is only used to build the grid.
         - 'brute' is supported for any estimator, but is more
            computationally intensive. Both methods are equivalent.
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
 
+        Unlike the 'brute' method, 'recursion' does not account for the
+        ``init`` predictor of the boosting process. In practice this still
+        produces the same values, up to a constant offset.
+
     Returns
     -------
     averaged_predictions : array, shape=(n_classes, n_points)
@@ -348,12 +348,17 @@ class (index 1) is always used.
         The method to use to calculate the partial dependence predictions:
 
         - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is optimal in terms of speed.
+          `BaseGradientBoosting`, but is optimal in terms of speed. With
+          this method, ``X`` is optional and is only used to build the grid.
         - 'brute' is supported for any estimator, but is more
-           computationally intensive. Both methods are equivalent.
+           computationally intensive.
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
+        
+        Unlike the 'brute' method, 'recursion' does not account for the
+        ``init`` predictor of the boosting process. In practice this still
+        produces the same plots, up to a constant offset.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.

From 2f34cc12784d878d15a7a550ed096166065685e9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 15:06:27 -0500
Subject: [PATCH 041/113] added deprecation in doc and used decorator

---
 sklearn/ensemble/partial_dependence.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 4c95ac637e00e..6fcdc9ba709c4 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -6,8 +6,12 @@
 import warnings
 from ..partial_dependence import partial_dependence as new_pd
 from ..partial_dependence import plot_partial_dependence as new_ppd
+from ..utils import deprecated
 
 
+@deprecated("The function ensemble.partial_dependence has been moved to "
+            "partial_dependence.partial_dependence in 0.21 and will "
+            "be removed in 0.23.")
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100):
     """Partial dependence of ``target_variables``.
@@ -18,6 +22,11 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
+    .. deprecated:: 0.21
+       This function was deprecated in version 0.21 in favor of
+       :func:`sklearn.partial_dependence.partial_dependence` and will be
+       removed in 0.23.
+
     Parameters
     ----------
     gbrt : BaseGradientBoosting
@@ -59,10 +68,6 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    warnings.warn("The function ensemble.partial_dependence has been moved to "
-                  "partial_dependence.partial_dependence in 0.21 and will "
-                  "be removed in 0.23.",
-                  DeprecationWarning)
     return new_pd(est=gbrt,
                   target_variables=target_variables,
                   grid=grid,
@@ -72,6 +77,9 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
                   method='recursion')
 
 
+@deprecated("The function ensemble.plot_partial_dependence has been "
+            "moved to partial_dependence.plot_partial_dependence in "
+            " 0.21 and will be removed in 0.23.")
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
                             percentiles=(0.05, 0.95), n_jobs=None,
@@ -85,6 +93,11 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
+    .. deprecated:: 0.21
+       This function was deprecated in version 0.21 in favor of
+       :func:`sklearn.partial_dependence.plot_partial_dependence` and will be
+       removed in 0.23.
+
     Parameters
     ----------
     gbrt : BaseGradientBoosting
@@ -147,10 +160,6 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
-    warnings.warn("The function ensemble.plot_partial_dependence has been "
-                  "moved to partial_dependence.plot_partial_dependence in "
-                  " 0.21 and will be removed in 0.23.",
-                  DeprecationWarning)
     return new_ppd(est=gbrt,
                    X=X,
                    features=features,

From 36db4416c2ed595a17c39dbbaf34fa8a89037c7c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 15:17:50 -0500
Subject: [PATCH 042/113] added whatsnew entry

---
 doc/whats_new/v0.21.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 28c1cc40542e2..7c4d9b43fc238 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -66,6 +66,14 @@ Support for Python 3.4 and below has been officially dropped.
   communication overhead. :issue:`12543` by :user:`Isaac Storch <istorch>`
   and `Olivier Grisel`_.
 
+- |API| :func:`ensemble.partial_dependence` and
+  :func:`ensemble.plot_partial_dependence` are now deprecated in favor of
+  :func:`partial_dependence.partial_dependence<sklearn.partial_dependence.partial_dependence>`
+  and
+  :func:`partial_dependence.plot_partial_dependence<sklearn.partial_dependence.plot_partial_dependence>`.
+  :issue:TODO by :user:`Trevor Stephens<trevorstephens>` and :user:`Nicolas
+  Hug<NicolasHug>`.
+
 :mod:`sklearn.metrics`
 ......................
 
@@ -137,6 +145,13 @@ Support for Python 3.4 and below has been officially dropped.
   affects all ensemble methods using decision trees.
   :pr:`12344` by :user:`Adrin Jalali <adrinjalali>`.
 
+:mod:`sklearn.partial_dependence`
+.................................
+- |Feature| Partial dependence plots
+  (:func:`partial_dependence.plot_partial_dependence`) are now supported for
+  any regressor or classifier (provided that they have a `predict_proba()`
+  method). :issue:TODO by :user:`Trevor Stephens<trevorstephens>` and
+  :user:`Nicolas Hug<NicolasHug>`.
 
 Multiple modules
 ................

From ef40ede053a6bbbc492e59fb04e656a41429a71d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 15:23:37 -0500
Subject: [PATCH 043/113] pep8

---
 sklearn/ensemble/partial_dependence.py   |  1 -
 sklearn/partial_dependence.py            | 25 +++++++++++-------------
 sklearn/tests/test_partial_dependence.py | 18 ++++++-----------
 3 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 6fcdc9ba709c4..7073acfa3a8c3 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,7 +3,6 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-import warnings
 from ..partial_dependence import partial_dependence as new_pd
 from ..partial_dependence import plot_partial_dependence as new_ppd
 from ..utils import deprecated
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index fd45f36e024be..5c89d0e6e8bd2 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -105,8 +105,8 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
         n_trees_per_stage = 1
         n_estimators = len(est.estimators_)
         learning_rate = 1.
-    averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64,
-                                    order='C')
+    averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
+                                    dtype=np.float64, order='C')
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             if isinstance(est, BaseGradientBoosting):
@@ -115,7 +115,7 @@ def _partial_dependence_recursion(est, grid, target_variables, X=None):
                 tree = est.estimators_[stage].tree_
             _partial_dependence_tree(tree, grid, target_variables,
                                      learning_rate, averaged_predictions[k])
-    
+
     return averaged_predictions
 
 
@@ -160,6 +160,7 @@ def _partial_dependence_brute(est, grid, target_variables, X):
 
     return averaged_predictions
 
+
 def partial_dependence(est, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
@@ -281,7 +282,7 @@ def partial_dependence(est, target_variables, grid=None, X=None,
 
     if grid is None:
         grid, values = _grid_from_X(X[:, target_variables], percentiles,
-                                  grid_resolution)
+                                    grid_resolution)
     else:
         grid = np.asarray(grid)
         values = None  # don't return values if grid is given
@@ -355,7 +356,7 @@ class (index 1) is always used.
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
-        
+
         Unlike the 'brute' method, 'recursion' does not account for the
         ``init`` predictor of the boosting process. In practice this still
         produces the same plots, up to a constant offset.
@@ -397,9 +398,6 @@ class (index 1) is always used.
     from matplotlib import transforms
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
-    # TODO: The pattern below required to avoid a namespace collision.
-    # TODO: Move below imports to module level import at 0.22 release.
-    from .ensemble.gradient_boosting import BaseGradientBoosting
 
     # set target_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
@@ -407,7 +405,7 @@ class (index 1) is always used.
             raise ValueError('target must be specified for multi-class')
         target_idx = np.searchsorted(est.classes_, target)
         if (not (0 <= target_idx < len(est.classes_)) or
-            est.classes_[target_idx] != target):
+                est.classes_[target_idx] != target):
             raise ValueError('target not in est.classes_, got {}'.format(
                 target))
     else:
@@ -453,11 +451,11 @@ def convert_feature(fx):
     names = []
     try:
         for fxs in features:
-            l = []
+            names_ = []
             # explicit loop so "i" is bound for exception below
             for i in fxs:
-                l.append(feature_names[i])
-            names.append(l)
+                names_.append(feature_names[i])
+            names.append(names_)
     except IndexError:
         raise ValueError('All entries of features must be less than '
                          'len(feature_names) = {0}, got {1}.'
@@ -516,8 +514,7 @@ def convert_feature(fx):
     n_cols = min(n_cols, len(features))
     n_rows = int(np.ceil(len(features) / float(n_cols)))
     axs = []
-    for i, fx, name, (pd, values) in zip(count(), features, names,
-                                        pd_result):
+    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
 
         if len(values) == 1:
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index f9045717d2365..5eaadbbe4226d 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -3,7 +3,6 @@
 """
 
 import numpy as np
-from numpy.testing import assert_array_equal
 from numpy.testing import assert_array_almost_equal
 import pytest
 
@@ -11,8 +10,6 @@
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import if_matplotlib
-from sklearn.utils.testing import all_estimators
-from sklearn.utils.testing import ignore_warnings
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.partial_dependence import _grid_from_X
@@ -20,17 +17,12 @@
 from sklearn.partial_dependence import _partial_dependence_recursion
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
-from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import MultiTaskLasso
 from sklearn.svm import SVC
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
-from sklearn.base import is_classifier, is_regressor
-from sklearn.utils.estimator_checks import multioutput_estimator_convert_y_2d
 from sklearn.cluster import KMeans
 
 
@@ -55,6 +47,7 @@ def regression():
 def multioutput_regression():
     return make_regression(n_targets=2, random_state=0), 2
 
+
 @pytest.mark.parametrize('Estimator, method, data', [
     (GradientBoostingClassifier, 'recursion', binary_classification()),
     (GradientBoostingClassifier, 'recursion', multiclass_classification()),
@@ -103,7 +96,7 @@ def test_grid_from_X():
 
     # Make sure that the grid is a cartesian product of the input (it will use
     # the unique values instead of the percentiles)
-    X = np.asarray([[1, 2], 
+    X = np.asarray([[1, 2],
                     [3, 4]])
     grid, axes = _grid_from_X(X)
     assert_array_almost_equal(grid, [[1, 2],
@@ -157,7 +150,8 @@ def test_grid_from_X():
 @pytest.mark.parametrize('target_feature', (0, 3))
 @pytest.mark.parametrize('est, partial_dependence_fun',
                          [(LinearRegression(), _partial_dependence_brute),
-                          (GradientBoostingRegressor(random_state=0), _partial_dependence_recursion)])
+                          (GradientBoostingRegressor(random_state=0),
+                          _partial_dependence_recursion)])
 def test_partial_dependence_helpers(est, partial_dependence_fun,
                                     target_feature):
     # Check that what is returned by _partial_dependence_brute or
@@ -188,7 +182,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
 
     pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
     assert_array_almost_equal(pdp, mean_predictions, decimal=3)
-    
+
 
 @pytest.mark.parametrize('Estimator',
                          (sklearn.tree.DecisionTreeClassifier,
@@ -271,6 +265,7 @@ def test_partial_dependence_input():
 
     partial_dependence(gbc, [0], grid=[1, 2])
 
+
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -378,7 +373,6 @@ def test_plot_partial_dependence_input():
                             plot_partial_dependence, lr_m, X_m, [0],
                             target=target)
 
-
     for feature_names in (None, ['abcd', 'def']):
         assert_raises_regex(ValueError,
                             'Feature foobar not in feature_names',

From a766cadde7adc9dddfca31d344ba43f8b3ab9716 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 15:32:12 -0500
Subject: [PATCH 044/113] added PR number to whatsnew

---
 doc/whats_new/v0.21.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 7c4d9b43fc238..23548dbf9fe0d 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -71,7 +71,7 @@ Support for Python 3.4 and below has been officially dropped.
   :func:`partial_dependence.partial_dependence<sklearn.partial_dependence.partial_dependence>`
   and
   :func:`partial_dependence.plot_partial_dependence<sklearn.partial_dependence.plot_partial_dependence>`.
-  :issue:TODO by :user:`Trevor Stephens<trevorstephens>` and :user:`Nicolas
+  :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and :user:`Nicolas
   Hug<NicolasHug>`.
 
 :mod:`sklearn.metrics`
@@ -136,8 +136,6 @@ Support for Python 3.4 and below has been officially dropped.
   :issue:`12300` by :user:`Adrin Jalali <adrinjalali>`.
 
 - |Fix| Fixed an issue with :class:`tree.BaseDecisionTree`
-  and consequently all estimators based
-  on it, including :class:`tree.DecisionTreeClassifier`,
   :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`,
   and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given
   ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and
@@ -150,7 +148,7 @@ Support for Python 3.4 and below has been officially dropped.
 - |Feature| Partial dependence plots
   (:func:`partial_dependence.plot_partial_dependence`) are now supported for
   any regressor or classifier (provided that they have a `predict_proba()`
-  method). :issue:TODO by :user:`Trevor Stephens<trevorstephens>` and
+  method). :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and
   :user:`Nicolas Hug<NicolasHug>`.
 
 Multiple modules

From 828cca4a01e7dda2b7df7170299d7b2a72401f0b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 16:31:38 -0500
Subject: [PATCH 045/113] sorted dict keys for python2

---
 sklearn/partial_dependence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 5c89d0e6e8bd2..06e2db6e16867 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -252,7 +252,7 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     if method not in method_to_function:
         raise ValueError(
             'method {} is invalid. Accepted method names are {}, auto.'.format(
-                method, ', '.join(method_to_function.keys())))
+                method, ', '.join(sorted(method_to_function.keys()))))
 
     if method == 'recursion':
         if not isinstance(est, BaseGradientBoosting):

From e86bdab5fdea5f5d757ed7768246e13f22d5f1c3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 16:35:57 -0500
Subject: [PATCH 046/113] trying to fix python37 issue

---
 sklearn/tests/test_partial_dependence.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 5eaadbbe4226d..51c01d84d289b 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -139,7 +139,7 @@ def test_grid_from_X():
                             _grid_from_X, X, percentiles=percentiles)
 
     assert_raises_regex(ValueError,
-                        "percentiles\[0\] must be strictly less than",
+                        r"percentiles\[0\] must be strictly less than",
                         _grid_from_X, X, percentiles=(.9, .1))
 
     assert_raises_regex(ValueError,
@@ -248,8 +248,8 @@ def test_partial_dependence_input():
 
     for target_variables in ([0], [0, 1, 0]):
         assert_raises_regex(ValueError,
-                            'grid.shape\[1\] \(2\) must be equal to the number'
-                            ' of target variables',
+                            r'grid.shape\[1\] \(2\) must be equal '
+                            r'to the number of target variables',
                             partial_dependence, lr, target_variables,
                             grid=[[30, -123]], X=X)
 
@@ -369,7 +369,7 @@ def test_plot_partial_dependence_input():
                         target=None)
     for target in (-1, 100):
         assert_raises_regex(ValueError,
-                            'target must be in \[0, n_tasks\]',
+                            r'target must be in \[0, n_tasks\]',
                             plot_partial_dependence, lr_m, X_m, [0],
                             target=target)
 

From e26c0ac51f82ed334e651853b392198057736ae3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 16:58:11 -0500
Subject: [PATCH 047/113] removed use of dict for function dispatching

---
 sklearn/partial_dependence.py            | 24 +++++++++++++-----------
 sklearn/tests/test_partial_dependence.py | 20 +++++++++++++-------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 06e2db6e16867..4bf2fd5f672c1 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -88,7 +88,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(values), values
 
 
-def _partial_dependence_recursion(est, grid, target_variables, X=None):
+def _partial_dependence_recursion(est, grid, target_variables):
     # TODO: The pattern below required to avoid a namespace collision.
     # TODO: Move below imports to module level import at 0.22 release.
     from .ensemble._gradient_boosting import _partial_dependence_tree
@@ -240,19 +240,17 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     if X is not None:
         X = check_array(X)
 
+    accepted_methods = ('brute', 'recursion', 'auto')
+    if method not in accepted_methods:
+        raise ValueError(
+            'method {} is invalid. Accepted method names are {}, auto.'.format(
+                method, ', '.join(accepted_methods)))
+
     if method == 'auto':
         if isinstance(est, BaseGradientBoosting):
             method = 'recursion'
         else:
             method = 'brute'
-    method_to_function = {
-        'brute': _partial_dependence_brute,
-        'recursion': _partial_dependence_recursion
-    }
-    if method not in method_to_function:
-        raise ValueError(
-            'method {} is invalid. Accepted method names are {}, auto.'.format(
-                method, ', '.join(sorted(method_to_function.keys()))))
 
     if method == 'recursion':
         if not isinstance(est, BaseGradientBoosting):
@@ -297,8 +295,12 @@ def partial_dependence(est, target_variables, grid=None, X=None,
                              'of target variables ({})'.format(
                                  grid.shape[1], target_variables.shape[0]))
 
-    averaged_predictions = method_to_function[method](est, grid,
-                                                      target_variables, X)
+    if method == 'brute':
+        averaged_predictions = _partial_dependence_brute(est, grid,
+                                                         target_variables, X)
+    else:
+        averaged_predictions = _partial_dependence_recursion(est, grid,
+                                                             target_variables)
 
     return averaged_predictions, values
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 51c01d84d289b..c608495c0def4 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -148,12 +148,11 @@ def test_grid_from_X():
 
 
 @pytest.mark.parametrize('target_feature', (0, 3))
-@pytest.mark.parametrize('est, partial_dependence_fun',
-                         [(LinearRegression(), _partial_dependence_brute),
+@pytest.mark.parametrize('est, method',
+                         [(LinearRegression(), 'brute'),
                           (GradientBoostingRegressor(random_state=0),
-                          _partial_dependence_recursion)])
-def test_partial_dependence_helpers(est, partial_dependence_fun,
-                                    target_feature):
+                          'recursion')])
+def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or
     # _partial_dependece_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
@@ -172,7 +171,11 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
     target_variables = np.array([target_feature], dtype=np.int32)
     grid = np.array([[.5],
                      [123]])
-    pdp = partial_dependence_fun(est, grid, target_variables, X)
+
+    if method == 'brute':
+        pdp = _partial_dependence_brute(est, grid, target_variables, X)
+    else:
+        pdp = _partial_dependence_recursion(est, grid, target_variables)
 
     mean_predictions = []
     for val in (.5, 123):
@@ -192,7 +195,7 @@ def test_partial_dependence_helpers(est, partial_dependence_fun,
                           sklearn.neighbors.RadiusNeighborsClassifier,
                           sklearn.ensemble.RandomForestClassifier))
 def test_multiclass_multioutput(Estimator):
-    # Make error is raised for multiclass-multioutput classifiers
+    # Make sure error is raised for multiclass-multioutput classifiers
 
     # make multiclass-multioutput dataset
     X, y = make_classification(n_classes=3, n_clusters_per_class=1,
@@ -231,6 +234,9 @@ def test_partial_dependence_input():
                         'for the "recursion" method',
                         partial_dependence, lr, [0], method='recursion')
 
+    assert_raises_regex(ValueError, "X is required for brute method",
+                        partial_dependence, lr, [0], grid=[[[1]]])
+
     assert_raises_regex(ValueError, "est requires a predict_proba()",
                         partial_dependence, SVC(), [0], X=X)
 

From 763d15191ba8be4218f06f6a96cb493b9e1c0ca2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 17:26:20 -0500
Subject: [PATCH 048/113] filtered out warnings in tests

---
 sklearn/partial_dependence.py            | 11 ++++++-----
 sklearn/tests/test_partial_dependence.py |  5 +++++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 4bf2fd5f672c1..a2b5b3594812e 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -183,7 +183,8 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         The data on which ``est`` was trained. It is used both to generate
         a ``grid`` for the ``target_variables`` (if ``grid`` wasn't specified),
         and to compute the averaged predictions where the target features
-        values would have been replaced by those in the grid.
+        values have been replaced by those in the grid, for 'brute' method.
+        Optional if ``grid`` is specified and ``method`` is 'recursion'.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
         for the ``grid``. Only used if ``grid`` is None.
@@ -310,7 +311,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
                             percentiles=(0.05, 0.95), method='auto',
                             n_jobs=1, verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
-    """Partial dependence plots for ``features``.
+    """Partial dependence plots.
 
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
     columns. Two-way partial dependence plots are plotted as contour
@@ -325,7 +326,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         ``predict_proba()`` method. Multioutput-multiclass estimators aren't
         supported.
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained.
+        The data to use to build the grid of values on which the dependence
+        will be evaluated. This is usually the training data.
     features : seq of ints, strings, or tuples of ints or strings
         If features[i] is an int or a string, a one-way PDP is created; if
         features[i] is a tuple, a two-way PDP is created. Each tuple must be of
@@ -351,8 +353,7 @@ class (index 1) is always used.
         The method to use to calculate the partial dependence predictions:
 
         - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is optimal in terms of speed. With
-          this method, ``X`` is optional and is only used to build the grid.
+          `BaseGradientBoosting`, but is optimal in terms of speed.
         - 'brute' is supported for any estimator, but is more
            computationally intensive.
         - If 'auto', then 'recursion' will be used for
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index c608495c0def4..1e0b881f237ea 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -48,6 +48,8 @@ def multioutput_regression():
     return make_regression(n_targets=2, random_state=0), 2
 
 
+@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
+@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
 @pytest.mark.parametrize('Estimator, method, data', [
     (GradientBoostingClassifier, 'recursion', binary_classification()),
     (GradientBoostingClassifier, 'recursion', multiclass_classification()),
@@ -187,6 +189,7 @@ def test_partial_dependence_helpers(est, method, target_feature):
     assert_array_almost_equal(pdp, mean_predictions, decimal=3)
 
 
+@pytest.mark.filterwarnings('ignore:The default value of ')  # 0.22
 @pytest.mark.parametrize('Estimator',
                          (sklearn.tree.DecisionTreeClassifier,
                           sklearn.tree.ExtraTreeClassifier,
@@ -343,6 +346,8 @@ def test_plot_partial_dependence_multiclass():
 
 
 @if_matplotlib
+@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
+@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
 def test_plot_partial_dependence_input():
     X, y = make_classification(random_state=0)
 

From f5ff5191cf11d1a2c3216ea27f5283eaa6f25786 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 17:36:57 -0500
Subject: [PATCH 049/113] added test for multioutput

---
 sklearn/tests/test_partial_dependence.py | 32 +++++++++++++++++-------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 1e0b881f237ea..780501d08568a 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -7,7 +7,6 @@
 import pytest
 
 import sklearn
-from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import if_matplotlib
 from sklearn.partial_dependence import partial_dependence
@@ -32,19 +31,23 @@
 
 
 def binary_classification():
+    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
     return make_classification(random_state=0), 1
 
 
 def multiclass_classification():
+    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
     return (make_classification(n_classes=3, n_clusters_per_class=1,
                                 random_state=0), 3)
 
 
 def regression():
+    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
     return make_regression(random_state=0), 1
 
 
 def multioutput_regression():
+    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
     return make_regression(n_targets=2, random_state=0), 2
 
 
@@ -334,15 +337,26 @@ def test_plot_partial_dependence_multiclass():
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    # label not in gbrt.classes_
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, iris.data, [0, 1], target='foobar',
-                  grid_resolution=grid_resolution)
 
-    # label not provided
-    assert_raises(ValueError, plot_partial_dependence,
-                  clf, iris.data, [0, 1],
-                  grid_resolution=grid_resolution)
+@if_matplotlib
+def test_plot_partial_dependence_multioutput():
+    # Test partial dependence plot function on multi-output input.
+    (X, y), _ = multioutput_regression()
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=1,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
 
 
 @if_matplotlib

From 32cafe82372d4f1af7e8f2815de16fec3d61fa93 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 17:38:34 -0500
Subject: [PATCH 050/113] fixed comment

---
 sklearn/partial_dependence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index a2b5b3594812e..8fa6d5510bb18 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -474,8 +474,8 @@ def convert_feature(fx):
     # For multioutput regression, we can only check the validity of target
     # now that we have the predictions.
     # Also note: as multiclass-multioutput classifiers are not supported,
-    # multiand multioutput setting. So there is no risk of overwriting
-    # target_idx here.
+    # multiclass and multioutput scenario are mutually exclusive. So there is
+    # no risk of overwriting target_idx here.
     pd, _ = pd_result[0]  # checking the first result is enough
     if is_regressor(est) and pd.shape[0] > 1:
         if target is None:

From 784277d0197bef1372206988c74d69f0bf66cd27 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 15 Nov 2018 21:41:29 -0500
Subject: [PATCH 051/113] Fixed doctest

---
 doc/modules/partial_dependence.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index ee89e6d9d33c3..bf4c0991ce153 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -75,9 +75,9 @@ the plots, you can use the :func:`partial_dependence` function::
 
     >>> pdp, axes = partial_dependence(clf, [0], X=X)
     >>> pdp  # doctest: +ELLIPSIS
-    array([[ 2.46643157,  2.46643157, ...
+    array([[ 2.466...,  2.466..., ...
     >>> axes  # doctest: +ELLIPSIS
-    [array([-1.62497054, -1.59201391, ...
+    [array([-1.624..., -1.592..., ...
 
 The function requires either the argument ``grid`` which specifies the
 values of the target features on which the partial dependence function

From 2a58752d70cd71e86a0e5be799240ccc3bcae4b3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 11:34:48 -0500
Subject: [PATCH 052/113] updated docstrings

---
 doc/modules/partial_dependence.rst       | 25 +++---
 examples/plot_partial_dependence.py      | 15 ++--
 sklearn/partial_dependence.py            | 99 +++++++++++++-----------
 sklearn/tests/test_partial_dependence.py |  2 +-
 4 files changed, 77 insertions(+), 64 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index bf4c0991ce153..fa65d06665725 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -17,7 +17,7 @@ Due to the limits of human perception the size of the target feature set
 must be small (usually, one or two) thus the target features are usually
 chosen among the most important features.
 
-The Figure below shows four one-way and one two-way partial dependence plots
+The figure below shows four one-way and one two-way partial dependence plots
 for the California housing dataset, with a :class:`GradientBoostingRegressor
 <sklearn.ensemble.GradientBoostingRegressor>`:
 
@@ -32,11 +32,11 @@ above figure shows the effect of the median income in a district on the
 median house price; we can clearly see a linear relationship among them.
 
 PDPs with two target features show the interactions among the two features.
-For example, the two-variable PDP in the above Figure shows the dependence
-of median house price on joint values of house age and avg. occupants per
+For example, the two-variable PDP in the above figure shows the dependence
+of median house price on joint values of house age and average occupants per
 household. We can clearly see an interaction between the two features: for
-an avg. occupancy greater than two, the house price is nearly independent of
-the house age, whereas for values less than two there is a strong dependence
+an average occupancy greater than two, the house price is nearly independent of
+the house age, whereas for values less than 2 there is a strong dependence
 on age.
 
 The :mod:`sklearn.partial_dependence` module provides a convenience function
@@ -79,11 +79,11 @@ the plots, you can use the :func:`partial_dependence` function::
     >>> axes  # doctest: +ELLIPSIS
     [array([-1.624..., -1.592..., ...
 
-The function requires either the argument ``grid`` which specifies the
-values of the target features on which the partial dependence function
-should be evaluated, or the argument ``X`` which is a convenience mode for
-automatically creating ``grid`` from the training data. If ``grid`` is not
-specified, the ``values`` field returned by the function gives the actual
+You can specify the argument ``grid`` which defines the values of the target
+features on which the partial dependence function should be evaluated, or
+the argument ``X`` which is a convenience mode for automatically creating
+the grid from the training data. If ``grid`` is not specified, the
+``values`` field returned by :func:`partial_dependence` gives the actual
 values used in the grid for each target feature. They also correspond to the
 axis of the plots.
 
@@ -99,7 +99,10 @@ traversal is performed: if a split node involves a 'target' feature, the
 corresponding left or right branch is followed, otherwise both branches are
 followed, each branch is weighted by the fraction of training samples that
 entered that branch. Finally, the partial dependence is given by a weighted
-average of all visited leaves.
+average of all visited leaves. Note that with the ``'recursion'`` method,
+``X`` is only used to generate the grid, not to compute the averaged
+predictions. The averaged predictions will always be computed on the data with
+which the trees were trained.
 
 .. rubric:: Footnotes
 
diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py
index c3fd666b72de4..203a76bacbd8a 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -16,12 +16,12 @@
 
 The plot shows four one-way and one two-way partial dependence plots.
 The target variables for the one-way PDP are:
-median income (`MedInc`), avg. occupants per household (`AvgOccup`),
-median house age (`HouseAge`), and avg. rooms per household (`AveRooms`).
+median income (`MedInc`), average occupants per household (`AvgOccup`),
+median house age (`HouseAge`), and average rooms per household (`AveRooms`).
 
 We can clearly see that the median house price shows a linear relationship
 with the median income (top left) and that the house price drops when the
-avg. occupants per household increases (top middle).
+average occupants per household increases (top middle).
 The top right plot shows that the house age in a district does not have
 a strong influence on the (median) house price; so does the average rooms
 per household.
@@ -30,12 +30,11 @@
 
 Partial dependence plots with two target features enable us to visualize
 interactions among them. The two-way partial dependence plot shows the
-dependence of median house price on joint values of house age and avg.
+dependence of median house price on joint values of house age and average
 occupants per household. We can clearly see an interaction between the
-two features:
-For an avg. occupancy greater than two, the house price is nearly independent
-of the house age, whereas for values less than two there is a strong dependence
-on age.
+two features: for an average occupancy greater than two, the house price is
+nearly independent of the house age, whereas for values less than two there
+is a strong dependence on age.
 
 .. [1] T. Hastie, R. Tibshirani and J. Friedman,
     "Elements of Statistical Learning Ed. 2", Springer, 2009.
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 8fa6d5510bb18..d2521e2686254 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -177,46 +177,52 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         The target features for which the partial dependency should be
         computed.
     grid : array-like, shape=(n_points, len(target_variables)), optional
-        The grid of ``target_variables`` values for which the partial
-        dependency should be evaluated.
+        The grid of values for which the partial dependence should be
+        evaluated. If `None`, the grid will be generated from the values in
+        ``X``.
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained. It is used both to generate
-        a ``grid`` for the ``target_variables`` (if ``grid`` wasn't specified),
-        and to compute the averaged predictions where the target features
-        values have been replaced by those in the grid, for 'brute' method.
-        Optional if ``grid`` is specified and ``method`` is 'recursion'.
-    percentiles : (low, high), default=(0.05, 0.95)
+        ``X`` is used both to generate a grid for the ``target_variables``
+        (if ``grid`` is None), and to compute the averaged predictions for
+        the 'brute' method. Optional if ``grid`` is not None and ``method``
+        is 'recursion'.
+    percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
         for the ``grid``. Only used if ``grid`` is None.
-    grid_resolution : int, default=100
-        The number of equally spaced points on the grid. Only used  if ``grid``
-        is None.
-    method : {'recursion', 'brute', 'auto'}, default='auto'
-        The method to use to calculate the partial dependence predictions:
+    grid_resolution : int, optional (default=100)
+        The number of equally spaced points on the grid, for each target
+        feature. Only used if ``grid`` is None.
+    method : str, optional (default='auto')
+        The method used to calculate the averaged predictions:
 
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is optimal in terms of speed. With
           this method, ``X`` is optional and is only used to build the grid.
+
         - 'brute' is supported for any estimator, but is more
-           computationally intensive. Both methods are equivalent.
+          computationally intensive.
+
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
 
         Unlike the 'brute' method, 'recursion' does not account for the
         ``init`` predictor of the boosting process. In practice this still
-        produces the same values, up to a constant offset.
+        produces the same values, up to a constant offset in the target
+        response.
 
     Returns
     -------
-    averaged_predictions : array, shape=(n_classes, n_points)
+    averaged_predictions : array, shape=(n_targets, n_points)
         The predictions for all the points in the ``grid``, averaged over
-        all samples in X. For regression and binary classification
-        ``n_classes==1``.
+        all samples in X (or over the training data if ``method`` is
+        `recursion`). ``n_targets`` corresponds to the number of classes in
+        a multi-class setting, or to the number of tasks for multi-output
+        regression. For classical regression and binary classification
+        ``n_targets==1``.
     values: seq of ndarray or None
         The values with which the grid has been created, or None if
-        the grid has been given. The grid is a cartesian product of the arrays
-        in ``values``
+        the grid has been given. The generated grid is a cartesian product
+        of the arrays in ``values``
 
     Examples
     --------
@@ -314,8 +320,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     """Partial dependence plots.
 
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
-    columns. Two-way partial dependence plots are plotted as contour
-    plots.
+    columns. Two-way partial dependence plots are plotted as contour plots.
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
@@ -328,55 +333,61 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     X : array-like, shape=(n_samples, n_features)
         The data to use to build the grid of values on which the dependence
         will be evaluated. This is usually the training data.
-    features : seq of ints, strings, or tuples of ints or strings
+    features : list of ints or strings, or tuples of ints or strings
+        The target features for which to create the PDPs.
         If features[i] is an int or a string, a one-way PDP is created; if
-        features[i] is a tuple, a two-way PDP is created. Each tuple must be of
-        size 2.
+        features[i] is a tuple, a two-way PDP is created. Each tuple must be
+        of size 2.
         if any entry is a string, then it must be in ``feature_names``.
-    feature_names : seq of str
+    feature_names : seq of str, shape=(n_features,)
         Name of each feature; feature_names[i] holds the name of the feature
         with index i.
     target : int, optional (default=None)
         - In a multiclass setting, specifies the class for which the PDPs
-        should be computed. Note that for binary classification, the positive
-        class (index 1) is always used.
+          should be computed. Note that for binary classification, the
+          positive class (index 1) is always used.
         - In a multioutput setting, specifies the task for which the PDPs
           should be computed
-    n_cols : int
-        The number of columns in the grid plot (default: 3).
-    grid_resolution : int, default=100
-        The number of equally spaced points on the axes.
-    percentiles : (low, high), default=(0.05, 0.95)
+        Ignored in binary classification or classical regression settings.
+    n_cols : int, optional (default=3)
+        The number of columns in the grid plot.
+    grid_resolution : int, optional (default=100)
+        The number of equally spaced points on the axes of the plots, for each
+        target feature.
+    percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
-    method : {'recursion', 'brute', 'auto'}, default='auto'
+    method : str, optional (default='auto')
         The method to use to calculate the partial dependence predictions:
 
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is optimal in terms of speed.
+
         - 'brute' is supported for any estimator, but is more
-           computationally intensive.
+          computationally intensive.
+
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
 
         Unlike the 'brute' method, 'recursion' does not account for the
         ``init`` predictor of the boosting process. In practice this still
-        produces the same plots, up to a constant offset.
-    n_jobs : int
+        produces the same plots, up to a constant offset in the target
+        response.
+    n_jobs : int, optional (default=1)
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
-        Defaults to 1.
-    verbose : int
-        Verbose output during PD computations. Defaults to 0.
-    ax : Matplotlib axis object, default None
+        See :term:`Glossary <n_jobs>` for more details.
+    verbose : int, optional (default=0)
+        Verbose output during PD computations.
+    ax : Matplotlib axis object, optional (default=None)
         An axis object onto which the plots will be drawn.
-    line_kw : dict
+    line_kw : dict, optional
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
-    contour_kw : dict
+    contour_kw : dict, optional
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
-    **fig_kw : dict
+    **fig_kw : dict, optional
         Dict with keywords passed to the figure() call.
         Note that all keywords not recognized above will be automatically
         included here.
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 780501d08568a..fc5944c69fc1e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -162,7 +162,7 @@ def test_partial_dependence_helpers(est, method, target_feature):
     # _partial_dependece_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
     # samples.
-    # This also checks that the brute method and the recursion give the same
+    # This also checks that the brute and recursion methods give the same
     # output.
 
     X, y = make_regression(random_state=0)

From a5285e113d0d3c6257e6d464b835dd24139d19bb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 11:50:07 -0500
Subject: [PATCH 053/113] put lazy imports in deprecated module

---
 sklearn/ensemble/partial_dependence.py | 4 ++--
 sklearn/partial_dependence.py          | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 7073acfa3a8c3..9ff863b92ea28 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,8 +3,6 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from ..partial_dependence import partial_dependence as new_pd
-from ..partial_dependence import plot_partial_dependence as new_ppd
 from ..utils import deprecated
 
 
@@ -67,6 +65,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
+    from ..partial_dependence import partial_dependence as new_pd
     return new_pd(est=gbrt,
                   target_variables=target_variables,
                   grid=grid,
@@ -159,6 +158,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
+    from ..partial_dependence import plot_partial_dependence as new_ppd
     return new_ppd(est=gbrt,
                    X=X,
                    features=features,
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index d2521e2686254..c1e740c8ebe08 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -20,6 +20,8 @@
 from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
 from .exceptions import NotFittedError
+from .ensemble.gradient_boosting import BaseGradientBoosting
+from .ensemble._gradient_boosting import _partial_dependence_tree
 
 
 __all__ = ['partial_dependence', 'plot_partial_dependence']
@@ -91,8 +93,6 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 def _partial_dependence_recursion(est, grid, target_variables):
     # TODO: The pattern below required to avoid a namespace collision.
     # TODO: Move below imports to module level import at 0.22 release.
-    from .ensemble._gradient_boosting import _partial_dependence_tree
-    from .ensemble.gradient_boosting import BaseGradientBoosting
 
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
@@ -235,7 +235,6 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
 
-    from .ensemble.gradient_boosting import BaseGradientBoosting
 
     if not (is_classifier(est) or is_regressor(est)):
         raise ValueError('est must be a fitted regressor or classifier.')

From 9bcc0ca36c32d326c27ea97709801ed91502bad0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 11:52:09 -0500
Subject: [PATCH 054/113] Finished removing old support for RandomForest

---
 sklearn/partial_dependence.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index c1e740c8ebe08..e28d7738ccad7 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -97,22 +97,14 @@ def _partial_dependence_recursion(est, grid, target_variables):
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
 
-    if isinstance(est, BaseGradientBoosting):
-        n_trees_per_stage = est.estimators_.shape[1]
-        n_estimators = est.estimators_.shape[0]
-        learning_rate = est.learning_rate
-    else:
-        n_trees_per_stage = 1
-        n_estimators = len(est.estimators_)
-        learning_rate = 1.
+    n_trees_per_stage = est.estimators_.shape[1]
+    n_estimators = est.estimators_.shape[0]
+    learning_rate = est.learning_rate
     averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
                                     dtype=np.float64, order='C')
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
-            if isinstance(est, BaseGradientBoosting):
-                tree = est.estimators_[stage, k].tree_
-            else:
-                tree = est.estimators_[stage].tree_
+            tree = est.estimators_[stage, k].tree_
             _partial_dependence_tree(tree, grid, target_variables,
                                      learning_rate, averaged_predictions[k])
 
@@ -235,7 +227,6 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
 
-
     if not (is_classifier(est) or is_regressor(est)):
         raise ValueError('est must be a fitted regressor or classifier.')
 

From 1c0b11de61bfe5017ff40d483134e76cf9d1e0fd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 12:02:15 -0500
Subject: [PATCH 055/113] fixed whatsnew

---
 doc/whats_new/v0.21.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 23548dbf9fe0d..c6352eb575eca 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -136,6 +136,8 @@ Support for Python 3.4 and below has been officially dropped.
   :issue:`12300` by :user:`Adrin Jalali <adrinjalali>`.
 
 - |Fix| Fixed an issue with :class:`tree.BaseDecisionTree`
+  and consequently all estimators based
+  on it, including :class:`tree.DecisionTreeClassifier`,
   :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`,
   and :class:`tree.ExtraTreeRegressor`, where they used to exceed the given
   ``max_depth`` by 1 while expanding the tree if ``max_leaf_nodes`` and

From 8f016c6a26de8c91ea04ba284e99c2c0dee85c5c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 12:03:53 -0500
Subject: [PATCH 056/113] removed unrelated change

---
 examples/ensemble/plot_random_forest_regression_multioutput.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/ensemble/plot_random_forest_regression_multioutput.py b/examples/ensemble/plot_random_forest_regression_multioutput.py
index e79e53f114e8b..8b7803361a60a 100644
--- a/examples/ensemble/plot_random_forest_regression_multioutput.py
+++ b/examples/ensemble/plot_random_forest_regression_multioutput.py
@@ -55,8 +55,6 @@
 # Predict on new data
 y_multirf = regr_multirf.predict(X_test)
 y_rf = regr_rf.predict(X_test)
-print(y_multirf.shape)
-print(y_rf.shape)
 
 # Plot the results
 plt.figure()

From 4bf6c9046deda66531c01ffa1b9e8ca44e00a978 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 12:07:57 -0500
Subject: [PATCH 057/113] small test refactoring

---
 sklearn/ensemble/tests/test_partial_dependence.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index 9f2ad310bc905..b1deaa93d4426 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -45,7 +45,6 @@ def test_partial_dependence_classifier():
     # now with our own grid
     X_ = np.asarray(X)
     grid = np.unique(X_[:, 0])
-    print(grid)
     pdp_2, axes = partial_dependence(clf, [0], grid=grid)
 
     assert axes is None
@@ -220,8 +219,9 @@ def test_plot_partial_dependence_multiclass():
                   grid_resolution=grid_resolution)
 
 
-def test_warning_raised_for_partial_dependence():
-    # Test that running the old partial_dependence function warns
+def test_warning_raised():
+    # Test that deprecation warning is raised
+
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
     grid_resolution = 25
@@ -231,14 +231,6 @@ def test_warning_raised_for_partial_dependence():
                          partial_dependence, clf, [0], X=boston.data,
                          grid_resolution=grid_resolution)
 
-
-@if_matplotlib
-def test_warning_raised_for_plot_partial_dependence():
-    # Test that running the old partial_dependence function warns
-    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
-    clf.fit(boston.data, boston.target)
-    grid_resolution = 25
-
     assert_warns_message(DeprecationWarning, "The function "
                          "ensemble.plot_partial_dependence has been moved to ",
                          plot_partial_dependence, clf, boston.data,

From fa6eba75fd1d0221da35ad163de5ae795ae025e3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 13:00:29 -0500
Subject: [PATCH 058/113] pyt back ifmatplotlib dec

---
 sklearn/ensemble/tests/test_partial_dependence.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index b1deaa93d4426..7c34fdcccabca 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -219,7 +219,7 @@ def test_plot_partial_dependence_multiclass():
                   grid_resolution=grid_resolution)
 
 
-def test_warning_raised():
+def test_warning_raised_partial_dependence():
     # Test that deprecation warning is raised
 
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
@@ -231,6 +231,14 @@ def test_warning_raised():
                          partial_dependence, clf, [0], X=boston.data,
                          grid_resolution=grid_resolution)
 
+@if_matplotlib
+def test_warning_raised_partial_dependence_plot():
+    # Test that deprecation warning is raised
+
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
     assert_warns_message(DeprecationWarning, "The function "
                          "ensemble.plot_partial_dependence has been moved to ",
                          plot_partial_dependence, clf, boston.data,

From 634dc335612824b5fa98e6ce53967ac1f366c5a3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 16 Nov 2018 13:39:19 -0500
Subject: [PATCH 059/113] pep8

---
 sklearn/ensemble/tests/test_partial_dependence.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index 7c34fdcccabca..f0b4dab7f5975 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -231,6 +231,7 @@ def test_warning_raised_partial_dependence():
                          partial_dependence, clf, [0], X=boston.data,
                          grid_resolution=grid_resolution)
 
+
 @if_matplotlib
 def test_warning_raised_partial_dependence_plot():
     # Test that deprecation warning is raised

From 512b3537877bfc05e8bdd92d11b854edd9851c91 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 23 Nov 2018 19:32:27 -0500
Subject: [PATCH 060/113] addressed some comments

---
 doc/modules/classes.rst            | 24 +++++++-----------------
 doc/modules/partial_dependence.rst |  1 -
 doc/whats_new/v0.21.rst            |  2 +-
 sklearn/partial_dependence.py      | 11 +++++------
 4 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 5ce8224ecf55a..ff8dc3f6e96dd 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -427,23 +427,6 @@ Samples generator
    :template: function.rst
 
 
-partial dependence
-------------------
-
-.. automodule:: sklearn.ensemble.partial_dependence
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-   ensemble.partial_dependence.partial_dependence
-   ensemble.partial_dependence.plot_partial_dependence
-
-
 .. _exceptions_ref:
 
 :mod:`sklearn.exceptions`: Exceptions and warnings
@@ -1526,6 +1509,13 @@ To be removed in 0.23
    utils.delayed
    metrics.calinski_harabaz_score
 
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   ensemble.partial_dependence.partial_dependence
+   ensemble.partial_dependence.plot_partial_dependence
+
 
 To be removed in 0.22
 ---------------------
diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index fa65d06665725..71ccdba1ca75d 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -114,7 +114,6 @@ which the trees were trained.
 
  * :ref:`sphx_glr_auto_examples_plot_partial_dependence.py`
 
-
 .. topic:: References
 
  .. [F2001] J. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine",
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 81eaa3dd0e0c1..a583c5a7bfefc 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -151,7 +151,7 @@ Support for Python 3.4 and below has been officially dropped.
 .................................
 - |Feature| Partial dependence plots
   (:func:`partial_dependence.plot_partial_dependence`) are now supported for
-  any regressor or classifier (provided that they have a `predict_proba()`
+  any regressor or classifier (provided that they have a `predict_proba`
   method). :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and
   :user:`Nicolas Hug<NicolasHug>`.
 
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index e28d7738ccad7..2f461d7010d04 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -91,8 +91,6 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
 
 def _partial_dependence_recursion(est, grid, target_variables):
-    # TODO: The pattern below required to avoid a namespace collision.
-    # TODO: Move below imports to module level import at 0.22 release.
 
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
@@ -187,8 +185,9 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         The method used to calculate the averaged predictions:
 
         - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is optimal in terms of speed. With
-          this method, ``X`` is optional and is only used to build the grid.
+          `BaseGradientBoosting`, but is more efficient in terms of speed.
+          With this method, ``X`` is optional and is only used to build the
+          grid.
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
@@ -211,7 +210,7 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         a multi-class setting, or to the number of tasks for multi-output
         regression. For classical regression and binary classification
         ``n_targets==1``.
-    values: seq of ndarray or None
+    values : seq of ndarray or None
         The values with which the grid has been created, or None if
         the grid has been given. The generated grid is a cartesian product
         of the arrays in ``values``
@@ -351,7 +350,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         The method to use to calculate the partial dependence predictions:
 
         - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is optimal in terms of speed.
+          `BaseGradientBoosting`, but is more efficient in terms of speed.
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.

From ef09e802d0354c69ee3063c1b6cb3a755631fb96 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 28 Nov 2018 14:11:35 -0500
Subject: [PATCH 061/113] Added sanity check

---
 sklearn/tests/test_partial_dependence.py | 35 ++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index fc5944c69fc1e..eb6bb197cc1f5 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -23,6 +23,8 @@
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.cluster import KMeans
+from sklearn.metrics import r2_score
+from sklearn.preprocessing import PolynomialFeatures
 
 
 # toy sample
@@ -192,6 +194,39 @@ def test_partial_dependence_helpers(est, method, target_feature):
     assert_array_almost_equal(pdp, mean_predictions, decimal=3)
 
 
+@pytest.mark.parametrize('est', (LinearRegression(),
+                                 GradientBoostingRegressor(random_state=0)))
+@pytest.mark.parametrize('power', (1, 2))
+def test_partial_dependence_easy_target(est, power):
+    # If the target y only depends on one feature in an obvious way (linear or
+    # quadratic) then the partial dependence for that feature should reflect
+    # it.
+    # We here fit a linear regression model (with polynomial features if
+    # needed) and compute r_squared to check that the partial dependence
+    # correctly reflects the target.
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    target_variable = 2
+    X = rng.normal(size=(n_samples, 5))
+    y = X[:, target_variable]**power
+
+    est.fit(X, y)
+
+    averaged_predictions, values = partial_dependence(
+        est, target_variables=[target_variable], X=X, grid_resolution=1000)
+
+    new_X = values[0].reshape(-1, 1)
+    new_y = averaged_predictions[0]
+    # add polynomial features if needed
+    new_X = PolynomialFeatures(degree=power).fit_transform(new_X)
+
+    lr = LinearRegression().fit(new_X, new_y)
+    r2 = r2_score(new_y, lr.predict(new_X))
+
+    assert r2 > .99
+
+
 @pytest.mark.filterwarnings('ignore:The default value of ')  # 0.22
 @pytest.mark.parametrize('Estimator',
                          (sklearn.tree.DecisionTreeClassifier,

From 60b69b842d4ca8a25b0ba7b6fc439ae8762e8495 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Dec 2018 08:47:45 -0500
Subject: [PATCH 062/113] Added warnings about non constant init estimators

---
 sklearn/partial_dependence.py            | 43 ++++++++++++++++++++----
 sklearn/tests/test_partial_dependence.py | 26 ++++++++++++++
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 2f461d7010d04..03840930dcc28 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -7,6 +7,7 @@
 
 from itertools import count
 import numbers
+import warnings
 
 import numpy as np
 from scipy.stats.mstats import mquantiles
@@ -92,6 +93,13 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
 def _partial_dependence_recursion(est, grid, target_variables):
 
+    if est.init is not None:
+        warnings.warn(
+            'Using recursion method with a non-constant init predictor will '
+            'lead to incorrect partial dependence values.',
+            UserWarning
+        )
+
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
 
@@ -156,6 +164,17 @@ def partial_dependence(est, target_variables, grid=None, X=None,
                        method='auto'):
     """Partial dependence of ``target_variables``.
 
+    .. _warning_recursion_init:
+
+    .. warning::
+        The 'recursion' method only works for gradient boosting estimators,
+        and unlike the 'brute' method, it does not account for the ``init``
+        predictor of the boosting process. In practice this will produce the
+        same values as 'brute' up to a constant offset in the target
+        response, provided that ``init`` is a consant estimator (which is
+        the default). However, as soon as ``init`` is not a constant
+        estimator, the partial dependence values are incorrect.
+
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
@@ -187,7 +206,9 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is more efficient in terms of speed.
           With this method, ``X`` is optional and is only used to build the
-          grid.
+          grid. This method does not account for the ``init`` predicor of
+          the boosting process, which may lead to incorrect values (see
+          :ref:`this warning<warning_recursion_init>`).
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
@@ -196,11 +217,6 @@ def partial_dependence(est, target_variables, grid=None, X=None,
           ``BaseGradientBoosting`` estimators, and 'brute' used for other
           estimators.
 
-        Unlike the 'brute' method, 'recursion' does not account for the
-        ``init`` predictor of the boosting process. In practice this still
-        produces the same values, up to a constant offset in the target
-        response.
-
     Returns
     -------
     averaged_predictions : array, shape=(n_targets, n_points)
@@ -311,6 +327,17 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
     columns. Two-way partial dependence plots are plotted as contour plots.
 
+    .. _warning_recursion_init_plot:
+
+    .. warning::
+        The 'recursion' method only works for gradient boosting estimators,
+        and unlike the 'brute' method, it does not account for the ``init``
+        predictor of the boosting process. In practice this will produce the
+        same values as 'brute' up to a constant offset in the target
+        response, provided that ``init`` is a consant estimator (which is
+        the default). However, as soon as ``init`` is not a constant
+        estimator, the partial dependence values are incorrect.
+
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
@@ -351,6 +378,10 @@ def plot_partial_dependence(est, X, features, feature_names=None,
 
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is more efficient in terms of speed.
+          With this method, ``X`` is optional and is only used to build the
+          grid. This method does not account for the ``init`` predicor of
+          the boosting process, which may lead to incorrect values (see
+          :ref:`this warning<warning_recursion_init_plot>`).
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index eb6bb197cc1f5..99526920358b5 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -8,6 +8,7 @@
 
 import sklearn
 from sklearn.utils.testing import assert_raises_regex
+from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import if_matplotlib
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
@@ -25,6 +26,7 @@
 from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score
 from sklearn.preprocessing import PolynomialFeatures
+from sklearn.dummy import DummyClassifier
 
 
 # toy sample
@@ -451,3 +453,27 @@ def test_plot_partial_dependence_input():
                         plot_partial_dependence, lr, X,
                         features=[123],
                         feature_names=['blah'])
+
+
+@pytest.mark.skip('Passing non-constant init fails. Wait for PR #12436 '
+                  'to be merged to un-skip this test')
+def test_warning_recursion_non_constant_init():
+    # make sure that passing a non-constant init parameter to a GBDT and using
+    # recursion method yields a warning.
+
+    gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
+    gbc.fit(X, y)
+
+    assert_warns_message(
+        UserWarning,
+        'Using recursion method with a non-constant init predictor',
+        plot_partial_dependence,
+        gbc, X, [0], method='recursion'
+    )
+
+    assert_warns_message(
+        UserWarning,
+        'Using recursion method with a non-constant init predictor',
+        partial_dependence,
+        gbc, [0], X=X, method='recursion'
+    )

From 34edf8fb533dbd28548805f7a1726512155be30d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 7 Dec 2018 12:06:13 -0500
Subject: [PATCH 063/113] Removed useless train_test_split from example

Also renamed clf into est (it's a regressor)
---
 examples/plot_partial_dependence.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py
index 203a76bacbd8a..5188378530851 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -50,7 +50,6 @@
 
 from mpl_toolkits.mplot3d import Axes3D
 
-from sklearn.model_selection import train_test_split
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.partial_dependence import partial_dependence
@@ -60,25 +59,20 @@
 def main():
     cal_housing = fetch_california_housing()
 
-    # split 80/20 train-test
-    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
-                                                        cal_housing.target,
-                                                        test_size=0.2,
-                                                        random_state=1)
+    X, y = cal_housing.data, cal_housing.target
     names = cal_housing.feature_names
 
     print("Training GBRT...")
-    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
+    est = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                     learning_rate=0.1, loss='huber',
                                     random_state=1)
-    clf.fit(X_train, y_train)
+    est.fit(X, y)
     print(" done.")
 
     print('Convenience plot with ``partial_dependence_plots``')
 
     features = [0, 5, 1, 2, (5, 1)]
-    fig, axs = plot_partial_dependence(clf, X_train, features,
-                                       feature_names=names,
+    fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
                                        n_jobs=3, grid_resolution=50)
     fig.suptitle('Partial dependence of house value on nonlocation features\n'
                  'for the California housing dataset')
@@ -88,8 +82,8 @@ def main():
     fig = plt.figure()
 
     target_feature = (1, 5)
-    pdp, axes = partial_dependence(clf, target_feature,
-                                   X=X_train, grid_resolution=50)
+    pdp, axes = partial_dependence(est, target_feature, X=X,
+                                   grid_resolution=50)
     XX, YY = np.meshgrid(axes[0], axes[1])
     Z = pdp[0].reshape(list(map(np.size, axes))).T
     ax = Axes3D(fig)

From 4975dc90a1f5d75d89a4c18f219609fb14002079 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 17 Dec 2018 15:04:51 -0500
Subject: [PATCH 064/113] put back old versions in
 ensemble/partial_dependence.py to remove grid param

---
 sklearn/ensemble/partial_dependence.py | 289 +++++++++++++++++++++++--
 1 file changed, 265 insertions(+), 24 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 9ff863b92ea28..a8a904fe00405 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,8 +3,76 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
+# Note: function here are deprecated. We don't call the new versions because
+# the API slightly changes (namely partial_dependence does not have the grid
+# parameter anymore.)
+
+from itertools import count
+import numbers
+
+import numpy as np
+from scipy.stats.mstats import mquantiles
+
+from ..utils.extmath import cartesian
+from ..utils._joblib import Parallel, delayed
+from ..externals import six
+from ..externals.six.moves import map, range, zip
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+from ..tree._tree import DTYPE
 from ..utils import deprecated
 
+from ._gradient_boosting import _partial_dependence_tree
+from .gradient_boosting import BaseGradientBoosting
+
+
+def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
+    """Generate a grid of points based on the ``percentiles of ``X``.
+
+    The grid is generated by placing ``grid_resolution`` equally
+    spaced points between the ``percentiles`` of each column
+    of ``X``.
+
+    Parameters
+    ----------
+    X : ndarray
+        The data
+    percentiles : tuple of floats
+        The percentiles which are used to construct the extreme
+        values of the grid axes.
+    grid_resolution : int
+        The number of equally spaced points that are placed
+        on the grid.
+
+    Returns
+    -------
+    grid : ndarray
+        All data points on the grid; ``grid.shape[1] == X.shape[1]``
+        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
+    axes : seq of ndarray
+        The axes with which the grid has been created.
+    """
+    if len(percentiles) != 2:
+        raise ValueError('percentile must be tuple of len 2')
+    if not all(0. <= x <= 1. for x in percentiles):
+        raise ValueError('percentile values must be in [0, 1]')
+
+    axes = []
+    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+    for col in range(X.shape[1]):
+        uniques = np.unique(X[:, col])
+        if uniques.shape[0] < grid_resolution:
+            # feature has low resolution use unique vals
+            axis = uniques
+        else:
+            # create axis based on percentiles and grid resolution
+            axis = np.linspace(emp_percentiles[0, col],
+                               emp_percentiles[1, col],
+                               num=grid_resolution, endpoint=True)
+        axes.append(axis)
+
+    return cartesian(axes), axes
+
 
 @deprecated("The function ensemble.partial_dependence has been moved to "
             "partial_dependence.partial_dependence in 0.21 and will "
@@ -65,14 +133,47 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    from ..partial_dependence import partial_dependence as new_pd
-    return new_pd(est=gbrt,
-                  target_variables=target_variables,
-                  grid=grid,
-                  X=X,
-                  percentiles=percentiles,
-                  grid_resolution=grid_resolution,
-                  method='recursion')
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    check_is_fitted(gbrt, 'estimators_')
+    if (grid is None and X is None) or (grid is not None and X is not None):
+        raise ValueError('Either grid or X must be specified')
+
+    target_variables = np.asarray(target_variables, dtype=np.int32,
+                                  order='C').ravel()
+
+    if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]):
+        raise ValueError('target_variables must be in [0, %d]'
+                         % (gbrt.n_features_ - 1))
+
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
+        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
+                                  grid_resolution)
+    else:
+        assert grid is not None
+        # dont return axes if grid is given
+        axes = None
+        # grid must be 2d
+        if grid.ndim == 1:
+            grid = grid[:, np.newaxis]
+        if grid.ndim != 2:
+            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
+
+    grid = np.asarray(grid, dtype=DTYPE, order='C')
+    assert grid.shape[1] == target_variables.shape[0]
+
+    n_trees_per_stage = gbrt.estimators_.shape[1]
+    n_estimators = gbrt.estimators_.shape[0]
+    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                   order='C')
+    for stage in range(n_estimators):
+        for k in range(n_trees_per_stage):
+            tree = gbrt.estimators_[stage, k].tree_
+            _partial_dependence_tree(tree, grid, target_variables,
+                                     gbrt.learning_rate, pdp[k])
+
+    return pdp, axes
 
 
 @deprecated("The function ensemble.plot_partial_dependence has been "
@@ -158,19 +259,159 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
-    from ..partial_dependence import plot_partial_dependence as new_ppd
-    return new_ppd(est=gbrt,
-                   X=X,
-                   features=features,
-                   feature_names=feature_names,
-                   target=label,
-                   n_cols=n_cols,
-                   grid_resolution=grid_resolution,
-                   method='recursion',
-                   percentiles=percentiles,
-                   n_jobs=n_jobs,
-                   verbose=verbose,
-                   ax=ax,
-                   line_kw=line_kw,
-                   contour_kw=contour_kw,
-                   **fig_kw)
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    check_is_fitted(gbrt, 'estimators_')
+
+    # set label_idx for multi-class GBRT
+    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
+        if label is None:
+            raise ValueError('label is not given for multi-class PDP')
+        label_idx = np.searchsorted(gbrt.classes_, label)
+        if gbrt.classes_[label_idx] != label:
+            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
+    else:
+        # regression and binary classification
+        label_idx = 0
+
+    X = check_array(X, dtype=DTYPE, order='C')
+    if gbrt.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match gbrt.n_features_')
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if not feature_names use fx indices as name
+        feature_names = [str(i) for i in range(gbrt.n_features_)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+
+    def convert_feature(fx):
+        if isinstance(fx, six.string_types):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return fx
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral,) + six.string_types):
+            fxs = (fxs,)
+        try:
+            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
+        except TypeError:
+            raise ValueError('features must be either int, str, or tuple '
+                             'of int/str')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('target features must be either one or two')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            l = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                l.append(feature_names[i])
+            names.append(l)
+    except IndexError:
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
+
+    # compute PD functions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(gbrt, fxs, X=X,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pdp, axes in pd_result:
+        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
+        n_fx = len(axes)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if ax is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig = ax.get_figure()
+        fig.clear()
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pdp, axes) in zip(count(), features, names,
+                                        pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(axes) == 1:
+            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(axes) == 2
+            XX, YY = np.meshgrid(axes[0], axes[1])
+            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(axes) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(axes) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs

From 995f4e91f62a2a9d513616e944195adbc00a6158 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 17 Dec 2018 15:18:12 -0500
Subject: [PATCH 065/113] Removed grid param from partial_dependence()

---
 doc/modules/partial_dependence.rst       | 12 +++---
 sklearn/partial_dependence.py            | 52 +++++++-----------------
 sklearn/tests/test_partial_dependence.py | 29 +++----------
 3 files changed, 24 insertions(+), 69 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index 71ccdba1ca75d..517d75d9c2390 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -79,13 +79,11 @@ the plots, you can use the :func:`partial_dependence` function::
     >>> axes  # doctest: +ELLIPSIS
     [array([-1.624..., -1.592..., ...
 
-You can specify the argument ``grid`` which defines the values of the target
-features on which the partial dependence function should be evaluated, or
-the argument ``X`` which is a convenience mode for automatically creating
-the grid from the training data. If ``grid`` is not specified, the
-``values`` field returned by :func:`partial_dependence` gives the actual
-values used in the grid for each target feature. They also correspond to the
-axis of the plots.
+The values at which the partial dependence should be evaluated are directly
+generated from ``X``. For 2-way partial dependence, a 2D-grid of values is
+generated. The ``values`` field returned by :func:`partial_dependence` gives
+the actual values used in the grid for each target feature. They also
+correspond to the axis of the plots.
 
 For each value of the 'target' features in the ``grid`` the partial
 dependence function needs to marginalize the predictions of the estimator
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 03840930dcc28..76a3d4b917d5e 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -159,9 +159,8 @@ def _partial_dependence_brute(est, grid, target_variables, X):
     return averaged_predictions
 
 
-def partial_dependence(est, target_variables, grid=None, X=None,
-                       percentiles=(0.05, 0.95), grid_resolution=100,
-                       method='auto'):
+def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
+                       grid_resolution=100, method='auto'):
     """Partial dependence of ``target_variables``.
 
     .. _warning_recursion_init:
@@ -185,27 +184,22 @@ def partial_dependence(est, target_variables, grid=None, X=None,
     target_variables : list or array-like of int
         The target features for which the partial dependency should be
         computed.
-    grid : array-like, shape=(n_points, len(target_variables)), optional
-        The grid of values for which the partial dependence should be
-        evaluated. If `None`, the grid will be generated from the values in
-        ``X``.
     X : array-like, shape=(n_samples, n_features)
-        ``X`` is used both to generate a grid for the ``target_variables``
-        (if ``grid`` is None), and to compute the averaged predictions for
-        the 'brute' method. Optional if ``grid`` is not None and ``method``
-        is 'recursion'.
+        ``X`` is used both to generate a grid of values for the
+        ``target_variables``, and to compute the averaged predictions when
+        method is 'brute'.
     percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
-        for the ``grid``. Only used if ``grid`` is None.
+        for the ``grid``.
     grid_resolution : int, optional (default=100)
         The number of equally spaced points on the grid, for each target
-        feature. Only used if ``grid`` is None.
+        feature.
     method : str, optional (default='auto')
         The method used to calculate the averaged predictions:
 
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is more efficient in terms of speed.
-          With this method, ``X`` is optional and is only used to build the
+          With this method, ``X`` is only used to build the
           grid. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
           :ref:`this warning<warning_recursion_init>`).
@@ -226,10 +220,10 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         a multi-class setting, or to the number of tasks for multi-output
         regression. For classical regression and binary classification
         ``n_targets==1``.
-    values : seq of ndarray or None
-        The values with which the grid has been created, or None if
-        the grid has been given. The generated grid is a cartesian product
-        of the arrays in ``values``
+    values : seq of ndarray
+        The values with which the grid has been created. The generated grid
+        is a cartesian product of the arrays in ``values``. ``len(values) ==
+        len(target_variables)``.
 
     Examples
     --------
@@ -287,26 +281,8 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         raise ValueError('all target_variables must be in [0, %d]'
                          % (n_features - 1))
 
-    if (grid is None and X is None):
-        raise ValueError('Either grid or X must be specified.')
-
-    if grid is None:
-        grid, values = _grid_from_X(X[:, target_variables], percentiles,
-                                    grid_resolution)
-    else:
-        grid = np.asarray(grid)
-        values = None  # don't return values if grid is given
-        # grid must be 2d
-        if grid.ndim == 1:
-            grid = grid[:, np.newaxis]
-        if grid.ndim != 2:
-            raise ValueError('grid must be 1d or 2d, got %dd dimensions' %
-                             grid.ndim)
-        if grid.shape[1] != target_variables.shape[0]:
-            raise ValueError('grid.shape[1] ({}) must be equal to the number '
-                             'of target variables ({})'.format(
-                                 grid.shape[1], target_variables.shape[0]))
-
+    grid, values = _grid_from_X(X[:, target_variables], percentiles,
+                                grid_resolution)
     if method == 'brute':
         averaged_predictions = _partial_dependence_brute(est, grid,
                                                          target_variables, X)
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 99526920358b5..961763c74e6ee 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -265,23 +265,20 @@ def test_partial_dependence_input():
 
     assert_raises_regex(ValueError,
                         "est must be a fitted regressor or classifier",
-                        partial_dependence, KMeans(), [0])
+                        partial_dependence, KMeans(), [0], X)
 
     assert_raises_regex(ValueError,
                         "method blahblah is invalid. Accepted method names "
                         "are brute, recursion, auto.",
-                        partial_dependence, lr, [0], method='blahblah')
+                        partial_dependence, lr, [0], X, method='blahblah')
 
     assert_raises_regex(ValueError,
                         'est must be an instance of BaseGradientBoosting '
                         'for the "recursion" method',
-                        partial_dependence, lr, [0], method='recursion')
-
-    assert_raises_regex(ValueError, "X is required for brute method",
-                        partial_dependence, lr, [0], grid=[[[1]]])
+                        partial_dependence, lr, [0], X, method='recursion')
 
     assert_raises_regex(ValueError, "est requires a predict_proba()",
-                        partial_dependence, SVC(), [0], X=X)
+                        partial_dependence, SVC(), [0], X)
 
     for feature in (-1, 1000000):
         for est in (lr, gbc):
@@ -289,19 +286,6 @@ def test_partial_dependence_input():
                                 "all target_variables must be in",
                                 partial_dependence, est, [feature], X=X)
 
-    assert_raises_regex(ValueError, "Either grid or X must be specified",
-                        partial_dependence, gbc, [0], grid=None, X=None)
-
-    assert_raises_regex(ValueError, "grid must be 1d or 2d",
-                        partial_dependence, lr, [0], grid=[[[1]]], X=X)
-
-    for target_variables in ([0], [0, 1, 0]):
-        assert_raises_regex(ValueError,
-                            r'grid.shape\[1\] \(2\) must be equal '
-                            r'to the number of target variables',
-                            partial_dependence, lr, target_variables,
-                            grid=[[30, -123]], X=X)
-
     for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
         assert_raises_regex(ValueError,
                             'est parameter must be a fitted estimator',
@@ -309,10 +293,7 @@ def test_partial_dependence_input():
 
     # check that array-like objects are accepted
     for est in (lr, gbc):
-        partial_dependence(est, [0], grid=[1, 2], X=list(X))
-        partial_dependence(est, [0], grid=[[1], [2]], X=list(X))
-
-    partial_dependence(gbc, [0], grid=[1, 2])
+        partial_dependence(est, [0],  X=list(X))
 
 
 @if_matplotlib

From 7a8fb447dde19991371fb665e08e278162564060 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 19 Dec 2018 16:22:57 -0500
Subject: [PATCH 066/113] Added MLPRegressor to example

---
 examples/plot_partial_dependence.py | 53 ++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py
index 5188378530851..4470620dfd65e 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -11,13 +11,15 @@
 important features.
 
 This example shows how to obtain partial dependence plots from a
-:class:`~sklearn.ensemble.GradientBoostingRegressor` trained on the California
-housing dataset. The example is taken from [1]_.
+:class:`~sklearn.neural_network.MLPRegressor` and a
+:class:`~sklearn.ensemble.GradientBoostingRegressor` trained on the
+California housing dataset. The example is taken from [1]_.
 
-The plot shows four one-way and one two-way partial dependence plots.
-The target variables for the one-way PDP are:
-median income (`MedInc`), average occupants per household (`AvgOccup`),
-median house age (`HouseAge`), and average rooms per household (`AveRooms`).
+The plots show four 1-way and two 1-way partial dependence plots (ommitted for
+:class:`~sklearn.neural_network.MLPRegressor` due to computation time).
+The target variables for the one-way PDP are: median income (`MedInc`),
+average occupants per household (`AvgOccup`), median house age (`HouseAge`),
+and average rooms per household (`AveRooms`).
 
 We can clearly see that the median house price shows a linear relationship
 with the median income (top left) and that the house price drops when the
@@ -28,6 +30,15 @@
 The tick marks on the x-axis represent the deciles of the feature values
 in the training data.
 
+We also observe that :class:`~sklearn.neural_network.MLPRegressor` has much
+smoother predictions than
+:class:`~sklearn.ensemble.GradientBoostingRegressor`. For the plots to be
+comparable, it is necessary to subtract the average value of the target
+``y``: The 'recursion' method, used by default for
+:class:`~sklearn.ensemble.GradientBoostingRegressor`, does not account for
+the initial predictor (in our case the average target). Setting the target
+average to 0 avoids this bias.
+
 Partial dependence plots with two target features enable us to visualize
 interactions among them. The two-way partial dependence plot shows the
 dependence of median house price on joint values of house age and average
@@ -36,6 +47,9 @@
 nearly independent of the house age, whereas for values less than two there
 is a strong dependence on age.
 
+On a third figure, we have plotted the same partial dependence plot, this time
+in 3 dimensions.
+
 .. [1] T. Hastie, R. Tibshirani and J. Friedman,
     "Elements of Statistical Learning Ed. 2", Springer, 2009.
 
@@ -51,6 +65,7 @@
 from mpl_toolkits.mplot3d import Axes3D
 
 from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.neural_network import MLPRegressor
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.partial_dependence import partial_dependence
 from sklearn.datasets.california_housing import fetch_california_housing
@@ -62,21 +77,33 @@ def main():
     X, y = cal_housing.data, cal_housing.target
     names = cal_housing.feature_names
 
+    # Center target to avoid GBDT init bias: GBDT with 'recursion' method does
+    # not account for the initial estimator (here the average target)
+    y -= y.mean()
+
+    print("Training MLPRegressor...")
+    est = MLPRegressor(activation='logistic')
+    est.fit(X, y)
+    print('Computing partial dependence plots...')
+    features = [0, 5, 1, 2]
+    fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
+                                       n_jobs=3, grid_resolution=50)
+    fig.suptitle('Partial dependence of house value on nonlocation features\n'
+                 'for the California housing dataset, with MLPRegressor')
+    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
+
     print("Training GBRT...")
     est = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                     learning_rate=0.1, loss='huber',
                                     random_state=1)
     est.fit(X, y)
-    print(" done.")
-
-    print('Convenience plot with ``partial_dependence_plots``')
-
+    print('Computing partial dependence plots...')
     features = [0, 5, 1, 2, (5, 1)]
     fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
                                        n_jobs=3, grid_resolution=50)
     fig.suptitle('Partial dependence of house value on nonlocation features\n'
-                 'for the California housing dataset')
-    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
+                 'for the California housing dataset, with Gradient Boosting')
+    plt.subplots_adjust(top=0.9)
 
     print('Custom 3d plot via ``partial_dependence``')
     fig = plt.figure()
@@ -96,7 +123,7 @@ def main():
     ax.view_init(elev=22, azim=122)
     plt.colorbar(surf)
     plt.suptitle('Partial dependence of house value on median\n'
-                 'age and average occupancy')
+                 'age and average occupancy, with Gradient Boosting')
     plt.subplots_adjust(top=0.9)
 
     plt.show()

From 2f67a35e7314f08967868d5197fc320333407aab Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 19 Dec 2018 16:31:51 -0500
Subject: [PATCH 067/113] Remoed ax param and used fig instead

---
 sklearn/partial_dependence.py            | 10 +++++-----
 sklearn/tests/test_partial_dependence.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 76a3d4b917d5e..bde312adff7f5 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -296,7 +296,7 @@ def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
 def plot_partial_dependence(est, X, features, feature_names=None,
                             target=None, n_cols=3, grid_resolution=100,
                             percentiles=(0.05, 0.95), method='auto',
-                            n_jobs=1, verbose=0, ax=None, line_kw=None,
+                            n_jobs=1, verbose=0, fig=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots.
 
@@ -375,8 +375,9 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         See :term:`Glossary <n_jobs>` for more details.
     verbose : int, optional (default=0)
         Verbose output during PD computations.
-    ax : Matplotlib axis object, optional (default=None)
-        An axis object onto which the plots will be drawn.
+    fig : Matplotlib figure object, optional (default=None)
+        A figure object onto which the plots will be drawn, after the figure
+        has been cleared.
     line_kw : dict, optional
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
@@ -510,10 +511,9 @@ def convert_feature(fx):
     if 2 in pdp_lim:
         Z_level = np.linspace(*pdp_lim[2], num=8)
 
-    if ax is None:
+    if fig is None:
         fig = plt.figure(**fig_kw)
     else:
-        fig = ax.get_figure()
         fig.clear()
 
     if line_kw is None:
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 961763c74e6ee..d266b8696cd1e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -458,3 +458,21 @@ def test_warning_recursion_non_constant_init():
         partial_dependence,
         gbc, [0], X=X, method='recursion'
     )
+
+
+@if_matplotlib
+def test_plot_partial_dependence_fig():
+    # Make sure fig object is correctly used if not None
+
+    import matplotlib.pyplot as plt
+
+    (X, y), _ = regression()
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    fig = plt.figure()
+    grid_resolution = 25
+    returned_fig, axs = plot_partial_dependence(
+        clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
+
+    assert returned_fig is fig

From 2e1f926d79d3155801c4d672cfcefe269ec31b6c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 9 Jan 2019 16:19:12 -0500
Subject: [PATCH 068/113] Addressed comments

---
 doc/whats_new/v0.21.rst                  |   4 +-
 examples/plot_partial_dependence.py      |   2 +-
 sklearn/partial_dependence.py            | 154 ++++++++++++-----------
 sklearn/tests/test_partial_dependence.py |  21 ++--
 4 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 4c8fe03746ff3..ac552c7d0e0d7 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -211,8 +211,8 @@ Support for Python 3.4 and below has been officially dropped.
 - |Feature| Partial dependence plots
   (:func:`partial_dependence.plot_partial_dependence`) are now supported for
   any regressor or classifier (provided that they have a `predict_proba`
-  method). :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and
-  :user:`Nicolas Hug<NicolasHug>`.
+  method). :issue:`12599` by :user:`Trevor Stephens <trevorstephens>` and
+  :user:`Nicolas Hug <NicolasHug>`.
 
 Multiple modules
 ................
diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py
index 4470620dfd65e..b2f8780552475 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -112,7 +112,7 @@ def main():
     pdp, axes = partial_dependence(est, target_feature, X=X,
                                    grid_resolution=50)
     XX, YY = np.meshgrid(axes[0], axes[1])
-    Z = pdp[0].reshape(list(map(np.size, axes))).T
+    Z = pdp[0].T
     ax = Axes3D(fig)
     surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                            cmap=plt.cm.BuPu, edgecolor='k')
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index bde312adff7f5..5aa140267726b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -16,7 +16,6 @@
 from .utils.extmath import cartesian
 from .externals.joblib import Parallel, delayed
 from .externals import six
-from .externals.six.moves import map, range, zip
 from .utils import check_array
 from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
@@ -29,33 +28,34 @@
 
 
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
-    """Generate a grid of points based on the ``percentiles of ``X``.
+    """Generate a grid of points based on the percentiles of X.
 
-    The grid is a cartesian product between the columns of Z. The ith column of
-    Z consists in ``grid_resolution`` equally-spaced points between the
-    percentiles of the ith column of X.
+    The grid is a cartesian product between the columns of ``values``. The
+    ith column of ``values`` consists in ``grid_resolution`` equally-spaced
+    points between the percentiles of the jth column of X.
     If ``grid_resolution`` is bigger than the number of unique values in the
-    ith column of X, then those unique values will be used instead.
+    jth column of X, then those unique values will be used instead.
 
     Parameters
     ----------
-    X : ndarray
+    X : ndarray, shape=(n_samples, n_target_features)
         The data
     percentiles : tuple of floats
         The percentiles which are used to construct the extreme values of
-        the grid.
+        the grid. Must be in [0, 1].
     grid_resolution : int
-        The number of equally spaced points to be placed on the grid for a
-        given column.
+        The number of equally spaced points to be placed on the grid for each
+        feature.
 
     Returns
     -------
     grid : ndarray, shape=(n_points, X.shape[1])
-        All data points on the grid. n_points is always ``<= grid_resolution **
-        X.shape[1]``.
-    Z: list of ndarray
-        The values with which the grid has been created. The ndarrays may be of
-        different shape: either (grid_resolution,) or (n_unique_values,).
+        A value for each feature at each point in the grid. ``n_points`` is
+        always ``<= grid_resolution ** X.shape[1]``.
+    values : list of 1d ndarrays
+        The values with which the grid has been created. The size of each
+        array ``values[j]`` is either ``grid_resolution``, or the number of
+        unique values in ``X[:, j]``, whichever is smaller.
     """
     try:
         assert len(percentiles) == 2
@@ -91,8 +91,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(values), values
 
 
-def _partial_dependence_recursion(est, grid, target_variables):
-
+def _partial_dependence_recursion(est, grid, features):
     if est.init is not None:
         warnings.warn(
             'Using recursion method with a non-constant init predictor will '
@@ -111,18 +110,17 @@ def _partial_dependence_recursion(est, grid, target_variables):
     for stage in range(n_estimators):
         for k in range(n_trees_per_stage):
             tree = est.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
+            _partial_dependence_tree(tree, grid, features,
                                      learning_rate, averaged_predictions[k])
 
     return averaged_predictions
 
 
-def _partial_dependence_brute(est, grid, target_variables, X):
-
+def _partial_dependence_brute(est, grid, features, X):
     averaged_predictions = []
     for new_values in grid:
         X_eval = X.copy()
-        for i, variable in enumerate(target_variables):
+        for i, variable in enumerate(features):
             X_eval[:, variable] = new_values[i]
 
         try:
@@ -159,20 +157,13 @@ def _partial_dependence_brute(est, grid, target_variables, X):
     return averaged_predictions
 
 
-def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
+def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
                        grid_resolution=100, method='auto'):
-    """Partial dependence of ``target_variables``.
-
-    .. _warning_recursion_init:
+    """Partial dependence of ``features``.
 
-    .. warning::
-        The 'recursion' method only works for gradient boosting estimators,
-        and unlike the 'brute' method, it does not account for the ``init``
-        predictor of the boosting process. In practice this will produce the
-        same values as 'brute' up to a constant offset in the target
-        response, provided that ``init`` is a consant estimator (which is
-        the default). However, as soon as ``init`` is not a constant
-        estimator, the partial dependence values are incorrect.
+    Partial dependence of a feature (or a set of features) corresponds to
+    the average response of an estimator for each possible value of the
+    feature.
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
@@ -181,16 +172,16 @@ def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
     est : BaseEstimator
         A fitted classification or regression model. Multioutput-multiclass
         classifiers are not supported.
-    target_variables : list or array-like of int
+    features : list or array-like of int
         The target features for which the partial dependency should be
         computed.
     X : array-like, shape=(n_samples, n_features)
         ``X`` is used both to generate a grid of values for the
-        ``target_variables``, and to compute the averaged predictions when
+        ``features``, and to compute the averaged predictions when
         method is 'brute'.
     percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
-        for the ``grid``.
+        for the grid. Must be in [0, 1].
     grid_resolution : int, optional (default=100)
         The number of equally spaced points on the grid, for each target
         feature.
@@ -213,27 +204,44 @@ def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
 
     Returns
     -------
-    averaged_predictions : array, shape=(n_targets, n_points)
-        The predictions for all the points in the ``grid``, averaged over
-        all samples in X (or over the training data if ``method`` is
-        `recursion`). ``n_targets`` corresponds to the number of classes in
+    averaged_predictions : array, \
+        shape=(n_outputs, n_values_feature_0, n_values_feature_1, ...)
+        The predictions for all the points in the grid, averaged over all
+        samples in X (or over the training data if ``method`` is
+        'recursion'). ``n_outputs`` corresponds to the number of classes in
         a multi-class setting, or to the number of tasks for multi-output
         regression. For classical regression and binary classification
-        ``n_targets==1``.
-    values : seq of ndarray
+        ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size
+        ``values[j]``.
+    values : seq of 1d ndarrays
         The values with which the grid has been created. The generated grid
         is a cartesian product of the arrays in ``values``. ``len(values) ==
-        len(target_variables)``.
+        len(features)``. The size of each array ``values[j]`` is either
+        ``grid_resolution``, or the number of unique values in ``X[:, j]``,
+        whichever is smaller.
 
     Examples
     --------
-    >>> samples = [[0, 0, 2], [1, 0, 0]]
-    >>> labels = [0, 1]
+    >>> X = [[0, 0, 2], [1, 0, 0]]
+    >>> y = [0, 1]
     >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels)
-    >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2)
-    >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)
+    >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),
+    ...                    grid_resolution=2) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
+
+    .. _warning_recursion_init:
+
+    Warnings
+    --------
+    The 'recursion' method only works for gradient boosting estimators, and
+    unlike the 'brute' method, it does not account for the ``init``
+    predictor of the boosting process. In practice this will produce the
+    same values as 'brute' up to a constant offset in the target response,
+    provided that ``init`` is a consant estimator (which is the default).
+    However, as soon as ``init`` is not a constant estimator, the partial
+    dependence values are incorrect.
+
     """
 
     if not (is_classifier(est) or is_regressor(est)):
@@ -243,8 +251,7 @@ def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
             isinstance(est.classes_[0], np.ndarray)):
         raise ValueError('Multiclass-multioutput estimators are not supported')
 
-    if X is not None:
-        X = check_array(X)
+    X = check_array(X)
 
     accepted_methods = ('brute', 'recursion', 'auto')
     if method not in accepted_methods:
@@ -267,28 +274,30 @@ def partial_dependence(est, target_variables, X, percentiles=(0.05, 0.95),
                         msg='est parameter must be a fitted estimator')
         # Note: if method is brute, this check is done at prediction time
         n_features = est.n_features_
-    elif X is None:
-        raise ValueError('X is required for brute method')
     else:
         if is_classifier(est) and not hasattr(est, 'predict_proba'):
             raise ValueError('est requires a predict_proba() method for '
                              'method="brute" for classification.')
         n_features = X.shape[1]
 
-    target_variables = np.asarray(target_variables, dtype=np.int32,
-                                  order='C').ravel()
-    if any(not (0 <= fx < n_features) for fx in target_variables):
-        raise ValueError('all target_variables must be in [0, %d]'
+    features = np.asarray(features, dtype=np.int32, order='C').ravel()
+    if any(not (0 <= f < n_features) for f in features):
+        raise ValueError('all features must be in [0, %d]'
                          % (n_features - 1))
 
-    grid, values = _grid_from_X(X[:, target_variables], percentiles,
+    grid, values = _grid_from_X(X[:, features], percentiles,
                                 grid_resolution)
     if method == 'brute':
         averaged_predictions = _partial_dependence_brute(est, grid,
-                                                         target_variables, X)
+                                                         features, X)
     else:
         averaged_predictions = _partial_dependence_recursion(est, grid,
-                                                             target_variables)
+                                                             features)
+
+    # reshape averaged_predictions to
+    # (n_outputs, n_values_feature_0, # n_values_feature_1, ...)
+    averaged_predictions = averaged_predictions.reshape(
+        -1, *[val.shape[0] for val in values])
 
     return averaged_predictions, values
 
@@ -303,17 +312,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
     columns. Two-way partial dependence plots are plotted as contour plots.
 
-    .. _warning_recursion_init_plot:
-
-    .. warning::
-        The 'recursion' method only works for gradient boosting estimators,
-        and unlike the 'brute' method, it does not account for the ``init``
-        predictor of the boosting process. In practice this will produce the
-        same values as 'brute' up to a constant offset in the target
-        response, provided that ``init`` is a consant estimator (which is
-        the default). However, as soon as ``init`` is not a constant
-        estimator, the partial dependence values are incorrect.
-
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
@@ -348,7 +346,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         target feature.
     percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
-        for the PDP axes.
+        for the PDP axes. Must be in [0, 1].
     method : str, optional (default='auto')
         The method to use to calculate the partial dependence predictions:
 
@@ -404,6 +402,18 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
+
+    .. _warning_recursion_init_plot:
+
+    Warnings
+    --------
+    The 'recursion' method only works for gradient boosting estimators, and
+    unlike the 'brute' method, it does not account for the ``init``
+    predictor of the boosting process. In practice this will produce the
+    same values as 'brute' up to a constant offset in the target response,
+    provided that ``init`` is a consant estimator (which is the default).
+    However, as soon as ``init`` is not a constant estimator, the partial
+    dependence values are incorrect.
     """
     import matplotlib.pyplot as plt
     from matplotlib import transforms
@@ -533,7 +543,7 @@ def convert_feature(fx):
             # make contour plot
             assert len(values) == 2
             XX, YY = np.meshgrid(values[0], values[1])
-            Z = pd[target_idx].reshape(list(map(np.size, values))).T
+            Z = pd[target_idx].T
             CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                             colors='k')
             ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index d266b8696cd1e..d4f67368242a2 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -71,9 +71,9 @@ def multioutput_regression():
     (MultiTaskLasso, 'brute', multioutput_regression()),
     ])
 @pytest.mark.parametrize('grid_resolution', (5, 10))
-@pytest.mark.parametrize('target_variables', ([1], [1, 2]))
+@pytest.mark.parametrize('features', ([1], [1, 2]))
 def test_output_shape(Estimator, method, data, grid_resolution,
-                      target_variables):
+                      features):
     # Check that partial_dependence has consistent output shape for different
     # kinds of estimators:
     # - classifiers with binary and multiclass settings
@@ -88,12 +88,13 @@ def test_output_shape(Estimator, method, data, grid_resolution,
     (X, y), n_targets = data
 
     est.fit(X, y)
-    pdp, axes = partial_dependence(est, target_variables=target_variables,
+    pdp, axes = partial_dependence(est, features=features,
                                    X=X, method=method,
                                    grid_resolution=grid_resolution)
 
-    expected_pdp_shape = (n_targets, grid_resolution ** len(target_variables))
-    expected_axes_shape = (len(target_variables), grid_resolution)
+    expected_pdp_shape = (n_targets, *[grid_resolution
+                                       for _ in range(len(features))])
+    expected_axes_shape = (len(features), grid_resolution)
 
     assert pdp.shape == expected_pdp_shape
     assert axes is not None
@@ -177,14 +178,14 @@ def test_partial_dependence_helpers(est, method, target_feature):
     est.fit(X, y)
 
     # target feature will be set to .5 and then to 123
-    target_variables = np.array([target_feature], dtype=np.int32)
+    features = np.array([target_feature], dtype=np.int32)
     grid = np.array([[.5],
                      [123]])
 
     if method == 'brute':
-        pdp = _partial_dependence_brute(est, grid, target_variables, X)
+        pdp = _partial_dependence_brute(est, grid, features, X)
     else:
-        pdp = _partial_dependence_recursion(est, grid, target_variables)
+        pdp = _partial_dependence_recursion(est, grid, features)
 
     mean_predictions = []
     for val in (.5, 123):
@@ -216,7 +217,7 @@ def test_partial_dependence_easy_target(est, power):
     est.fit(X, y)
 
     averaged_predictions, values = partial_dependence(
-        est, target_variables=[target_variable], X=X, grid_resolution=1000)
+        est, features=[target_variable], X=X, grid_resolution=1000)
 
     new_X = values[0].reshape(-1, 1)
     new_y = averaged_predictions[0]
@@ -283,7 +284,7 @@ def test_partial_dependence_input():
     for feature in (-1, 1000000):
         for est in (lr, gbc):
             assert_raises_regex(ValueError,
-                                "all target_variables must be in",
+                                "all features must be in",
                                 partial_dependence, est, [feature], X=X)
 
     for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):

From 56ac79ece0e53da03ca30f16c30b6fd4ed49fc8e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 10 Jan 2019 09:11:01 -0500
Subject: [PATCH 069/113] minor docstring change

---
 sklearn/partial_dependence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 5aa140267726b..fc950f3508919 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -205,7 +205,7 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
     Returns
     -------
     averaged_predictions : array, \
-        shape=(n_outputs, n_values_feature_0, n_values_feature_1, ...)
+            shape=(n_outputs, len(values[0]), len(values[1]), ...)
         The predictions for all the points in the grid, averaged over all
         samples in X (or over the training data if ``method`` is
         'recursion'). ``n_outputs`` corresponds to the number of classes in

From 4149a9c08aadce45fa746f99d96fcd5ce2c9d23b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 14 Jan 2019 09:16:25 -0500
Subject: [PATCH 070/113] Addressed comments

---
 sklearn/ensemble/partial_dependence.py        |  9 +--
 .../ensemble/tests/test_partial_dependence.py |  5 +-
 sklearn/partial_dependence.py                 | 16 +++--
 sklearn/tests/test_partial_dependence.py      | 66 +++++++++----------
 4 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index a8a904fe00405..2efc46bb2b821 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -74,9 +74,9 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-@deprecated("The function ensemble.partial_dependence has been moved to "
-            "partial_dependence.partial_dependence in 0.21 and will "
-            "be removed in 0.23.")
+@deprecated("The function ensemble.partial_dependence has been deprecated "
+            "in favour of partial_dependence.partial_dependence in 0.21 "
+            "and will be removed in 0.23.")
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100):
     """Partial dependence of ``target_variables``.
@@ -177,7 +177,8 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
 
 @deprecated("The function ensemble.plot_partial_dependence has been "
-            "moved to partial_dependence.plot_partial_dependence in "
+            "deprecated in favour of "
+            "partial_dependence.plot_partial_dependence in "
             " 0.21 and will be removed in 0.23.")
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index f0b4dab7f5975..8661f1777b562 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -227,7 +227,7 @@ def test_warning_raised_partial_dependence():
     grid_resolution = 25
 
     assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.partial_dependence has been moved to ",
+                         "ensemble.partial_dependence has been deprecated ",
                          partial_dependence, clf, [0], X=boston.data,
                          grid_resolution=grid_resolution)
 
@@ -241,7 +241,8 @@ def test_warning_raised_partial_dependence_plot():
     grid_resolution = 25
 
     assert_warns_message(DeprecationWarning, "The function "
-                         "ensemble.plot_partial_dependence has been moved to ",
+                         "ensemble.plot_partial_dependence has been "
+                         "deprecated",
                          plot_partial_dependence, clf, boston.data,
                          [0, 1, (0, 1)], grid_resolution=grid_resolution,
                          feature_names=boston.feature_names)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index fc950f3508919..5400b56f7db76 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -199,8 +199,8 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
           computationally intensive.
 
         - If 'auto', then 'recursion' will be used for
-          ``BaseGradientBoosting`` estimators, and 'brute' used for other
-          estimators.
+          ``BaseGradientBoosting`` estimators with ``init=None``, and 'brute'
+          used for other estimators.
 
     Returns
     -------
@@ -240,7 +240,7 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
     same values as 'brute' up to a constant offset in the target response,
     provided that ``init`` is a consant estimator (which is the default).
     However, as soon as ``init`` is not a constant estimator, the partial
-    dependence values are incorrect.
+    dependence values are incorrect for 'recursion'.
 
     """
 
@@ -260,7 +260,7 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
                 method, ', '.join(accepted_methods)))
 
     if method == 'auto':
-        if isinstance(est, BaseGradientBoosting):
+        if isinstance(est, BaseGradientBoosting) and est.init is None:
             method = 'recursion'
         else:
             method = 'brute'
@@ -361,8 +361,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           computationally intensive.
 
         - If 'auto', then 'recursion' will be used for
-          ``BaseGradientBoosting`` estimators, and 'brute' used for other
-          estimators.
+          ``BaseGradientBoosting`` estimators with ``init=None``, and
+          'brute' used for other estimators.
 
         Unlike the 'brute' method, 'recursion' does not account for the
         ``init`` predictor of the boosting process. In practice this still
@@ -413,7 +413,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     same values as 'brute' up to a constant offset in the target response,
     provided that ``init`` is a consant estimator (which is the default).
     However, as soon as ``init`` is not a constant estimator, the partial
-    dependence values are incorrect.
+    dependence values are incorrect for 'recursion'.
     """
     import matplotlib.pyplot as plt
     from matplotlib import transforms
@@ -442,6 +442,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         feature_names = [str(i) for i in range(n_features)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
+    if len(set(feature_names)) != len(feature_names):
+        raise ValueError('feature_names should not contain duplicates.')
 
     def convert_feature(fx):
         if isinstance(fx, six.string_types):
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index d4f67368242a2..f53c3c8c3cb76 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -34,41 +34,29 @@
 y = [-1, -1, -1, 1, 1, 1]
 
 
-def binary_classification():
-    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
-    return make_classification(random_state=0), 1
-
-
-def multiclass_classification():
-    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
-    return (make_classification(n_classes=3, n_clusters_per_class=1,
-                                random_state=0), 3)
-
-
-def regression():
-    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
-    return make_regression(random_state=0), 1
-
-
-def multioutput_regression():
-    # returns (X, y), n_targets  <-- as expected in the output of partial_dep()
-    return make_regression(n_targets=2, random_state=0), 2
+# (X, y), n_targets  <-- as expected in the output of partial_dep()
+binary_classification_data = (make_classification(random_state=0), 1)
+multiclass_classification_data = (make_classification(n_classes=3,
+                                                      n_clusters_per_class=1,
+                                                      random_state=0), 3)
+regression_data = (make_regression(random_state=0), 1)
+multioutput_regression_data = (make_regression(n_targets=2, random_state=0), 2)
 
 
 @pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
 @pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
 @pytest.mark.parametrize('Estimator, method, data', [
-    (GradientBoostingClassifier, 'recursion', binary_classification()),
-    (GradientBoostingClassifier, 'recursion', multiclass_classification()),
-    (GradientBoostingClassifier, 'brute', binary_classification()),
-    (GradientBoostingClassifier, 'brute', multiclass_classification()),
-    (GradientBoostingRegressor, 'recursion', regression()),
-    (GradientBoostingRegressor, 'brute', regression()),
-    (LinearRegression, 'brute', regression()),
-    (LinearRegression, 'brute', multioutput_regression()),
-    (LogisticRegression, 'brute', binary_classification()),
-    (LogisticRegression, 'brute', multiclass_classification()),
-    (MultiTaskLasso, 'brute', multioutput_regression()),
+    (GradientBoostingClassifier, 'recursion', binary_classification_data),
+    (GradientBoostingClassifier, 'recursion', multiclass_classification_data),
+    (GradientBoostingClassifier, 'brute', binary_classification_data),
+    (GradientBoostingClassifier, 'brute', multiclass_classification_data),
+    (GradientBoostingRegressor, 'recursion', regression_data),
+    (GradientBoostingRegressor, 'brute', regression_data),
+    (LinearRegression, 'brute', regression_data),
+    (LinearRegression, 'brute', multioutput_regression_data),
+    (LogisticRegression, 'brute', binary_classification_data),
+    (LogisticRegression, 'brute', multiclass_classification_data),
+    (MultiTaskLasso, 'brute', multioutput_regression_data),
     ])
 @pytest.mark.parametrize('grid_resolution', (5, 10))
 @pytest.mark.parametrize('features', ([1], [1, 2]))
@@ -84,7 +72,7 @@ def test_output_shape(Estimator, method, data, grid_resolution,
 
     # n_target corresponds to the number of classes (1 for binary classif) or
     # the number of tasks / outputs in multi task settings. It's equal to 1 for
-    # classical regression.
+    # classical regression_data.
     (X, y), n_targets = data
 
     est.fit(X, y)
@@ -204,7 +192,7 @@ def test_partial_dependence_easy_target(est, power):
     # If the target y only depends on one feature in an obvious way (linear or
     # quadratic) then the partial dependence for that feature should reflect
     # it.
-    # We here fit a linear regression model (with polynomial features if
+    # We here fit a linear regression_data model (with polynomial features if
     # needed) and compute r_squared to check that the partial dependence
     # correctly reflects the target.
 
@@ -360,7 +348,7 @@ def test_plot_partial_dependence_multiclass():
 @if_matplotlib
 def test_plot_partial_dependence_multioutput():
     # Test partial dependence plot function on multi-output input.
-    (X, y), _ = multioutput_regression()
+    (X, y), _ = multioutput_regression_data
     clf = LinearRegression()
     clf.fit(X, y)
 
@@ -390,7 +378,7 @@ def test_plot_partial_dependence_input():
     gbc.fit(X, y)
 
     # check target param for multiclass
-    (X_m, y_m), _ = multiclass_classification()
+    (X_m, y_m), _ = multiclass_classification_data
     lr_m = LogisticRegression()
     lr_m.fit(X_m, y_m)
     assert_raises_regex(ValueError,
@@ -404,7 +392,7 @@ def test_plot_partial_dependence_input():
                             target=target)
 
     # check target param for multioutput
-    (X_m, y_m), _ = multioutput_regression()
+    (X_m, y_m), _ = multioutput_regression_data
     lr_m = LinearRegression()
     lr_m.fit(X_m, y_m)
     assert_raises_regex(ValueError,
@@ -436,6 +424,12 @@ def test_plot_partial_dependence_input():
                         features=[123],
                         feature_names=['blah'])
 
+    assert_raises_regex(ValueError,
+                        'feature_names should not contain duplicates',
+                        plot_partial_dependence, lr, X,
+                        features=[0, 1, 2],
+                        feature_names=['a', 'b', 'a'])
+
 
 @pytest.mark.skip('Passing non-constant init fails. Wait for PR #12436 '
                   'to be merged to un-skip this test')
@@ -467,7 +461,7 @@ def test_plot_partial_dependence_fig():
 
     import matplotlib.pyplot as plt
 
-    (X, y), _ = regression()
+    (X, y), _ = regression_data
     clf = LinearRegression()
     clf.fit(X, y)
 

From 69b95e925425e49c09d0f65e3aa8a31e415c9001 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 15 Jan 2019 08:58:46 -0500
Subject: [PATCH 071/113] Addressed comments from Joel

---
 examples/plot_partial_dependence.py | 18 ++++++++++--------
 sklearn/partial_dependence.py       |  6 +++---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py
index b2f8780552475..a88cc00e87920 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/plot_partial_dependence.py
@@ -56,7 +56,6 @@
 .. [2] For classification you can think of it as the regression score before
        the link function.
 """
-from __future__ import print_function
 print(__doc__)
 
 import numpy as np
@@ -64,10 +63,10 @@
 
 from mpl_toolkits.mplot3d import Axes3D
 
+from sklearn.partial_dependence import partial_dependence
+from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.neural_network import MLPRegressor
-from sklearn.partial_dependence import plot_partial_dependence
-from sklearn.partial_dependence import partial_dependence
 from sklearn.datasets.california_housing import fetch_california_housing
 
 
@@ -77,22 +76,25 @@ def main():
     X, y = cal_housing.data, cal_housing.target
     names = cal_housing.feature_names
 
-    # Center target to avoid GBDT init bias: GBDT with 'recursion' method does
-    # not account for the initial estimator (here the average target)
+    # Center target to avoid gradient boosting init bias: gradient boosting
+    # with the 'recursion' method does not account for the initial estimator
+    # (here the average target, by default)
     y -= y.mean()
 
     print("Training MLPRegressor...")
     est = MLPRegressor(activation='logistic')
     est.fit(X, y)
     print('Computing partial dependence plots...')
+    # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
+    # with the brute method.
     features = [0, 5, 1, 2]
     fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
                                        n_jobs=3, grid_resolution=50)
-    fig.suptitle('Partial dependence of house value on nonlocation features\n'
+    fig.suptitle('Partial dependence of house value on non-location features\n'
                  'for the California housing dataset, with MLPRegressor')
     plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
 
-    print("Training GBRT...")
+    print("Training GradientBoostingRegressor...")
     est = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                     learning_rate=0.1, loss='huber',
                                     random_state=1)
@@ -101,7 +103,7 @@ def main():
     features = [0, 5, 1, 2, (5, 1)]
     fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
                                        n_jobs=3, grid_resolution=50)
-    fig.suptitle('Partial dependence of house value on nonlocation features\n'
+    fig.suptitle('Partial dependence of house value on non-location features\n'
                  'for the California housing dataset, with Gradient Boosting')
     plt.subplots_adjust(top=0.9)
 
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 5400b56f7db76..2b21f2bafe8d7 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -323,13 +323,13 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     X : array-like, shape=(n_samples, n_features)
         The data to use to build the grid of values on which the dependence
         will be evaluated. This is usually the training data.
-    features : list of ints or strings, or tuples of ints or strings
+    features : list of {int, str, pair of int, pair of str}
         The target features for which to create the PDPs.
         If features[i] is an int or a string, a one-way PDP is created; if
         features[i] is a tuple, a two-way PDP is created. Each tuple must be
         of size 2.
         if any entry is a string, then it must be in ``feature_names``.
-    feature_names : seq of str, shape=(n_features,)
+    feature_names : seq of str, shape=(n_features,), optional
         Name of each feature; feature_names[i] holds the name of the feature
         with index i.
     target : int, optional (default=None)
@@ -340,7 +340,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           should be computed
         Ignored in binary classification or classical regression settings.
     n_cols : int, optional (default=3)
-        The number of columns in the grid plot.
+        The maximum number of columns in the grid plot.
     grid_resolution : int, optional (default=100)
         The number of equally spaced points on the axes of the plots, for each
         target feature.

From 01ab87ce298892e9e98076c84308cbc6c71b12cf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 3 Feb 2019 11:26:33 -0500
Subject: [PATCH 072/113] rm blank line

---
 doc/whats_new/v0.21.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 33eedd1d997c1..68e26dca3c8b1 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -285,7 +285,6 @@ Support for Python 3.4 and below has been officially dropped.
   method). :issue:`12599` by :user:`Trevor Stephens <trevorstephens>` and
   :user:`Nicolas Hug <NicolasHug>`.
 
-
 :mod:`sklearn.linear_model`
 ...........................
 

From ca8c0fd9e43a17c8d4c6c37da2acfe8d3edff75a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 4 Feb 2019 10:56:32 -0500
Subject: [PATCH 073/113] moved into inspect module

---
 doc/model_inspection.rst                      | 10 ++++++++++
 doc/model_selection.rst                       |  1 -
 doc/modules/classes.rst                       | 10 +++++-----
 doc/modules/partial_dependence.rst            | 10 +++++-----
 doc/user_guide.rst                            |  1 +
 doc/whats_new/v0.21.rst                       | 15 +++++++-------
 .../{ => inspect}/plot_partial_dependence.py  |  4 ++--
 sklearn/__init__.py                           | 10 +++++-----
 sklearn/ensemble/partial_dependence.py        |  8 ++++----
 sklearn/inspect/__init__.py                   |  8 ++++++++
 sklearn/{ => inspect}/partial_dependence.py   | 20 +++++++++----------
 .../tests/test_partial_dependence.py          | 10 +++++-----
 12 files changed, 63 insertions(+), 44 deletions(-)
 create mode 100644 doc/model_inspection.rst
 rename examples/{ => inspect}/plot_partial_dependence.py (97%)
 create mode 100644 sklearn/inspect/__init__.py
 rename sklearn/{ => inspect}/partial_dependence.py (98%)
 rename sklearn/{ => inspect}/tests/test_partial_dependence.py (98%)

diff --git a/doc/model_inspection.rst b/doc/model_inspection.rst
new file mode 100644
index 0000000000000..0bc468517fb45
--- /dev/null
+++ b/doc/model_inspection.rst
@@ -0,0 +1,10 @@
+.. include:: includes/big_toc_css.rst
+
+.. _model_inspection:
+
+Model inspection
+----------------
+
+.. toctree::
+
+    modules/partial_dependence
diff --git a/doc/model_selection.rst b/doc/model_selection.rst
index 7d559615e069f..daec6a6ed83e4 100644
--- a/doc/model_selection.rst
+++ b/doc/model_selection.rst
@@ -12,4 +12,3 @@ Model selection and evaluation
     modules/model_evaluation
     modules/model_persistence
     modules/learning_curve
-    modules/partial_dependence
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 90e37a1ee3fe8..e9a1630e0c9c0 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1208,10 +1208,10 @@ Model validation
 
 .. _partial_dependence_ref:
 
-:mod:`sklearn.partial_dependence`: Partial dependence plots
-===========================================================
+:mod:`sklearn.inspect`: Model inspection
+========================================
 
-.. automodule:: sklearn.partial_dependence
+.. automodule:: sklearn.inspect
    :no-members:
    :no-inherited-members:
 
@@ -1221,8 +1221,8 @@ Model validation
    :toctree: generated/
    :template: function.rst
 
-   partial_dependence.partial_dependence
-   partial_dependence.plot_partial_dependence
+   inspect.partial_dependence
+   inspect.plot_partial_dependence
 
 
 .. _preprocessing_ref:
diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index 517d75d9c2390..dfd6e5a82d986 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -5,7 +5,7 @@
 Partial dependence plots
 ========================
 
-.. currentmodule:: sklearn.partial_dependence
+.. currentmodule:: sklearn.inspect
 
 Partial dependence plots (PDP) show the dependence between the target response
 and a set of 'target' features, marginalizing over the values of all other
@@ -39,7 +39,7 @@ an average occupancy greater than two, the house price is nearly independent of
 the house age, whereas for values less than 2 there is a strong dependence
 on age.
 
-The :mod:`sklearn.partial_dependence` module provides a convenience function
+The :mod:`sklearn.inspect` module provides a convenience function
 :func:`plot_partial_dependence` to create one-way and two-way partial
 dependence plots. In the below example we show how to create a grid of
 partial dependence plots: two one-way PDPs for the features ``0`` and ``1``
@@ -47,7 +47,7 @@ and a two-way PDP between the two features::
 
     >>> from sklearn.datasets import make_hastie_10_2
     >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> from sklearn.partial_dependence import plot_partial_dependence
+    >>> from sklearn.inspect import plot_partial_dependence
 
     >>> X, y = make_hastie_10_2(random_state=0)
     >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
@@ -71,7 +71,7 @@ regression settings.
 If you need the raw values of the partial dependence function rather than
 the plots, you can use the :func:`partial_dependence` function::
 
-    >>> from sklearn.partial_dependence import partial_dependence
+    >>> from sklearn.inspect import partial_dependence
 
     >>> pdp, axes = partial_dependence(clf, [0], X=X)
     >>> pdp  # doctest: +ELLIPSIS
@@ -110,7 +110,7 @@ which the trees were trained.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_plot_partial_dependence.py`
+ * :ref:`sphx_glr_auto_examples_inspect_plot_partial_dependence.py`
 
 .. topic:: References
 
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index aae88134cd1a0..004feb0824e7d 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -18,6 +18,7 @@ User Guide
    supervised_learning.rst
    unsupervised_learning.rst
    model_selection.rst
+   model_inspection.rst
    data_transforms.rst
    Dataset loading utilities <datasets/index.rst>
    modules/computing.rst
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 68e26dca3c8b1..6f2af654686fb 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -89,11 +89,11 @@ Support for Python 3.4 and below has been officially dropped.
 
 - |API| :func:`ensemble.partial_dependence` and
   :func:`ensemble.plot_partial_dependence` are now deprecated in favor of
-  :func:`partial_dependence.partial_dependence<sklearn.partial_dependence.partial_dependence>`
+  :func:`inspect.partial_dependence<sklearn.inspect.partial_dependence>`
   and
-  :func:`partial_dependence.plot_partial_dependence<sklearn.partial_dependence.plot_partial_dependence>`.
-  :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and :user:`Nicolas
-  Hug<NicolasHug>`.
+  :func:`inspect.plot_partial_dependence<sklearn.inspect.plot_partial_dependence>`.
+  :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and
+  :user:`Nicolas Hug<NicolasHug>`.
 
 - |Fix| Fixed a bug in :class:`ensemble.GradientBoostingClassifier` where
   the gradients would be incorrectly computed in multiclass classification
@@ -277,10 +277,11 @@ Support for Python 3.4 and below has been officially dropped.
   affects all ensemble methods using decision trees.
   :issue:`12344` by :user:`Adrin Jalali <adrinjalali>`.
 
-:mod:`sklearn.partial_dependence`
-.................................
+:mod:`sklearn.inspect.partial_dependence`
+.........................................
+
 - |Feature| Partial dependence plots
-  (:func:`partial_dependence.plot_partial_dependence`) are now supported for
+  (:func:`inspect.plot_partial_dependence`) are now supported for
   any regressor or classifier (provided that they have a `predict_proba`
   method). :issue:`12599` by :user:`Trevor Stephens <trevorstephens>` and
   :user:`Nicolas Hug <NicolasHug>`.
diff --git a/examples/plot_partial_dependence.py b/examples/inspect/plot_partial_dependence.py
similarity index 97%
rename from examples/plot_partial_dependence.py
rename to examples/inspect/plot_partial_dependence.py
index a88cc00e87920..bee9fe21b8714 100644
--- a/examples/plot_partial_dependence.py
+++ b/examples/inspect/plot_partial_dependence.py
@@ -63,8 +63,8 @@
 
 from mpl_toolkits.mplot3d import Axes3D
 
-from sklearn.partial_dependence import partial_dependence
-from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.inspect import partial_dependence
+from sklearn.inspect import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.neural_network import MLPRegressor
 from sklearn.datasets.california_housing import fetch_california_housing
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index efa73e95bc01c..76313f971703d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -69,11 +69,11 @@
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
                'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
                'externals', 'feature_extraction', 'feature_selection',
-               'gaussian_process', 'isotonic', 'kernel_approximation',
-               'kernel_ridge', 'linear_model', 'manifold', 'metrics',
-               'mixture', 'model_selection', 'multiclass', 'multioutput',
-               'naive_bayes', 'neighbors', 'neural_network',
-               'partial_dependence', 'pipeline', 'preprocessing',
+               'gaussian_process', 'inspect', 'isotonic',
+               'kernel_approximation', 'kernel_ridge', 'linear_model',
+               'manifold', 'metrics', 'mixture', 'model_selection',
+               'multiclass', 'multioutput', 'naive_bayes', 'neighbors',
+               'neural_network', 'pipeline', 'preprocessing',
                'random_projection', 'semi_supervised', 'svm', 'tree',
                'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 91c2ff602d4d2..aafb7baedfb07 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -73,7 +73,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
 
 @deprecated("The function ensemble.partial_dependence has been deprecated "
-            "in favour of partial_dependence.partial_dependence in 0.21 "
+            "in favour of inspect.partial_dependence in 0.21 "
             "and will be removed in 0.23.")
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100):
@@ -87,7 +87,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.partial_dependence.partial_dependence` and will be
+       :func:`sklearn.inspect.partial_dependence` and will be
        removed in 0.23.
 
     Parameters
@@ -176,7 +176,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
 @deprecated("The function ensemble.plot_partial_dependence has been "
             "deprecated in favour of "
-            "partial_dependence.plot_partial_dependence in "
+            "inspect.plot_partial_dependence in "
             " 0.21 and will be removed in 0.23.")
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
@@ -193,7 +193,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.partial_dependence.plot_partial_dependence` and will be
+       :func:`sklearn.inspect.plot_partial_dependence` and will be
        removed in 0.23.
 
     Parameters
diff --git a/sklearn/inspect/__init__.py b/sklearn/inspect/__init__.py
new file mode 100644
index 0000000000000..e0814894a4aa2
--- /dev/null
+++ b/sklearn/inspect/__init__.py
@@ -0,0 +1,8 @@
+from .partial_dependence import partial_dependence
+from .partial_dependence import plot_partial_dependence
+
+
+__all__ = [
+    'partial_dependence',
+    'plot_partial_dependence',
+]
\ No newline at end of file
diff --git a/sklearn/partial_dependence.py b/sklearn/inspect/partial_dependence.py
similarity index 98%
rename from sklearn/partial_dependence.py
rename to sklearn/inspect/partial_dependence.py
index 2b21f2bafe8d7..cd7841476ba08 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -12,16 +12,16 @@
 import numpy as np
 from scipy.stats.mstats import mquantiles
 
-from .base import is_classifier, is_regressor
-from .utils.extmath import cartesian
-from .externals.joblib import Parallel, delayed
-from .externals import six
-from .utils import check_array
-from .utils.validation import check_is_fitted
-from .tree._tree import DTYPE
-from .exceptions import NotFittedError
-from .ensemble.gradient_boosting import BaseGradientBoosting
-from .ensemble._gradient_boosting import _partial_dependence_tree
+from ..base import is_classifier, is_regressor
+from ..utils.extmath import cartesian
+from ..externals.joblib import Parallel, delayed
+from ..externals import six
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+from ..tree._tree import DTYPE
+from ..exceptions import NotFittedError
+from ..ensemble.gradient_boosting import BaseGradientBoosting
+from ..ensemble._gradient_boosting import _partial_dependence_tree
 
 
 __all__ = ['partial_dependence', 'plot_partial_dependence']
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
similarity index 98%
rename from sklearn/tests/test_partial_dependence.py
rename to sklearn/inspect/tests/test_partial_dependence.py
index f53c3c8c3cb76..a4e3a23f5b154 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -10,11 +10,11 @@
 from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import if_matplotlib
-from sklearn.partial_dependence import partial_dependence
-from sklearn.partial_dependence import plot_partial_dependence
-from sklearn.partial_dependence import _grid_from_X
-from sklearn.partial_dependence import _partial_dependence_brute
-from sklearn.partial_dependence import _partial_dependence_recursion
+from sklearn.inspect import partial_dependence
+from sklearn.inspect import plot_partial_dependence
+from sklearn.inspect.partial_dependence import _grid_from_X
+from sklearn.inspect.partial_dependence import _partial_dependence_brute
+from sklearn.inspect.partial_dependence import _partial_dependence_recursion
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.linear_model import LinearRegression

From d68ebc9b87c1caabd856896eb9dc9ec4066c05f7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 4 Feb 2019 11:21:12 -0500
Subject: [PATCH 074/113] added sklearn/inspect/tests/__init__.py

---
 sklearn/inspect/tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sklearn/inspect/tests/__init__.py

diff --git a/sklearn/inspect/tests/__init__.py b/sklearn/inspect/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 5d095845f3cd9c4d7e5fff71898baa3701c5d65b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 12 Feb 2019 13:25:56 -0500
Subject: [PATCH 075/113] Hopefully fixes windows issue?

---
 sklearn/ensemble/partial_dependence.py | 6 ++++++
 sklearn/inspect/__init__.py            | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index aafb7baedfb07..d4194981b7ec3 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -24,6 +24,12 @@
 from .gradient_boosting import BaseGradientBoosting
 
 
+__all__ = [
+    'partial_dependence',
+    'plot_partial_dependence',
+]
+
+
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     """Generate a grid of points based on the ``percentiles of ``X``.
 
diff --git a/sklearn/inspect/__init__.py b/sklearn/inspect/__init__.py
index e0814894a4aa2..b5f0f2691cece 100644
--- a/sklearn/inspect/__init__.py
+++ b/sklearn/inspect/__init__.py
@@ -1,3 +1,4 @@
+"""The :mod:`sklearn.inspect` module includes tools for model inspection."""
 from .partial_dependence import partial_dependence
 from .partial_dependence import plot_partial_dependence
 
@@ -5,4 +6,4 @@
 __all__ = [
     'partial_dependence',
     'plot_partial_dependence',
-]
\ No newline at end of file
+]

From 592a5895c01f4ae63a4a3ae6779a68b82b8b7b7a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 13 Feb 2019 11:47:11 -0500
Subject: [PATCH 076/113] Using add_subpackage

---
 sklearn/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/setup.py b/sklearn/setup.py
index a20d7e4e3fe22..5f5e50233a6e8 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -32,6 +32,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('feature_selection/tests')
     config.add_subpackage('gaussian_process')
     config.add_subpackage('gaussian_process/tests')
+    config.add_subpackage('inspect')
+    config.add_subpackage('inspect/tests')
     config.add_subpackage('mixture')
     config.add_subpackage('mixture/tests')
     config.add_subpackage('model_selection')

From 38c1c549ab36aa4c045a6e36571adff37b0c7250 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 26 Feb 2019 09:46:46 -0500
Subject: [PATCH 077/113] wording

---
 sklearn/inspect/partial_dependence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index cd7841476ba08..577563a16426c 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -200,7 +200,7 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
 
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators with ``init=None``, and 'brute'
-          used for other estimators.
+          for all other.
 
     Returns
     -------
@@ -362,7 +362,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
 
         - If 'auto', then 'recursion' will be used for
           ``BaseGradientBoosting`` estimators with ``init=None``, and
-          'brute' used for other estimators.
+          'brute' for all other.
 
         Unlike the 'brute' method, 'recursion' does not account for the
         ``init`` predictor of the boosting process. In practice this still

From 54ece6b2a41e4b0b0e04bfca6f0469bf3bdba6a4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 27 Feb 2019 11:00:34 -0500
Subject: [PATCH 078/113] Added response parameter

---
 doc/modules/partial_dependence.rst            | 16 +--
 sklearn/inspect/partial_dependence.py         | 98 +++++++++++++++----
 .../inspect/tests/test_partial_dependence.py  | 65 ++++++++++--
 3 files changed, 146 insertions(+), 33 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index dfd6e5a82d986..dde1cd04d85a3 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -7,11 +7,11 @@ Partial dependence plots
 
 .. currentmodule:: sklearn.inspect
 
-Partial dependence plots (PDP) show the dependence between the target response
-and a set of 'target' features, marginalizing over the values of all other
-features (the 'complement' features). Intuitively, we can interpret the
-partial dependence as the expected target response [1]_ as a function of the
-'target' features.
+Partial dependence plots (PDP) show the dependence between the target
+response [1]_ and a set of 'target' features, marginalizing over the values
+of all other features (the 'complement' features). Intuitively, we can
+interpret the partial dependence as the expected target response as a
+function of the 'target' features.
 
 Due to the limits of human perception the size of the target feature set
 must be small (usually, one or two) thus the target features are usually
@@ -104,9 +104,9 @@ which the trees were trained.
 
 .. rubric:: Footnotes
 
-.. [1] For classification, the target response is the probability of a class.
-   In particular for binary classification, this is the probability of the
-   positive class.
+.. [1] For classification, the target response may be the probability of a
+   class (the positive class for binary classification), or the decision
+   function.
 
 .. topic:: Examples:
 
diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index 577563a16426c..f4d34f81319b5 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -116,16 +116,40 @@ def _partial_dependence_recursion(est, grid, features):
     return averaged_predictions
 
 
-def _partial_dependence_brute(est, grid, features, X):
+def _partial_dependence_brute(est, grid, features, X, response):
     averaged_predictions = []
+
+    # define the prediction_method (predict, predict_proba, decision_function).
+    if is_regressor(est):
+        prediction_method = est.predict
+    else:
+        predict_proba = getattr(est, 'predict_proba', None)
+        decision_function = getattr(est, 'decision_function', None)
+        if response == 'auto':
+            # try predict_proba, then decision_function if it doesn't exist
+            prediction_method = predict_proba or decision_function
+        else:
+            prediction_method = (predict_proba if response == 'predict'
+                                 else decision_function)
+        if prediction_method is None:
+            if response == 'auto':
+                raise ValueError(
+                    'The estimator has no predict_proba and no '
+                    'decision_function method.'
+                )
+            elif response == 'proba':
+                raise ValueError('The estimator has no predict_proba method.')
+            else:
+                raise ValueError(
+                    'The estimator has no decision_function method.')
+
     for new_values in grid:
         X_eval = X.copy()
         for i, variable in enumerate(features):
             X_eval[:, variable] = new_values[i]
 
         try:
-            predictions = (est.predict(X_eval) if is_regressor(est)
-                           else est.predict_proba(X_eval))
+            predictions = prediction_method(X_eval)
         except NotFittedError:
             raise ValueError('est parameter must be a fitted estimator')
 
@@ -157,8 +181,9 @@ def _partial_dependence_brute(est, grid, features, X):
     return averaged_predictions
 
 
-def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
-                       grid_resolution=100, method='auto'):
+def partial_dependence(est, features, X, response='auto',
+                       percentiles=(0.05, 0.95), grid_resolution=100,
+                       method='auto'):
     """Partial dependence of ``features``.
 
     Partial dependence of a feature (or a set of features) corresponds to
@@ -179,6 +204,14 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
         ``X`` is used both to generate a grid of values for the
         ``features``, and to compute the averaged predictions when
         method is 'brute'.
+    response : 'auto', 'proba' or 'decision', optional (default='auto') :
+        Specifies whether to use ``est.predict_proba()`` or
+        ``est.decision_function()`` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        ``est.predict()``. By default, ``predict_proba()`` is tried first
+        and we revert to ``decision_function()`` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        ``decision_function()`.
     percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
         for the grid. Must be in [0, 1].
@@ -193,7 +226,9 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
           With this method, ``X`` is only used to build the
           grid. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init>`).
+          :ref:`this warning<warning_recursion_init_plot>`). With this
+          method, the target response of a classifier is always the decision
+          function, not the predicted probabilities.
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
@@ -253,10 +288,19 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
 
     X = check_array(X)
 
+    accepted_responses = ('auto', 'proba', 'decision')
+    if response not in accepted_responses:
+        raise ValueError(
+            'response {} is invalid. Accepted response names are {}.'.format(
+                response, ', '.join(accepted_responses)))
+
+    if is_regressor(est) and response != 'auto':
+        warnings.warn("The response parameter is ignored for regressors.",
+                      UserWarning)
     accepted_methods = ('brute', 'recursion', 'auto')
     if method not in accepted_methods:
         raise ValueError(
-            'method {} is invalid. Accepted method names are {}, auto.'.format(
+            'method {} is invalid. Accepted method names are {}.'.format(
                 method, ', '.join(accepted_methods)))
 
     if method == 'auto':
@@ -270,14 +314,19 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
             raise ValueError(
                 'est must be an instance of BaseGradientBoosting '
                 'for the "recursion" method. Try using method="brute".')
+        if response == 'auto':
+            response = 'decision'
+
+        if response != 'decision':
+            raise ValueError(
+                "With the 'recursion' method, the response must be 'decision'."
+                "Got {}.".format(response)
+            )
         check_is_fitted(est, 'estimators_',
                         msg='est parameter must be a fitted estimator')
         # Note: if method is brute, this check is done at prediction time
         n_features = est.n_features_
     else:
-        if is_classifier(est) and not hasattr(est, 'predict_proba'):
-            raise ValueError('est requires a predict_proba() method for '
-                             'method="brute" for classification.')
         n_features = X.shape[1]
 
     features = np.asarray(features, dtype=np.int32, order='C').ravel()
@@ -289,7 +338,7 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
                                 grid_resolution)
     if method == 'brute':
         averaged_predictions = _partial_dependence_brute(est, grid,
-                                                         features, X)
+                                                         features, X, response)
     else:
         averaged_predictions = _partial_dependence_recursion(est, grid,
                                                              features)
@@ -303,10 +352,10 @@ def partial_dependence(est, features, X, percentiles=(0.05, 0.95),
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
-                            target=None, n_cols=3, grid_resolution=100,
-                            percentiles=(0.05, 0.95), method='auto',
-                            n_jobs=1, verbose=0, fig=None, line_kw=None,
-                            contour_kw=None, **fig_kw):
+                            target=None, response='auto', n_cols=3,
+                            grid_resolution=100, percentiles=(0.05, 0.95),
+                            method='auto', n_jobs=1, verbose=0, fig=None,
+                            line_kw=None, contour_kw=None, **fig_kw):
     """Partial dependence plots.
 
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
@@ -318,8 +367,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     ----------
     est : BaseEstimator
         A fitted classification or regression model. Classifiers must have a
-        ``predict_proba()`` method. Multioutput-multiclass estimators aren't
-        supported.
+        ``predict_proba()`` or ``decision_function`` method.
+        Multioutput-multiclass estimators aren't supported.
     X : array-like, shape=(n_samples, n_features)
         The data to use to build the grid of values on which the dependence
         will be evaluated. This is usually the training data.
@@ -339,6 +388,14 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         - In a multioutput setting, specifies the task for which the PDPs
           should be computed
         Ignored in binary classification or classical regression settings.
+    response : 'auto', 'proba' or 'decision', optional (default='auto') :
+        Specifies whether to use ``est.predict_proba()`` or
+        ``est.decision_function()`` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        ``est.predict()``. By default, ``predict_proba()`` is tried first
+        and we revert to ``decision_function()`` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        ``decision_function()`.
     n_cols : int, optional (default=3)
         The maximum number of columns in the grid plot.
     grid_resolution : int, optional (default=100)
@@ -355,7 +412,9 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           With this method, ``X`` is optional and is only used to build the
           grid. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init_plot>`).
+          :ref:`this warning<warning_recursion_init_plot>`). With this
+          method, the target response of a classifier is always the decision
+          function, not the predicted probabilities.
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
@@ -486,7 +545,8 @@ def convert_feature(fx):
 
     # compute averaged predictions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X, method=method,
+        delayed(partial_dependence)(est, fxs, X=X, response=response,
+                                    method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)
diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
index a4e3a23f5b154..f24370be72230 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -20,13 +20,13 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import MultiTaskLasso
-from sklearn.svm import SVC
 from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.dummy import DummyClassifier
+from sklearn.base import BaseEstimator, ClassifierMixin
 
 
 # toy sample
@@ -149,7 +149,9 @@ def test_grid_from_X():
 @pytest.mark.parametrize('est, method',
                          [(LinearRegression(), 'brute'),
                           (GradientBoostingRegressor(random_state=0),
-                          'recursion')])
+                          'recursion'),
+                          (GradientBoostingRegressor(random_state=0),
+                          'brute')])
 def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or
     # _partial_dependece_recursion is equivalent to manually setting a target
@@ -171,7 +173,8 @@ def test_partial_dependence_helpers(est, method, target_feature):
                      [123]])
 
     if method == 'brute':
-        pdp = _partial_dependence_brute(est, grid, features, X)
+        pdp = _partial_dependence_brute(est, grid, features, X,
+                                        response='auto')
     else:
         pdp = _partial_dependence_recursion(est, grid, features)
 
@@ -185,6 +188,26 @@ def test_partial_dependence_helpers(est, method, target_feature):
     assert_array_almost_equal(pdp, mean_predictions, decimal=3)
 
 
+@pytest.mark.parametrize('target_feature', (0, 1, 2, 3, 4, 5))
+def test_recursion_decision_function(target_feature):
+    # Make sure the recursion method (implicitely uses decision_function) has
+    # the same result as using brute method with response=decision
+
+    X, y = make_classification(n_classes=2, n_clusters_per_class=1,
+                               random_state=1)
+    assert np.mean(y) == .5  # make sure the init estimator predicts 0 anyway
+
+    est = GradientBoostingClassifier(random_state=0, loss='deviance')
+    est.fit(X, y)
+
+    preds_1, _ = partial_dependence(est, target_feature, X,
+                                    response='decision', method='recursion')
+    preds_2, _ = partial_dependence(est, target_feature, X,
+                                    response='decision', method='brute')
+
+    assert_array_almost_equal(preds_1, preds_2, decimal=5)
+
+
 @pytest.mark.parametrize('est', (LinearRegression(),
                                  GradientBoostingRegressor(random_state=0)))
 @pytest.mark.parametrize('power', (1, 2))
@@ -256,6 +279,39 @@ def test_partial_dependence_input():
                         "est must be a fitted regressor or classifier",
                         partial_dependence, KMeans(), [0], X)
 
+    assert_warns_message(
+        UserWarning,
+        'The response parameter is ignored for regressors',
+        partial_dependence, lr, [0], X, response='proba')
+
+    assert_raises_regex(
+        ValueError,
+        "With the 'recursion' method, the response must be 'decision'.",
+        partial_dependence, gbc, [0], X, response='proba', method='recursion')
+
+    assert_raises_regex(ValueError,
+                        "response blahblah is invalid. Accepted response",
+                        partial_dependence, gbc, [0], X, response='blahblah')
+
+    class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
+        pass
+    bad_clf = NoPredictProbaNoDecisionFunction()
+
+    assert_raises_regex(
+        ValueError,
+        'The estimator has no predict_proba and no decision_function method.',
+        partial_dependence, bad_clf, [0], X, response='auto')
+
+    assert_raises_regex(
+        ValueError,
+        'The estimator has no predict_proba method.',
+        partial_dependence, bad_clf, [0], X, response='proba')
+
+    assert_raises_regex(
+        ValueError,
+        'The estimator has no decision_function method.',
+        partial_dependence, bad_clf, [0], X, response='decision')
+
     assert_raises_regex(ValueError,
                         "method blahblah is invalid. Accepted method names "
                         "are brute, recursion, auto.",
@@ -266,9 +322,6 @@ def test_partial_dependence_input():
                         'for the "recursion" method',
                         partial_dependence, lr, [0], X, method='recursion')
 
-    assert_raises_regex(ValueError, "est requires a predict_proba()",
-                        partial_dependence, SVC(), [0], X)
-
     for feature in (-1, 1000000):
         for est in (lr, gbc):
             assert_raises_regex(ValueError,

From aaeab443f8a05435ba101bbf44277d77a2d45ac2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Feb 2019 03:39:01 -0500
Subject: [PATCH 079/113] indent

---
 sklearn/inspect/partial_dependence.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index f4d34f81319b5..0ca8e3caa7f1d 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -562,9 +562,8 @@ def convert_feature(fx):
             raise ValueError(
                 'target must be specified for multi-output regressors')
         if not 0 <= target <= pd.shape[0]:
-                raise ValueError(
-                    'target must be in [0, n_tasks], got {}.'.format(
-                        target))
+            raise ValueError(
+                'target must be in [0, n_tasks], got {}.'.format( target))
         target_idx = target
     else:
         target_idx = 0

From 28b936a3069d8f78e374fb76af84e6795356599f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Feb 2019 05:34:23 -0500
Subject: [PATCH 080/113] pep8

---
 sklearn/inspect/partial_dependence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index 0ca8e3caa7f1d..67ba9c192550f 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -563,7 +563,7 @@ def convert_feature(fx):
                 'target must be specified for multi-output regressors')
         if not 0 <= target <= pd.shape[0]:
             raise ValueError(
-                'target must be in [0, n_tasks], got {}.'.format( target))
+                'target must be in [0, n_tasks], got {}.'.format(target))
         target_idx = target
     else:
         target_idx = 0

From 2c65b03e33c4aef41f15f235260ffa09fa831a35 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 1 Mar 2019 04:14:00 -0500
Subject: [PATCH 081/113] changed proba into predict_proba, decision into
 decision_function and response into response_method

---
 sklearn/inspect/partial_dependence.py         | 54 ++++++++++---------
 .../inspect/tests/test_partial_dependence.py  | 33 +++++++-----
 2 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index 67ba9c192550f..f5fcbd763d716 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -116,7 +116,7 @@ def _partial_dependence_recursion(est, grid, features):
     return averaged_predictions
 
 
-def _partial_dependence_brute(est, grid, features, X, response):
+def _partial_dependence_brute(est, grid, features, X, response_method):
     averaged_predictions = []
 
     # define the prediction_method (predict, predict_proba, decision_function).
@@ -125,19 +125,19 @@ def _partial_dependence_brute(est, grid, features, X, response):
     else:
         predict_proba = getattr(est, 'predict_proba', None)
         decision_function = getattr(est, 'decision_function', None)
-        if response == 'auto':
+        if response_method == 'auto':
             # try predict_proba, then decision_function if it doesn't exist
             prediction_method = predict_proba or decision_function
         else:
-            prediction_method = (predict_proba if response == 'predict'
-                                 else decision_function)
+            prediction_method = (predict_proba if response_method ==
+                                 'predict_proba' else decision_function)
         if prediction_method is None:
-            if response == 'auto':
+            if response_method == 'auto':
                 raise ValueError(
                     'The estimator has no predict_proba and no '
                     'decision_function method.'
                 )
-            elif response == 'proba':
+            elif response_method == 'predict_proba':
                 raise ValueError('The estimator has no predict_proba method.')
             else:
                 raise ValueError(
@@ -181,7 +181,7 @@ def _partial_dependence_brute(est, grid, features, X, response):
     return averaged_predictions
 
 
-def partial_dependence(est, features, X, response='auto',
+def partial_dependence(est, features, X, response_method='auto',
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
     """Partial dependence of ``features``.
@@ -204,7 +204,8 @@ def partial_dependence(est, features, X, response='auto',
         ``X`` is used both to generate a grid of values for the
         ``features``, and to compute the averaged predictions when
         method is 'brute'.
-    response : 'auto', 'proba' or 'decision', optional (default='auto') :
+    response_method : 'auto', 'predict_proba' or 'decision_function', \
+            optional (default='auto') :
         Specifies whether to use ``est.predict_proba()`` or
         ``est.decision_function()`` as the target response. For regressors
         this parameter is ignored and the response is always the output of
@@ -288,15 +289,17 @@ def partial_dependence(est, features, X, response='auto',
 
     X = check_array(X)
 
-    accepted_responses = ('auto', 'proba', 'decision')
-    if response not in accepted_responses:
+    accepted_responses = ('auto', 'predict_proba', 'decision_function')
+    if response_method not in accepted_responses:
         raise ValueError(
-            'response {} is invalid. Accepted response names are {}.'.format(
-                response, ', '.join(accepted_responses)))
+            'response_method {} is invalid. Accepted response_method names '
+            'are {}.'.format(response_method, ', '.join(accepted_responses)))
 
-    if is_regressor(est) and response != 'auto':
-        warnings.warn("The response parameter is ignored for regressors.",
-                      UserWarning)
+    if is_regressor(est) and response_method != 'auto':
+        warnings.warn(
+            "The response_method parameter is ignored for regressors.",
+            UserWarning
+        )
     accepted_methods = ('brute', 'recursion', 'auto')
     if method not in accepted_methods:
         raise ValueError(
@@ -314,13 +317,13 @@ def partial_dependence(est, features, X, response='auto',
             raise ValueError(
                 'est must be an instance of BaseGradientBoosting '
                 'for the "recursion" method. Try using method="brute".')
-        if response == 'auto':
-            response = 'decision'
+        if response_method == 'auto':
+            response_method = 'decision_function'
 
-        if response != 'decision':
+        if response_method != 'decision_function':
             raise ValueError(
-                "With the 'recursion' method, the response must be 'decision'."
-                "Got {}.".format(response)
+                "With the 'recursion' method, the response_method must be "
+                "'decision_function'. Got {}.".format(response_method)
             )
         check_is_fitted(est, 'estimators_',
                         msg='est parameter must be a fitted estimator')
@@ -338,7 +341,8 @@ def partial_dependence(est, features, X, response='auto',
                                 grid_resolution)
     if method == 'brute':
         averaged_predictions = _partial_dependence_brute(est, grid,
-                                                         features, X, response)
+                                                         features, X,
+                                                         response_method)
     else:
         averaged_predictions = _partial_dependence_recursion(est, grid,
                                                              features)
@@ -352,7 +356,7 @@ def partial_dependence(est, features, X, response='auto',
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
-                            target=None, response='auto', n_cols=3,
+                            target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
                             method='auto', n_jobs=1, verbose=0, fig=None,
                             line_kw=None, contour_kw=None, **fig_kw):
@@ -388,7 +392,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         - In a multioutput setting, specifies the task for which the PDPs
           should be computed
         Ignored in binary classification or classical regression settings.
-    response : 'auto', 'proba' or 'decision', optional (default='auto') :
+    response_method : 'auto', 'predict_proba' or 'decision_function', \
+            optional (default='auto') :
         Specifies whether to use ``est.predict_proba()`` or
         ``est.decision_function()`` as the target response. For regressors
         this parameter is ignored and the response is always the output of
@@ -545,7 +550,8 @@ def convert_feature(fx):
 
     # compute averaged predictions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X, response=response,
+        delayed(partial_dependence)(est, fxs, X=X,
+                                    response_method=response_method,
                                     method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
index f24370be72230..3568249514d2c 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -174,7 +174,7 @@ def test_partial_dependence_helpers(est, method, target_feature):
 
     if method == 'brute':
         pdp = _partial_dependence_brute(est, grid, features, X,
-                                        response='auto')
+                                        response_method='auto')
     else:
         pdp = _partial_dependence_recursion(est, grid, features)
 
@@ -191,7 +191,7 @@ def test_partial_dependence_helpers(est, method, target_feature):
 @pytest.mark.parametrize('target_feature', (0, 1, 2, 3, 4, 5))
 def test_recursion_decision_function(target_feature):
     # Make sure the recursion method (implicitely uses decision_function) has
-    # the same result as using brute method with response=decision
+    # the same result as using brute method with response_method=decision
 
     X, y = make_classification(n_classes=2, n_clusters_per_class=1,
                                random_state=1)
@@ -201,9 +201,11 @@ def test_recursion_decision_function(target_feature):
     est.fit(X, y)
 
     preds_1, _ = partial_dependence(est, target_feature, X,
-                                    response='decision', method='recursion')
+                                    response_method='decision_function',
+                                    method='recursion')
     preds_2, _ = partial_dependence(est, target_feature, X,
-                                    response='decision', method='brute')
+                                    response_method='decision_function',
+                                    method='brute')
 
     assert_array_almost_equal(preds_1, preds_2, decimal=5)
 
@@ -281,17 +283,21 @@ def test_partial_dependence_input():
 
     assert_warns_message(
         UserWarning,
-        'The response parameter is ignored for regressors',
-        partial_dependence, lr, [0], X, response='proba')
+        'The response_method parameter is ignored for regressors',
+        partial_dependence, lr, [0], X, response_method='predict_proba')
 
     assert_raises_regex(
         ValueError,
-        "With the 'recursion' method, the response must be 'decision'.",
-        partial_dependence, gbc, [0], X, response='proba', method='recursion')
+        "With the 'recursion' method, the response_method must be "
+        "'decision_function'.",
+        partial_dependence, gbc, [0], X, response_method='predict_proba',
+        method='recursion')
 
     assert_raises_regex(ValueError,
-                        "response blahblah is invalid. Accepted response",
-                        partial_dependence, gbc, [0], X, response='blahblah')
+                        "response_method blahblah is invalid. "
+                        "Accepted response",
+                        partial_dependence, gbc, [0], X,
+                        response_method='blahblah')
 
     class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
         pass
@@ -300,17 +306,18 @@ class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
     assert_raises_regex(
         ValueError,
         'The estimator has no predict_proba and no decision_function method.',
-        partial_dependence, bad_clf, [0], X, response='auto')
+        partial_dependence, bad_clf, [0], X, response_method='auto')
 
     assert_raises_regex(
         ValueError,
         'The estimator has no predict_proba method.',
-        partial_dependence, bad_clf, [0], X, response='proba')
+        partial_dependence, bad_clf, [0], X, response_method='predict_proba')
 
     assert_raises_regex(
         ValueError,
         'The estimator has no decision_function method.',
-        partial_dependence, bad_clf, [0], X, response='decision')
+        partial_dependence, bad_clf, [0], X,
+        response_method='decision_function')
 
     assert_raises_regex(ValueError,
                         "method blahblah is invalid. Accepted method names "

From 2bc0f0fed453cd8e5b95416d103a4120e9adcb15 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 12 Mar 2019 06:34:01 -0400
Subject: [PATCH 082/113] Updated references

---
 doc/modules/partial_dependence.rst | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index dde1cd04d85a3..eb48a14c79b99 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -114,11 +114,9 @@ which the trees were trained.
 
 .. topic:: References
 
- .. [F2001] J. Friedman, "Greedy Function Approximation: A Gradient Boosting Machine",
-   The Annals of Statistics, Vol. 29, No. 5, 2001.
+ .. [HTF2009] T. Hastie, R. Tibshirani and J. Friedman, `The Elements of
+    Statistical Learning <https://web.stanford.edu/~hastie/ElemStatLearn//>`_,
+    Second Edition, Section 10.13.2, Springer, 2009.
 
- .. [F1999] J. Friedman, "Stochastic Gradient Boosting", 1999
-
- .. [HTF2009] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.
-
- .. [R2007] G. Ridgeway, "Generalized Boosted Models: A guide to the gbm package", 2007
+ .. [Mol2019] C. Molnar, `Interpretable Machine Learning
+    <https://christophm.github.io/interpretable-ml-book/>`_, Section 5.1, 2019.

From 600cf7ed7bbb9ff96f35a79b7f9ee525acccf371 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 12 Mar 2019 06:49:26 -0400
Subject: [PATCH 083/113] link to glossary terms

---
 sklearn/inspect/partial_dependence.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index f5fcbd763d716..13f85a560b46d 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -206,13 +206,13 @@ def partial_dependence(est, features, X, response_method='auto',
         method is 'brute'.
     response_method : 'auto', 'predict_proba' or 'decision_function', \
             optional (default='auto') :
-        Specifies whether to use ``est.predict_proba()`` or
-        ``est.decision_function()`` as the target response. For regressors
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
         this parameter is ignored and the response is always the output of
-        ``est.predict()``. By default, ``predict_proba()`` is tried first
-        and we revert to ``decision_function()`` if it doesn't exist. If
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
         ``method`` is 'recursion', the response is always the output of
-        ``decision_function()`.
+        :term:`decision_function`.
     percentiles : tuple of float, optional (default=(0.05, 0.95))
         The lower and upper percentile used to create the extreme values
         for the grid. Must be in [0, 1].
@@ -225,7 +225,8 @@ def partial_dependence(est, features, X, response_method='auto',
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is more efficient in terms of speed.
           With this method, ``X`` is only used to build the
-          grid. This method does not account for the ``init`` predicor of
+          grid and the partial dependences are computed using the training
+          data. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
           :ref:`this warning<warning_recursion_init_plot>`). With this
           method, the target response of a classifier is always the decision
@@ -394,13 +395,13 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         Ignored in binary classification or classical regression settings.
     response_method : 'auto', 'predict_proba' or 'decision_function', \
             optional (default='auto') :
-        Specifies whether to use ``est.predict_proba()`` or
-        ``est.decision_function()`` as the target response. For regressors
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
         this parameter is ignored and the response is always the output of
-        ``est.predict()``. By default, ``predict_proba()`` is tried first
-        and we revert to ``decision_function()`` if it doesn't exist. If
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
         ``method`` is 'recursion', the response is always the output of
-        ``decision_function()`.
+        :term:`decision_function`.
     n_cols : int, optional (default=3)
         The maximum number of columns in the grid plot.
     grid_resolution : int, optional (default=100)
@@ -415,7 +416,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         - 'recursion' is only supported for objects inheriting from
           `BaseGradientBoosting`, but is more efficient in terms of speed.
           With this method, ``X`` is optional and is only used to build the
-          grid. This method does not account for the ``init`` predicor of
+          grid and the partial dependences are computed using the training
+          data. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
           :ref:`this warning<warning_recursion_init_plot>`). With this
           method, the target response of a classifier is always the decision

From abb15b65073409c965dd9477bd823f1b012cc377 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Mar 2019 07:31:47 -0400
Subject: [PATCH 084/113] Addressed Joels comments

---
 sklearn/inspect/partial_dependence.py           |  6 +++---
 .../inspect/tests/test_partial_dependence.py    | 17 +++++++++++++----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/inspect/partial_dependence.py
index 13f85a560b46d..6b56d7a1a6f3c 100644
--- a/sklearn/inspect/partial_dependence.py
+++ b/sklearn/inspect/partial_dependence.py
@@ -297,9 +297,9 @@ def partial_dependence(est, features, X, response_method='auto',
             'are {}.'.format(response_method, ', '.join(accepted_responses)))
 
     if is_regressor(est) and response_method != 'auto':
-        warnings.warn(
-            "The response_method parameter is ignored for regressors.",
-            UserWarning
+        raise ValueError(
+            "The response_method parameter is ignored for regressors and "
+            "must be 'auto'."
         )
     accepted_methods = ('brute', 'recursion', 'auto')
     if method not in accepted_methods:
diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
index 3568249514d2c..bdb07ccf1857d 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -281,10 +281,10 @@ def test_partial_dependence_input():
                         "est must be a fitted regressor or classifier",
                         partial_dependence, KMeans(), [0], X)
 
-    assert_warns_message(
-        UserWarning,
-        'The response_method parameter is ignored for regressors',
-        partial_dependence, lr, [0], X, response_method='predict_proba')
+    with pytest.raises(
+            ValueError,
+            match='The response_method parameter is ignored for regressors'):
+        partial_dependence(lr, [0], X, response_method='predict_proba')
 
     assert_raises_regex(
         ValueError,
@@ -293,6 +293,15 @@ def test_partial_dependence_input():
         partial_dependence, gbc, [0], X, response_method='predict_proba',
         method='recursion')
 
+    # for GBDTs, if users want to use predict_proba then they're forced to set
+    # 'method' to brute.
+    with pytest.raises(
+            ValueError,
+            match="With the 'recursion' method, the response_method must be "
+                  "'decision_function"):
+        partial_dependence(gbc, [0], X, response_method='predict_proba',
+                           method='auto')
+
     assert_raises_regex(ValueError,
                         "response_method blahblah is invalid. "
                         "Accepted response",

From de2ebe578e8fead9ceef131722bd2ac1479c38c3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Mar 2019 07:32:49 -0400
Subject: [PATCH 085/113] Addressed Joels comments

---
 .../inspect/tests/test_partial_dependence.py  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
index bdb07ccf1857d..f0657f004fae9 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -277,21 +277,22 @@ def test_partial_dependence_input():
     gbc = GradientBoostingClassifier(random_state=0)
     gbc.fit(X, y)
 
-    assert_raises_regex(ValueError,
-                        "est must be a fitted regressor or classifier",
-                        partial_dependence, KMeans(), [0], X)
+    with pytest.raises(
+            ValueError,
+            match="est must be a fitted regressor or classifier"):
+        partial_dependence(KMeans(), [0], X)
 
     with pytest.raises(
             ValueError,
             match='The response_method parameter is ignored for regressors'):
         partial_dependence(lr, [0], X, response_method='predict_proba')
 
-    assert_raises_regex(
-        ValueError,
-        "With the 'recursion' method, the response_method must be "
-        "'decision_function'.",
-        partial_dependence, gbc, [0], X, response_method='predict_proba',
-        method='recursion')
+    with pytest.raises(
+            ValueError,
+            match="With the 'recursion' method, the response_method must be "
+                  "'decision_function'."):
+        partial_dependence(gbc, [0], X, response_method='predict_proba',
+                           method='recursion')
 
     # for GBDTs, if users want to use predict_proba then they're forced to set
     # 'method' to brute.

From fe8a026461375689ffb6acfccf43cc2b9020249c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Mar 2019 07:45:05 -0400
Subject: [PATCH 086/113] Use pytest for exceptions and warnings

---
 .../inspect/tests/test_partial_dependence.py  | 202 +++++++++---------
 1 file changed, 104 insertions(+), 98 deletions(-)

diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/inspect/tests/test_partial_dependence.py
index f0657f004fae9..61f666c187845 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/inspect/tests/test_partial_dependence.py
@@ -7,8 +7,6 @@
 import pytest
 
 import sklearn
-from sklearn.utils.testing import assert_raises_regex
-from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import if_matplotlib
 from sklearn.inspect import partial_dependence
 from sklearn.inspect import plot_partial_dependence
@@ -124,25 +122,32 @@ def test_grid_from_X():
     assert axes[0].shape == (n_unique_values,)
     assert axes[1].shape == (grid_resolution,)
 
-    assert_raises_regex(ValueError, 'percentiles are too close',
-                        _grid_from_X, X, grid_resolution=2,
-                        percentiles=(0, 0.0001))
+    with pytest.raises(
+            ValueError,
+            match='percentiles are too close'):
+        _grid_from_X(X, grid_resolution=2, percentiles=(0, 0.0001))
 
     for percentiles in ((1, 2, 3, 4), 12345):
-        assert_raises_regex(ValueError, "percentiles must be a sequence",
-                            _grid_from_X, X, percentiles=percentiles)
+        with pytest.raises(
+                ValueError,
+                match="percentiles must be a sequence"):
+            _grid_from_X(X, percentiles=percentiles)
 
     for percentiles in ((-1, .95), (.05, 2)):
-        assert_raises_regex(ValueError, "percentiles values must be in",
-                            _grid_from_X, X, percentiles=percentiles)
+        with pytest.raises(
+                ValueError,
+                match="percentiles values must be in"):
+            _grid_from_X(X, percentiles=percentiles)
 
-    assert_raises_regex(ValueError,
-                        r"percentiles\[0\] must be strictly less than",
-                        _grid_from_X, X, percentiles=(.9, .1))
+    with pytest.raises(
+            ValueError,
+            match=r"percentiles\[0\] must be strictly less than"):
+        _grid_from_X(X, percentiles=(.9, .1))
 
-    assert_raises_regex(ValueError,
-                        'grid_resolution must be strictly greater than 1.',
-                        _grid_from_X, X, grid_resolution=1)
+    with pytest.raises(
+            ValueError,
+            match='grid_resolution must be strictly greater than 1.'):
+        _grid_from_X(X, grid_resolution=1)
 
 
 @pytest.mark.parametrize('target_feature', (0, 3))
@@ -262,9 +267,10 @@ def test_multiclass_multioutput(Estimator):
     est = Estimator()
     est.fit(X, y)
 
-    assert_raises_regex(ValueError,
-                        "Multiclass-multioutput estimators are not supported",
-                        partial_dependence, est, [0], X=X)
+    with pytest.raises(
+            ValueError,
+            match="Multiclass-multioutput estimators are not supported"):
+        partial_dependence(est, [0], X=X)
 
 
 def test_partial_dependence_input():
@@ -303,52 +309,56 @@ def test_partial_dependence_input():
         partial_dependence(gbc, [0], X, response_method='predict_proba',
                            method='auto')
 
-    assert_raises_regex(ValueError,
-                        "response_method blahblah is invalid. "
-                        "Accepted response",
-                        partial_dependence, gbc, [0], X,
-                        response_method='blahblah')
+    with pytest.raises(
+            ValueError,
+            match="response_method blahblah is invalid. Accepted response"):
+        partial_dependence(gbc, [0], X, response_method='blahblah')
 
     class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
         pass
     bad_clf = NoPredictProbaNoDecisionFunction()
 
-    assert_raises_regex(
-        ValueError,
-        'The estimator has no predict_proba and no decision_function method.',
-        partial_dependence, bad_clf, [0], X, response_method='auto')
+    with pytest.raises(
+            ValueError,
+            match='The estimator has no predict_proba and no '
+                  'decision_function method.'):
+        partial_dependence(bad_clf, [0], X, response_method='auto')
 
-    assert_raises_regex(
-        ValueError,
-        'The estimator has no predict_proba method.',
-        partial_dependence, bad_clf, [0], X, response_method='predict_proba')
+    with pytest.raises(
+            ValueError,
+            match='The estimator has no predict_proba method.'):
+        partial_dependence(bad_clf, [0], X, response_method='predict_proba')
 
-    assert_raises_regex(
-        ValueError,
-        'The estimator has no decision_function method.',
-        partial_dependence, bad_clf, [0], X,
-        response_method='decision_function')
+    with pytest.raises(
+            ValueError,
+            match='The estimator has no decision_function method.'):
+        partial_dependence(bad_clf, [0], X,
+                           response_method='decision_function')
 
-    assert_raises_regex(ValueError,
-                        "method blahblah is invalid. Accepted method names "
-                        "are brute, recursion, auto.",
-                        partial_dependence, lr, [0], X, method='blahblah')
+    with pytest.raises(
+            ValueError,
+            match="method blahblah is invalid. Accepted method names "
+                  "are brute, recursion, auto."):
+        partial_dependence(lr, [0], X, method='blahblah')
 
-    assert_raises_regex(ValueError,
-                        'est must be an instance of BaseGradientBoosting '
-                        'for the "recursion" method',
-                        partial_dependence, lr, [0], X, method='recursion')
+    with pytest.raises(
+            ValueError,
+            match='est must be an instance of BaseGradientBoosting '
+                  'for the "recursion" method'):
+        partial_dependence(lr, [0], X, method='recursion')
 
     for feature in (-1, 1000000):
         for est in (lr, gbc):
-            assert_raises_regex(ValueError,
-                                "all features must be in",
-                                partial_dependence, est, [feature], X=X)
+            with pytest.raises(
+                    ValueError,
+                    match="all features must be in"):
+                partial_dependence(est, [feature], X=X)
 
     for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
-        assert_raises_regex(ValueError,
-                            'est parameter must be a fitted estimator',
-                            partial_dependence, unfitted_est, [0], X=X)
+        with pytest.raises(
+                ValueError,
+                match='est parameter must be a fitted estimator'):
+            partial_dependence(unfitted_est, [0], X=X)
 
     # check that array-like objects are accepted
     for est in (lr, gbc):
@@ -451,54 +461,54 @@ def test_plot_partial_dependence_input():
     (X_m, y_m), _ = multiclass_classification_data
     lr_m = LogisticRegression()
     lr_m.fit(X_m, y_m)
-    assert_raises_regex(ValueError,
-                        'target must be specified for multi-class',
-                        plot_partial_dependence, lr_m, X_m, [0],
-                        target=None)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-class'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
     for target in (-1, 100):
-        assert_raises_regex(ValueError,
-                            'target not in est.classes_',
-                            plot_partial_dependence, lr_m, X_m, [0],
-                            target=target)
+        with pytest.raises(
+                ValueError,
+                match='target not in est.classes_'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
 
     # check target param for multioutput
     (X_m, y_m), _ = multioutput_regression_data
     lr_m = LinearRegression()
     lr_m.fit(X_m, y_m)
-    assert_raises_regex(ValueError,
-                        'target must be specified for multi-output',
-                        plot_partial_dependence, lr_m, X_m, [0],
-                        target=None)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-output'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
     for target in (-1, 100):
-        assert_raises_regex(ValueError,
-                            r'target must be in \[0, n_tasks\]',
-                            plot_partial_dependence, lr_m, X_m, [0],
-                            target=target)
+        with pytest.raises(
+                ValueError,
+                match=r'target must be in \[0, n_tasks\]'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
 
     for feature_names in (None, ['abcd', 'def']):
-        assert_raises_regex(ValueError,
-                            'Feature foobar not in feature_names',
-                            plot_partial_dependence, lr, X,
-                            features=['foobar'],
-                            feature_names=feature_names)
+        with pytest.raises(
+                ValueError,
+                match='Feature foobar not in feature_names'):
+            plot_partial_dependence(lr, X, features=['foobar'],
+                                    feature_names=feature_names)
 
     for features in([(1, 2, 3)], [1, {}], [tuple()]):
-        assert_raises_regex(ValueError,
-                            'Each entry in features must be either an int, ',
-                            plot_partial_dependence, lr, X,
-                            features=features)
+        with pytest.raises(
+                ValueError,
+                match='Each entry in features must be either an int, '):
+            plot_partial_dependence(lr, X, features=features)
 
-    assert_raises_regex(ValueError,
-                        'All entries of features must be less than ',
-                        plot_partial_dependence, lr, X,
-                        features=[123],
-                        feature_names=['blah'])
+    with pytest.raises(
+            ValueError,
+            match='All entries of features must be less than '):
+        plot_partial_dependence(lr, X, features=[123],
+                                feature_names=['blah'])
 
-    assert_raises_regex(ValueError,
-                        'feature_names should not contain duplicates',
-                        plot_partial_dependence, lr, X,
-                        features=[0, 1, 2],
-                        feature_names=['a', 'b', 'a'])
+    with pytest.raises(
+            ValueError,
+            match='feature_names should not contain duplicates'):
+        plot_partial_dependence(lr, X, features=[0, 1, 2],
+                                feature_names=['a', 'b', 'a'])
 
 
 @pytest.mark.skip('Passing non-constant init fails. Wait for PR #12436 '
@@ -510,19 +520,15 @@ def test_warning_recursion_non_constant_init():
     gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
     gbc.fit(X, y)
 
-    assert_warns_message(
-        UserWarning,
-        'Using recursion method with a non-constant init predictor',
-        plot_partial_dependence,
-        gbc, X, [0], method='recursion'
-    )
-
-    assert_warns_message(
-        UserWarning,
-        'Using recursion method with a non-constant init predictor',
-        partial_dependence,
-        gbc, [0], X=X, method='recursion'
-    )
+    with pytest.warns(
+            UserWarning,
+            match='Using recursion method with a non-constant init predictor'):
+        plot_partial_dependence(gbc, X, [0], method='recursion')
+
+    with pytest.warns(
+            UserWarning,
+            match='Using recursion method with a non-constant init predictor'):
+        partial_dependence(gbc, [0], X=X, method='recursion')
 
 
 @if_matplotlib

From 07e8de4742216368fbd3e311e3bc29c104545556 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Mar 2019 07:59:14 -0400
Subject: [PATCH 087/113] Renamed inspect into model_selection

---
 doc/modules/classes.rst                              | 10 +++++-----
 sklearn/__init__.py                                  |  6 +++---
 sklearn/ensemble/partial_dependence.py               |  4 ++--
 sklearn/{inspect => model_inspection}/__init__.py    |  0
 .../partial_dependence.py                            |  0
 .../{inspect => model_inspection}/tests/__init__.py  |  0
 .../tests/test_partial_dependence.py                 | 12 +++++-------
 sklearn/setup.py                                     |  4 ++--
 8 files changed, 17 insertions(+), 19 deletions(-)
 rename sklearn/{inspect => model_inspection}/__init__.py (100%)
 rename sklearn/{inspect => model_inspection}/partial_dependence.py (100%)
 rename sklearn/{inspect => model_inspection}/tests/__init__.py (100%)
 rename sklearn/{inspect => model_inspection}/tests/test_partial_dependence.py (97%)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 72bd97eff7921..6ea674a94feff 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1212,10 +1212,10 @@ Model validation
 
 .. _partial_dependence_ref:
 
-:mod:`sklearn.inspect`: Model inspection
-========================================
+:mod:`sklearn.model_inspection`: Model inspection
+=================================================
 
-.. automodule:: sklearn.inspect
+.. automodule:: sklearn.model_inspection
    :no-members:
    :no-inherited-members:
 
@@ -1225,8 +1225,8 @@ Model validation
    :toctree: generated/
    :template: function.rst
 
-   inspect.partial_dependence
-   inspect.plot_partial_dependence
+   model_inspection.partial_dependence
+   model_inspection.plot_partial_dependence
 
 
 .. _preprocessing_ref:
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 6112e4c513774..c6527a119b230 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -81,9 +81,9 @@
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
                'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
                'externals', 'feature_extraction', 'feature_selection',
-               'gaussian_process', 'inspect', 'isotonic',
-               'kernel_approximation', 'kernel_ridge', 'linear_model',
-               'manifold', 'metrics', 'mixture', 'model_selection',
+               'gaussian_process', 'isotonic', 'kernel_approximation',
+               'kernel_ridge', 'linear_model', 'manifold', 'metrics',
+               'mixture', 'model_selection', 'model_inspection',
                'multiclass', 'multioutput', 'naive_bayes', 'neighbors',
                'neural_network', 'pipeline', 'preprocessing',
                'random_projection', 'semi_supervised', 'svm', 'tree',
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index d4194981b7ec3..deb56374d9c44 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -79,7 +79,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
 
 @deprecated("The function ensemble.partial_dependence has been deprecated "
-            "in favour of inspect.partial_dependence in 0.21 "
+            "in favour of model_inspection.partial_dependence in 0.21 "
             "and will be removed in 0.23.")
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100):
@@ -93,7 +93,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.inspect.partial_dependence` and will be
+       :func:`sklearn.model_inspection.partial_dependence` and will be
        removed in 0.23.
 
     Parameters
diff --git a/sklearn/inspect/__init__.py b/sklearn/model_inspection/__init__.py
similarity index 100%
rename from sklearn/inspect/__init__.py
rename to sklearn/model_inspection/__init__.py
diff --git a/sklearn/inspect/partial_dependence.py b/sklearn/model_inspection/partial_dependence.py
similarity index 100%
rename from sklearn/inspect/partial_dependence.py
rename to sklearn/model_inspection/partial_dependence.py
diff --git a/sklearn/inspect/tests/__init__.py b/sklearn/model_inspection/tests/__init__.py
similarity index 100%
rename from sklearn/inspect/tests/__init__.py
rename to sklearn/model_inspection/tests/__init__.py
diff --git a/sklearn/inspect/tests/test_partial_dependence.py b/sklearn/model_inspection/tests/test_partial_dependence.py
similarity index 97%
rename from sklearn/inspect/tests/test_partial_dependence.py
rename to sklearn/model_inspection/tests/test_partial_dependence.py
index 61f666c187845..c60c119d24df3 100644
--- a/sklearn/inspect/tests/test_partial_dependence.py
+++ b/sklearn/model_inspection/tests/test_partial_dependence.py
@@ -8,11 +8,11 @@
 
 import sklearn
 from sklearn.utils.testing import if_matplotlib
-from sklearn.inspect import partial_dependence
-from sklearn.inspect import plot_partial_dependence
-from sklearn.inspect.partial_dependence import _grid_from_X
-from sklearn.inspect.partial_dependence import _partial_dependence_brute
-from sklearn.inspect.partial_dependence import _partial_dependence_recursion
+from sklearn.model_inspection import partial_dependence
+from sklearn.model_inspection import plot_partial_dependence
+from sklearn.model_inspection.partial_dependence import _grid_from_X
+from sklearn.model_inspection.partial_dependence import _partial_dependence_brute
+from sklearn.model_inspection.partial_dependence import _partial_dependence_recursion
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.linear_model import LinearRegression
@@ -511,8 +511,6 @@ def test_plot_partial_dependence_input():
                                 feature_names=['a', 'b', 'a'])
 
 
-@pytest.mark.skip('Passing non-constant init fails. Wait for PR #12436 '
-                  'to be merged to un-skip this test')
 def test_warning_recursion_non_constant_init():
     # make sure that passing a non-constant init parameter to a GBDT and using
     # recursion method yields a warning.
diff --git a/sklearn/setup.py b/sklearn/setup.py
index fbaa67930c228..d6f8cd4694871 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -33,12 +33,12 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('feature_selection/tests')
     config.add_subpackage('gaussian_process')
     config.add_subpackage('gaussian_process/tests')
-    config.add_subpackage('inspect')
-    config.add_subpackage('inspect/tests')
     config.add_subpackage('mixture')
     config.add_subpackage('mixture/tests')
     config.add_subpackage('model_selection')
     config.add_subpackage('model_selection/tests')
+    config.add_subpackage('model_inspection')
+    config.add_subpackage('model_inspection/tests')
     config.add_subpackage('neural_network')
     config.add_subpackage('neural_network/tests')
     config.add_subpackage('preprocessing')

From d33a732e68c78b1ae71cf5209281d8d149f1e795 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 28 Mar 2019 08:32:20 -0400
Subject: [PATCH 088/113] created plot module and put plot_partial_dependence()
 there

---
 doc/modules/classes.rst                       |  19 +-
 doc/modules/partial_dependence.rst            |  20 +-
 doc/{model_inspection.rst => plot.rst}        |   6 +-
 doc/user_guide.rst                            |   2 +-
 .../plot_partial_dependence.py                |   4 +-
 sklearn/__init__.py                           |   2 +-
 sklearn/ensemble/partial_dependence.py        |   4 +-
 sklearn/model_inspection/__init__.py          |   5 +-
 .../model_inspection/partial_dependence.py    | 310 +----------------
 .../tests/test_partial_dependence.py          | 177 +---------
 sklearn/plot/__init__.py                      |   7 +
 sklearn/plot/partial_dependence.py            | 325 ++++++++++++++++++
 sklearn/plot/tests/__init__.py                |   0
 sklearn/plot/tests/test_partial_dependence.py | 187 ++++++++++
 sklearn/setup.py                              |   2 +
 15 files changed, 571 insertions(+), 499 deletions(-)
 rename doc/{model_inspection.rst => plot.rst} (60%)
 rename examples/{inspect => plot}/plot_partial_dependence.py (98%)
 create mode 100644 sklearn/plot/__init__.py
 create mode 100644 sklearn/plot/partial_dependence.py
 create mode 100644 sklearn/plot/tests/__init__.py
 create mode 100644 sklearn/plot/tests/test_partial_dependence.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 6ea674a94feff..6832e4471a36c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1210,7 +1210,7 @@ Model validation
    pipeline.make_pipeline
    pipeline.make_union
 
-.. _partial_dependence_ref:
+.. _model_inspection_ref:
 
 :mod:`sklearn.model_inspection`: Model inspection
 =================================================
@@ -1226,8 +1226,23 @@ Model validation
    :template: function.rst
 
    model_inspection.partial_dependence
-   model_inspection.plot_partial_dependence
 
+.. _plot_ref:
+
+:mod:`sklearn.plot`: Plot
+=========================
+
+.. automodule:: sklearn.plot
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: function.rst
+
+   plot.plot_partial_dependence
 
 .. _preprocessing_ref:
 
diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index eb48a14c79b99..93e25df7e940f 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -5,7 +5,7 @@
 Partial dependence plots
 ========================
 
-.. currentmodule:: sklearn.inspect
+.. currentmodule:: sklearn.plot
 
 Partial dependence plots (PDP) show the dependence between the target
 response [1]_ and a set of 'target' features, marginalizing over the values
@@ -39,7 +39,7 @@ an average occupancy greater than two, the house price is nearly independent of
 the house age, whereas for values less than 2 there is a strong dependence
 on age.
 
-The :mod:`sklearn.inspect` module provides a convenience function
+The :mod:`sklearn.plot` module provides a convenience function
 :func:`plot_partial_dependence` to create one-way and two-way partial
 dependence plots. In the below example we show how to create a grid of
 partial dependence plots: two one-way PDPs for the features ``0`` and ``1``
@@ -47,7 +47,7 @@ and a two-way PDP between the two features::
 
     >>> from sklearn.datasets import make_hastie_10_2
     >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> from sklearn.inspect import plot_partial_dependence
+    >>> from sklearn.plot import plot_partial_dependence
 
     >>> X, y = make_hastie_10_2(random_state=0)
     >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
@@ -69,9 +69,10 @@ The same parameter ``target`` is used to specify the target in multi-output
 regression settings.
 
 If you need the raw values of the partial dependence function rather than
-the plots, you can use the :func:`partial_dependence` function::
+the plots, you can use the
+:func:`sklearn.model_inspection.partial_dependence` function::
 
-    >>> from sklearn.inspect import partial_dependence
+    >>> from sklearn.model_inspection import partial_dependence
 
     >>> pdp, axes = partial_dependence(clf, [0], X=X)
     >>> pdp  # doctest: +ELLIPSIS
@@ -81,9 +82,10 @@ the plots, you can use the :func:`partial_dependence` function::
 
 The values at which the partial dependence should be evaluated are directly
 generated from ``X``. For 2-way partial dependence, a 2D-grid of values is
-generated. The ``values`` field returned by :func:`partial_dependence` gives
-the actual values used in the grid for each target feature. They also
-correspond to the axis of the plots.
+generated. The ``values`` field returned by
+:func:`sklearn.model_inspection.partial_dependence` gives the actual values
+used in the grid for each target feature. They also correspond to the axis
+of the plots.
 
 For each value of the 'target' features in the ``grid`` the partial
 dependence function needs to marginalize the predictions of the estimator
@@ -110,7 +112,7 @@ which the trees were trained.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_inspect_plot_partial_dependence.py`
+ * :ref:`sphx_glr_auto_examples_plot_plot_partial_dependence.py`
 
 .. topic:: References
 
diff --git a/doc/model_inspection.rst b/doc/plot.rst
similarity index 60%
rename from doc/model_inspection.rst
rename to doc/plot.rst
index 0bc468517fb45..38e3bdcf72648 100644
--- a/doc/model_inspection.rst
+++ b/doc/plot.rst
@@ -1,9 +1,9 @@
 .. include:: includes/big_toc_css.rst
 
-.. _model_inspection:
+.. _plot:
 
-Model inspection
-----------------
+Plotting
+--------
 
 .. toctree::
 
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 004feb0824e7d..327dc07e057ae 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -18,7 +18,7 @@ User Guide
    supervised_learning.rst
    unsupervised_learning.rst
    model_selection.rst
-   model_inspection.rst
+   plot.rst
    data_transforms.rst
    Dataset loading utilities <datasets/index.rst>
    modules/computing.rst
diff --git a/examples/inspect/plot_partial_dependence.py b/examples/plot/plot_partial_dependence.py
similarity index 98%
rename from examples/inspect/plot_partial_dependence.py
rename to examples/plot/plot_partial_dependence.py
index bee9fe21b8714..465bf14f499cf 100644
--- a/examples/inspect/plot_partial_dependence.py
+++ b/examples/plot/plot_partial_dependence.py
@@ -63,8 +63,8 @@
 
 from mpl_toolkits.mplot3d import Axes3D
 
-from sklearn.inspect import partial_dependence
-from sklearn.inspect import plot_partial_dependence
+from sklearn.model_inspection import partial_dependence
+from sklearn.plot import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.neural_network import MLPRegressor
 from sklearn.datasets.california_housing import fetch_california_housing
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index c6527a119b230..a3f8116961a95 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -85,7 +85,7 @@
                'kernel_ridge', 'linear_model', 'manifold', 'metrics',
                'mixture', 'model_selection', 'model_inspection',
                'multiclass', 'multioutput', 'naive_bayes', 'neighbors',
-               'neural_network', 'pipeline', 'preprocessing',
+               'neural_network', 'pipeline', 'plot', 'preprocessing',
                'random_projection', 'semi_supervised', 'svm', 'tree',
                'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index deb56374d9c44..908e6c1a08d0f 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -182,7 +182,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
 @deprecated("The function ensemble.plot_partial_dependence has been "
             "deprecated in favour of "
-            "inspect.plot_partial_dependence in "
+            "plot.plot_partial_dependence in "
             " 0.21 and will be removed in 0.23.")
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
@@ -199,7 +199,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.inspect.plot_partial_dependence` and will be
+       :func:`sklearn.plot.plot_partial_dependence` and will be
        removed in 0.23.
 
     Parameters
diff --git a/sklearn/model_inspection/__init__.py b/sklearn/model_inspection/__init__.py
index b5f0f2691cece..4fcd6197dc508 100644
--- a/sklearn/model_inspection/__init__.py
+++ b/sklearn/model_inspection/__init__.py
@@ -1,9 +1,8 @@
-"""The :mod:`sklearn.inspect` module includes tools for model inspection."""
+"""The :mod:`sklearn.model_inspection` module includes tools for model
+inspection."""
 from .partial_dependence import partial_dependence
-from .partial_dependence import plot_partial_dependence
 
 
 __all__ = [
     'partial_dependence',
-    'plot_partial_dependence',
 ]
diff --git a/sklearn/model_inspection/partial_dependence.py b/sklearn/model_inspection/partial_dependence.py
index 6b56d7a1a6f3c..d6cb180674ac6 100644
--- a/sklearn/model_inspection/partial_dependence.py
+++ b/sklearn/model_inspection/partial_dependence.py
@@ -5,8 +5,6 @@
 #          Nicolas Hug
 # License: BSD 3 clause
 
-from itertools import count
-import numbers
 import warnings
 
 import numpy as np
@@ -14,8 +12,6 @@
 
 from ..base import is_classifier, is_regressor
 from ..utils.extmath import cartesian
-from ..externals.joblib import Parallel, delayed
-from ..externals import six
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
 from ..tree._tree import DTYPE
@@ -24,7 +20,7 @@
 from ..ensemble._gradient_boosting import _partial_dependence_tree
 
 
-__all__ = ['partial_dependence', 'plot_partial_dependence']
+__all__ = ['partial_dependence']
 
 
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
@@ -267,6 +263,10 @@ def partial_dependence(est, features, X, response_method='auto',
     ...                    grid_resolution=2) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
 
+    See also
+    --------
+    sklearn.plot.plot_partial_dependence: Plot partial dependence
+
     .. _warning_recursion_init:
 
     Warnings
@@ -354,303 +354,3 @@ def partial_dependence(est, features, X, response_method='auto',
         -1, *[val.shape[0] for val in values])
 
     return averaged_predictions, values
-
-
-def plot_partial_dependence(est, X, features, feature_names=None,
-                            target=None, response_method='auto', n_cols=3,
-                            grid_resolution=100, percentiles=(0.05, 0.95),
-                            method='auto', n_jobs=1, verbose=0, fig=None,
-                            line_kw=None, contour_kw=None, **fig_kw):
-    """Partial dependence plots.
-
-    The ``len(features)`` plots are arranged in a grid with ``n_cols``
-    columns. Two-way partial dependence plots are plotted as contour plots.
-
-    Read more in the :ref:`User Guide <partial_dependence>`.
-
-    Parameters
-    ----------
-    est : BaseEstimator
-        A fitted classification or regression model. Classifiers must have a
-        ``predict_proba()`` or ``decision_function`` method.
-        Multioutput-multiclass estimators aren't supported.
-    X : array-like, shape=(n_samples, n_features)
-        The data to use to build the grid of values on which the dependence
-        will be evaluated. This is usually the training data.
-    features : list of {int, str, pair of int, pair of str}
-        The target features for which to create the PDPs.
-        If features[i] is an int or a string, a one-way PDP is created; if
-        features[i] is a tuple, a two-way PDP is created. Each tuple must be
-        of size 2.
-        if any entry is a string, then it must be in ``feature_names``.
-    feature_names : seq of str, shape=(n_features,), optional
-        Name of each feature; feature_names[i] holds the name of the feature
-        with index i.
-    target : int, optional (default=None)
-        - In a multiclass setting, specifies the class for which the PDPs
-          should be computed. Note that for binary classification, the
-          positive class (index 1) is always used.
-        - In a multioutput setting, specifies the task for which the PDPs
-          should be computed
-        Ignored in binary classification or classical regression settings.
-    response_method : 'auto', 'predict_proba' or 'decision_function', \
-            optional (default='auto') :
-        Specifies whether to use :term:`predict_proba` or
-        :term:`decision_function` as the target response. For regressors
-        this parameter is ignored and the response is always the output of
-        :term:`predict`. By default, :term:`predict_proba` is tried first
-        and we revert to :term:`decision_function` if it doesn't exist. If
-        ``method`` is 'recursion', the response is always the output of
-        :term:`decision_function`.
-    n_cols : int, optional (default=3)
-        The maximum number of columns in the grid plot.
-    grid_resolution : int, optional (default=100)
-        The number of equally spaced points on the axes of the plots, for each
-        target feature.
-    percentiles : tuple of float, optional (default=(0.05, 0.95))
-        The lower and upper percentile used to create the extreme values
-        for the PDP axes. Must be in [0, 1].
-    method : str, optional (default='auto')
-        The method to use to calculate the partial dependence predictions:
-
-        - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is more efficient in terms of speed.
-          With this method, ``X`` is optional and is only used to build the
-          grid and the partial dependences are computed using the training
-          data. This method does not account for the ``init`` predicor of
-          the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init_plot>`). With this
-          method, the target response of a classifier is always the decision
-          function, not the predicted probabilities.
-
-        - 'brute' is supported for any estimator, but is more
-          computationally intensive.
-
-        - If 'auto', then 'recursion' will be used for
-          ``BaseGradientBoosting`` estimators with ``init=None``, and
-          'brute' for all other.
-
-        Unlike the 'brute' method, 'recursion' does not account for the
-        ``init`` predictor of the boosting process. In practice this still
-        produces the same plots, up to a constant offset in the target
-        response.
-    n_jobs : int, optional (default=1)
-        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
-        See :term:`Glossary <n_jobs>` for more details.
-    verbose : int, optional (default=0)
-        Verbose output during PD computations.
-    fig : Matplotlib figure object, optional (default=None)
-        A figure object onto which the plots will be drawn, after the figure
-        has been cleared.
-    line_kw : dict, optional
-        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
-        For one-way partial dependence plots.
-    contour_kw : dict, optional
-        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
-        For two-way partial dependence plots.
-    **fig_kw : dict, optional
-        Dict with keywords passed to the figure() call.
-        Note that all keywords not recognized above will be automatically
-        included here.
-
-    Returns
-    -------
-    fig : figure
-        The Matplotlib Figure object.
-    axs : seq of Axis objects
-        A seq of Axis objects, one for each subplot.
-
-    Examples
-    --------
-    >>> from sklearn.datasets import make_friedman1
-    >>> from sklearn.ensemble import GradientBoostingRegressor
-    >>> X, y = make_friedman1()
-    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
-    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
-    ...
-
-    .. _warning_recursion_init_plot:
-
-    Warnings
-    --------
-    The 'recursion' method only works for gradient boosting estimators, and
-    unlike the 'brute' method, it does not account for the ``init``
-    predictor of the boosting process. In practice this will produce the
-    same values as 'brute' up to a constant offset in the target response,
-    provided that ``init`` is a consant estimator (which is the default).
-    However, as soon as ``init`` is not a constant estimator, the partial
-    dependence values are incorrect for 'recursion'.
-    """
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
-
-    # set target_idx for multi-class estimators
-    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
-        if target is None:
-            raise ValueError('target must be specified for multi-class')
-        target_idx = np.searchsorted(est.classes_, target)
-        if (not (0 <= target_idx < len(est.classes_)) or
-                est.classes_[target_idx] != target):
-            raise ValueError('target not in est.classes_, got {}'.format(
-                target))
-    else:
-        # regression and binary classification
-        target_idx = 0
-
-    X = check_array(X)
-    n_features = X.shape[1]
-
-    # convert feature_names to list
-    if feature_names is None:
-        # if feature_names is None, use feature indices as name
-        feature_names = [str(i) for i in range(n_features)]
-    elif isinstance(feature_names, np.ndarray):
-        feature_names = feature_names.tolist()
-    if len(set(feature_names)) != len(feature_names):
-        raise ValueError('feature_names should not contain duplicates.')
-
-    def convert_feature(fx):
-        if isinstance(fx, six.string_types):
-            try:
-                fx = feature_names.index(fx)
-            except ValueError:
-                raise ValueError('Feature %s not in feature_names' % fx)
-        return int(fx)
-
-    # convert features into a seq of int tuples
-    tmp_features = []
-    for fxs in features:
-        if isinstance(fxs, (numbers.Integral, six.string_types)):
-            fxs = (fxs,)
-        try:
-            fxs = [convert_feature(fx) for fx in fxs]
-        except TypeError:
-            raise ValueError('Each entry in features must be either an int, '
-                             'a string, or an iterable of size at most 2.')
-        if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('Each entry in features must be either an int, '
-                             'a string, or an iterable of size at most 2.')
-
-        tmp_features.append(fxs)
-
-    features = tmp_features
-
-    names = []
-    try:
-        for fxs in features:
-            names_ = []
-            # explicit loop so "i" is bound for exception below
-            for i in fxs:
-                names_.append(feature_names[i])
-            names.append(names_)
-    except IndexError:
-        raise ValueError('All entries of features must be less than '
-                         'len(feature_names) = {0}, got {1}.'
-                         .format(len(feature_names), i))
-
-    # compute averaged predictions
-    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X,
-                                    response_method=response_method,
-                                    method=method,
-                                    grid_resolution=grid_resolution,
-                                    percentiles=percentiles)
-        for fxs in features)
-
-    # For multioutput regression, we can only check the validity of target
-    # now that we have the predictions.
-    # Also note: as multiclass-multioutput classifiers are not supported,
-    # multiclass and multioutput scenario are mutually exclusive. So there is
-    # no risk of overwriting target_idx here.
-    pd, _ = pd_result[0]  # checking the first result is enough
-    if is_regressor(est) and pd.shape[0] > 1:
-        if target is None:
-            raise ValueError(
-                'target must be specified for multi-output regressors')
-        if not 0 <= target <= pd.shape[0]:
-            raise ValueError(
-                'target must be in [0, n_tasks], got {}.'.format(target))
-        target_idx = target
-    else:
-        target_idx = 0
-
-    # get global min and max values of PD grouped by plot type
-    pdp_lim = {}
-    for pd, values in pd_result:
-        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
-        n_fx = len(values)
-        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
-        min_pd = min(min_pd, old_min_pd)
-        max_pd = max(max_pd, old_max_pd)
-        pdp_lim[n_fx] = (min_pd, max_pd)
-
-    # create contour levels for two-way plots
-    if 2 in pdp_lim:
-        Z_level = np.linspace(*pdp_lim[2], num=8)
-
-    if fig is None:
-        fig = plt.figure(**fig_kw)
-    else:
-        fig.clear()
-
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
-    n_cols = min(n_cols, len(features))
-    n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
-        ax = fig.add_subplot(n_rows, n_cols, i + 1)
-
-        if len(values) == 1:
-            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
-        else:
-            # make contour plot
-            assert len(values) == 2
-            XX, YY = np.meshgrid(values[0], values[1])
-            Z = pd[target_idx].T
-            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
-                            colors='k')
-            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
-                        vmin=Z_level[0], alpha=0.75, **contour_kw)
-            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
-
-        # plot data deciles + axes labels
-        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
-        trans = transforms.blended_transform_factory(ax.transData,
-                                                     ax.transAxes)
-        ylim = ax.get_ylim()
-        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
-        ax.set_xlabel(name[0])
-        ax.set_ylim(ylim)
-
-        # prevent x-axis ticks from overlapping
-        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
-        tick_formatter = ScalarFormatter()
-        tick_formatter.set_powerlimits((-3, 4))
-        ax.xaxis.set_major_formatter(tick_formatter)
-
-        if len(values) > 1:
-            # two-way PDP - y-axis deciles + labels
-            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
-            trans = transforms.blended_transform_factory(ax.transAxes,
-                                                         ax.transData)
-            xlim = ax.get_xlim()
-            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
-            ax.set_ylabel(name[1])
-            # hline erases xlim
-            ax.set_xlim(xlim)
-        else:
-            ax.set_ylabel('Partial dependence')
-
-        if len(values) == 1:
-            ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
-
-    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
-                        hspace=0.3)
-    return fig, axs
diff --git a/sklearn/model_inspection/tests/test_partial_dependence.py b/sklearn/model_inspection/tests/test_partial_dependence.py
index c60c119d24df3..627d617fde539 100644
--- a/sklearn/model_inspection/tests/test_partial_dependence.py
+++ b/sklearn/model_inspection/tests/test_partial_dependence.py
@@ -7,18 +7,17 @@
 import pytest
 
 import sklearn
-from sklearn.utils.testing import if_matplotlib
 from sklearn.model_inspection import partial_dependence
-from sklearn.model_inspection import plot_partial_dependence
-from sklearn.model_inspection.partial_dependence import _grid_from_X
-from sklearn.model_inspection.partial_dependence import _partial_dependence_brute
-from sklearn.model_inspection.partial_dependence import _partial_dependence_recursion
+from sklearn.model_inspection.partial_dependence import (
+    _grid_from_X,
+    _partial_dependence_brute,
+    _partial_dependence_recursion
+)
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import MultiTaskLasso
-from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score
@@ -365,152 +364,6 @@ class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
         partial_dependence(est, [0],  X=list(X))
 
 
-@if_matplotlib
-def test_plot_partial_dependence():
-    # Test partial dependence plot function.
-    boston = load_boston()
-    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
-    clf.fit(boston.data, boston.target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-    # check with str features and array feature names
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
-
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-    # check with list feature_names
-    feature_names = boston.feature_names.tolist()
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=feature_names)
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-def test_plot_partial_dependence_multiclass():
-    # Test partial dependence plot function on multi-class input.
-    iris = load_iris()
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(iris.data, iris.target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-    # now with symbol labels
-    target = iris.target_names[iris.target]
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(iris.data, target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target='setosa',
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-def test_plot_partial_dependence_multioutput():
-    # Test partial dependence plot function on multi-output input.
-    (X, y), _ = multioutput_regression_data
-    clf = LinearRegression()
-    clf.fit(X, y)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=1,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
-@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
-def test_plot_partial_dependence_input():
-    X, y = make_classification(random_state=0)
-
-    lr = LinearRegression()
-    lr.fit(X, y)
-    gbc = GradientBoostingClassifier(random_state=0)
-    gbc.fit(X, y)
-
-    # check target param for multiclass
-    (X_m, y_m), _ = multiclass_classification_data
-    lr_m = LogisticRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-class'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match='target not in est.classes_'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    # check target param for multioutput
-    (X_m, y_m), _ = multioutput_regression_data
-    lr_m = LinearRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-output'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match=r'target must be in \[0, n_tasks\]'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    for feature_names in (None, ['abcd', 'def']):
-        with pytest.raises(
-                ValueError,
-                match='Feature foobar not in feature_names'):
-            plot_partial_dependence(lr, X, features=['foobar'],
-                                    feature_names=feature_names)
-
-    for features in([(1, 2, 3)], [1, {}], [tuple()]):
-        with pytest.raises(
-                ValueError,
-                match='Each entry in features must be either an int, '):
-            plot_partial_dependence(lr, X, features=features)
-
-    with pytest.raises(
-            ValueError,
-            match='All entries of features must be less than '):
-        plot_partial_dependence(lr, X, features=[123],
-                                feature_names=['blah'])
-
-    with pytest.raises(
-            ValueError,
-            match='feature_names should not contain duplicates'):
-        plot_partial_dependence(lr, X, features=[0, 1, 2],
-                                feature_names=['a', 'b', 'a'])
-
-
 def test_warning_recursion_non_constant_init():
     # make sure that passing a non-constant init parameter to a GBDT and using
     # recursion method yields a warning.
@@ -521,27 +374,9 @@ def test_warning_recursion_non_constant_init():
     with pytest.warns(
             UserWarning,
             match='Using recursion method with a non-constant init predictor'):
-        plot_partial_dependence(gbc, X, [0], method='recursion')
+        partial_dependence(gbc, [0], X=X, method='recursion')
 
     with pytest.warns(
             UserWarning,
             match='Using recursion method with a non-constant init predictor'):
         partial_dependence(gbc, [0], X=X, method='recursion')
-
-
-@if_matplotlib
-def test_plot_partial_dependence_fig():
-    # Make sure fig object is correctly used if not None
-
-    import matplotlib.pyplot as plt
-
-    (X, y), _ = regression_data
-    clf = LinearRegression()
-    clf.fit(X, y)
-
-    fig = plt.figure()
-    grid_resolution = 25
-    returned_fig, axs = plot_partial_dependence(
-        clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
-
-    assert returned_fig is fig
diff --git a/sklearn/plot/__init__.py b/sklearn/plot/__init__.py
new file mode 100644
index 0000000000000..c659e4721f0ae
--- /dev/null
+++ b/sklearn/plot/__init__.py
@@ -0,0 +1,7 @@
+"""The :mod:`sklearn.plot` module includes tools for plotting."""
+from .partial_dependence import plot_partial_dependence
+
+
+__all__ = [
+    'plot_partial_dependence',
+]
diff --git a/sklearn/plot/partial_dependence.py b/sklearn/plot/partial_dependence.py
new file mode 100644
index 0000000000000..d76be3b711fc3
--- /dev/null
+++ b/sklearn/plot/partial_dependence.py
@@ -0,0 +1,325 @@
+"""Partial dependence plots for regression and classification models."""
+
+# Authors: Peter Prettenhofer
+#          Trevor Stephens
+#          Nicolas Hug
+# License: BSD 3 clause
+
+from itertools import count
+import numbers
+
+import numpy as np
+from scipy.stats.mstats import mquantiles
+
+from ..base import is_regressor
+from ..externals.joblib import Parallel, delayed
+from ..utils import check_array
+from ..model_inspection import partial_dependence
+
+
+__all__ = ['plot_partial_dependence']
+
+
+def plot_partial_dependence(est, X, features, feature_names=None,
+                            target=None, response_method='auto', n_cols=3,
+                            grid_resolution=100, percentiles=(0.05, 0.95),
+                            method='auto', n_jobs=1, verbose=0, fig=None,
+                            line_kw=None, contour_kw=None, **fig_kw):
+    """Partial dependence plots.
+
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
+    columns. Two-way partial dependence plots are plotted as contour plots.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model. Classifiers must have a
+        ``predict_proba()`` or ``decision_function`` method.
+        Multioutput-multiclass estimators aren't supported.
+    X : array-like, shape=(n_samples, n_features)
+        The data to use to build the grid of values on which the dependence
+        will be evaluated. This is usually the training data.
+    features : list of {int, str, pair of int, pair of str}
+        The target features for which to create the PDPs.
+        If features[i] is an int or a string, a one-way PDP is created; if
+        features[i] is a tuple, a two-way PDP is created. Each tuple must be
+        of size 2.
+        if any entry is a string, then it must be in ``feature_names``.
+    feature_names : seq of str, shape=(n_features,), optional
+        Name of each feature; feature_names[i] holds the name of the feature
+        with index i.
+    target : int, optional (default=None)
+        - In a multiclass setting, specifies the class for which the PDPs
+          should be computed. Note that for binary classification, the
+          positive class (index 1) is always used.
+        - In a multioutput setting, specifies the task for which the PDPs
+          should be computed
+        Ignored in binary classification or classical regression settings.
+    response_method : 'auto', 'predict_proba' or 'decision_function', \
+            optional (default='auto') :
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        :term:`decision_function`.
+    n_cols : int, optional (default=3)
+        The maximum number of columns in the grid plot.
+    grid_resolution : int, optional (default=100)
+        The number of equally spaced points on the axes of the plots, for each
+        target feature.
+    percentiles : tuple of float, optional (default=(0.05, 0.95))
+        The lower and upper percentile used to create the extreme values
+        for the PDP axes. Must be in [0, 1].
+    method : str, optional (default='auto')
+        The method to use to calculate the partial dependence predictions:
+
+        - 'recursion' is only supported for objects inheriting from
+          `BaseGradientBoosting`, but is more efficient in terms of speed.
+          With this method, ``X`` is optional and is only used to build the
+          grid and the partial dependences are computed using the training
+          data. This method does not account for the ``init`` predicor of
+          the boosting process, which may lead to incorrect values (see
+          :ref:`this warning<warning_recursion_init_plot>`). With this
+          method, the target response of a classifier is always the decision
+          function, not the predicted probabilities.
+
+        - 'brute' is supported for any estimator, but is more
+          computationally intensive.
+
+        - If 'auto', then 'recursion' will be used for
+          ``BaseGradientBoosting`` estimators with ``init=None``, and
+          'brute' for all other.
+
+        Unlike the 'brute' method, 'recursion' does not account for the
+        ``init`` predictor of the boosting process. In practice this still
+        produces the same plots, up to a constant offset in the target
+        response.
+    n_jobs : int, optional (default=1)
+        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
+        See :term:`Glossary <n_jobs>` for more details.
+    verbose : int, optional (default=0)
+        Verbose output during PD computations.
+    fig : Matplotlib figure object, optional (default=None)
+        A figure object onto which the plots will be drawn, after the figure
+        has been cleared.
+    line_kw : dict, optional
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For one-way partial dependence plots.
+    contour_kw : dict, optional
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For two-way partial dependence plots.
+    **fig_kw : dict, optional
+        Dict with keywords passed to the figure() call.
+        Note that all keywords not recognized above will be automatically
+        included here.
+
+    Returns
+    -------
+    fig : figure
+        The Matplotlib Figure object.
+    axs : seq of Axis objects
+        A seq of Axis objects, one for each subplot.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
+    ...
+
+    See also
+    --------
+    sklearn.model_inspection.partial_dependence: Return raw partial
+      dependence values
+
+    .. _warning_recursion_init_plot:
+
+    Warnings
+    --------
+    The 'recursion' method only works for gradient boosting estimators, and
+    unlike the 'brute' method, it does not account for the ``init``
+    predictor of the boosting process. In practice this will produce the
+    same values as 'brute' up to a constant offset in the target response,
+    provided that ``init`` is a consant estimator (which is the default).
+    However, as soon as ``init`` is not a constant estimator, the partial
+    dependence values are incorrect for 'recursion'.
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+
+    # set target_idx for multi-class estimators
+    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
+        if target is None:
+            raise ValueError('target must be specified for multi-class')
+        target_idx = np.searchsorted(est.classes_, target)
+        if (not (0 <= target_idx < len(est.classes_)) or
+                est.classes_[target_idx] != target):
+            raise ValueError('target not in est.classes_, got {}'.format(
+                target))
+    else:
+        # regression and binary classification
+        target_idx = 0
+
+    X = check_array(X)
+    n_features = X.shape[1]
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if feature_names is None, use feature indices as name
+        feature_names = [str(i) for i in range(n_features)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+    if len(set(feature_names)) != len(feature_names):
+        raise ValueError('feature_names should not contain duplicates.')
+
+    def convert_feature(fx):
+        if isinstance(fx, str):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return int(fx)
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral, str)):
+            fxs = (fxs,)
+        try:
+            fxs = [convert_feature(fx) for fx in fxs]
+        except TypeError:
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            names_ = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                names_.append(feature_names[i])
+            names.append(names_)
+    except IndexError:
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
+
+    # compute averaged predictions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(est, fxs, X=X,
+                                    response_method=response_method,
+                                    method=method,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # For multioutput regression, we can only check the validity of target
+    # now that we have the predictions.
+    # Also note: as multiclass-multioutput classifiers are not supported,
+    # multiclass and multioutput scenario are mutually exclusive. So there is
+    # no risk of overwriting target_idx here.
+    pd, _ = pd_result[0]  # checking the first result is enough
+    if is_regressor(est) and pd.shape[0] > 1:
+        if target is None:
+            raise ValueError(
+                'target must be specified for multi-output regressors')
+        if not 0 <= target <= pd.shape[0]:
+            raise ValueError(
+                'target must be in [0, n_tasks], got {}.'.format(target))
+        target_idx = target
+    else:
+        target_idx = 0
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pd, values in pd_result:
+        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
+        n_fx = len(values)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if fig is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig.clear()
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(values) == 1:
+            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(values) == 2
+            XX, YY = np.meshgrid(values[0], values[1])
+            Z = pd[target_idx].T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(values) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(values) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs
diff --git a/sklearn/plot/tests/__init__.py b/sklearn/plot/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/plot/tests/test_partial_dependence.py b/sklearn/plot/tests/test_partial_dependence.py
new file mode 100644
index 0000000000000..24e24fcd019f7
--- /dev/null
+++ b/sklearn/plot/tests/test_partial_dependence.py
@@ -0,0 +1,187 @@
+"""
+Testing for the partial dependence module.
+"""
+
+import pytest
+
+from sklearn.utils.testing import if_matplotlib
+from sklearn.plot import plot_partial_dependence
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import make_classification, make_regression
+
+
+# (X, y), n_targets  <-- as expected in the output of partial_dep()
+binary_classification_data = (make_classification(random_state=0), 1)
+multiclass_classification_data = (make_classification(n_classes=3,
+                                                      n_clusters_per_class=1,
+                                                      random_state=0), 3)
+regression_data = (make_regression(random_state=0), 1)
+multioutput_regression_data = (make_regression(n_targets=2, random_state=0), 2)
+
+
+@if_matplotlib
+def test_plot_partial_dependence():
+    # Test partial dependence plot function.
+    boston = load_boston()
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with str features and array feature names
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with list feature_names
+    feature_names = boston.feature_names.tolist()
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multiclass():
+    # Test partial dependence plot function on multi-class input.
+    iris = load_iris()
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       target=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       target='setosa',
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multioutput():
+    # Test partial dependence plot function on multi-output input.
+    (X, y), _ = multioutput_regression_data
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=1,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
+@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
+def test_plot_partial_dependence_input():
+    X, y = make_classification(random_state=0)
+
+    lr = LinearRegression()
+    lr.fit(X, y)
+    gbc = GradientBoostingClassifier(random_state=0)
+    gbc.fit(X, y)
+
+    # check target param for multiclass
+    (X_m, y_m), _ = multiclass_classification_data
+    lr_m = LogisticRegression()
+    lr_m.fit(X_m, y_m)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-class'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
+    for target in (-1, 100):
+        with pytest.raises(
+                ValueError,
+                match='target not in est.classes_'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
+
+    # check target param for multioutput
+    (X_m, y_m), _ = multioutput_regression_data
+    lr_m = LinearRegression()
+    lr_m.fit(X_m, y_m)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-output'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
+    for target in (-1, 100):
+        with pytest.raises(
+                ValueError,
+                match=r'target must be in \[0, n_tasks\]'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
+
+    for feature_names in (None, ['abcd', 'def']):
+        with pytest.raises(
+                ValueError,
+                match='Feature foobar not in feature_names'):
+            plot_partial_dependence(lr, X, features=['foobar'],
+                                    feature_names=feature_names)
+
+    for features in([(1, 2, 3)], [1, {}], [tuple()]):
+        with pytest.raises(
+                ValueError,
+                match='Each entry in features must be either an int, '):
+            plot_partial_dependence(lr, X, features=features)
+
+    with pytest.raises(
+            ValueError,
+            match='All entries of features must be less than '):
+        plot_partial_dependence(lr, X, features=[123],
+                                feature_names=['blah'])
+
+    with pytest.raises(
+            ValueError,
+            match='feature_names should not contain duplicates'):
+        plot_partial_dependence(lr, X, features=[0, 1, 2],
+                                feature_names=['a', 'b', 'a'])
+
+
+@if_matplotlib
+def test_plot_partial_dependence_fig():
+    # Make sure fig object is correctly used if not None
+
+    import matplotlib.pyplot as plt
+
+    (X, y), _ = regression_data
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    fig = plt.figure()
+    grid_resolution = 25
+    returned_fig, axs = plot_partial_dependence(
+        clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
+
+    assert returned_fig is fig
diff --git a/sklearn/setup.py b/sklearn/setup.py
index d6f8cd4694871..4618fd46b369a 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -41,6 +41,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('model_inspection/tests')
     config.add_subpackage('neural_network')
     config.add_subpackage('neural_network/tests')
+    config.add_subpackage('plot')
+    config.add_subpackage('plot/tests')
     config.add_subpackage('preprocessing')
     config.add_subpackage('preprocessing/tests')
     config.add_subpackage('semi_supervised')

From 8262419af9bb3d95bcdbe98f06e0eb59d028088e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 18 Apr 2019 07:52:21 -0400
Subject: [PATCH 089/113] Apply suggestions from code review

Co-Authored-By: NicolasHug <contact@nicolas-hug.com>
---
 doc/modules/partial_dependence.rst             | 2 +-
 sklearn/model_inspection/partial_dependence.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index 93e25df7e940f..1e70e1c411dcf 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -90,7 +90,7 @@ of the plots.
 For each value of the 'target' features in the ``grid`` the partial
 dependence function needs to marginalize the predictions of the estimator
 over all possible values of the 'complement' features. With the ``'brute'``
-method, this is done by replacing every target feature value of `X` by those
+method, this is done by replacing every target feature value of ``X`` by those
 in the grid, and computing the average prediction.
 
 In decision trees this can be evaluated efficiently without reference to the
diff --git a/sklearn/model_inspection/partial_dependence.py b/sklearn/model_inspection/partial_dependence.py
index d6cb180674ac6..bd1aa6fe22557 100644
--- a/sklearn/model_inspection/partial_dependence.py
+++ b/sklearn/model_inspection/partial_dependence.py
@@ -196,12 +196,12 @@ def partial_dependence(est, features, X, response_method='auto',
     features : list or array-like of int
         The target features for which the partial dependency should be
         computed.
-    X : array-like, shape=(n_samples, n_features)
+    X : array-like, shape (n_samples, n_features)
         ``X`` is used both to generate a grid of values for the
         ``features``, and to compute the averaged predictions when
         method is 'brute'.
     response_method : 'auto', 'predict_proba' or 'decision_function', \
-            optional (default='auto') :
+            optional (default='auto')
         Specifies whether to use :term:`predict_proba` or
         :term:`decision_function` as the target response. For regressors
         this parameter is ignored and the response is always the output of
@@ -237,8 +237,8 @@ def partial_dependence(est, features, X, response_method='auto',
 
     Returns
     -------
-    averaged_predictions : array, \
-            shape=(n_outputs, len(values[0]), len(values[1]), ...)
+    averaged_predictions : ndarray, \
+            shape (n_outputs, len(values[0]), len(values[1]), ...)
         The predictions for all the points in the grid, averaged over all
         samples in X (or over the training data if ``method`` is
         'recursion'). ``n_outputs`` corresponds to the number of classes in

From 3464b7e16f2c9319d87964e4ef549bdb695f8c68 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Apr 2019 08:17:05 -0400
Subject: [PATCH 090/113] Addressed comments from Guillaume

---
 doc/modules/partial_dependence.rst            |  4 +-
 examples/plot/plot_partial_dependence.py      |  2 +-
 .../model_inspection/partial_dependence.py    | 35 ++++++++--------
 .../tests/test_partial_dependence.py          | 40 +++++++++----------
 sklearn/plot/partial_dependence.py            | 22 +++++-----
 5 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index 93e25df7e940f..8da80722d45aa 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -29,7 +29,9 @@ for the California housing dataset, with a :class:`GradientBoostingRegressor
 One-way PDPs tell us about the interaction between the target response and
 the target feature (e.g. linear, non-linear). The upper left plot in the
 above figure shows the effect of the median income in a district on the
-median house price; we can clearly see a linear relationship among them.
+median house price; we can clearly see a linear relationship among them. Note
+that PDPs assume that the target features are independent from the complement
+features, and this assumption is often violated in practice.
 
 PDPs with two target features show the interactions among the two features.
 For example, the two-variable PDP in the above figure shows the dependence
diff --git a/examples/plot/plot_partial_dependence.py b/examples/plot/plot_partial_dependence.py
index 465bf14f499cf..a432128df2aeb 100644
--- a/examples/plot/plot_partial_dependence.py
+++ b/examples/plot/plot_partial_dependence.py
@@ -111,7 +111,7 @@ def main():
     fig = plt.figure()
 
     target_feature = (1, 5)
-    pdp, axes = partial_dependence(est, target_feature, X=X,
+    pdp, axes = partial_dependence(est, X, target_feature,
                                    grid_resolution=50)
     XX, YY = np.meshgrid(axes[0], axes[1])
     Z = pdp[0].T
diff --git a/sklearn/model_inspection/partial_dependence.py b/sklearn/model_inspection/partial_dependence.py
index d6cb180674ac6..52e1887c0951b 100644
--- a/sklearn/model_inspection/partial_dependence.py
+++ b/sklearn/model_inspection/partial_dependence.py
@@ -177,7 +177,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
     return averaged_predictions
 
 
-def partial_dependence(est, features, X, response_method='auto',
+def partial_dependence(estimator, X, features, response_method='auto',
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
     """Partial dependence of ``features``.
@@ -190,16 +190,17 @@ def partial_dependence(est, features, X, response_method='auto',
 
     Parameters
     ----------
-    est : BaseEstimator
-        A fitted classification or regression model. Multioutput-multiclass
-        classifiers are not supported.
-    features : list or array-like of int
-        The target features for which the partial dependency should be
-        computed.
+    estimator : BaseEstimator
+        A fitted estimator object implementing `predict`, `predict_proba`,
+        or `decision_function`. Multioutput-multiclass classifiers are not
+        supported.
     X : array-like, shape=(n_samples, n_features)
         ``X`` is used both to generate a grid of values for the
         ``features``, and to compute the averaged predictions when
         method is 'brute'.
+    features : list or array-like of int
+        The target features for which the partial dependency should be
+        computed.
     response_method : 'auto', 'predict_proba' or 'decision_function', \
             optional (default='auto') :
         Specifies whether to use :term:`predict_proba` or
@@ -281,11 +282,11 @@ def partial_dependence(est, features, X, response_method='auto',
 
     """
 
-    if not (is_classifier(est) or is_regressor(est)):
+    if not (is_classifier(estimator) or is_regressor(estimator)):
         raise ValueError('est must be a fitted regressor or classifier.')
 
-    if (hasattr(est, 'classes_') and
-            isinstance(est.classes_[0], np.ndarray)):
+    if (hasattr(estimator, 'classes_') and
+            isinstance(estimator.classes_[0], np.ndarray)):
         raise ValueError('Multiclass-multioutput estimators are not supported')
 
     X = check_array(X)
@@ -296,7 +297,7 @@ def partial_dependence(est, features, X, response_method='auto',
             'response_method {} is invalid. Accepted response_method names '
             'are {}.'.format(response_method, ', '.join(accepted_responses)))
 
-    if is_regressor(est) and response_method != 'auto':
+    if is_regressor(estimator) and response_method != 'auto':
         raise ValueError(
             "The response_method parameter is ignored for regressors and "
             "must be 'auto'."
@@ -308,13 +309,13 @@ def partial_dependence(est, features, X, response_method='auto',
                 method, ', '.join(accepted_methods)))
 
     if method == 'auto':
-        if isinstance(est, BaseGradientBoosting) and est.init is None:
+        if isinstance(estimator, BaseGradientBoosting) and estimator.init is None:
             method = 'recursion'
         else:
             method = 'brute'
 
     if method == 'recursion':
-        if not isinstance(est, BaseGradientBoosting):
+        if not isinstance(estimator, BaseGradientBoosting):
             raise ValueError(
                 'est must be an instance of BaseGradientBoosting '
                 'for the "recursion" method. Try using method="brute".')
@@ -326,10 +327,10 @@ def partial_dependence(est, features, X, response_method='auto',
                 "With the 'recursion' method, the response_method must be "
                 "'decision_function'. Got {}.".format(response_method)
             )
-        check_is_fitted(est, 'estimators_',
+        check_is_fitted(estimator, 'estimators_',
                         msg='est parameter must be a fitted estimator')
         # Note: if method is brute, this check is done at prediction time
-        n_features = est.n_features_
+        n_features = estimator.n_features_
     else:
         n_features = X.shape[1]
 
@@ -341,11 +342,11 @@ def partial_dependence(est, features, X, response_method='auto',
     grid, values = _grid_from_X(X[:, features], percentiles,
                                 grid_resolution)
     if method == 'brute':
-        averaged_predictions = _partial_dependence_brute(est, grid,
+        averaged_predictions = _partial_dependence_brute(estimator, grid,
                                                          features, X,
                                                          response_method)
     else:
-        averaged_predictions = _partial_dependence_recursion(est, grid,
+        averaged_predictions = _partial_dependence_recursion(estimator, grid,
                                                              features)
 
     # reshape averaged_predictions to
diff --git a/sklearn/model_inspection/tests/test_partial_dependence.py b/sklearn/model_inspection/tests/test_partial_dependence.py
index 627d617fde539..a6cebbe97374b 100644
--- a/sklearn/model_inspection/tests/test_partial_dependence.py
+++ b/sklearn/model_inspection/tests/test_partial_dependence.py
@@ -73,8 +73,8 @@ def test_output_shape(Estimator, method, data, grid_resolution,
     (X, y), n_targets = data
 
     est.fit(X, y)
-    pdp, axes = partial_dependence(est, features=features,
-                                   X=X, method=method,
+    pdp, axes = partial_dependence(est, X=X, features=features,
+                                   method=method,
                                    grid_resolution=grid_resolution)
 
     expected_pdp_shape = (n_targets, *[grid_resolution
@@ -204,10 +204,10 @@ def test_recursion_decision_function(target_feature):
     est = GradientBoostingClassifier(random_state=0, loss='deviance')
     est.fit(X, y)
 
-    preds_1, _ = partial_dependence(est, target_feature, X,
+    preds_1, _ = partial_dependence(est, X, [target_feature],
                                     response_method='decision_function',
                                     method='recursion')
-    preds_2, _ = partial_dependence(est, target_feature, X,
+    preds_2, _ = partial_dependence(est, X, [target_feature],
                                     response_method='decision_function',
                                     method='brute')
 
@@ -269,7 +269,7 @@ def test_multiclass_multioutput(Estimator):
     with pytest.raises(
             ValueError,
             match="Multiclass-multioutput estimators are not supported"):
-        partial_dependence(est, [0], X=X)
+        partial_dependence(est, X, [0])
 
 
 def test_partial_dependence_input():
@@ -285,18 +285,18 @@ def test_partial_dependence_input():
     with pytest.raises(
             ValueError,
             match="est must be a fitted regressor or classifier"):
-        partial_dependence(KMeans(), [0], X)
+        partial_dependence(KMeans(), X, [0])
 
     with pytest.raises(
             ValueError,
             match='The response_method parameter is ignored for regressors'):
-        partial_dependence(lr, [0], X, response_method='predict_proba')
+        partial_dependence(lr, X, [0], response_method='predict_proba')
 
     with pytest.raises(
             ValueError,
             match="With the 'recursion' method, the response_method must be "
                   "'decision_function'."):
-        partial_dependence(gbc, [0], X, response_method='predict_proba',
+        partial_dependence(gbc, X, [0], response_method='predict_proba',
                            method='recursion')
 
     # for GBDTs, if users want to use predict_proba then they're forced to set
@@ -305,13 +305,13 @@ def test_partial_dependence_input():
             ValueError,
             match="With the 'recursion' method, the response_method must be "
                   "'decision_function"):
-        partial_dependence(gbc, [0], X, response_method='predict_proba',
+        partial_dependence(gbc, X, [0], response_method='predict_proba',
                            method='auto')
 
     with pytest.raises(
             ValueError,
             match="response_method blahblah is invalid. Accepted response"):
-        partial_dependence(gbc, [0], X, response_method='blahblah')
+        partial_dependence(gbc, X, [0], response_method='blahblah')
 
     class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
         pass
@@ -321,47 +321,47 @@ class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
             ValueError,
             match='The estimator has no predict_proba and no '
                   'decision_function method.'):
-        partial_dependence(bad_clf, [0], X, response_method='auto')
+        partial_dependence(bad_clf, X, [0], response_method='auto')
 
     with pytest.raises(
             ValueError,
             match='The estimator has no predict_proba method.'):
-        partial_dependence(bad_clf, [0], X, response_method='predict_proba')
+        partial_dependence(bad_clf, X, [0], response_method='predict_proba')
 
     with pytest.raises(
             ValueError,
             match='The estimator has no decision_function method.'):
-        partial_dependence(bad_clf, [0], X,
+        partial_dependence(bad_clf, X, [0],
                            response_method='decision_function')
 
     with pytest.raises(
             ValueError,
             match="method blahblah is invalid. Accepted method names "
                   "are brute, recursion, auto."):
-        partial_dependence(lr, [0], X, method='blahblah')
+        partial_dependence(lr, X, [0], method='blahblah')
 
     with pytest.raises(
             ValueError,
             match='est must be an instance of BaseGradientBoosting '
                   'for the "recursion" method'):
-        partial_dependence(lr, [0], X, method='recursion')
+        partial_dependence(lr, X, [0], method='recursion')
 
     for feature in (-1, 1000000):
         for est in (lr, gbc):
             with pytest.raises(
                     ValueError,
                     match="all features must be in"):
-                partial_dependence(est, [feature], X=X)
+                partial_dependence(est, X, [feature])
 
     for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
         with pytest.raises(
                 ValueError,
                 match='est parameter must be a fitted estimator'):
-            partial_dependence(unfitted_est, [0], X=X)
+            partial_dependence(unfitted_est, X, [0])
 
     # check that array-like objects are accepted
     for est in (lr, gbc):
-        partial_dependence(est, [0],  X=list(X))
+        partial_dependence(est, list(X), [0])
 
 
 def test_warning_recursion_non_constant_init():
@@ -374,9 +374,9 @@ def test_warning_recursion_non_constant_init():
     with pytest.warns(
             UserWarning,
             match='Using recursion method with a non-constant init predictor'):
-        partial_dependence(gbc, [0], X=X, method='recursion')
+        partial_dependence(gbc, X, [0], method='recursion')
 
     with pytest.warns(
             UserWarning,
             match='Using recursion method with a non-constant init predictor'):
-        partial_dependence(gbc, [0], X=X, method='recursion')
+        partial_dependence(gbc, X, [0], method='recursion')
diff --git a/sklearn/plot/partial_dependence.py b/sklearn/plot/partial_dependence.py
index d76be3b711fc3..178bdf2152c97 100644
--- a/sklearn/plot/partial_dependence.py
+++ b/sklearn/plot/partial_dependence.py
@@ -20,7 +20,7 @@
 __all__ = ['plot_partial_dependence']
 
 
-def plot_partial_dependence(est, X, features, feature_names=None,
+def plot_partial_dependence(estimator, X, features, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
                             method='auto', n_jobs=1, verbose=0, fig=None,
@@ -34,10 +34,10 @@ def plot_partial_dependence(est, X, features, feature_names=None,
 
     Parameters
     ----------
-    est : BaseEstimator
-        A fitted classification or regression model. Classifiers must have a
-        ``predict_proba()`` or ``decision_function`` method.
-        Multioutput-multiclass estimators aren't supported.
+    estimator : BaseEstimator
+        A fitted estimator object implementing `predict`, `predict_proba`,
+        or `decision_function`. Multioutput-multiclass classifiers are not
+        supported.
     X : array-like, shape=(n_samples, n_features)
         The data to use to build the grid of values on which the dependence
         will be evaluated. This is usually the training data.
@@ -156,12 +156,12 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from matplotlib.ticker import ScalarFormatter
 
     # set target_idx for multi-class estimators
-    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
+    if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
         if target is None:
             raise ValueError('target must be specified for multi-class')
-        target_idx = np.searchsorted(est.classes_, target)
-        if (not (0 <= target_idx < len(est.classes_)) or
-                est.classes_[target_idx] != target):
+        target_idx = np.searchsorted(estimator.classes_, target)
+        if (not (0 <= target_idx < len(estimator.classes_)) or
+                estimator.classes_[target_idx] != target):
             raise ValueError('target not in est.classes_, got {}'.format(
                 target))
     else:
@@ -221,7 +221,7 @@ def convert_feature(fx):
 
     # compute averaged predictions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X,
+        delayed(partial_dependence)(estimator, X, fxs,
                                     response_method=response_method,
                                     method=method,
                                     grid_resolution=grid_resolution,
@@ -234,7 +234,7 @@ def convert_feature(fx):
     # multiclass and multioutput scenario are mutually exclusive. So there is
     # no risk of overwriting target_idx here.
     pd, _ = pd_result[0]  # checking the first result is enough
-    if is_regressor(est) and pd.shape[0] > 1:
+    if is_regressor(estimator) and pd.shape[0] > 1:
         if target is None:
             raise ValueError(
                 'target must be specified for multi-output regressors')

From 591471ff67a0624472d74204895780993df560ab Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Apr 2019 09:09:55 -0400
Subject: [PATCH 091/113] put everything in sklearn.inspection

---
 doc/{plot.rst => inspection.rst}              |   6 +-
 doc/modules/classes.rst                       |  28 +-
 doc/modules/partial_dependence.rst            |  20 +-
 doc/user_guide.rst                            |   2 +-
 doc/whats_new/v0.21.rst                       |  18 +-
 .../plot_partial_dependence.py                |   5 +-
 sklearn/__init__.py                           |   8 +-
 sklearn/ensemble/partial_dependence.py        |   8 +-
 sklearn/inspection/__init__.py                |   9 +
 .../partial_dependence.py                     | 321 ++++++++++++++++-
 .../tests/__init__.py                         |   0
 .../tests/test_partial_dependence.py          | 171 ++++++++-
 sklearn/model_inspection/__init__.py          |   8 -
 sklearn/plot/__init__.py                      |   7 -
 sklearn/plot/partial_dependence.py            | 325 ------------------
 sklearn/plot/tests/__init__.py                |   0
 sklearn/plot/tests/test_partial_dependence.py | 187 ----------
 sklearn/setup.py                              |   6 +-
 sklearn/utils/__init__.py                     |  12 +-
 sklearn/utils/tests/test_utils.py             |   1 +
 20 files changed, 550 insertions(+), 592 deletions(-)
 rename doc/{plot.rst => inspection.rst} (69%)
 rename examples/{plot => inspection}/plot_partial_dependence.py (98%)
 create mode 100644 sklearn/inspection/__init__.py
 rename sklearn/{model_inspection => inspection}/partial_dependence.py (52%)
 rename sklearn/{model_inspection => inspection}/tests/__init__.py (100%)
 rename sklearn/{model_inspection => inspection}/tests/test_partial_dependence.py (70%)
 delete mode 100644 sklearn/model_inspection/__init__.py
 delete mode 100644 sklearn/plot/__init__.py
 delete mode 100644 sklearn/plot/partial_dependence.py
 delete mode 100644 sklearn/plot/tests/__init__.py
 delete mode 100644 sklearn/plot/tests/test_partial_dependence.py

diff --git a/doc/plot.rst b/doc/inspection.rst
similarity index 69%
rename from doc/plot.rst
rename to doc/inspection.rst
index 38e3bdcf72648..745539d51bf77 100644
--- a/doc/plot.rst
+++ b/doc/inspection.rst
@@ -1,9 +1,9 @@
 .. include:: includes/big_toc_css.rst
 
-.. _plot:
+.. _inspection:
 
-Plotting
---------
+Inspection
+----------
 
 .. toctree::
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index b6a6e1bbd4883..bcca50fb40307 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1211,29 +1211,13 @@ Model validation
    pipeline.make_pipeline
    pipeline.make_union
 
-.. _model_inspection_ref:
 
-:mod:`sklearn.model_inspection`: Model inspection
-=================================================
+.. _inspection_ref:
 
-.. automodule:: sklearn.model_inspection
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-
-   model_inspection.partial_dependence
+:mod:`sklearn.inspection`: inspection
+=====================================
 
-.. _plot_ref:
-
-:mod:`sklearn.plot`: Plot
-=========================
-
-.. automodule:: sklearn.plot
+.. automodule:: sklearn.inspection
    :no-members:
    :no-inherited-members:
 
@@ -1243,7 +1227,9 @@ Model validation
    :toctree: generated/
    :template: function.rst
 
-   plot.plot_partial_dependence
+   inspection.partial_dependence
+   inspection.plot_partial_dependence
+
 
 .. _preprocessing_ref:
 
diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index 8da80722d45aa..61e4e624d3887 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -5,7 +5,7 @@
 Partial dependence plots
 ========================
 
-.. currentmodule:: sklearn.plot
+.. currentmodule:: sklearn.inspection
 
 Partial dependence plots (PDP) show the dependence between the target
 response [1]_ and a set of 'target' features, marginalizing over the values
@@ -21,8 +21,8 @@ The figure below shows four one-way and one two-way partial dependence plots
 for the California housing dataset, with a :class:`GradientBoostingRegressor
 <sklearn.ensemble.GradientBoostingRegressor>`:
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_partial_dependence_001.png
-   :target: ../auto_examples/plot_partial_dependence.html
+.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_partial_dependence_002.png
+   :target: ../auto_examples/inspection/plot_partial_dependence.html
    :align: center
    :scale: 70
 
@@ -41,7 +41,7 @@ an average occupancy greater than two, the house price is nearly independent of
 the house age, whereas for values less than 2 there is a strong dependence
 on age.
 
-The :mod:`sklearn.plot` module provides a convenience function
+The :mod:`sklearn.inspection` module provides a convenience function
 :func:`plot_partial_dependence` to create one-way and two-way partial
 dependence plots. In the below example we show how to create a grid of
 partial dependence plots: two one-way PDPs for the features ``0`` and ``1``
@@ -49,7 +49,7 @@ and a two-way PDP between the two features::
 
     >>> from sklearn.datasets import make_hastie_10_2
     >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> from sklearn.plot import plot_partial_dependence
+    >>> from sklearn.inspection import plot_partial_dependence
 
     >>> X, y = make_hastie_10_2(random_state=0)
     >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
@@ -72,11 +72,11 @@ regression settings.
 
 If you need the raw values of the partial dependence function rather than
 the plots, you can use the
-:func:`sklearn.model_inspection.partial_dependence` function::
+:func:`sklearn.inspection.partial_dependence` function::
 
-    >>> from sklearn.model_inspection import partial_dependence
+    >>> from sklearn.inspection import partial_dependence
 
-    >>> pdp, axes = partial_dependence(clf, [0], X=X)
+    >>> pdp, axes = partial_dependence(clf, X, [0])
     >>> pdp  # doctest: +ELLIPSIS
     array([[ 2.466...,  2.466..., ...
     >>> axes  # doctest: +ELLIPSIS
@@ -85,7 +85,7 @@ the plots, you can use the
 The values at which the partial dependence should be evaluated are directly
 generated from ``X``. For 2-way partial dependence, a 2D-grid of values is
 generated. The ``values`` field returned by
-:func:`sklearn.model_inspection.partial_dependence` gives the actual values
+:func:`sklearn.inspection.partial_dependence` gives the actual values
 used in the grid for each target feature. They also correspond to the axis
 of the plots.
 
@@ -114,7 +114,7 @@ which the trees were trained.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_plot_plot_partial_dependence.py`
+ * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
 
 .. topic:: References
 
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 327dc07e057ae..70c8a20bdf036 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -18,7 +18,7 @@ User Guide
    supervised_learning.rst
    unsupervised_learning.rst
    model_selection.rst
-   plot.rst
+   inspection.rst
    data_transforms.rst
    Dataset loading utilities <datasets/index.rst>
    modules/computing.rst
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 4177fb911a16c..134318d788180 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -183,9 +183,9 @@ Support for Python 3.4 and below has been officially dropped.
 
 - |API| :func:`ensemble.partial_dependence` and
   :func:`ensemble.plot_partial_dependence` are now deprecated in favor of
-  :func:`inspect.partial_dependence<sklearn.inspect.partial_dependence>`
+  :func:`inspection.partial_dependence<sklearn.inspection.partial_dependence>`
   and
-  :func:`inspect.plot_partial_dependence<sklearn.inspect.plot_partial_dependence>`.
+  :func:`inspection.plot_partial_dependence<sklearn.inspection.plot_partial_dependence>`.
   :issue:`12599` by :user:`Trevor Stephens<trevorstephens>` and
   :user:`Nicolas Hug<NicolasHug>`.
 
@@ -256,8 +256,9 @@ Support for Python 3.4 and below has been officially dropped.
   with the document and the caller functions.
   :issue:`6463` by :user:`movelikeriver <movelikeriver>`.
 
-- |Fix| :func:`ensemble.partial_dependence` now takes sample weights into
-  account for the partial dependence computation when the
+- |Fix| :func:`ensemble.partial_dependence` (and consequently the new
+  version :func:`sklearn.inspection.partial_dependence`) now takes sample
+  weights into account for the partial dependence computation when the
   gradient boosting model has been trained with sample weights.
   :issue:`13193` by :user:`Samuel O. Ronsin <samronsin>`.
 
@@ -635,11 +636,14 @@ Support for Python 3.4 and below has been officially dropped.
   affects all ensemble methods using decision trees.
   :issue:`12344` by :user:`Adrin Jalali <adrinjalali>`.
 
-:mod:`sklearn.inspect.partial_dependence`
-.........................................
+:mod:`sklearn.inspection`
+.........................
+
+
+- |Feature| A new module ``sklearn.inspection`` is created.
 
 - |Feature| Partial dependence plots
-  (:func:`inspect.plot_partial_dependence`) are now supported for
+  (:func:`inspection.plot_partial_dependence`) are now supported for
   any regressor or classifier (provided that they have a `predict_proba`
   method). :issue:`12599` by :user:`Trevor Stephens <trevorstephens>` and
   :user:`Nicolas Hug <NicolasHug>`.
diff --git a/examples/plot/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py
similarity index 98%
rename from examples/plot/plot_partial_dependence.py
rename to examples/inspection/plot_partial_dependence.py
index a432128df2aeb..dd2b7e0c4a220 100644
--- a/examples/plot/plot_partial_dependence.py
+++ b/examples/inspection/plot_partial_dependence.py
@@ -60,11 +60,10 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-
 from mpl_toolkits.mplot3d import Axes3D
 
-from sklearn.model_inspection import partial_dependence
-from sklearn.plot import plot_partial_dependence
+from sklearn.inspection import partial_dependence
+from sklearn.inspection import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.neural_network import MLPRegressor
 from sklearn.datasets.california_housing import fetch_california_housing
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index dff4ddb96bf9b..9a6f4e4e29deb 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -81,11 +81,11 @@
     __all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
                'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
                'externals', 'feature_extraction', 'feature_selection',
-               'gaussian_process', 'isotonic', 'kernel_approximation',
-               'kernel_ridge', 'linear_model', 'manifold', 'metrics',
-               'mixture', 'model_selection', 'model_inspection',
+               'gaussian_process', 'inspection', 'isotonic',
+               'kernel_approximation', 'kernel_ridge', 'linear_model',
+               'manifold', 'metrics', 'mixture', 'model_selection',
                'multiclass', 'multioutput', 'naive_bayes', 'neighbors',
-               'neural_network', 'pipeline', 'plot', 'preprocessing',
+               'neural_network', 'pipeline', 'preprocessing',
                'random_projection', 'semi_supervised', 'svm', 'tree',
                'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 908e6c1a08d0f..11d5208d2d179 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -79,7 +79,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
 
 @deprecated("The function ensemble.partial_dependence has been deprecated "
-            "in favour of model_inspection.partial_dependence in 0.21 "
+            "in favour of inspection.partial_dependence in 0.21 "
             "and will be removed in 0.23.")
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
                        percentiles=(0.05, 0.95), grid_resolution=100):
@@ -93,7 +93,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.model_inspection.partial_dependence` and will be
+       :func:`sklearn.inspection.partial_dependence` and will be
        removed in 0.23.
 
     Parameters
@@ -182,7 +182,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
 
 @deprecated("The function ensemble.plot_partial_dependence has been "
             "deprecated in favour of "
-            "plot.plot_partial_dependence in "
+            "sklearn.inspection.plot_partial_dependence in "
             " 0.21 and will be removed in 0.23.")
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
@@ -199,7 +199,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     .. deprecated:: 0.21
        This function was deprecated in version 0.21 in favor of
-       :func:`sklearn.plot.plot_partial_dependence` and will be
+       :func:`sklearn.inspection.plot_partial_dependence` and will be
        removed in 0.23.
 
     Parameters
diff --git a/sklearn/inspection/__init__.py b/sklearn/inspection/__init__.py
new file mode 100644
index 0000000000000..2bf3fe14c0023
--- /dev/null
+++ b/sklearn/inspection/__init__.py
@@ -0,0 +1,9 @@
+"""The :mod:`sklearn.inspection` module includes tools for model inspection."""
+from .partial_dependence import partial_dependence
+from .partial_dependence import plot_partial_dependence
+
+
+__all__ = [
+    'partial_dependence',
+    'plot_partial_dependence',
+]
diff --git a/sklearn/model_inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
similarity index 52%
rename from sklearn/model_inspection/partial_dependence.py
rename to sklearn/inspection/partial_dependence.py
index 52e1887c0951b..8c32d0ed92535 100644
--- a/sklearn/model_inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -6,13 +6,17 @@
 # License: BSD 3 clause
 
 import warnings
+from itertools import count
+import numbers
 
 import numpy as np
 from scipy.stats.mstats import mquantiles
+from joblib import Parallel, delayed
 
 from ..base import is_classifier, is_regressor
 from ..utils.extmath import cartesian
 from ..utils import check_array
+from ..utils import check_matplotlib_support
 from ..utils.validation import check_is_fitted
 from ..tree._tree import DTYPE
 from ..exceptions import NotFittedError
@@ -20,7 +24,7 @@
 from ..ensemble._gradient_boosting import _partial_dependence_tree
 
 
-__all__ = ['partial_dependence']
+__all__ = ['partial_dependence', 'plot_partial_dependence']
 
 
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
@@ -225,9 +229,9 @@ def partial_dependence(estimator, X, features, response_method='auto',
           grid and the partial dependences are computed using the training
           data. This method does not account for the ``init`` predicor of
           the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init_plot>`). With this
-          method, the target response of a classifier is always the decision
-          function, not the predicted probabilities.
+          warning below). With this method, the target response of a
+          classifier is always the decision function, not the predicted
+          probabilities.
 
         - 'brute' is supported for any estimator, but is more
           computationally intensive.
@@ -266,7 +270,8 @@ def partial_dependence(estimator, X, features, response_method='auto',
 
     See also
     --------
-    sklearn.plot.plot_partial_dependence: Plot partial dependence
+    sklearn.inspection.plot_partial_dependence: Plot partial dependence
+
 
     .. _warning_recursion_init:
 
@@ -355,3 +360,309 @@ def partial_dependence(estimator, X, features, response_method='auto',
         -1, *[val.shape[0] for val in values])
 
     return averaged_predictions, values
+
+
+def plot_partial_dependence(estimator, X, features, feature_names=None,
+                            target=None, response_method='auto', n_cols=3,
+                            grid_resolution=100, percentiles=(0.05, 0.95),
+                            method='auto', n_jobs=1, verbose=0, fig=None,
+                            line_kw=None, contour_kw=None, **fig_kw):
+    """Partial dependence plots.
+
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
+    columns. Two-way partial dependence plots are plotted as contour plots.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator
+        A fitted estimator object implementing `predict`, `predict_proba`,
+        or `decision_function`. Multioutput-multiclass classifiers are not
+        supported.
+    X : array-like, shape=(n_samples, n_features)
+        The data to use to build the grid of values on which the dependence
+        will be evaluated. This is usually the training data.
+    features : list of {int, str, pair of int, pair of str}
+        The target features for which to create the PDPs.
+        If features[i] is an int or a string, a one-way PDP is created; if
+        features[i] is a tuple, a two-way PDP is created. Each tuple must be
+        of size 2.
+        if any entry is a string, then it must be in ``feature_names``.
+    feature_names : seq of str, shape=(n_features,), optional
+        Name of each feature; feature_names[i] holds the name of the feature
+        with index i.
+    target : int, optional (default=None)
+        - In a multiclass setting, specifies the class for which the PDPs
+          should be computed. Note that for binary classification, the
+          positive class (index 1) is always used.
+        - In a multioutput setting, specifies the task for which the PDPs
+          should be computed
+        Ignored in binary classification or classical regression settings.
+    response_method : 'auto', 'predict_proba' or 'decision_function', \
+            optional (default='auto') :
+        Specifies whether to use :term:`predict_proba` or
+        :term:`decision_function` as the target response. For regressors
+        this parameter is ignored and the response is always the output of
+        :term:`predict`. By default, :term:`predict_proba` is tried first
+        and we revert to :term:`decision_function` if it doesn't exist. If
+        ``method`` is 'recursion', the response is always the output of
+        :term:`decision_function`.
+    n_cols : int, optional (default=3)
+        The maximum number of columns in the grid plot.
+    grid_resolution : int, optional (default=100)
+        The number of equally spaced points on the axes of the plots, for each
+        target feature.
+    percentiles : tuple of float, optional (default=(0.05, 0.95))
+        The lower and upper percentile used to create the extreme values
+        for the PDP axes. Must be in [0, 1].
+    method : str, optional (default='auto')
+        The method to use to calculate the partial dependence predictions:
+
+        - 'recursion' is only supported for objects inheriting from
+          `BaseGradientBoosting`, but is more efficient in terms of speed.
+          With this method, ``X`` is optional and is only used to build the
+          grid and the partial dependences are computed using the training
+          data. This method does not account for the ``init`` predicor of
+          the boosting process, which may lead to incorrect values (see
+          warning below. With this method, the target response of a
+          classifier is always the decision function, not the predicted
+          probabilities.
+
+        - 'brute' is supported for any estimator, but is more
+          computationally intensive.
+
+        - If 'auto', then 'recursion' will be used for
+          ``BaseGradientBoosting`` estimators with ``init=None``, and
+          'brute' for all other.
+
+        Unlike the 'brute' method, 'recursion' does not account for the
+        ``init`` predictor of the boosting process. In practice this still
+        produces the same plots, up to a constant offset in the target
+        response.
+    n_jobs : int, optional (default=1)
+        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
+        See :term:`Glossary <n_jobs>` for more details.
+    verbose : int, optional (default=0)
+        Verbose output during PD computations.
+    fig : Matplotlib figure object, optional (default=None)
+        A figure object onto which the plots will be drawn, after the figure
+        has been cleared.
+    line_kw : dict, optional
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For one-way partial dependence plots.
+    contour_kw : dict, optional
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
+        For two-way partial dependence plots.
+    **fig_kw : dict, optional
+        Dict with keywords passed to the figure() call.
+        Note that all keywords not recognized above will be automatically
+        included here.
+
+    Returns
+    -------
+    fig : figure
+        The Matplotlib Figure object.
+    axs : seq of Axis objects
+        A seq of Axis objects, one for each subplot.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
+    ...
+
+    See also
+    --------
+    sklearn.inspection.partial_dependence: Return raw partial
+      dependence values
+
+    .. _warning_recursion_init_plot:
+
+    Warnings
+    --------
+    The 'recursion' method only works for gradient boosting estimators, and
+    unlike the 'brute' method, it does not account for the ``init``
+    predictor of the boosting process. In practice this will produce the
+    same values as 'brute' up to a constant offset in the target response,
+    provided that ``init`` is a consant estimator (which is the default).
+    However, as soon as ``init`` is not a constant estimator, the partial
+    dependence values are incorrect for 'recursion'.
+    """
+    check_matplotlib_support('plot_partial_dependence')
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+
+    # set target_idx for multi-class estimators
+    if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
+        if target is None:
+            raise ValueError('target must be specified for multi-class')
+        target_idx = np.searchsorted(estimator.classes_, target)
+        if (not (0 <= target_idx < len(estimator.classes_)) or
+                estimator.classes_[target_idx] != target):
+            raise ValueError('target not in est.classes_, got {}'.format(
+                target))
+    else:
+        # regression and binary classification
+        target_idx = 0
+
+    X = check_array(X)
+    n_features = X.shape[1]
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if feature_names is None, use feature indices as name
+        feature_names = [str(i) for i in range(n_features)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+    if len(set(feature_names)) != len(feature_names):
+        raise ValueError('feature_names should not contain duplicates.')
+
+    def convert_feature(fx):
+        if isinstance(fx, str):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return int(fx)
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral, str)):
+            fxs = (fxs,)
+        try:
+            fxs = [convert_feature(fx) for fx in fxs]
+        except TypeError:
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('Each entry in features must be either an int, '
+                             'a string, or an iterable of size at most 2.')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            names_ = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                names_.append(feature_names[i])
+            names.append(names_)
+    except IndexError:
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
+
+    # compute averaged predictions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(estimator, X, fxs,
+                                    response_method=response_method,
+                                    method=method,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # For multioutput regression, we can only check the validity of target
+    # now that we have the predictions.
+    # Also note: as multiclass-multioutput classifiers are not supported,
+    # multiclass and multioutput scenario are mutually exclusive. So there is
+    # no risk of overwriting target_idx here.
+    pd, _ = pd_result[0]  # checking the first result is enough
+    if is_regressor(estimator) and pd.shape[0] > 1:
+        if target is None:
+            raise ValueError(
+                'target must be specified for multi-output regressors')
+        if not 0 <= target <= pd.shape[0]:
+            raise ValueError(
+                'target must be in [0, n_tasks], got {}.'.format(target))
+        target_idx = target
+    else:
+        target_idx = 0
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pd, values in pd_result:
+        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
+        n_fx = len(values)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if fig is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig.clear()
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(values) == 1:
+            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(values) == 2
+            XX, YY = np.meshgrid(values[0], values[1])
+            Z = pd[target_idx].T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(values) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(values) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs
\ No newline at end of file
diff --git a/sklearn/model_inspection/tests/__init__.py b/sklearn/inspection/tests/__init__.py
similarity index 100%
rename from sklearn/model_inspection/tests/__init__.py
rename to sklearn/inspection/tests/__init__.py
diff --git a/sklearn/model_inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
similarity index 70%
rename from sklearn/model_inspection/tests/test_partial_dependence.py
rename to sklearn/inspection/tests/test_partial_dependence.py
index a6cebbe97374b..e53b4dcb385e3 100644
--- a/sklearn/model_inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -7,8 +7,9 @@
 import pytest
 
 import sklearn
-from sklearn.model_inspection import partial_dependence
-from sklearn.model_inspection.partial_dependence import (
+from sklearn.inspection import partial_dependence
+from sklearn.inspection import plot_partial_dependence
+from sklearn.inspection.partial_dependence import (
     _grid_from_X,
     _partial_dependence_brute,
     _partial_dependence_recursion
@@ -18,12 +19,14 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import MultiTaskLasso
+from sklearn.datasets import load_boston, load_iris
 from sklearn.datasets import make_classification, make_regression
 from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.dummy import DummyClassifier
 from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.testing import if_matplotlib
 
 
 # toy sample
@@ -380,3 +383,167 @@ def test_warning_recursion_non_constant_init():
             UserWarning,
             match='Using recursion method with a non-constant init predictor'):
         partial_dependence(gbc, X, [0], method='recursion')
+
+
+@if_matplotlib
+def test_plot_partial_dependence():
+    # Test partial dependence plot function.
+    boston = load_boston()
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with str features and array feature names
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with list feature_names
+    feature_names = boston.feature_names.tolist()
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multiclass():
+    # Test partial dependence plot function on multi-class input.
+    iris = load_iris()
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       target=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       target='setosa',
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multioutput():
+    # Test partial dependence plot function on multi-output input.
+    (X, y), _ = multioutput_regression_data
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    fig, axs = plot_partial_dependence(clf, X, [0, 1],
+                                       target=1,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
+@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
+def test_plot_partial_dependence_input():
+    X, y = make_classification(random_state=0)
+
+    lr = LinearRegression()
+    lr.fit(X, y)
+    gbc = GradientBoostingClassifier(random_state=0)
+    gbc.fit(X, y)
+
+    # check target param for multiclass
+    (X_m, y_m), _ = multiclass_classification_data
+    lr_m = LogisticRegression()
+    lr_m.fit(X_m, y_m)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-class'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
+    for target in (-1, 100):
+        with pytest.raises(
+                ValueError,
+                match='target not in est.classes_'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
+
+    # check target param for multioutput
+    (X_m, y_m), _ = multioutput_regression_data
+    lr_m = LinearRegression()
+    lr_m.fit(X_m, y_m)
+    with pytest.raises(
+            ValueError,
+            match='target must be specified for multi-output'):
+        plot_partial_dependence(lr_m, X_m, [0], target=None)
+    for target in (-1, 100):
+        with pytest.raises(
+                ValueError,
+                match=r'target must be in \[0, n_tasks\]'):
+            plot_partial_dependence(lr_m, X_m, [0], target=target)
+
+    for feature_names in (None, ['abcd', 'def']):
+        with pytest.raises(
+                ValueError,
+                match='Feature foobar not in feature_names'):
+            plot_partial_dependence(lr, X, features=['foobar'],
+                                    feature_names=feature_names)
+
+    for features in([(1, 2, 3)], [1, {}], [tuple()]):
+        with pytest.raises(
+                ValueError,
+                match='Each entry in features must be either an int, '):
+            plot_partial_dependence(lr, X, features=features)
+
+    with pytest.raises(
+            ValueError,
+            match='All entries of features must be less than '):
+        plot_partial_dependence(lr, X, features=[123],
+                                feature_names=['blah'])
+
+    with pytest.raises(
+            ValueError,
+            match='feature_names should not contain duplicates'):
+        plot_partial_dependence(lr, X, features=[0, 1, 2],
+                                feature_names=['a', 'b', 'a'])
+
+
+@if_matplotlib
+def test_plot_partial_dependence_fig():
+    # Make sure fig object is correctly used if not None
+
+    import matplotlib.pyplot as plt
+
+    (X, y), _ = regression_data
+    clf = LinearRegression()
+    clf.fit(X, y)
+
+    fig = plt.figure()
+    grid_resolution = 25
+    returned_fig, axs = plot_partial_dependence(
+        clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
+
+    assert returned_fig is fig
diff --git a/sklearn/model_inspection/__init__.py b/sklearn/model_inspection/__init__.py
deleted file mode 100644
index 4fcd6197dc508..0000000000000
--- a/sklearn/model_inspection/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""The :mod:`sklearn.model_inspection` module includes tools for model
-inspection."""
-from .partial_dependence import partial_dependence
-
-
-__all__ = [
-    'partial_dependence',
-]
diff --git a/sklearn/plot/__init__.py b/sklearn/plot/__init__.py
deleted file mode 100644
index c659e4721f0ae..0000000000000
--- a/sklearn/plot/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""The :mod:`sklearn.plot` module includes tools for plotting."""
-from .partial_dependence import plot_partial_dependence
-
-
-__all__ = [
-    'plot_partial_dependence',
-]
diff --git a/sklearn/plot/partial_dependence.py b/sklearn/plot/partial_dependence.py
deleted file mode 100644
index 178bdf2152c97..0000000000000
--- a/sklearn/plot/partial_dependence.py
+++ /dev/null
@@ -1,325 +0,0 @@
-"""Partial dependence plots for regression and classification models."""
-
-# Authors: Peter Prettenhofer
-#          Trevor Stephens
-#          Nicolas Hug
-# License: BSD 3 clause
-
-from itertools import count
-import numbers
-
-import numpy as np
-from scipy.stats.mstats import mquantiles
-
-from ..base import is_regressor
-from ..externals.joblib import Parallel, delayed
-from ..utils import check_array
-from ..model_inspection import partial_dependence
-
-
-__all__ = ['plot_partial_dependence']
-
-
-def plot_partial_dependence(estimator, X, features, feature_names=None,
-                            target=None, response_method='auto', n_cols=3,
-                            grid_resolution=100, percentiles=(0.05, 0.95),
-                            method='auto', n_jobs=1, verbose=0, fig=None,
-                            line_kw=None, contour_kw=None, **fig_kw):
-    """Partial dependence plots.
-
-    The ``len(features)`` plots are arranged in a grid with ``n_cols``
-    columns. Two-way partial dependence plots are plotted as contour plots.
-
-    Read more in the :ref:`User Guide <partial_dependence>`.
-
-    Parameters
-    ----------
-    estimator : BaseEstimator
-        A fitted estimator object implementing `predict`, `predict_proba`,
-        or `decision_function`. Multioutput-multiclass classifiers are not
-        supported.
-    X : array-like, shape=(n_samples, n_features)
-        The data to use to build the grid of values on which the dependence
-        will be evaluated. This is usually the training data.
-    features : list of {int, str, pair of int, pair of str}
-        The target features for which to create the PDPs.
-        If features[i] is an int or a string, a one-way PDP is created; if
-        features[i] is a tuple, a two-way PDP is created. Each tuple must be
-        of size 2.
-        if any entry is a string, then it must be in ``feature_names``.
-    feature_names : seq of str, shape=(n_features,), optional
-        Name of each feature; feature_names[i] holds the name of the feature
-        with index i.
-    target : int, optional (default=None)
-        - In a multiclass setting, specifies the class for which the PDPs
-          should be computed. Note that for binary classification, the
-          positive class (index 1) is always used.
-        - In a multioutput setting, specifies the task for which the PDPs
-          should be computed
-        Ignored in binary classification or classical regression settings.
-    response_method : 'auto', 'predict_proba' or 'decision_function', \
-            optional (default='auto') :
-        Specifies whether to use :term:`predict_proba` or
-        :term:`decision_function` as the target response. For regressors
-        this parameter is ignored and the response is always the output of
-        :term:`predict`. By default, :term:`predict_proba` is tried first
-        and we revert to :term:`decision_function` if it doesn't exist. If
-        ``method`` is 'recursion', the response is always the output of
-        :term:`decision_function`.
-    n_cols : int, optional (default=3)
-        The maximum number of columns in the grid plot.
-    grid_resolution : int, optional (default=100)
-        The number of equally spaced points on the axes of the plots, for each
-        target feature.
-    percentiles : tuple of float, optional (default=(0.05, 0.95))
-        The lower and upper percentile used to create the extreme values
-        for the PDP axes. Must be in [0, 1].
-    method : str, optional (default='auto')
-        The method to use to calculate the partial dependence predictions:
-
-        - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is more efficient in terms of speed.
-          With this method, ``X`` is optional and is only used to build the
-          grid and the partial dependences are computed using the training
-          data. This method does not account for the ``init`` predicor of
-          the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init_plot>`). With this
-          method, the target response of a classifier is always the decision
-          function, not the predicted probabilities.
-
-        - 'brute' is supported for any estimator, but is more
-          computationally intensive.
-
-        - If 'auto', then 'recursion' will be used for
-          ``BaseGradientBoosting`` estimators with ``init=None``, and
-          'brute' for all other.
-
-        Unlike the 'brute' method, 'recursion' does not account for the
-        ``init`` predictor of the boosting process. In practice this still
-        produces the same plots, up to a constant offset in the target
-        response.
-    n_jobs : int, optional (default=1)
-        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
-        See :term:`Glossary <n_jobs>` for more details.
-    verbose : int, optional (default=0)
-        Verbose output during PD computations.
-    fig : Matplotlib figure object, optional (default=None)
-        A figure object onto which the plots will be drawn, after the figure
-        has been cleared.
-    line_kw : dict, optional
-        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
-        For one-way partial dependence plots.
-    contour_kw : dict, optional
-        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
-        For two-way partial dependence plots.
-    **fig_kw : dict, optional
-        Dict with keywords passed to the figure() call.
-        Note that all keywords not recognized above will be automatically
-        included here.
-
-    Returns
-    -------
-    fig : figure
-        The Matplotlib Figure object.
-    axs : seq of Axis objects
-        A seq of Axis objects, one for each subplot.
-
-    Examples
-    --------
-    >>> from sklearn.datasets import make_friedman1
-    >>> from sklearn.ensemble import GradientBoostingRegressor
-    >>> X, y = make_friedman1()
-    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
-    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
-    ...
-
-    See also
-    --------
-    sklearn.model_inspection.partial_dependence: Return raw partial
-      dependence values
-
-    .. _warning_recursion_init_plot:
-
-    Warnings
-    --------
-    The 'recursion' method only works for gradient boosting estimators, and
-    unlike the 'brute' method, it does not account for the ``init``
-    predictor of the boosting process. In practice this will produce the
-    same values as 'brute' up to a constant offset in the target response,
-    provided that ``init`` is a consant estimator (which is the default).
-    However, as soon as ``init`` is not a constant estimator, the partial
-    dependence values are incorrect for 'recursion'.
-    """
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
-
-    # set target_idx for multi-class estimators
-    if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
-        if target is None:
-            raise ValueError('target must be specified for multi-class')
-        target_idx = np.searchsorted(estimator.classes_, target)
-        if (not (0 <= target_idx < len(estimator.classes_)) or
-                estimator.classes_[target_idx] != target):
-            raise ValueError('target not in est.classes_, got {}'.format(
-                target))
-    else:
-        # regression and binary classification
-        target_idx = 0
-
-    X = check_array(X)
-    n_features = X.shape[1]
-
-    # convert feature_names to list
-    if feature_names is None:
-        # if feature_names is None, use feature indices as name
-        feature_names = [str(i) for i in range(n_features)]
-    elif isinstance(feature_names, np.ndarray):
-        feature_names = feature_names.tolist()
-    if len(set(feature_names)) != len(feature_names):
-        raise ValueError('feature_names should not contain duplicates.')
-
-    def convert_feature(fx):
-        if isinstance(fx, str):
-            try:
-                fx = feature_names.index(fx)
-            except ValueError:
-                raise ValueError('Feature %s not in feature_names' % fx)
-        return int(fx)
-
-    # convert features into a seq of int tuples
-    tmp_features = []
-    for fxs in features:
-        if isinstance(fxs, (numbers.Integral, str)):
-            fxs = (fxs,)
-        try:
-            fxs = [convert_feature(fx) for fx in fxs]
-        except TypeError:
-            raise ValueError('Each entry in features must be either an int, '
-                             'a string, or an iterable of size at most 2.')
-        if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('Each entry in features must be either an int, '
-                             'a string, or an iterable of size at most 2.')
-
-        tmp_features.append(fxs)
-
-    features = tmp_features
-
-    names = []
-    try:
-        for fxs in features:
-            names_ = []
-            # explicit loop so "i" is bound for exception below
-            for i in fxs:
-                names_.append(feature_names[i])
-            names.append(names_)
-    except IndexError:
-        raise ValueError('All entries of features must be less than '
-                         'len(feature_names) = {0}, got {1}.'
-                         .format(len(feature_names), i))
-
-    # compute averaged predictions
-    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(estimator, X, fxs,
-                                    response_method=response_method,
-                                    method=method,
-                                    grid_resolution=grid_resolution,
-                                    percentiles=percentiles)
-        for fxs in features)
-
-    # For multioutput regression, we can only check the validity of target
-    # now that we have the predictions.
-    # Also note: as multiclass-multioutput classifiers are not supported,
-    # multiclass and multioutput scenario are mutually exclusive. So there is
-    # no risk of overwriting target_idx here.
-    pd, _ = pd_result[0]  # checking the first result is enough
-    if is_regressor(estimator) and pd.shape[0] > 1:
-        if target is None:
-            raise ValueError(
-                'target must be specified for multi-output regressors')
-        if not 0 <= target <= pd.shape[0]:
-            raise ValueError(
-                'target must be in [0, n_tasks], got {}.'.format(target))
-        target_idx = target
-    else:
-        target_idx = 0
-
-    # get global min and max values of PD grouped by plot type
-    pdp_lim = {}
-    for pd, values in pd_result:
-        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
-        n_fx = len(values)
-        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
-        min_pd = min(min_pd, old_min_pd)
-        max_pd = max(max_pd, old_max_pd)
-        pdp_lim[n_fx] = (min_pd, max_pd)
-
-    # create contour levels for two-way plots
-    if 2 in pdp_lim:
-        Z_level = np.linspace(*pdp_lim[2], num=8)
-
-    if fig is None:
-        fig = plt.figure(**fig_kw)
-    else:
-        fig.clear()
-
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
-    n_cols = min(n_cols, len(features))
-    n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
-        ax = fig.add_subplot(n_rows, n_cols, i + 1)
-
-        if len(values) == 1:
-            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
-        else:
-            # make contour plot
-            assert len(values) == 2
-            XX, YY = np.meshgrid(values[0], values[1])
-            Z = pd[target_idx].T
-            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
-                            colors='k')
-            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
-                        vmin=Z_level[0], alpha=0.75, **contour_kw)
-            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
-
-        # plot data deciles + axes labels
-        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
-        trans = transforms.blended_transform_factory(ax.transData,
-                                                     ax.transAxes)
-        ylim = ax.get_ylim()
-        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
-        ax.set_xlabel(name[0])
-        ax.set_ylim(ylim)
-
-        # prevent x-axis ticks from overlapping
-        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
-        tick_formatter = ScalarFormatter()
-        tick_formatter.set_powerlimits((-3, 4))
-        ax.xaxis.set_major_formatter(tick_formatter)
-
-        if len(values) > 1:
-            # two-way PDP - y-axis deciles + labels
-            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
-            trans = transforms.blended_transform_factory(ax.transAxes,
-                                                         ax.transData)
-            xlim = ax.get_xlim()
-            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
-            ax.set_ylabel(name[1])
-            # hline erases xlim
-            ax.set_xlim(xlim)
-        else:
-            ax.set_ylabel('Partial dependence')
-
-        if len(values) == 1:
-            ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
-
-    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
-                        hspace=0.3)
-    return fig, axs
diff --git a/sklearn/plot/tests/__init__.py b/sklearn/plot/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/sklearn/plot/tests/test_partial_dependence.py b/sklearn/plot/tests/test_partial_dependence.py
deleted file mode 100644
index 24e24fcd019f7..0000000000000
--- a/sklearn/plot/tests/test_partial_dependence.py
+++ /dev/null
@@ -1,187 +0,0 @@
-"""
-Testing for the partial dependence module.
-"""
-
-import pytest
-
-from sklearn.utils.testing import if_matplotlib
-from sklearn.plot import plot_partial_dependence
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.ensemble import GradientBoostingRegressor
-from sklearn.linear_model import LinearRegression
-from sklearn.linear_model import LogisticRegression
-from sklearn.datasets import load_boston, load_iris
-from sklearn.datasets import make_classification, make_regression
-
-
-# (X, y), n_targets  <-- as expected in the output of partial_dep()
-binary_classification_data = (make_classification(random_state=0), 1)
-multiclass_classification_data = (make_classification(n_classes=3,
-                                                      n_clusters_per_class=1,
-                                                      random_state=0), 3)
-regression_data = (make_regression(random_state=0), 1)
-multioutput_regression_data = (make_regression(n_targets=2, random_state=0), 2)
-
-
-@if_matplotlib
-def test_plot_partial_dependence():
-    # Test partial dependence plot function.
-    boston = load_boston()
-    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
-    clf.fit(boston.data, boston.target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-    # check with str features and array feature names
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
-
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-    # check with list feature_names
-    feature_names = boston.feature_names.tolist()
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=feature_names)
-    assert len(axs) == 3
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-def test_plot_partial_dependence_multiclass():
-    # Test partial dependence plot function on multi-class input.
-    iris = load_iris()
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(iris.data, iris.target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-    # now with symbol labels
-    target = iris.target_names[iris.target]
-    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
-    clf.fit(iris.data, target)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target='setosa',
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-def test_plot_partial_dependence_multioutput():
-    # Test partial dependence plot function on multi-output input.
-    (X, y), _ = multioutput_regression_data
-    clf = LinearRegression()
-    clf.fit(X, y)
-
-    grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=1,
-                                       grid_resolution=grid_resolution)
-    assert len(axs) == 2
-    assert all(ax.has_data for ax in axs)
-
-
-@if_matplotlib
-@pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
-@pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
-def test_plot_partial_dependence_input():
-    X, y = make_classification(random_state=0)
-
-    lr = LinearRegression()
-    lr.fit(X, y)
-    gbc = GradientBoostingClassifier(random_state=0)
-    gbc.fit(X, y)
-
-    # check target param for multiclass
-    (X_m, y_m), _ = multiclass_classification_data
-    lr_m = LogisticRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-class'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match='target not in est.classes_'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    # check target param for multioutput
-    (X_m, y_m), _ = multioutput_regression_data
-    lr_m = LinearRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-output'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match=r'target must be in \[0, n_tasks\]'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    for feature_names in (None, ['abcd', 'def']):
-        with pytest.raises(
-                ValueError,
-                match='Feature foobar not in feature_names'):
-            plot_partial_dependence(lr, X, features=['foobar'],
-                                    feature_names=feature_names)
-
-    for features in([(1, 2, 3)], [1, {}], [tuple()]):
-        with pytest.raises(
-                ValueError,
-                match='Each entry in features must be either an int, '):
-            plot_partial_dependence(lr, X, features=features)
-
-    with pytest.raises(
-            ValueError,
-            match='All entries of features must be less than '):
-        plot_partial_dependence(lr, X, features=[123],
-                                feature_names=['blah'])
-
-    with pytest.raises(
-            ValueError,
-            match='feature_names should not contain duplicates'):
-        plot_partial_dependence(lr, X, features=[0, 1, 2],
-                                feature_names=['a', 'b', 'a'])
-
-
-@if_matplotlib
-def test_plot_partial_dependence_fig():
-    # Make sure fig object is correctly used if not None
-
-    import matplotlib.pyplot as plt
-
-    (X, y), _ = regression_data
-    clf = LinearRegression()
-    clf.fit(X, y)
-
-    fig = plt.figure()
-    grid_resolution = 25
-    returned_fig, axs = plot_partial_dependence(
-        clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
-
-    assert returned_fig is fig
diff --git a/sklearn/setup.py b/sklearn/setup.py
index 4618fd46b369a..6ea9fecf83a76 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -33,16 +33,14 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('feature_selection/tests')
     config.add_subpackage('gaussian_process')
     config.add_subpackage('gaussian_process/tests')
+    config.add_subpackage('inspection')
+    config.add_subpackage('inspection/tests')
     config.add_subpackage('mixture')
     config.add_subpackage('mixture/tests')
     config.add_subpackage('model_selection')
     config.add_subpackage('model_selection/tests')
-    config.add_subpackage('model_inspection')
-    config.add_subpackage('model_inspection/tests')
     config.add_subpackage('neural_network')
     config.add_subpackage('neural_network/tests')
-    config.add_subpackage('plot')
-    config.add_subpackage('plot/tests')
     config.add_subpackage('preprocessing')
     config.add_subpackage('preprocessing/tests')
     config.add_subpackage('semi_supervised')
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 6150e017e3e28..e449ddfa16c80 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -64,7 +64,7 @@ class Parallel(_joblib.Parallel):
            "check_symmetric", "indices_to_mask", "deprecated",
            "cpu_count", "Parallel", "Memory", "delayed", "parallel_backend",
            "register_parallel_backend", "hash", "effective_n_jobs",
-           "resample", "shuffle"]
+           "resample", "shuffle", "check_matplotlib_support"]
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
 _IS_32BIT = 8 * struct.calcsize("P") == 32
@@ -636,3 +636,13 @@ def is_scalar_nan(x):
     # convert from numpy.bool_ to python bool to ensure that testing
     # is_scalar_nan(x) is True does not fail.
     return bool(isinstance(x, numbers.Real) and np.isnan(x))
+
+
+def check_matplotlib_support(caller_name):
+    try:
+        import matplotlib
+    except ImportError as e:
+        raise ImportError(
+            "{} requires matplotlib. You can install matplotlib with "
+            "`pip install matplotlib`".format(caller_name)
+        ) from e
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 88138452d6ab6..747eb3012c0de 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -20,6 +20,7 @@
 from sklearn.utils import get_chunk_n_rows
 from sklearn.utils import is_scalar_nan
 from sklearn.utils.mocking import MockDataFrame
+from sklearn.utils import check_matplotlib_support
 from sklearn import config_context
 
 

From b89d5c435f85eece7b6536d4cc990b9a8a8d6914 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Apr 2019 09:11:33 -0400
Subject: [PATCH 092/113] removed model_inspection

---
 .../model_inspection/partial_dependence.py    | 356 ------------------
 1 file changed, 356 deletions(-)
 delete mode 100644 sklearn/model_inspection/partial_dependence.py

diff --git a/sklearn/model_inspection/partial_dependence.py b/sklearn/model_inspection/partial_dependence.py
deleted file mode 100644
index bd1aa6fe22557..0000000000000
--- a/sklearn/model_inspection/partial_dependence.py
+++ /dev/null
@@ -1,356 +0,0 @@
-"""Partial dependence plots for regression and classification models."""
-
-# Authors: Peter Prettenhofer
-#          Trevor Stephens
-#          Nicolas Hug
-# License: BSD 3 clause
-
-import warnings
-
-import numpy as np
-from scipy.stats.mstats import mquantiles
-
-from ..base import is_classifier, is_regressor
-from ..utils.extmath import cartesian
-from ..utils import check_array
-from ..utils.validation import check_is_fitted
-from ..tree._tree import DTYPE
-from ..exceptions import NotFittedError
-from ..ensemble.gradient_boosting import BaseGradientBoosting
-from ..ensemble._gradient_boosting import _partial_dependence_tree
-
-
-__all__ = ['partial_dependence']
-
-
-def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
-    """Generate a grid of points based on the percentiles of X.
-
-    The grid is a cartesian product between the columns of ``values``. The
-    ith column of ``values`` consists in ``grid_resolution`` equally-spaced
-    points between the percentiles of the jth column of X.
-    If ``grid_resolution`` is bigger than the number of unique values in the
-    jth column of X, then those unique values will be used instead.
-
-    Parameters
-    ----------
-    X : ndarray, shape=(n_samples, n_target_features)
-        The data
-    percentiles : tuple of floats
-        The percentiles which are used to construct the extreme values of
-        the grid. Must be in [0, 1].
-    grid_resolution : int
-        The number of equally spaced points to be placed on the grid for each
-        feature.
-
-    Returns
-    -------
-    grid : ndarray, shape=(n_points, X.shape[1])
-        A value for each feature at each point in the grid. ``n_points`` is
-        always ``<= grid_resolution ** X.shape[1]``.
-    values : list of 1d ndarrays
-        The values with which the grid has been created. The size of each
-        array ``values[j]`` is either ``grid_resolution``, or the number of
-        unique values in ``X[:, j]``, whichever is smaller.
-    """
-    try:
-        assert len(percentiles) == 2
-    except (AssertionError, TypeError):
-        raise ValueError('percentiles must be a sequence of 2 elements.')
-    if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentiles values must be in [0, 1].')
-    if percentiles[0] >= percentiles[1]:
-        raise ValueError('percentiles[0] must be strictly less '
-                         'than percentiles[1].')
-
-    if grid_resolution <= 1:
-        raise ValueError('grid_resolution must be strictly greater than 1.')
-
-    values = []
-    for feature in range(X.shape[1]):
-        uniques = np.unique(X[:, feature])
-        if uniques.shape[0] < grid_resolution:
-            # feature has low resolution use unique vals
-            axis = uniques
-        else:
-            # create axis based on percentiles and grid resolution
-            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-            if np.allclose(emp_percentiles[0, feature],
-                           emp_percentiles[1, feature]):
-                raise ValueError('percentiles are too close to each other, '
-                                 'unable to build the grid.')
-            axis = np.linspace(emp_percentiles[0, feature],
-                               emp_percentiles[1, feature],
-                               num=grid_resolution, endpoint=True)
-        values.append(axis)
-
-    return cartesian(values), values
-
-
-def _partial_dependence_recursion(est, grid, features):
-    if est.init is not None:
-        warnings.warn(
-            'Using recursion method with a non-constant init predictor will '
-            'lead to incorrect partial dependence values.',
-            UserWarning
-        )
-
-    # grid needs to be DTYPE
-    grid = np.asarray(grid, dtype=DTYPE, order='C')
-
-    n_trees_per_stage = est.estimators_.shape[1]
-    n_estimators = est.estimators_.shape[0]
-    learning_rate = est.learning_rate
-    averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
-                                    dtype=np.float64, order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = est.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, features,
-                                     learning_rate, averaged_predictions[k])
-
-    return averaged_predictions
-
-
-def _partial_dependence_brute(est, grid, features, X, response_method):
-    averaged_predictions = []
-
-    # define the prediction_method (predict, predict_proba, decision_function).
-    if is_regressor(est):
-        prediction_method = est.predict
-    else:
-        predict_proba = getattr(est, 'predict_proba', None)
-        decision_function = getattr(est, 'decision_function', None)
-        if response_method == 'auto':
-            # try predict_proba, then decision_function if it doesn't exist
-            prediction_method = predict_proba or decision_function
-        else:
-            prediction_method = (predict_proba if response_method ==
-                                 'predict_proba' else decision_function)
-        if prediction_method is None:
-            if response_method == 'auto':
-                raise ValueError(
-                    'The estimator has no predict_proba and no '
-                    'decision_function method.'
-                )
-            elif response_method == 'predict_proba':
-                raise ValueError('The estimator has no predict_proba method.')
-            else:
-                raise ValueError(
-                    'The estimator has no decision_function method.')
-
-    for new_values in grid:
-        X_eval = X.copy()
-        for i, variable in enumerate(features):
-            X_eval[:, variable] = new_values[i]
-
-        try:
-            predictions = prediction_method(X_eval)
-        except NotFittedError:
-            raise ValueError('est parameter must be a fitted estimator')
-
-        # Note: predictions is of shape
-        # (n_points,) for non-multioutput regressors
-        # (n_points, n_tasks) for multioutput regressors
-        # (n_points, 1) for the regressors in cross_decomposition (I think)
-        # (n_points, 2)  for binary classifaction
-        # (n_points, n_classes) for multiclass classification
-
-        # average over samples
-        averaged_predictions.append(np.mean(predictions, axis=0))
-
-    # reshape to (n_targets, n_points) where n_targets is:
-    # - 1 for non-multioutput regression and binary classification (shape is
-    #   already correct in those cases)
-    # - n_tasks for multi-output regression
-    # - n_classes for multiclass classification.
-    averaged_predictions = np.array(averaged_predictions).T
-    if is_regressor(est) and averaged_predictions.ndim == 1:
-        # non-multioutput regression, shape is (n_points,)
-        averaged_predictions = averaged_predictions.reshape(1, -1)
-    elif is_classifier(est) and averaged_predictions.shape[0] == 2:
-        # Binary classification, shape is (2, n_points).
-        # we output the effect of **positive** class
-        averaged_predictions = averaged_predictions[1]
-        averaged_predictions = averaged_predictions.reshape(1, -1)
-
-    return averaged_predictions
-
-
-def partial_dependence(est, features, X, response_method='auto',
-                       percentiles=(0.05, 0.95), grid_resolution=100,
-                       method='auto'):
-    """Partial dependence of ``features``.
-
-    Partial dependence of a feature (or a set of features) corresponds to
-    the average response of an estimator for each possible value of the
-    feature.
-
-    Read more in the :ref:`User Guide <partial_dependence>`.
-
-    Parameters
-    ----------
-    est : BaseEstimator
-        A fitted classification or regression model. Multioutput-multiclass
-        classifiers are not supported.
-    features : list or array-like of int
-        The target features for which the partial dependency should be
-        computed.
-    X : array-like, shape (n_samples, n_features)
-        ``X`` is used both to generate a grid of values for the
-        ``features``, and to compute the averaged predictions when
-        method is 'brute'.
-    response_method : 'auto', 'predict_proba' or 'decision_function', \
-            optional (default='auto')
-        Specifies whether to use :term:`predict_proba` or
-        :term:`decision_function` as the target response. For regressors
-        this parameter is ignored and the response is always the output of
-        :term:`predict`. By default, :term:`predict_proba` is tried first
-        and we revert to :term:`decision_function` if it doesn't exist. If
-        ``method`` is 'recursion', the response is always the output of
-        :term:`decision_function`.
-    percentiles : tuple of float, optional (default=(0.05, 0.95))
-        The lower and upper percentile used to create the extreme values
-        for the grid. Must be in [0, 1].
-    grid_resolution : int, optional (default=100)
-        The number of equally spaced points on the grid, for each target
-        feature.
-    method : str, optional (default='auto')
-        The method used to calculate the averaged predictions:
-
-        - 'recursion' is only supported for objects inheriting from
-          `BaseGradientBoosting`, but is more efficient in terms of speed.
-          With this method, ``X`` is only used to build the
-          grid and the partial dependences are computed using the training
-          data. This method does not account for the ``init`` predicor of
-          the boosting process, which may lead to incorrect values (see
-          :ref:`this warning<warning_recursion_init_plot>`). With this
-          method, the target response of a classifier is always the decision
-          function, not the predicted probabilities.
-
-        - 'brute' is supported for any estimator, but is more
-          computationally intensive.
-
-        - If 'auto', then 'recursion' will be used for
-          ``BaseGradientBoosting`` estimators with ``init=None``, and 'brute'
-          for all other.
-
-    Returns
-    -------
-    averaged_predictions : ndarray, \
-            shape (n_outputs, len(values[0]), len(values[1]), ...)
-        The predictions for all the points in the grid, averaged over all
-        samples in X (or over the training data if ``method`` is
-        'recursion'). ``n_outputs`` corresponds to the number of classes in
-        a multi-class setting, or to the number of tasks for multi-output
-        regression. For classical regression and binary classification
-        ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size
-        ``values[j]``.
-    values : seq of 1d ndarrays
-        The values with which the grid has been created. The generated grid
-        is a cartesian product of the arrays in ``values``. ``len(values) ==
-        len(features)``. The size of each array ``values[j]`` is either
-        ``grid_resolution``, or the number of unique values in ``X[:, j]``,
-        whichever is smaller.
-
-    Examples
-    --------
-    >>> X = [[0, 0, 2], [1, 0, 0]]
-    >>> y = [0, 1]
-    >>> from sklearn.ensemble import GradientBoostingClassifier
-    >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)
-    >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),
-    ...                    grid_resolution=2) # doctest: +SKIP
-    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
-
-    See also
-    --------
-    sklearn.plot.plot_partial_dependence: Plot partial dependence
-
-    .. _warning_recursion_init:
-
-    Warnings
-    --------
-    The 'recursion' method only works for gradient boosting estimators, and
-    unlike the 'brute' method, it does not account for the ``init``
-    predictor of the boosting process. In practice this will produce the
-    same values as 'brute' up to a constant offset in the target response,
-    provided that ``init`` is a consant estimator (which is the default).
-    However, as soon as ``init`` is not a constant estimator, the partial
-    dependence values are incorrect for 'recursion'.
-
-    """
-
-    if not (is_classifier(est) or is_regressor(est)):
-        raise ValueError('est must be a fitted regressor or classifier.')
-
-    if (hasattr(est, 'classes_') and
-            isinstance(est.classes_[0], np.ndarray)):
-        raise ValueError('Multiclass-multioutput estimators are not supported')
-
-    X = check_array(X)
-
-    accepted_responses = ('auto', 'predict_proba', 'decision_function')
-    if response_method not in accepted_responses:
-        raise ValueError(
-            'response_method {} is invalid. Accepted response_method names '
-            'are {}.'.format(response_method, ', '.join(accepted_responses)))
-
-    if is_regressor(est) and response_method != 'auto':
-        raise ValueError(
-            "The response_method parameter is ignored for regressors and "
-            "must be 'auto'."
-        )
-    accepted_methods = ('brute', 'recursion', 'auto')
-    if method not in accepted_methods:
-        raise ValueError(
-            'method {} is invalid. Accepted method names are {}.'.format(
-                method, ', '.join(accepted_methods)))
-
-    if method == 'auto':
-        if isinstance(est, BaseGradientBoosting) and est.init is None:
-            method = 'recursion'
-        else:
-            method = 'brute'
-
-    if method == 'recursion':
-        if not isinstance(est, BaseGradientBoosting):
-            raise ValueError(
-                'est must be an instance of BaseGradientBoosting '
-                'for the "recursion" method. Try using method="brute".')
-        if response_method == 'auto':
-            response_method = 'decision_function'
-
-        if response_method != 'decision_function':
-            raise ValueError(
-                "With the 'recursion' method, the response_method must be "
-                "'decision_function'. Got {}.".format(response_method)
-            )
-        check_is_fitted(est, 'estimators_',
-                        msg='est parameter must be a fitted estimator')
-        # Note: if method is brute, this check is done at prediction time
-        n_features = est.n_features_
-    else:
-        n_features = X.shape[1]
-
-    features = np.asarray(features, dtype=np.int32, order='C').ravel()
-    if any(not (0 <= f < n_features) for f in features):
-        raise ValueError('all features must be in [0, %d]'
-                         % (n_features - 1))
-
-    grid, values = _grid_from_X(X[:, features], percentiles,
-                                grid_resolution)
-    if method == 'brute':
-        averaged_predictions = _partial_dependence_brute(est, grid,
-                                                         features, X,
-                                                         response_method)
-    else:
-        averaged_predictions = _partial_dependence_recursion(est, grid,
-                                                             features)
-
-    # reshape averaged_predictions to
-    # (n_outputs, n_values_feature_0, # n_values_feature_1, ...)
-    averaged_predictions = averaged_predictions.reshape(
-        -1, *[val.shape[0] for val in values])
-
-    return averaged_predictions, values

From f27f54230e30eb0aca8c2ae98c3849ca7c86f5e8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Apr 2019 09:13:38 -0400
Subject: [PATCH 093/113] pep8

---
 sklearn/inspection/partial_dependence.py | 17 +++++++++--------
 sklearn/utils/__init__.py                |  2 +-
 sklearn/utils/tests/test_utils.py        |  1 -
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 8c32d0ed92535..5bbb157e54858 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -16,7 +16,7 @@
 from ..base import is_classifier, is_regressor
 from ..utils.extmath import cartesian
 from ..utils import check_array
-from ..utils import check_matplotlib_support
+from ..utils import check_matplotlib_support  # noqa
 from ..utils.validation import check_is_fitted
 from ..tree._tree import DTYPE
 from ..exceptions import NotFittedError
@@ -314,7 +314,8 @@ def partial_dependence(estimator, X, features, response_method='auto',
                 method, ', '.join(accepted_methods)))
 
     if method == 'auto':
-        if isinstance(estimator, BaseGradientBoosting) and estimator.init is None:
+        if (isinstance(estimator, BaseGradientBoosting) and
+                estimator.init is None):
             method = 'recursion'
         else:
             method = 'brute'
@@ -492,11 +493,11 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
     However, as soon as ``init`` is not a constant estimator, the partial
     dependence values are incorrect for 'recursion'.
     """
-    check_matplotlib_support('plot_partial_dependence')
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
+    check_matplotlib_support('plot_partial_dependence')  # noqa
+    import matplotlib.pyplot as plt  # noqa
+    from matplotlib import transforms  # noqa
+    from matplotlib.ticker import MaxNLocator  # noqa
+    from matplotlib.ticker import ScalarFormatter  # noqa
 
     # set target_idx for multi-class estimators
     if hasattr(estimator, 'classes_') and np.size(estimator.classes_) > 2:
@@ -665,4 +666,4 @@ def convert_feature(fx):
 
     fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
                         hspace=0.3)
-    return fig, axs
\ No newline at end of file
+    return fig, axs
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index e449ddfa16c80..4b46a339d365e 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -640,7 +640,7 @@ def is_scalar_nan(x):
 
 def check_matplotlib_support(caller_name):
     try:
-        import matplotlib
+        import matplotlib  # noqa
     except ImportError as e:
         raise ImportError(
             "{} requires matplotlib. You can install matplotlib with "
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 747eb3012c0de..88138452d6ab6 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -20,7 +20,6 @@
 from sklearn.utils import get_chunk_n_rows
 from sklearn.utils import is_scalar_nan
 from sklearn.utils.mocking import MockDataFrame
-from sklearn.utils import check_matplotlib_support
 from sklearn import config_context
 
 

From 9bca47c1f6a1601d81b4305863d8b56513a1a6c8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Apr 2019 09:25:49 -0400
Subject: [PATCH 094/113] plot_partial_dep doesnt return anything

---
 sklearn/inspection/partial_dependence.py      |  8 ---
 .../tests/test_partial_dependence.py          | 68 ++++++++++++-------
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 5bbb157e54858..ffadf05754d35 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -460,13 +460,6 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         Note that all keywords not recognized above will be automatically
         included here.
 
-    Returns
-    -------
-    fig : figure
-        The Matplotlib Figure object.
-    axs : seq of Axis objects
-        A seq of Axis objects, one for each subplot.
-
     Examples
     --------
     >>> from sklearn.datasets import make_friedman1
@@ -666,4 +659,3 @@ def convert_feature(fx):
 
     fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
                         hspace=0.3)
-    return fig, axs
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index e53b4dcb385e3..eb3b149197fa8 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -388,32 +388,40 @@ def test_warning_recursion_non_constant_init():
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
+    import matplotlib.pyplot as plt  # noqa
+
     boston = load_boston()
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
     clf.fit(boston.data, boston.target)
 
     grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
+    plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                            grid_resolution=grid_resolution,
+                            feature_names=boston.feature_names)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
     # check with str features and array feature names
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=boston.feature_names)
+    plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                               ('CRIM', 'ZN')],
+                            grid_resolution=grid_resolution,
+                            feature_names=boston.feature_names)
 
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
     # check with list feature_names
     feature_names = boston.feature_names.tolist()
-    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
-                                                          ('CRIM', 'ZN')],
-                                       grid_resolution=grid_resolution,
-                                       feature_names=feature_names)
+    plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                               ('CRIM', 'ZN')],
+                            grid_resolution=grid_resolution,
+                            feature_names=feature_names)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
@@ -421,14 +429,17 @@ def test_plot_partial_dependence():
 @if_matplotlib
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
+    import matplotlib.pyplot as plt  # noqa
     iris = load_iris()
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(iris.data, iris.target)
 
     grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
+    plot_partial_dependence(clf, iris.data, [0, 1],
+                            target=0,
+                            grid_resolution=grid_resolution)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
@@ -438,9 +449,11 @@ def test_plot_partial_dependence_multiclass():
     clf.fit(iris.data, target)
 
     grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
-                                       target='setosa',
-                                       grid_resolution=grid_resolution)
+    plot_partial_dependence(clf, iris.data, [0, 1],
+                            target='setosa',
+                            grid_resolution=grid_resolution)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
@@ -448,20 +461,25 @@ def test_plot_partial_dependence_multiclass():
 @if_matplotlib
 def test_plot_partial_dependence_multioutput():
     # Test partial dependence plot function on multi-output input.
+    import matplotlib.pyplot as plt  # noqa
     (X, y), _ = multioutput_regression_data
     clf = LinearRegression()
     clf.fit(X, y)
 
     grid_resolution = 25
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=0,
-                                       grid_resolution=grid_resolution)
+    plot_partial_dependence(clf, X, [0, 1],
+                            target=0,
+                            grid_resolution=grid_resolution)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    fig, axs = plot_partial_dependence(clf, X, [0, 1],
-                                       target=1,
-                                       grid_resolution=grid_resolution)
+    plot_partial_dependence(clf, X, [0, 1],
+                            target=1,
+                            grid_resolution=grid_resolution)
+    fig = plt.gcf()
+    axs = fig.get_axes()
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
@@ -543,7 +561,7 @@ def test_plot_partial_dependence_fig():
 
     fig = plt.figure()
     grid_resolution = 25
-    returned_fig, axs = plot_partial_dependence(
+    plot_partial_dependence(
         clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
 
-    assert returned_fig is fig
+    assert plt.gcf() is fig

From ad326afa765de678375292461eec565bfdb0bb0d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 19 Apr 2019 10:52:01 -0400
Subject: [PATCH 095/113] Ignored dep warning for new tests in ensemble

---
 sklearn/ensemble/tests/test_partial_dependence.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index 31b2d63f78884..2aff6b9d2df77 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -98,6 +98,7 @@ def test_partial_dependence_regressor():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_sample_weight():
     # Test near perfect correlation between partial dependence and diagonal
     # when sample weights emphasize y = x predictions
@@ -123,6 +124,7 @@ def test_partial_dependence_sample_weight():
     assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependecy_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)

From 3823fd2830c15bbd286acf13cd97fa4f7ece381d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 19 Apr 2019 10:59:48 -0400
Subject: [PATCH 096/113] ported sample_weight tests

---
 .../tests/test_partial_dependence.py          | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index eb3b149197fa8..d6bfcf2c0fd88 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -385,6 +385,31 @@ def test_warning_recursion_non_constant_init():
         partial_dependence(gbc, X, [0], method='recursion')
 
 
+def test_partial_dependence_sample_weight():
+    # Test near perfect correlation between partial dependence and diagonal
+    # when sample weights emphasize y = x predictions
+    # non-regression test for #13193
+    N = 1000
+    rng = np.random.RandomState(123456)
+    mask = rng.randint(2, size=N, dtype=bool)
+
+    x = rng.rand(N)
+    # set y = x on mask and y = -x outside
+    y = x.copy()
+    y[~mask] = -y[~mask]
+    X = np.c_[mask, x]
+    # sample weights to emphasize data points where y = x
+    sample_weight = np.ones(N)
+    sample_weight[mask] = 1000.
+
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    pdp, values = partial_dependence(clf, X, features=[1])
+
+    assert np.corrcoef(pdp, values)[0, 1] > 0.99
+
+
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.

From 2be41ce1244c977d42992e983c62b03d5f699335 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 21 Apr 2019 10:40:04 -0400
Subject: [PATCH 097/113] docstring

---
 sklearn/utils/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 4b46a339d365e..640e326b05e5a 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -639,6 +639,16 @@ def is_scalar_nan(x):
 
 
 def check_matplotlib_support(caller_name):
+    """Raise ImportError with detailed error message if mpl is not installed.
+
+    Plot utilities like plot_partial_dependence should lazily import
+    matplotlib and call this helper before any computation.
+
+    Parameters
+    ----------
+    caller_name : str
+        The name of the caller that requires matplotlib.
+    """
     try:
         import matplotlib  # noqa
     except ImportError as e:

From 743c838976bc42b5039c825859c911162dd77597 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 07:43:45 -0400
Subject: [PATCH 098/113] Addressed comments

---
 examples/inspection/README.txt           |  7 +++++++
 sklearn/inspection/partial_dependence.py | 13 ++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)
 create mode 100644 examples/inspection/README.txt

diff --git a/examples/inspection/README.txt b/examples/inspection/README.txt
new file mode 100644
index 0000000000000..e64900d978e59
--- /dev/null
+++ b/examples/inspection/README.txt
@@ -0,0 +1,7 @@
+.. _inspection_examples:
+
+Inspection
+----------
+
+Examples related to the :mod:`sklearn.inspection` module.
+
diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index ffadf05754d35..eddae0543156b 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -366,7 +366,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
 def plot_partial_dependence(estimator, X, features, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
-                            method='auto', n_jobs=1, verbose=0, fig=None,
+                            method='auto', n_jobs=None, verbose=0, fig=None,
                             line_kw=None, contour_kw=None, **fig_kw):
     """Partial dependence plots.
 
@@ -392,7 +392,8 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         if any entry is a string, then it must be in ``feature_names``.
     feature_names : seq of str, shape=(n_features,), optional
         Name of each feature; feature_names[i] holds the name of the feature
-        with index i.
+        with index i. By default, the name of the feature corresponds to
+        their numerical index.
     target : int, optional (default=None)
         - In a multiclass setting, specifies the class for which the PDPs
           should be computed. Note that for binary classification, the
@@ -441,9 +442,11 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         ``init`` predictor of the boosting process. In practice this still
         produces the same plots, up to a constant offset in the target
         response.
-    n_jobs : int, optional (default=1)
-        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
-        See :term:`Glossary <n_jobs>` for more details.
+    n_jobs : int, optional (default=None)
+        The number of CPUs to use to compute the partial dependences.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
     verbose : int, optional (default=0)
         Verbose output during PD computations.
     fig : Matplotlib figure object, optional (default=None)

From cb5166a31a446e5bc74466bc99d581fbb9a7c537 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Apr 2019 07:44:31 -0400
Subject: [PATCH 099/113] Apply suggestions from code review

Co-Authored-By: NicolasHug <contact@nicolas-hug.com>
---
 sklearn/inspection/partial_dependence.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index ffadf05754d35..14ee319c357a7 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -38,7 +38,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
     Parameters
     ----------
-    X : ndarray, shape=(n_samples, n_target_features)
+    X : ndarray, shape (n_samples, n_target_features)
         The data
     percentiles : tuple of floats
         The percentiles which are used to construct the extreme values of
@@ -49,7 +49,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
 
     Returns
     -------
-    grid : ndarray, shape=(n_points, X.shape[1])
+    grid : ndarray, shape (n_points, n_target_features)
         A value for each feature at each point in the grid. ``n_points`` is
         always ``<= grid_resolution ** X.shape[1]``.
     values : list of 1d ndarrays
@@ -198,7 +198,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
         A fitted estimator object implementing `predict`, `predict_proba`,
         or `decision_function`. Multioutput-multiclass classifiers are not
         supported.
-    X : array-like, shape=(n_samples, n_features)
+    X : array-like, shape (n_samples, n_features)
         ``X`` is used both to generate a grid of values for the
         ``features``, and to compute the averaged predictions when
         method is 'brute'.
@@ -206,7 +206,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
         The target features for which the partial dependency should be
         computed.
     response_method : 'auto', 'predict_proba' or 'decision_function', \
-            optional (default='auto') :
+            optional (default='auto')
         Specifies whether to use :term:`predict_proba` or
         :term:`decision_function` as the target response. For regressors
         this parameter is ignored and the response is always the output of
@@ -242,8 +242,8 @@ def partial_dependence(estimator, X, features, response_method='auto',
 
     Returns
     -------
-    averaged_predictions : array, \
-            shape=(n_outputs, len(values[0]), len(values[1]), ...)
+    averaged_predictions : ndarray, \
+            shape (n_outputs, len(values[0]), len(values[1]), ...)
         The predictions for all the points in the grid, averaged over all
         samples in X (or over the training data if ``method`` is
         'recursion'). ``n_outputs`` corresponds to the number of classes in
@@ -366,7 +366,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
 def plot_partial_dependence(estimator, X, features, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
-                            method='auto', n_jobs=1, verbose=0, fig=None,
+                            method='auto', n_jobs=None, verbose=0, fig=None,
                             line_kw=None, contour_kw=None, **fig_kw):
     """Partial dependence plots.
 
@@ -381,7 +381,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         A fitted estimator object implementing `predict`, `predict_proba`,
         or `decision_function`. Multioutput-multiclass classifiers are not
         supported.
-    X : array-like, shape=(n_samples, n_features)
+    X : array-like, shape (n_samples, n_features)
         The data to use to build the grid of values on which the dependence
         will be evaluated. This is usually the training data.
     features : list of {int, str, pair of int, pair of str}
@@ -390,9 +390,10 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         features[i] is a tuple, a two-way PDP is created. Each tuple must be
         of size 2.
         if any entry is a string, then it must be in ``feature_names``.
-    feature_names : seq of str, shape=(n_features,), optional
+    feature_names : seq of str, shape (n_features,), optional
         Name of each feature; feature_names[i] holds the name of the feature
-        with index i.
+        with index i. By default, the name of the feature corresponds to
+        their numerical index.
     target : int, optional (default=None)
         - In a multiclass setting, specifies the class for which the PDPs
           should be computed. Note that for binary classification, the
@@ -441,7 +442,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         ``init`` predictor of the boosting process. In practice this still
         produces the same plots, up to a constant offset in the target
         response.
-    n_jobs : int, optional (default=1)
+    n_jobs : int, optional (default=None)
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         See :term:`Glossary <n_jobs>` for more details.
     verbose : int, optional (default=0)

From f9cb12746fc6e426d1154c4128e0a7279aec34b2 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 07:44:45 -0400
Subject: [PATCH 100/113] forgot some merging conflicts

---
 sklearn/inspection/partial_dependence.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 70386c376a2f1..986ba2e2af86f 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -443,15 +443,10 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         produces the same plots, up to a constant offset in the target
         response.
     n_jobs : int, optional (default=None)
-<<<<<<< HEAD
         The number of CPUs to use to compute the partial dependences.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
-=======
-        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
-        See :term:`Glossary <n_jobs>` for more details.
->>>>>>> cb5166a31a446e5bc74466bc99d581fbb9a7c537
     verbose : int, optional (default=0)
         Verbose output during PD computations.
     fig : Matplotlib figure object, optional (default=None)

From a8d7991a353fd2cb7145c419f3622015b54fcb11 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 08:50:39 -0400
Subject: [PATCH 101/113] put back old test

---
 sklearn/ensemble/tests/test_partial_dependence.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index 2aff6b9d2df77..a40fea2ff0099 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -133,6 +133,9 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence,
                   clf, [0], grid=None, X=None)
 
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=[0, 1], X=X)
+
     # first argument must be an instance of BaseGradientBoosting
     assert_raises(ValueError, partial_dependence,
                   {}, [0], X=X)

From 02e74ec1e26b6dd62793cd7cc91acfec868ff5a8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Apr 2019 08:51:50 -0400
Subject: [PATCH 102/113] Update sklearn/utils/__init__.py

Co-Authored-By: NicolasHug <contact@nicolas-hug.com>
---
 sklearn/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index e1d46ab17aa98..24d62640376d7 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -697,7 +697,7 @@ def is_scalar_nan(x):
 def check_matplotlib_support(caller_name):
     """Raise ImportError with detailed error message if mpl is not installed.
 
-    Plot utilities like plot_partial_dependence should lazily import
+    Plot utilities like :func:`plot_partial_dependence` should lazily import
     matplotlib and call this helper before any computation.
 
     Parameters

From 2b5205135352e9f51a3bb891effdf1c48f36bdfa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Apr 2019 11:37:59 -0400
Subject: [PATCH 103/113] Apply suggestions from code review

Co-Authored-By: NicolasHug <contact@nicolas-hug.com>
---
 sklearn/inspection/partial_dependence.py        | 10 +++++-----
 .../inspection/tests/test_partial_dependence.py | 17 +++++++++--------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 986ba2e2af86f..1ffdac4de329c 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -61,14 +61,14 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         assert len(percentiles) == 2
     except (AssertionError, TypeError):
         raise ValueError('percentiles must be a sequence of 2 elements.')
-    if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentiles values must be in [0, 1].')
+    if not all(0 <= x <= 1 for x in percentiles):
+        raise ValueError("'percentiles' values must be in [0, 1].")
     if percentiles[0] >= percentiles[1]:
         raise ValueError('percentiles[0] must be strictly less '
                          'than percentiles[1].')
 
     if grid_resolution <= 1:
-        raise ValueError('grid_resolution must be strictly greater than 1.')
+        raise ValueError("'grid_resolution' must be strictly greater than 1.")
 
     values = []
     for feature in range(X.shape[1]):
@@ -151,7 +151,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
         try:
             predictions = prediction_method(X_eval)
         except NotFittedError:
-            raise ValueError('est parameter must be a fitted estimator')
+            raise ValueError("'estimator' parameter must be a fitted estimator")
 
         # Note: predictions is of shape
         # (n_points,) for non-multioutput regressors
@@ -288,7 +288,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
     """
 
     if not (is_classifier(estimator) or is_regressor(estimator)):
-        raise ValueError('est must be a fitted regressor or classifier.')
+        raise ValueError("'estimator' must be a fitted regressor or classifier.")
 
     if (hasattr(estimator, 'classes_') and
             isinstance(estimator.classes_[0], np.ndarray)):
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index d6bfcf2c0fd88..d272042d38c58 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -3,7 +3,6 @@
 """
 
 import numpy as np
-from numpy.testing import assert_array_almost_equal
 import pytest
 
 import sklearn
@@ -26,6 +25,8 @@
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.dummy import DummyClassifier
 from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import if_matplotlib
 
 
@@ -97,11 +98,11 @@ def test_grid_from_X():
     X = np.asarray([[1, 2],
                     [3, 4]])
     grid, axes = _grid_from_X(X)
-    assert_array_almost_equal(grid, [[1, 2],
+    assert_array_equal(grid, [[1, 2],
                                      [1, 4],
                                      [3, 2],
                                      [3, 4]])
-    assert_array_almost_equal(axes, X.T)
+    assert_array_equal(axes, X.T)
 
     # test shapes of returned objects depending on the number of unique values
     # for a feature.
@@ -161,7 +162,7 @@ def test_grid_from_X():
                           'brute')])
 def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or
-    # _partial_dependece_recursion is equivalent to manually setting a target
+    # _partial_dependence_recursion is equivalent to manually setting a target
     # feature to a given value, and computing the average prediction over all
     # samples.
     # This also checks that the brute and recursion methods give the same
@@ -192,13 +193,13 @@ def test_partial_dependence_helpers(est, method, target_feature):
         mean_predictions.append(est.predict(X_).mean())
 
     pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
-    assert_array_almost_equal(pdp, mean_predictions, decimal=3)
+    assert_allclose(pdp, mean_predictions, atol=1e-3)
 
 
 @pytest.mark.parametrize('target_feature', (0, 1, 2, 3, 4, 5))
 def test_recursion_decision_function(target_feature):
-    # Make sure the recursion method (implicitely uses decision_function) has
-    # the same result as using brute method with response_method=decision
+    # Make sure the recursion method (implicitly uses decision_function) has
+    # the same result as using brute method with response_method=decision_function
 
     X, y = make_classification(n_classes=2, n_clusters_per_class=1,
                                random_state=1)
@@ -214,7 +215,7 @@ def test_recursion_decision_function(target_feature):
                                     response_method='decision_function',
                                     method='brute')
 
-    assert_array_almost_equal(preds_1, preds_2, decimal=5)
+    assert_allclose(preds_1, preds_2, atol=1e-7)
 
 
 @pytest.mark.parametrize('est', (LinearRegression(),

From 0a13f1b53aa2ee1b882d14ff3fe1083d311bf809 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 11:41:38 -0400
Subject: [PATCH 104/113] comments

---
 sklearn/inspection/partial_dependence.py            |  6 +++---
 sklearn/inspection/tests/test_partial_dependence.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 1ffdac4de329c..b5b26aa633630 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -323,8 +323,8 @@ def partial_dependence(estimator, X, features, response_method='auto',
     if method == 'recursion':
         if not isinstance(estimator, BaseGradientBoosting):
             raise ValueError(
-                'est must be an instance of BaseGradientBoosting '
-                'for the "recursion" method. Try using method="brute".')
+                "'estimator' must be an instance of BaseGradientBoosting "
+                "for the 'recursion' method. Try using method='brute'.")
         if response_method == 'auto':
             response_method = 'decision_function'
 
@@ -334,7 +334,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
                 "'decision_function'. Got {}.".format(response_method)
             )
         check_is_fitted(estimator, 'estimators_',
-                        msg='est parameter must be a fitted estimator')
+                        msg="'estimator' parameter must be a fitted estimator")
         # Note: if method is brute, this check is done at prediction time
         n_features = estimator.n_features_
     else:
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index d272042d38c58..7eb5f3ded3e7f 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -139,7 +139,7 @@ def test_grid_from_X():
     for percentiles in ((-1, .95), (.05, 2)):
         with pytest.raises(
                 ValueError,
-                match="percentiles values must be in"):
+                match="'percentiles' values must be in"):
             _grid_from_X(X, percentiles=percentiles)
 
     with pytest.raises(
@@ -149,7 +149,7 @@ def test_grid_from_X():
 
     with pytest.raises(
             ValueError,
-            match='grid_resolution must be strictly greater than 1.'):
+            match="'grid_resolution' must be strictly greater than 1."):
         _grid_from_X(X, grid_resolution=1)
 
 
@@ -288,7 +288,7 @@ def test_partial_dependence_input():
 
     with pytest.raises(
             ValueError,
-            match="est must be a fitted regressor or classifier"):
+            match="'estimator' must be a fitted regressor or classifier"):
         partial_dependence(KMeans(), X, [0])
 
     with pytest.raises(
@@ -346,8 +346,8 @@ class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
 
     with pytest.raises(
             ValueError,
-            match='est must be an instance of BaseGradientBoosting '
-                  'for the "recursion" method'):
+            match="'estimator' must be an instance of BaseGradientBoosting "
+                  "for the 'recursion' method"):
         partial_dependence(lr, X, [0], method='recursion')
 
     for feature in (-1, 1000000):
@@ -360,7 +360,7 @@ class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
     for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
         with pytest.raises(
                 ValueError,
-                match='est parameter must be a fitted estimator'):
+                match="'estimator' parameter must be a fitted estimator"):
             partial_dependence(unfitted_est, X, [0])
 
     # check that array-like objects are accepted

From 14dbd2ba5617bcd1d2e6b67e0e041ac4babd8428 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 12:01:50 -0400
Subject: [PATCH 105/113] comments

---
 sklearn/inspection/partial_dependence.py      |  36 +--
 .../tests/test_partial_dependence.py          | 221 ++++++++----------
 2 files changed, 122 insertions(+), 135 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index b5b26aa633630..9d372ff392f7d 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -8,6 +8,7 @@
 import warnings
 from itertools import count
 import numbers
+from collections.abc import Iterable
 
 import numpy as np
 from scipy.stats.mstats import mquantiles
@@ -27,7 +28,7 @@
 __all__ = ['partial_dependence', 'plot_partial_dependence']
 
 
-def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
+def _grid_from_X(X, percentiles, grid_resolution):
     """Generate a grid of points based on the percentiles of X.
 
     The grid is a cartesian product between the columns of ``values``. The
@@ -57,10 +58,8 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         array ``values[j]`` is either ``grid_resolution``, or the number of
         unique values in ``X[:, j]``, whichever is smaller.
     """
-    try:
-        assert len(percentiles) == 2
-    except (AssertionError, TypeError):
-        raise ValueError('percentiles must be a sequence of 2 elements.')
+    if not isinstance(percentiles, Iterable) or len(percentiles) != 2:
+        raise ValueError("'percentiles' must be a sequence of 2 elements.")
     if not all(0 <= x <= 1 for x in percentiles):
         raise ValueError("'percentiles' values must be in [0, 1].")
     if percentiles[0] >= percentiles[1]:
@@ -102,8 +101,7 @@ def _partial_dependence_recursion(est, grid, features):
     # grid needs to be DTYPE
     grid = np.asarray(grid, dtype=DTYPE, order='C')
 
-    n_trees_per_stage = est.estimators_.shape[1]
-    n_estimators = est.estimators_.shape[0]
+    n_estimators, n_trees_per_stage = est.estimators_.shape
     learning_rate = est.learning_rate
     averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
                                     dtype=np.float64, order='C')
@@ -151,7 +149,8 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
         try:
             predictions = prediction_method(X_eval)
         except NotFittedError:
-            raise ValueError("'estimator' parameter must be a fitted estimator")
+            raise ValueError(
+                "'estimator' parameter must be a fitted estimator")
 
         # Note: predictions is of shape
         # (n_points,) for non-multioutput regressors
@@ -288,7 +287,8 @@ def partial_dependence(estimator, X, features, response_method='auto',
     """
 
     if not (is_classifier(estimator) or is_regressor(estimator)):
-        raise ValueError("'estimator' must be a fitted regressor or classifier.")
+        raise ValueError(
+            "'estimator' must be a fitted regressor or classifier.")
 
     if (hasattr(estimator, 'classes_') and
             isinstance(estimator.classes_[0], np.ndarray)):
@@ -573,12 +573,12 @@ def convert_feature(fx):
     # Also note: as multiclass-multioutput classifiers are not supported,
     # multiclass and multioutput scenario are mutually exclusive. So there is
     # no risk of overwriting target_idx here.
-    pd, _ = pd_result[0]  # checking the first result is enough
-    if is_regressor(estimator) and pd.shape[0] > 1:
+    avg_preds, _ = pd_result[0]  # checking the first result is enough
+    if is_regressor(estimator) and avg_preds.shape[0] > 1:
         if target is None:
             raise ValueError(
                 'target must be specified for multi-output regressors')
-        if not 0 <= target <= pd.shape[0]:
+        if not 0 <= target <= avg_preds.shape[0]:
             raise ValueError(
                 'target must be in [0, n_tasks], got {}.'.format(target))
         target_idx = target
@@ -587,8 +587,9 @@ def convert_feature(fx):
 
     # get global min and max values of PD grouped by plot type
     pdp_lim = {}
-    for pd, values in pd_result:
-        min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max()
+    for avg_preds, values in pd_result:
+        min_pd = avg_preds[target_idx].min()
+        max_pd = avg_preds[target_idx].max()
         n_fx = len(values)
         old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
         min_pd = min(min_pd, old_min_pd)
@@ -612,16 +613,17 @@ def convert_feature(fx):
     n_cols = min(n_cols, len(features))
     n_rows = int(np.ceil(len(features) / float(n_cols)))
     axs = []
-    for i, fx, name, (pd, values) in zip(count(), features, names, pd_result):
+    for i, fx, name, (avg_preds, values) in zip(
+            count(), features, names, pd_result):
         ax = fig.add_subplot(n_rows, n_cols, i + 1)
 
         if len(values) == 1:
-            ax.plot(values[0], pd[target_idx].ravel(), **line_kw)
+            ax.plot(values[0], avg_preds[target_idx].ravel(), **line_kw)
         else:
             # make contour plot
             assert len(values) == 2
             XX, YY = np.meshgrid(values[0], values[1])
-            Z = pd[target_idx].T
+            Z = avg_preds[target_idx].T
             CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
                             colors='k')
             ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 7eb5f3ded3e7f..ce230552e8c5c 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -95,13 +95,15 @@ def test_grid_from_X():
 
     # Make sure that the grid is a cartesian product of the input (it will use
     # the unique values instead of the percentiles)
+    percentiles = (.05, .95)
+    grid_resolution = 100
     X = np.asarray([[1, 2],
                     [3, 4]])
-    grid, axes = _grid_from_X(X)
+    grid, axes = _grid_from_X(X, percentiles, grid_resolution)
     assert_array_equal(grid, [[1, 2],
-                                     [1, 4],
-                                     [3, 2],
-                                     [3, 4]])
+                              [1, 4],
+                              [3, 2],
+                              [3, 4]])
     assert_array_equal(axes, X.T)
 
     # test shapes of returned objects depending on the number of unique values
@@ -111,7 +113,7 @@ def test_grid_from_X():
 
     # n_unique_values > grid_resolution
     X = rng.normal(size=(20, 2))
-    grid, axes = _grid_from_X(X, grid_resolution=grid_resolution)
+    grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution)
     assert grid.shape == (grid_resolution * grid_resolution, X.shape[1])
     assert np.asarray(axes).shape == (2, grid_resolution)
 
@@ -119,47 +121,38 @@ def test_grid_from_X():
     n_unique_values = 12
     X[n_unique_values - 1:, 0] = 12345
     rng.shuffle(X)  # just to make sure the order is irrelevant
-    grid, axes = _grid_from_X(X, grid_resolution=grid_resolution)
+    grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution)
     assert grid.shape == (n_unique_values * grid_resolution, X.shape[1])
     # axes is a list of arrays of different shapes
     assert axes[0].shape == (n_unique_values,)
     assert axes[1].shape == (grid_resolution,)
 
-    with pytest.raises(
-            ValueError,
-            match='percentiles are too close'):
-        _grid_from_X(X, grid_resolution=2, percentiles=(0, 0.0001))
-
-    for percentiles in ((1, 2, 3, 4), 12345):
-        with pytest.raises(
-                ValueError,
-                match="percentiles must be a sequence"):
-            _grid_from_X(X, percentiles=percentiles)
-
-    for percentiles in ((-1, .95), (.05, 2)):
-        with pytest.raises(
-                ValueError,
-                match="'percentiles' values must be in"):
-            _grid_from_X(X, percentiles=percentiles)
-
-    with pytest.raises(
-            ValueError,
-            match=r"percentiles\[0\] must be strictly less than"):
-        _grid_from_X(X, percentiles=(.9, .1))
 
-    with pytest.raises(
-            ValueError,
-            match="'grid_resolution' must be strictly greater than 1."):
-        _grid_from_X(X, grid_resolution=1)
+@pytest.mark.parametrize(
+    "grid_resolution, percentiles, err_msg",
+    [(2, (0, 0.0001), "percentiles are too close"),
+     (100, (1, 2, 3, 4), "'percentiles' must be a sequence of 2 elements"),
+     (100, 12345, "'percentiles' must be a sequence of 2 elements"),
+     (100, (-1, .95), r"'percentiles' values must be in \[0, 1\]"),
+     (100, (.05, 2), r"'percentiles' values must be in \[0, 1\]"),
+     (100, (.9, .1), r"percentiles\[0\] must be strictly less than"),
+     (1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1")]
+)
+def test_grid_from_X_error(grid_resolution, percentiles, err_msg):
+    X = np.asarray([[1, 2], [3, 4]])
+    with pytest.raises(ValueError, match=err_msg):
+        _grid_from_X(
+            X, grid_resolution=grid_resolution, percentiles=percentiles
+        )
 
 
 @pytest.mark.parametrize('target_feature', (0, 3))
-@pytest.mark.parametrize('est, method',
-                         [(LinearRegression(), 'brute'),
-                          (GradientBoostingRegressor(random_state=0),
-                          'recursion'),
-                          (GradientBoostingRegressor(random_state=0),
-                          'brute')])
+@pytest.mark.parametrize(
+    'est, method',
+    [(LinearRegression(), 'brute'),
+     (GradientBoostingRegressor(random_state=0), 'recursion'),
+     (GradientBoostingRegressor(random_state=0), 'brute')]
+)
 def test_partial_dependence_helpers(est, method, target_feature):
     # Check that what is returned by _partial_dependence_brute or
     # _partial_dependence_recursion is equivalent to manually setting a target
@@ -199,7 +192,8 @@ def test_partial_dependence_helpers(est, method, target_feature):
 @pytest.mark.parametrize('target_feature', (0, 1, 2, 3, 4, 5))
 def test_recursion_decision_function(target_feature):
     # Make sure the recursion method (implicitly uses decision_function) has
-    # the same result as using brute method with response_method=decision_function
+    # the same result as using brute method with
+    # response_method=decision_function
 
     X, y = make_classification(n_classes=2, n_clusters_per_class=1,
                                random_state=1)
@@ -276,96 +270,87 @@ def test_multiclass_multioutput(Estimator):
         partial_dependence(est, X, [0])
 
 
-def test_partial_dependence_input():
-    # Test input validation of partial_dependence.
-
+class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
+    def fit(self, X, y):
+        return self
+
+
+@pytest.mark.parametrize(
+    "estimator, params, err_msg",
+    [(KMeans(),
+      {'features': [0]},
+      "'estimator' must be a fitted regressor or classifier"),
+     (LinearRegression(),
+      {'features': [0], 'response_method': 'predict_proba'},
+      'The response_method parameter is ignored for regressors'),
+     (GradientBoostingClassifier(random_state=0),
+      {'features': [0], 'response_method': 'predict_proba',
+       'method': 'recursion'},
+      "'recursion' method, the response_method must be 'decision_function'"),
+     (GradientBoostingClassifier(random_state=0),
+      {'features': [0], 'response_method': 'predict_proba', 'method': 'auto'},
+      "'recursion' method, the response_method must be 'decision_function'"),
+     (GradientBoostingClassifier(random_state=0),
+      {'features': [0], 'response_method': 'blahblah'},
+      'response_method blahblah is invalid. Accepted response_method'),
+     (NoPredictProbaNoDecisionFunction(),
+      {'features': [0], 'response_method': 'auto'},
+      'The estimator has no predict_proba and no decision_function method'),
+     (NoPredictProbaNoDecisionFunction(),
+      {'features': [0], 'response_method': 'predict_proba'},
+      'The estimator has no predict_proba method.'),
+     (NoPredictProbaNoDecisionFunction(),
+      {'features': [0], 'response_method': 'decision_function'},
+      'The estimator has no decision_function method.'),
+     (LinearRegression(),
+      {'features': [0], 'method': 'blahblah'},
+      'blahblah is invalid. Accepted method names are brute, recursion, auto'),
+     (LinearRegression(),
+      {'features': [0], 'method': 'recursion'},
+      "'estimator' must be an instance of BaseGradientBoosting for the"
+      " 'recursion'")]
+)
+def test_partial_dependence_error(estimator, params, err_msg):
     X, y = make_classification(random_state=0)
+    estimator.fit(X, y)
 
-    lr = LinearRegression()
-    lr.fit(X, y)
-    gbc = GradientBoostingClassifier(random_state=0)
-    gbc.fit(X, y)
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, X, **params)
 
-    with pytest.raises(
-            ValueError,
-            match="'estimator' must be a fitted regressor or classifier"):
-        partial_dependence(KMeans(), X, [0])
-
-    with pytest.raises(
-            ValueError,
-            match='The response_method parameter is ignored for regressors'):
-        partial_dependence(lr, X, [0], response_method='predict_proba')
 
-    with pytest.raises(
-            ValueError,
-            match="With the 'recursion' method, the response_method must be "
-                  "'decision_function'."):
-        partial_dependence(gbc, X, [0], response_method='predict_proba',
-                           method='recursion')
-
-    # for GBDTs, if users want to use predict_proba then they're forced to set
-    # 'method' to brute.
-    with pytest.raises(
-            ValueError,
-            match="With the 'recursion' method, the response_method must be "
-                  "'decision_function"):
-        partial_dependence(gbc, X, [0], response_method='predict_proba',
-                           method='auto')
-
-    with pytest.raises(
-            ValueError,
-            match="response_method blahblah is invalid. Accepted response"):
-        partial_dependence(gbc, X, [0], response_method='blahblah')
-
-    class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
-        pass
-    bad_clf = NoPredictProbaNoDecisionFunction()
-
-    with pytest.raises(
-            ValueError,
-            match='The estimator has no predict_proba and no '
-                  'decision_function method.'):
-        partial_dependence(bad_clf, X, [0], response_method='auto')
+@pytest.mark.parametrize(
+    'estimator',
+    [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+@pytest.mark.parametrize('features', [-1, 1000000])
+def test_partial_dependence_unknown_feature(estimator, features):
+    X, y = make_classification(random_state=0)
+    estimator.fit(X, y)
 
-    with pytest.raises(
-            ValueError,
-            match='The estimator has no predict_proba method.'):
-        partial_dependence(bad_clf, X, [0], response_method='predict_proba')
+    err_msg = 'all features must be in'
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, X, [features])
 
-    with pytest.raises(
-            ValueError,
-            match='The estimator has no decision_function method.'):
-        partial_dependence(bad_clf, X, [0],
-                           response_method='decision_function')
 
-    with pytest.raises(
-            ValueError,
-            match="method blahblah is invalid. Accepted method names "
-                  "are brute, recursion, auto."):
-        partial_dependence(lr, X, [0], method='blahblah')
+@pytest.mark.parametrize(
+    'estimator',
+    [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+def test_partial_dependence_unfitted_estimator(estimator):
+    err_msg = "'estimator' parameter must be a fitted estimator"
+    with pytest.raises(ValueError, match=err_msg):
+        partial_dependence(estimator, X, [0])
 
-    with pytest.raises(
-            ValueError,
-            match="'estimator' must be an instance of BaseGradientBoosting "
-                  "for the 'recursion' method"):
-        partial_dependence(lr, X, [0], method='recursion')
-
-    for feature in (-1, 1000000):
-        for est in (lr, gbc):
-            with pytest.raises(
-                    ValueError,
-                    match="all features must be in"):
-                partial_dependence(est, X, [feature])
-
-    for unfitted_est in (LinearRegression(), GradientBoostingRegressor()):
-        with pytest.raises(
-                ValueError,
-                match="'estimator' parameter must be a fitted estimator"):
-            partial_dependence(unfitted_est, X, [0])
 
+@pytest.mark.parametrize(
+    'estimator',
+    [LinearRegression(), GradientBoostingClassifier(random_state=0)]
+)
+def test_partial_dependence_X_list(estimator):
     # check that array-like objects are accepted
-    for est in (lr, gbc):
-        partial_dependence(est, list(X), [0])
+    X, y = make_classification(random_state=0)
+    estimator.fit(X, y)
+    partial_dependence(estimator, list(X), [0])
 
 
 def test_warning_recursion_non_constant_init():

From 4a1b11dcff9823c7ce968153ea23d65247b58fa0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 12:16:29 -0400
Subject: [PATCH 106/113] Addressed comments

---
 sklearn/inspection/partial_dependence.py      |  6 ++++--
 .../tests/test_partial_dependence.py          | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 9d372ff392f7d..8b550a9796a9a 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -80,8 +80,10 @@ def _grid_from_X(X, percentiles, grid_resolution):
             emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
             if np.allclose(emp_percentiles[0, feature],
                            emp_percentiles[1, feature]):
-                raise ValueError('percentiles are too close to each other, '
-                                 'unable to build the grid.')
+                raise ValueError(
+                    'percentiles are too close to each other, '
+                    'unable to build the grid. Please choose percentiles '
+                    'that are further apart.')
             axis = np.linspace(emp_percentiles[0, feature],
                                emp_percentiles[1, feature],
                                num=grid_resolution, endpoint=True)
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index ce230552e8c5c..ebd9a695df4cc 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -396,6 +396,16 @@ def test_partial_dependence_sample_weight():
     assert np.corrcoef(pdp, values)[0, 1] > 0.99
 
 
+def close_fig(fignum=None):
+    from matplotlib.pyplot import get_fignums, close as _close  # noqa
+
+    if fignum is None:
+        for fignum in get_fignums():
+            _close(fignum)
+    else:
+        _close(fignum)
+
+
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -436,6 +446,8 @@ def test_plot_partial_dependence():
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
+    close_fig()
+
 
 @if_matplotlib
 def test_plot_partial_dependence_multiclass():
@@ -468,6 +480,8 @@ def test_plot_partial_dependence_multiclass():
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
+    close_fig()
+
 
 @if_matplotlib
 def test_plot_partial_dependence_multioutput():
@@ -494,6 +508,8 @@ def test_plot_partial_dependence_multioutput():
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
+    close_fig()
+
 
 @if_matplotlib
 @pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
@@ -559,6 +575,7 @@ def test_plot_partial_dependence_input():
         plot_partial_dependence(lr, X, features=[0, 1, 2],
                                 feature_names=['a', 'b', 'a'])
 
+    close_fig()
 
 @if_matplotlib
 def test_plot_partial_dependence_fig():
@@ -576,3 +593,5 @@ def test_plot_partial_dependence_fig():
         clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
 
     assert plt.gcf() is fig
+
+    close_fig()

From 92795a98a8a59151b64a7d57a41a131a1ae04c5b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 12:18:01 -0400
Subject: [PATCH 107/113] pep8

---
 sklearn/inspection/tests/test_partial_dependence.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index ebd9a695df4cc..a2aaeb83ade66 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -577,6 +577,7 @@ def test_plot_partial_dependence_input():
 
     close_fig()
 
+
 @if_matplotlib
 def test_plot_partial_dependence_fig():
     # Make sure fig object is correctly used if not None

From 30bffb3ea833982b8615a3ddbd0e1cfd883ccb5c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 13:20:36 -0400
Subject: [PATCH 108/113] Avoid re-computating quantiles

---
 sklearn/inspection/partial_dependence.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 8b550a9796a9a..d680695e1ebcc 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -77,15 +77,16 @@ def _grid_from_X(X, percentiles, grid_resolution):
             axis = uniques
         else:
             # create axis based on percentiles and grid resolution
-            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-            if np.allclose(emp_percentiles[0, feature],
-                           emp_percentiles[1, feature]):
+            emp_percentiles = mquantiles(X[:, feature], prob=percentiles,
+                                         axis=0)
+            if np.allclose(emp_percentiles[0],
+                           emp_percentiles[1]):
                 raise ValueError(
                     'percentiles are too close to each other, '
                     'unable to build the grid. Please choose percentiles '
                     'that are further apart.')
-            axis = np.linspace(emp_percentiles[0, feature],
-                               emp_percentiles[1, feature],
+            axis = np.linspace(emp_percentiles[0],
+                               emp_percentiles[1],
                                num=grid_resolution, endpoint=True)
         values.append(axis)
 

From e0094b67dae4c537ab8b538bbc41d909297e7547 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 13:20:58 -0400
Subject: [PATCH 109/113] fixed example

---
 examples/inspection/plot_partial_dependence.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py
index dd2b7e0c4a220..e1bf49258a188 100644
--- a/examples/inspection/plot_partial_dependence.py
+++ b/examples/inspection/plot_partial_dependence.py
@@ -87,8 +87,9 @@ def main():
     # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower
     # with the brute method.
     features = [0, 5, 1, 2]
-    fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
-                                       n_jobs=3, grid_resolution=50)
+    plot_partial_dependence(est, X, features, feature_names=names,
+                            n_jobs=3, grid_resolution=50)
+    fig = plt.gcf()
     fig.suptitle('Partial dependence of house value on non-location features\n'
                  'for the California housing dataset, with MLPRegressor')
     plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle
@@ -100,8 +101,9 @@ def main():
     est.fit(X, y)
     print('Computing partial dependence plots...')
     features = [0, 5, 1, 2, (5, 1)]
-    fig, axs = plot_partial_dependence(est, X, features, feature_names=names,
-                                       n_jobs=3, grid_resolution=50)
+    plot_partial_dependence(est, X, features, feature_names=names,
+                            n_jobs=3, grid_resolution=50)
+    fig = plt.gcf()
     fig.suptitle('Partial dependence of house value on non-location features\n'
                  'for the California housing dataset, with Gradient Boosting')
     plt.subplots_adjust(top=0.9)

From d5b1559671d59709513a3ceffc02840d2dd3bfdd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 23 Apr 2019 13:48:07 -0400
Subject: [PATCH 110/113] removed warnings refs

---
 sklearn/inspection/partial_dependence.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index d680695e1ebcc..cde24cdb8991b 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -274,9 +274,6 @@ def partial_dependence(estimator, X, features, response_method='auto',
     --------
     sklearn.inspection.plot_partial_dependence: Plot partial dependence
 
-
-    .. _warning_recursion_init:
-
     Warnings
     --------
     The 'recursion' method only works for gradient boosting estimators, and
@@ -480,8 +477,6 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
     sklearn.inspection.partial_dependence: Return raw partial
       dependence values
 
-    .. _warning_recursion_init_plot:
-
     Warnings
     --------
     The 'recursion' method only works for gradient boosting estimators, and

From 3fce2c455cb1d50de9b9a3d6d17a6f8606525d5a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Apr 2019 22:07:35 +0200
Subject: [PATCH 111/113] MAINT: install matplotlib in conda latest build

---
 azure-pipelines.yml          | 1 +
 build_tools/azure/install.sh | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3a950325812dd..ae27828dd22a3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -38,6 +38,7 @@ jobs:
         PYAMG_VERSION: '*'
         PILLOW_VERSION: '*'
         JOBLIB_VERSION: '*'
+        MATPLOTLIB_VERSION: '*'
         COVERAGE: 'true'
         CHECK_PYTEST_SOFT_DEPENDENCY: 'true'
         TEST_DOCSTRINGS: 'true'
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 519f7de1e5037..82b9180f160ad 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -47,6 +47,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
         TO_INSTALL="$TO_INSTALL pillow=$PILLOW_VERSION"
     fi
 
+    if [[ -n "$MATPLOTLIB_VERSION" ]]; then
+        TO_INSTALL="$TO_INSTALL matplotlib=$MATPLOTLIB_VERSION"
+    fi
+
 	make_conda $TO_INSTALL
 
 elif [[ "$DISTRIB" == "ubuntu" ]]; then

From d22d7b282b9634dba0741cc84817fd1bf671eb20 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 24 Apr 2019 07:30:25 -0400
Subject: [PATCH 112/113] Addressed comments

---
 doc/modules/partial_dependence.rst            |   7 +-
 doc/whats_new/v0.21.rst                       |   2 -
 sklearn/inspection/partial_dependence.py      |  13 +-
 .../tests/test_partial_dependence.py          | 113 ++++++------------
 sklearn/utils/testing.py                      |  18 +++
 5 files changed, 65 insertions(+), 88 deletions(-)

diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst
index f9081678b7c77..eea9cbf063b81 100644
--- a/doc/modules/partial_dependence.rst
+++ b/doc/modules/partial_dependence.rst
@@ -55,7 +55,10 @@ and a two-way PDP between the two features::
     >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     ...     max_depth=1, random_state=0).fit(X, y)
     >>> features = [0, 1, (0, 1)]
-    >>> fig, axs = plot_partial_dependence(clf, X, features) #doctest: +SKIP
+    >>> plot_partial_dependence(clf, X, features) #doctest: +SKIP
+
+You can access the newly created figure and Axes objects using ``plt.gcf()``
+and ``plt.gca()``.
 
 For multi-class classification, you need to set the class label for which
 the PDPs should be created via the ``target`` argument::
@@ -65,7 +68,7 @@ the PDPs should be created via the ``target`` argument::
     >>> mc_clf = GradientBoostingClassifier(n_estimators=10,
     ...     max_depth=1).fit(iris.data, iris.target)
     >>> features = [3, 2, (3, 2)]
-    >>> fig, axs = plot_partial_dependence(mc_clf, X, features, target=0) #doctest: +SKIP
+    >>> plot_partial_dependence(mc_clf, X, features, target=0) #doctest: +SKIP
 
 The same parameter ``target`` is used to specify the target in multi-output
 regression settings.
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index dcf2d3c75b2b7..9f4ab65f8f6ff 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -316,8 +316,6 @@ Support for Python 3.4 and below has been officially dropped.
 :mod:`sklearn.inspection`
 .........................
 
-- |Feature| A new module ``sklearn.inspection`` is created.
-
 - |Feature| Partial dependence plots
   (:func:`inspection.plot_partial_dependence`) are now supported for
   any regressor or classifier (provided that they have a `predict_proba`
diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index cde24cdb8991b..8798fb459ec74 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -367,7 +367,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
                             method='auto', n_jobs=None, verbose=0, fig=None,
-                            line_kw=None, contour_kw=None, **fig_kw):
+                            line_kw=None, contour_kw=None):
     """Partial dependence plots.
 
     The ``len(features)`` plots are arranged in a grid with ``n_cols``
@@ -451,17 +451,13 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
         Verbose output during PD computations.
     fig : Matplotlib figure object, optional (default=None)
         A figure object onto which the plots will be drawn, after the figure
-        has been cleared.
+        has been cleared. By default, a new one is created.
     line_kw : dict, optional
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
     contour_kw : dict, optional
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
-    **fig_kw : dict, optional
-        Dict with keywords passed to the figure() call.
-        Note that all keywords not recognized above will be automatically
-        included here.
 
     Examples
     --------
@@ -469,8 +465,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None,
     >>> from sklearn.ensemble import GradientBoostingRegressor
     >>> X, y = make_friedman1()
     >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
-    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
-    ...
+    >>> plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
 
     See also
     --------
@@ -599,7 +594,7 @@ def convert_feature(fx):
         Z_level = np.linspace(*pdp_lim[2], num=8)
 
     if fig is None:
-        fig = plt.figure(**fig_kw)
+        fig = plt.figure()
     else:
         fig.clear()
 
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index a2aaeb83ade66..109847291db87 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -28,6 +28,7 @@
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import if_matplotlib
+from sklearn.utils.testing import close_figure
 
 
 # toy sample
@@ -396,16 +397,6 @@ def test_partial_dependence_sample_weight():
     assert np.corrcoef(pdp, values)[0, 1] > 0.99
 
 
-def close_fig(fignum=None):
-    from matplotlib.pyplot import get_fignums, close as _close  # noqa
-
-    if fignum is None:
-        for fignum in get_fignums():
-            _close(fignum)
-    else:
-        _close(fignum)
-
-
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -446,7 +437,7 @@ def test_plot_partial_dependence():
     assert len(axs) == 3
     assert all(ax.has_data for ax in axs)
 
-    close_fig()
+    close_figure()
 
 
 @if_matplotlib
@@ -480,7 +471,7 @@ def test_plot_partial_dependence_multiclass():
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    close_fig()
+    close_figure()
 
 
 @if_matplotlib
@@ -508,74 +499,46 @@ def test_plot_partial_dependence_multioutput():
     assert len(axs) == 2
     assert all(ax.has_data for ax in axs)
 
-    close_fig()
+    close_figure()
 
 
-@if_matplotlib
+@pytest.mark.parametrize(
+    "data, params, err_msg",
+    [(multioutput_regression_data[0], {"target": None, 'features': [0]},
+      "target must be specified for multi-output"),
+     (multioutput_regression_data[0], {"target": -1, 'features': [0]},
+      r'target must be in \[0, n_tasks\]'),
+     (multioutput_regression_data[0], {"target": 100, 'features': [0]},
+      r'target must be in \[0, n_tasks\]'),
+     (make_classification(random_state=0),
+     {'features': ['foobar'], 'feature_names': None},
+     'Feature foobar not in feature_names'),
+     (make_classification(random_state=0),
+     {'features': ['foobar'], 'feature_names': ['abcd', 'def']},
+      'Feature foobar not in feature_names'),
+     (make_classification(random_state=0), {'features': [(1, 2, 3)]},
+      'Each entry in features must be either an int, '),
+     (make_classification(random_state=0), {'features': [1, {}]},
+      'Each entry in features must be either an int, '),
+     (make_classification(random_state=0), {'features': [tuple()]},
+      'Each entry in features must be either an int, '),
+     (make_classification(random_state=0),
+      {'features': [123], 'feature_names': ['blahblah']},
+      'All entries of features must be less than '),
+     (make_classification(random_state=0),
+      {'features': [0, 1, 2], 'feature_names': ['a', 'b', 'a']},
+      'feature_names should not contain duplicates')]
+)
 @pytest.mark.filterwarnings('ignore:Default solver will be changed ')  # 0.22
 @pytest.mark.filterwarnings('ignore:Default multi_class will be')  # 0.22
-def test_plot_partial_dependence_input():
-    X, y = make_classification(random_state=0)
+def test_plot_partial_dependence_error(data, params, err_msg):
+    X, y = data
+    estimator = LinearRegression().fit(X, y)
 
-    lr = LinearRegression()
-    lr.fit(X, y)
-    gbc = GradientBoostingClassifier(random_state=0)
-    gbc.fit(X, y)
-
-    # check target param for multiclass
-    (X_m, y_m), _ = multiclass_classification_data
-    lr_m = LogisticRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-class'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match='target not in est.classes_'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    # check target param for multioutput
-    (X_m, y_m), _ = multioutput_regression_data
-    lr_m = LinearRegression()
-    lr_m.fit(X_m, y_m)
-    with pytest.raises(
-            ValueError,
-            match='target must be specified for multi-output'):
-        plot_partial_dependence(lr_m, X_m, [0], target=None)
-    for target in (-1, 100):
-        with pytest.raises(
-                ValueError,
-                match=r'target must be in \[0, n_tasks\]'):
-            plot_partial_dependence(lr_m, X_m, [0], target=target)
-
-    for feature_names in (None, ['abcd', 'def']):
-        with pytest.raises(
-                ValueError,
-                match='Feature foobar not in feature_names'):
-            plot_partial_dependence(lr, X, features=['foobar'],
-                                    feature_names=feature_names)
-
-    for features in([(1, 2, 3)], [1, {}], [tuple()]):
-        with pytest.raises(
-                ValueError,
-                match='Each entry in features must be either an int, '):
-            plot_partial_dependence(lr, X, features=features)
-
-    with pytest.raises(
-            ValueError,
-            match='All entries of features must be less than '):
-        plot_partial_dependence(lr, X, features=[123],
-                                feature_names=['blah'])
-
-    with pytest.raises(
-            ValueError,
-            match='feature_names should not contain duplicates'):
-        plot_partial_dependence(lr, X, features=[0, 1, 2],
-                                feature_names=['a', 'b', 'a'])
+    with pytest.raises(ValueError, match=err_msg):
+        plot_partial_dependence(estimator, X, **params)
 
-    close_fig()
+    close_figure()
 
 
 @if_matplotlib
@@ -595,4 +558,4 @@ def test_plot_partial_dependence_fig():
 
     assert plt.gcf() is fig
 
-    close_fig()
+    close_figure()
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 1662294189690..33b0da90fa9db 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -970,3 +970,21 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None):
             if n1 != n2:
                 incorrect += [func_name + ' ' + n1 + ' != ' + n2]
     return incorrect
+
+
+def close_figure(fig=None):
+    """Close a matplotlibt figure.
+
+    Parameters
+    ----------
+    fig : int or str or Figure, optional (default=None)
+        The figure, figure number or figure name to close. If ``None``, all
+        current figures are closed.
+    """
+    from matplotlib.pyplot import get_fignums, close as _close  # noqa
+
+    if fig is None:
+        for fig in get_fignums():
+            _close(fig)
+    else:
+        _close(fig)

From 27c261edf483f31c446cc185486bc2d215f6a648 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 24 Apr 2019 08:01:15 -0400
Subject: [PATCH 113/113] forgot if_mpl decorator

---
 sklearn/inspection/tests/test_partial_dependence.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 109847291db87..406a6b1bf2e42 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -502,6 +502,7 @@ def test_plot_partial_dependence_multioutput():
     close_figure()
 
 
+@if_matplotlib
 @pytest.mark.parametrize(
     "data, params, err_msg",
     [(multioutput_regression_data[0], {"target": None, 'features': [0]},