diff --git a/sklearn/__init__.py b/sklearn/__init__.py index dbb7862d8839e..1fa157fb68d4d 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -71,9 +71,10 @@ 'isotonic', 'kernel_approximation', 'kernel_ridge', 'learning_curve', 'linear_model', 'manifold', 'metrics', 'mixture', 'model_selection', 'multiclass', 'multioutput', - 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', - 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', + 'naive_bayes', 'neighbors', 'neural_network', + 'partial_dependence', 'pipeline', 'preprocessing', + 'random_projection', 'semi_supervised', 'svm', 'tree', + 'discriminant_analysis', 'impute', 'compose', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context'] diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py index e8bfc2110bb90..63d397d86c8a0 100644 --- a/sklearn/ensemble/partial_dependence.py +++ b/sklearn/ensemble/partial_dependence.py @@ -3,70 +3,9 @@ # Authors: Peter Prettenhofer # License: BSD 3 clause -from itertools import count -import numbers - -import numpy as np -from scipy.stats.mstats import mquantiles - -from ..utils.extmath import cartesian -from ..externals.joblib import Parallel, delayed -from ..externals import six -from ..externals.six.moves import map, range, zip -from ..utils import check_array -from ..utils.validation import check_is_fitted -from ..tree._tree import DTYPE - -from ._gradient_boosting import _partial_dependence_tree -from .gradient_boosting import BaseGradientBoosting - - -def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): - """Generate a grid of points based on the ``percentiles of ``X``. - - The grid is generated by placing ``grid_resolution`` equally - spaced points between the ``percentiles`` of each column - of ``X``. - - Parameters - ---------- - X : ndarray - The data - percentiles : tuple of floats - The percentiles which are used to construct the extreme - values of the grid axes. - grid_resolution : int - The number of equally spaced points that are placed - on the grid. - - Returns - ------- - grid : ndarray - All data points on the grid; ``grid.shape[1] == X.shape[1]`` - and ``grid.shape[0] == grid_resolution * X.shape[1]``. - axes : seq of ndarray - The axes with which the grid has been created. - """ - if len(percentiles) != 2: - raise ValueError('percentile must be tuple of len 2') - if not all(0. <= x <= 1. for x in percentiles): - raise ValueError('percentile values must be in [0, 1]') - - axes = [] - emp_percentiles = mquantiles(X, prob=percentiles, axis=0) - for col in range(X.shape[1]): - uniques = np.unique(X[:, col]) - if uniques.shape[0] < grid_resolution: - # feature has low resolution use unique vals - axis = uniques - else: - # create axis based on percentiles and grid resolution - axis = np.linspace(emp_percentiles[0, col], - emp_percentiles[1, col], - num=grid_resolution, endpoint=True) - axes.append(axis) - - return cartesian(axes), axes +import warnings +from ..partial_dependence import partial_dependence as new_pd +from ..partial_dependence import plot_partial_dependence as new_ppd def partial_dependence(gbrt, target_variables, grid=None, X=None, @@ -120,47 +59,17 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None, >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) """ - if not isinstance(gbrt, BaseGradientBoosting): - raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt, 'estimators_') - if (grid is None and X is None) or (grid is not None and X is not None): - raise ValueError('Either grid or X must be specified') - - target_variables = np.asarray(target_variables, dtype=np.int32, - order='C').ravel() - - if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]): - raise ValueError('target_variables must be in [0, %d]' - % (gbrt.n_features_ - 1)) - - if X is not None: - X = check_array(X, dtype=DTYPE, order='C') - grid, axes = _grid_from_X(X[:, target_variables], percentiles, - grid_resolution) - else: - assert grid is not None - # dont return axes if grid is given - axes = None - # grid must be 2d - if grid.ndim == 1: - grid = grid[:, np.newaxis] - if grid.ndim != 2: - raise ValueError('grid must be 2d but is %dd' % grid.ndim) - - grid = np.asarray(grid, dtype=DTYPE, order='C') - assert grid.shape[1] == target_variables.shape[0] - - n_trees_per_stage = gbrt.estimators_.shape[1] - n_estimators = gbrt.estimators_.shape[0] - pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64, - order='C') - for stage in range(n_estimators): - for k in range(n_trees_per_stage): - tree = gbrt.estimators_[stage, k].tree_ - _partial_dependence_tree(tree, grid, target_variables, - gbrt.learning_rate, pdp[k]) - - return pdp, axes + warnings.warn("The function ensemble.partial_dependence has been moved to " + "partial_dependence in 0.20 and will be removed in 0.22.", + DeprecationWarning) + return new_pd(est=gbrt, + target_variables=target_variables, + grid=grid, + X=X, + output=None, + percentiles=percentiles, + grid_resolution=grid_resolution, + method='recursion') def plot_partial_dependence(gbrt, X, features, feature_names=None, @@ -237,159 +146,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None, >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ - import matplotlib.pyplot as plt - from matplotlib import transforms - from matplotlib.ticker import MaxNLocator - from matplotlib.ticker import ScalarFormatter - - if not isinstance(gbrt, BaseGradientBoosting): - raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt, 'estimators_') - - # set label_idx for multi-class GBRT - if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: - if label is None: - raise ValueError('label is not given for multi-class PDP') - label_idx = np.searchsorted(gbrt.classes_, label) - if gbrt.classes_[label_idx] != label: - raise ValueError('label %s not in ``gbrt.classes_``' % str(label)) - else: - # regression and binary classification - label_idx = 0 - - X = check_array(X, dtype=DTYPE, order='C') - if gbrt.n_features_ != X.shape[1]: - raise ValueError('X.shape[1] does not match gbrt.n_features_') - - if line_kw is None: - line_kw = {'color': 'green'} - if contour_kw is None: - contour_kw = {} - - # convert feature_names to list - if feature_names is None: - # if not feature_names use fx indices as name - feature_names = [str(i) for i in range(gbrt.n_features_)] - elif isinstance(feature_names, np.ndarray): - feature_names = feature_names.tolist() - - def convert_feature(fx): - if isinstance(fx, six.string_types): - try: - fx = feature_names.index(fx) - except ValueError: - raise ValueError('Feature %s not in feature_names' % fx) - return fx - - # convert features into a seq of int tuples - tmp_features = [] - for fxs in features: - if isinstance(fxs, (numbers.Integral,) + six.string_types): - fxs = (fxs,) - try: - fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) - except TypeError: - raise ValueError('features must be either int, str, or tuple ' - 'of int/str') - if not (1 <= np.size(fxs) <= 2): - raise ValueError('target features must be either one or two') - - tmp_features.append(fxs) - - features = tmp_features - - names = [] - try: - for fxs in features: - l = [] - # explicit loop so "i" is bound for exception below - for i in fxs: - l.append(feature_names[i]) - names.append(l) - except IndexError: - raise ValueError('All entries of features must be less than ' - 'len(feature_names) = {0}, got {1}.' - .format(len(feature_names), i)) - - # compute PD functions - pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(partial_dependence)(gbrt, fxs, X=X, - grid_resolution=grid_resolution, - percentiles=percentiles) - for fxs in features) - - # get global min and max values of PD grouped by plot type - pdp_lim = {} - for pdp, axes in pd_result: - min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() - n_fx = len(axes) - old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) - min_pd = min(min_pd, old_min_pd) - max_pd = max(max_pd, old_max_pd) - pdp_lim[n_fx] = (min_pd, max_pd) - - # create contour levels for two-way plots - if 2 in pdp_lim: - Z_level = np.linspace(*pdp_lim[2], num=8) - - if ax is None: - fig = plt.figure(**fig_kw) - else: - fig = ax.get_figure() - fig.clear() - - n_cols = min(n_cols, len(features)) - n_rows = int(np.ceil(len(features) / float(n_cols))) - axs = [] - for i, fx, name, (pdp, axes) in zip(count(), features, names, - pd_result): - ax = fig.add_subplot(n_rows, n_cols, i + 1) - - if len(axes) == 1: - ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) - else: - # make contour plot - assert len(axes) == 2 - XX, YY = np.meshgrid(axes[0], axes[1]) - Z = pdp[label_idx].reshape(list(map(np.size, axes))).T - CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, - colors='k') - ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], - vmin=Z_level[0], alpha=0.75, **contour_kw) - ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) - - # plot data deciles + axes labels - deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) - trans = transforms.blended_transform_factory(ax.transData, - ax.transAxes) - ylim = ax.get_ylim() - ax.vlines(deciles, [0], 0.05, transform=trans, color='k') - ax.set_xlabel(name[0]) - ax.set_ylim(ylim) - - # prevent x-axis ticks from overlapping - ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) - tick_formatter = ScalarFormatter() - tick_formatter.set_powerlimits((-3, 4)) - ax.xaxis.set_major_formatter(tick_formatter) - - if len(axes) > 1: - # two-way PDP - y-axis deciles + labels - deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) - trans = transforms.blended_transform_factory(ax.transAxes, - ax.transData) - xlim = ax.get_xlim() - ax.hlines(deciles, [0], 0.05, transform=trans, color='k') - ax.set_ylabel(name[1]) - # hline erases xlim - ax.set_xlim(xlim) - else: - ax.set_ylabel('Partial dependence') - - if len(axes) == 1: - ax.set_ylim(pdp_lim[1]) - axs.append(ax) - - fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, - hspace=0.3) - return fig, axs + warnings.warn("The function ensemble.plot_partial_dependence has been " + "moved to partial_dependence in 0.20 and will be removed " + "in 0.22.", + DeprecationWarning) + return new_ppd(est=gbrt, + X=X, + features=features, + feature_names=feature_names, + label=label, + n_cols=n_cols, + grid_resolution=grid_resolution, + method='recursion', + percentiles=percentiles, + n_jobs=n_jobs, + verbose=verbose, + ax=ax, + line_kw=line_kw, + contour_kw=contour_kw, + **fig_kw) diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py index cec7efc46f03b..3a45ade617f9e 100644 --- a/sklearn/ensemble/tests/test_partial_dependence.py +++ b/sklearn/ensemble/tests/test_partial_dependence.py @@ -12,6 +12,8 @@ from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn import datasets +from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import assert_warns_message # toy sample @@ -27,6 +29,7 @@ iris = datasets.load_iris() +@ignore_warnings(category=DeprecationWarning) def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -47,6 +50,7 @@ def test_partial_dependence_classifier(): assert_array_equal(pdp, pdp_2) +@ignore_warnings(category=DeprecationWarning) def test_partial_dependence_multiclass(): # Test partial dependence for multi-class classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -62,6 +66,7 @@ def test_partial_dependence_multiclass(): assert axes[0].shape[0] == grid_resolution +@ignore_warnings(category=DeprecationWarning) def test_partial_dependence_regressor(): # Test partial dependence for regressor clf = GradientBoostingRegressor(n_estimators=10, random_state=1) @@ -75,6 +80,7 @@ def test_partial_dependence_regressor(): assert axes[0].shape[0] == grid_resolution +@ignore_warnings(category=DeprecationWarning) def test_partial_dependecy_input(): # Test input validation of partial dependence. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -103,6 +109,7 @@ def test_partial_dependecy_input(): assert_raises(ValueError, partial_dependence, clf, [0], grid=grid) +@ignore_warnings(category=DeprecationWarning) @if_matplotlib def test_plot_partial_dependence(): # Test partial dependence plot function. @@ -136,6 +143,7 @@ def test_plot_partial_dependence(): @if_matplotlib +@ignore_warnings(category=DeprecationWarning) def test_plot_partial_dependence_input(): # Test partial dependence plot function input checks. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -171,6 +179,7 @@ def test_plot_partial_dependence_input(): @if_matplotlib +@ignore_warnings(category=DeprecationWarning) def test_plot_partial_dependence_multiclass(): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -204,3 +213,31 @@ def test_plot_partial_dependence_multiclass(): assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], grid_resolution=grid_resolution) + + +def test_warning_raised_for_partial_dependence(): + # Test that running the old partial_dependence function warns + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(boston.data, boston.target) + grid_resolution = 25 + + assert_warns_message(DeprecationWarning, "The function " + "ensemble.partial_dependence has been moved to " + "partial_dependence in 0.20 and will be removed in " + "0.22.", partial_dependence, clf, [0], X=boston.data, + grid_resolution=grid_resolution) + + +@if_matplotlib +def test_warning_raised_for_plot_partial_dependence(): + # Test that running the old partial_dependence function warns + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(boston.data, boston.target) + grid_resolution = 25 + + assert_warns_message(DeprecationWarning, "The function " + "ensemble.plot_partial_dependence has been moved to " + "partial_dependence in 0.20 and will be removed in " + "0.22.", plot_partial_dependence, clf, boston.data, + [0, 1, (0, 1)], grid_resolution=grid_resolution, + feature_names=boston.feature_names) diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py new file mode 100644 index 0000000000000..436bc6f2b1134 --- /dev/null +++ b/sklearn/partial_dependence.py @@ -0,0 +1,614 @@ +"""Partial dependence plots for regression and classification models.""" + +# Authors: Peter Prettenhofer +# Trevor Stephens +# License: BSD 3 clause + +from itertools import count +import numbers + +import numpy as np +from scipy.stats.mstats import mquantiles + +from .utils.extmath import cartesian +from .externals.joblib import Parallel, delayed +from .externals import six +from .externals.six.moves import map, range, zip +from .utils import check_array +from .utils.validation import check_is_fitted +from .tree._tree import DTYPE + +from .exceptions import NotFittedError + + +__all__ = ['partial_dependence', 'plot_partial_dependence'] + + +def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): + """Generate a grid of points based on the ``percentiles of ``X``. + + The grid is generated by placing ``grid_resolution`` equally + spaced points between the ``percentiles`` of each column + of ``X``. + + Parameters + ---------- + X : ndarray + The data + percentiles : tuple of floats + The percentiles which are used to construct the extreme + values of the grid axes. + grid_resolution : int + The number of equally spaced points that are placed + on the grid. + + Returns + ------- + grid : ndarray + All data points on the grid; ``grid.shape[1] == X.shape[1]`` + and ``grid.shape[0] == grid_resolution * X.shape[1]``. + axes : seq of ndarray + The axes with which the grid has been created. + """ + if len(percentiles) != 2: + raise ValueError('percentile must be tuple of len 2') + if not all(0. <= x <= 1. for x in percentiles): + raise ValueError('percentile values must be in [0, 1]') + + axes = [] + emp_percentiles = mquantiles(X, prob=percentiles, axis=0) + for col in range(X.shape[1]): + uniques = np.unique(X[:, col]) + if uniques.shape[0] < grid_resolution: + # feature has low resolution use unique vals + axis = uniques + else: + # create axis based on percentiles and grid resolution + axis = np.linspace(emp_percentiles[0, col], + emp_percentiles[1, col], + num=grid_resolution, endpoint=True) + axes.append(axis) + + return cartesian(axes), axes + + +def _predict(est, X_eval, output=None): + """Calculate part of the partial dependence of ``target_variables``. + + The function will be calculated by calling the ``predict_proba`` method of + ``est`` for classification or ``predict`` for regression on ``X`` for every + point in the grid. + + Parameters + ---------- + est : BaseEstimator + A fitted classification or regression model. + X_eval : array-like, shape=(n_samples, n_features) + The data on which the partial dependence of ``est`` should be + predicted. + output : int, optional (default=None) + The output index to use for multi-output estimators. + + Returns + ------- + out : array, shape=(n_classes, n_points) + The partial dependence function evaluated on the ``grid``. + For regression and binary classification ``n_classes==1``. + """ + if est._estimator_type == 'regressor': + try: + out = est.predict(X_eval) + except NotFittedError: + raise ValueError('Call %s.fit before partial_dependence' % + est.__class__.__name__) + if out.ndim != 1 and out.shape[1] == 1: + # Column output + out = out.ravel() + if out.ndim != 1 and out.shape[1] != 1: + # Multi-output + if not 0 <= output < out.shape[1]: + raise ValueError('Valid output must be specified for ' + 'multi-output models.') + out = out[:, output] + elif est._estimator_type == 'classifier': + try: + out = est.predict_proba(X_eval) + except NotFittedError: + raise ValueError('Call %s.fit before partial_dependence' % + est.__class__.__name__) + if isinstance(out, list): + # Multi-output + if not 0 <= output < len(out): + raise ValueError('Valid output must be specified for ' + 'multi-output models.') + out = out[output] + out = np.log(np.clip(out, 1e-16, 1)) + out = np.subtract(out, np.mean(out, 1)[:, np.newaxis]) + else: + raise ValueError('est must be a fitted regressor or classifier ' + 'model.') + return out + + +def partial_dependence(est, target_variables, grid=None, X=None, output=None, + percentiles=(0.05, 0.95), grid_resolution=100, + method='auto'): + """Partial dependence of ``target_variables``. + + Partial dependence plots show the dependence between the joint values + of the ``target_variables`` and the function represented + by the ``est``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + est : BaseEstimator + A fitted classification or regression model. + target_variables : array-like, dtype=int + The target features for which the partial dependency should be + computed (size should be smaller than 3 for visual renderings). + grid : array-like, shape=(n_points, len(target_variables)) + The grid of ``target_variables`` values for which the + partial dependency should be evaluated (either ``grid`` or ``X`` + must be specified). + X : array-like, shape=(n_samples, n_features) + The data on which ``est`` was trained. It is used to generate + a ``grid`` for the ``target_variables``. The ``grid`` comprises + ``grid_resolution`` equally spaced points between the two + ``percentiles``. + output : int, optional (default=None) + The output index to use for multi-output estimators. + percentiles : (low, high), default=(0.05, 0.95) + The lower and upper percentile used to create the extreme values + for the ``grid``. Only if ``X`` is not None. + grid_resolution : int, default=100 + The number of equally spaced points on the ``grid``. + method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto' + The method to use to calculate the partial dependence function: + + - If 'recursion', the underlying trees of ``est`` will be recursed to + calculate the function. Only supported for BaseGradientBoosting and + ForestRegressor. + - If 'exact', the function will be calculated by calling the + ``predict_proba`` method of ``est`` for classification or ``predict`` + for regression on ``X``for every point in the grid. To speed up this + method, you can use a subset of ``X`` or a more coarse grid. + - If 'estimated', the function will be calculated by calling the + ``predict_proba`` method of ``est`` for classification or ``predict`` + for regression on the median of ``X``. + - If 'auto', then 'recursion' will be used if ``est`` is + BaseGradientBoosting or ForestRegressor, and 'exact' used for other + estimators. + + Returns + ------- + pdp : array, shape=(n_classes, n_points) + The partial dependence function evaluated on the ``grid``. + For regression and binary classification ``n_classes==1``. + axes : seq of ndarray or None + The axes with which the grid has been created or None if + the grid has been given. + + Examples + -------- + >>> samples = [[0, 0, 2], [1, 0, 0]] + >>> labels = [0, 1] + >>> from sklearn.ensemble import GradientBoostingClassifier + >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels) + >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2) + >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP + (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) + """ + # TODO: The pattern below required to avoid a namespace collision. + # TODO: Move below imports to module level import at 0.22 release. + from .ensemble._gradient_boosting import _partial_dependence_tree + from .ensemble.gradient_boosting import BaseGradientBoosting + from .ensemble.forest import ForestRegressor + + if method == 'auto': + if isinstance(est, (BaseGradientBoosting, ForestRegressor)): + method = 'recursion' + else: + method = 'exact' + if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and + method == 'recursion'): + raise ValueError('est has to be an instance of BaseGradientBoosting or' + ' ForestRegressor for the "recursion" method. Try ' + 'using method="exact" or "estimated".') + if (not hasattr(est, '_estimator_type') or + est._estimator_type not in ('classifier', 'regressor')): + raise ValueError('est must be a fitted regressor or classifier model.') + if (method != 'recursion' and est._estimator_type == 'classifier' and + not hasattr(est, 'predict_proba')): + raise ValueError('est requires a predict_proba method for ' + 'method="exact" or "estimated" for classification.') + if X is not None: + X = check_array(X, dtype=DTYPE, order='C') + if method == 'recursion': + check_is_fitted(est, 'estimators_', msg='Call %s.fit before ' + 'partial_dependence' % + est.__class__.__name__) + n_features = est.n_features_ + elif X is None: + raise ValueError('X is required for method="exact" or "estimated".') + else: + n_features = X.shape[1] + if (grid is None and X is None) or (grid is not None and X is not None): + raise ValueError('Either grid or X must be specified') + + target_variables = np.asarray(target_variables, dtype=np.int32, + order='C').ravel() + + if any([not (0 <= fx < n_features) for fx in target_variables]): + raise ValueError('target_variables must be in [0, %d]' + % (n_features - 1)) + + if X is not None: + grid, axes = _grid_from_X(X[:, target_variables], percentiles, + grid_resolution) + else: + assert grid is not None + # don't return axes if grid is given + axes = None + # grid must be 2d + if grid.ndim == 1: + grid = grid[:, np.newaxis] + if grid.ndim != 2: + raise ValueError('grid must be 2d but is %dd' % grid.ndim) + + grid = np.asarray(grid, dtype=DTYPE, order='C') + assert grid.shape[1] == target_variables.shape[0] + + if method == 'recursion': + if isinstance(est, BaseGradientBoosting): + n_trees_per_stage = est.estimators_.shape[1] + n_estimators = est.estimators_.shape[0] + learning_rate = est.learning_rate + else: + n_trees_per_stage = 1 + n_estimators = len(est.estimators_) + learning_rate = 1. + pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64, + order='C') + for stage in range(n_estimators): + for k in range(n_trees_per_stage): + if isinstance(est, BaseGradientBoosting): + tree = est.estimators_[stage, k].tree_ + else: + tree = est.estimators_[stage].tree_ + _partial_dependence_tree(tree, grid, target_variables, + learning_rate, pdp[k]) + if isinstance(est, ForestRegressor): + pdp /= n_estimators + elif method == 'exact': + n_samples = X.shape[0] + pdp = [] + for row in range(grid.shape[0]): + X_eval = X.copy() + for i, variable in enumerate(target_variables): + X_eval[:, variable] = np.repeat(grid[row, i], n_samples) + pdp_row = _predict(est, X_eval, output=output) + if est._estimator_type == 'regressor': + pdp.append(np.mean(pdp_row)) + else: + pdp.append(np.mean(pdp_row, 0)) + pdp = np.array(pdp).transpose() + if pdp.shape[0] == 2: + # Binary classification + pdp = pdp[1, :][np.newaxis] + elif pdp.ndim == 1: + # Regression + pdp = pdp[np.newaxis] + elif method == 'estimated': + n_samples = grid.shape[0] + X_eval = np.tile(np.median(X, 0), [n_samples, 1]) + for i, variable in enumerate(target_variables): + X_eval[:, variable] = grid[:, i] + pdp = _predict(est, X_eval, output=output) + if est._estimator_type == 'classifier' and pdp.shape[1] == 2: + # Binary classification + pdp = pdp[:, 1][np.newaxis] + elif est._estimator_type == 'classifier' and pdp.shape[1] > 2: + # Multi-label classification + pdp = pdp.T + if est._estimator_type == 'regressor': + pdp = pdp[np.newaxis] + else: + raise ValueError('method "%s" is invalid. Use "recursion", "exact", ' + '"estimated", or None.' % method) + return pdp, axes + + +def plot_partial_dependence(est, X, features, feature_names=None, + label=None, output=None, + n_cols=3, grid_resolution=100, + percentiles=(0.05, 0.95), method='auto', n_jobs=1, + verbose=0, ax=None, line_kw=None, + contour_kw=None, **fig_kw): + """Partial dependence plots for ``features``. + + The ``len(features)`` plots are arranged in a grid with ``n_cols`` + columns. Two-way partial dependence plots are plotted as contour + plots. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + est : BaseEstimator + A fitted classification or regression model. + X : array-like, shape=(n_samples, n_features) + The data on which ``est`` was trained. + features : seq of ints, strings, or tuples of ints or strings + If seq[i] is an int or a tuple with one int value, a one-way + PDP is created; if seq[i] is a tuple of two ints, a two-way + PDP is created. + If feature_names is specified and seq[i] is an int, seq[i] + must be < len(feature_names). + If seq[i] is a string, feature_names must be specified, and + seq[i] must be in feature_names. + feature_names : seq of str + Name of each feature; feature_names[i] holds + the name of the feature with index i. + label : object + The class label for which the PDPs should be computed. + Only if est is a multi-class model. Must be in ``est.classes_``. + output : int, optional (default=None) + The output index to use for multi-output estimators. + n_cols : int + The number of columns in the grid plot (default: 3). + grid_resolution : int, default=100 + The number of equally spaced points on the axes. + percentiles : (low, high), default=(0.05, 0.95) + The lower and upper percentile used to create the extreme values + for the PDP axes. + method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto' + The method to use to calculate the partial dependence function: + + - If 'recursion', the underlying trees of ``est`` will be recursed to + calculate the function. Only supported for BaseGradientBoosting and + ForestRegressor. + - If 'exact', the function will be calculated by calling the + ``predict_proba`` method of ``est`` for classification or ``predict`` + for regression on ``X``for every point in the grid. To speed up this + method, you can use a subset of ``X`` or a more coarse grid. + - If 'estimated', the function will be calculated by calling the + ``predict_proba`` method of ``est`` for classification or ``predict`` + for regression on the median of ``X``. + - If 'auto', then 'recursion' will be used if ``est`` is + BaseGradientBoosting or ForestRegressor, and 'exact' used for other + estimators. + n_jobs : int + The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. + Defaults to 1. + verbose : int + Verbose output during PD computations. Defaults to 0. + ax : Matplotlib axis object, default None + An axis object onto which the plots will be drawn. + line_kw : dict + Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. + For one-way partial dependence plots. + contour_kw : dict + Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. + For two-way partial dependence plots. + **fig_kw : dict + Dict with keywords passed to the figure() call. + Note that all keywords not recognized above will be automatically + included here. + + Returns + ------- + fig : figure + The Matplotlib Figure object. + axs : seq of Axis objects + A seq of Axis objects, one for each subplot. + + Examples + -------- + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> X, y = make_friedman1() + >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) + >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP + ... + """ + import matplotlib.pyplot as plt + from matplotlib import transforms + from matplotlib.ticker import MaxNLocator + from matplotlib.ticker import ScalarFormatter + # TODO: The pattern below required to avoid a namespace collision. + # TODO: Move below imports to module level import at 0.22 release. + from .ensemble.gradient_boosting import BaseGradientBoosting + from .ensemble.forest import ForestRegressor + + if method == 'auto': + if isinstance(est, (BaseGradientBoosting, ForestRegressor)): + method = 'recursion' + else: + method = 'exact' + if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and + method == 'recursion'): + raise ValueError('est has to be an instance of BaseGradientBoosting or' + ' ForestRegressor for the "recursion" method. Try ' + 'using method="exact" or "estimated".') + if (not hasattr(est, '_estimator_type') or + est._estimator_type not in ('classifier', 'regressor')): + raise ValueError('est must be a fitted regressor or classifier model.') + if (method != 'recursion' and est._estimator_type == 'classifier' and + not hasattr(est, 'predict_proba')): + raise ValueError('est requires a predict_proba method for ' + 'method="exact" or "estimated" for classification.') + if X is not None: + X = check_array(X, dtype=DTYPE, order='C') + if method == 'recursion': + check_is_fitted(est, 'estimators_', msg='Call %s.fit before ' + 'partial_dependence' % + est.__class__.__name__) + n_features = est.n_features_ + elif X is None: + raise ValueError('X is required for method="exact" or "estimated".') + else: + n_features = X.shape[1] + + # set label_idx for multi-class estimators + if hasattr(est, 'classes_') and np.size(est.classes_) > 2: + if label is None: + raise ValueError('label is not given for multi-class PDP') + if type(est.classes_) == list: + # multi-output classification + if output is None: + raise ValueError('output is required for multi-output ' + 'estimators') + if output > len(est.classes_): + raise ValueError('output %d exceeds number of outputs in est, ' + '%d' % (output, len(est.classes_))) + label_idx = np.searchsorted(est.classes_[output], label) + if est.classes_[output][label_idx] != label: + raise ValueError('label %s not in ``est.classes_``' % + str(label)) + else: + label_idx = np.searchsorted(est.classes_, label) + if est.classes_[label_idx] != label: + raise ValueError('label %s not in ``est.classes_``' % + str(label)) + else: + # regression and binary classification + label_idx = 0 + + if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]: + raise ValueError('X.shape[1] does not match est.n_features_') + + if line_kw is None: + line_kw = {'color': 'green'} + if contour_kw is None: + contour_kw = {} + + # convert feature_names to list + if feature_names is None: + # if not feature_names use fx indices as name + feature_names = [str(i) for i in range(n_features)] + elif isinstance(feature_names, np.ndarray): + feature_names = feature_names.tolist() + + def convert_feature(fx): + if isinstance(fx, six.string_types): + try: + fx = feature_names.index(fx) + except ValueError: + raise ValueError('Feature %s not in feature_names' % fx) + return fx + + # convert features into a seq of int tuples + tmp_features = [] + for fxs in features: + if isinstance(fxs, (numbers.Integral,) + six.string_types): + fxs = (fxs,) + try: + fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) + except TypeError: + raise ValueError('features must be either int, str, or tuple ' + 'of int/str') + if not (1 <= np.size(fxs) <= 2): + raise ValueError('target features must be either one or two') + + tmp_features.append(fxs) + + features = tmp_features + + names = [] + try: + for fxs in features: + l = [] + # explicit loop so "i" is bound for exception below + for i in fxs: + l.append(feature_names[i]) + names.append(l) + except IndexError: + raise ValueError('All entries of features must be less than ' + 'len(feature_names) = {0}, got {1}.' + .format(len(feature_names), i)) + + # compute PD functions + pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(partial_dependence)(est, fxs, X=X, output=output, + method=method, + grid_resolution=grid_resolution, + percentiles=percentiles) + for fxs in features) + + # get global min and max values of PD grouped by plot type + pdp_lim = {} + for pdp, axes in pd_result: + min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() + n_fx = len(axes) + old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) + min_pd = min(min_pd, old_min_pd) + max_pd = max(max_pd, old_max_pd) + pdp_lim[n_fx] = (min_pd, max_pd) + + # create contour levels for two-way plots + if 2 in pdp_lim: + Z_level = np.linspace(*pdp_lim[2], num=8) + + if ax is None: + fig = plt.figure(**fig_kw) + else: + fig = ax.get_figure() + fig.clear() + + n_cols = min(n_cols, len(features)) + n_rows = int(np.ceil(len(features) / float(n_cols))) + axs = [] + for i, fx, name, (pdp, axes) in zip(count(), features, names, + pd_result): + ax = fig.add_subplot(n_rows, n_cols, i + 1) + + if len(axes) == 1: + ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) + else: + # make contour plot + assert len(axes) == 2 + XX, YY = np.meshgrid(axes[0], axes[1]) + Z = pdp[label_idx].reshape(list(map(np.size, axes))).T + CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, + colors='k') + ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], + vmin=Z_level[0], alpha=0.75, **contour_kw) + ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) + + # plot data deciles + axes labels + deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) + trans = transforms.blended_transform_factory(ax.transData, + ax.transAxes) + ylim = ax.get_ylim() + ax.vlines(deciles, [0], 0.05, transform=trans, color='k') + ax.set_xlabel(name[0]) + ax.set_ylim(ylim) + + # prevent x-axis ticks from overlapping + ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) + tick_formatter = ScalarFormatter() + tick_formatter.set_powerlimits((-3, 4)) + ax.xaxis.set_major_formatter(tick_formatter) + + if len(axes) > 1: + # two-way PDP - y-axis deciles + labels + deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) + trans = transforms.blended_transform_factory(ax.transAxes, + ax.transData) + xlim = ax.get_xlim() + ax.hlines(deciles, [0], 0.05, transform=trans, color='k') + ax.set_ylabel(name[1]) + # hline erases xlim + ax.set_xlim(xlim) + else: + ax.set_ylabel('Partial dependence') + + if len(axes) == 1: + ax.set_ylim(pdp_lim[1]) + axs.append(ax) + + fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, + hspace=0.3) + return fig, axs diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py new file mode 100644 index 0000000000000..aaa2e539cf594 --- /dev/null +++ b/sklearn/tests/test_partial_dependence.py @@ -0,0 +1,328 @@ +""" +Testing for the partial dependence module. +""" + +import numpy as np +from numpy.testing import assert_array_equal + +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import if_matplotlib +from sklearn.utils.testing import all_estimators +from sklearn.utils.testing import ignore_warnings +from sklearn.partial_dependence import partial_dependence +from sklearn.partial_dependence import plot_partial_dependence +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble.gradient_boosting import BaseGradientBoosting +from sklearn.ensemble.forest import ForestRegressor +from sklearn.datasets import load_boston, load_iris +from sklearn.datasets import make_classification, make_regression + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y = [-1, -1, -1, 1, 1, 1] + +# Make some sample data to test output shapes +X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0) +# Non-negative for MultinomialNB +X_c = X_c + np.abs(X_c.min()) +X_r, y_r = make_regression(n_features=10, n_informative=5, random_state=0) + +# Load the boston & iris datasets +boston = load_boston() +iris = load_iris() + + +@ignore_warnings() +def test_output_shape_recursion(): + # Test recursion partial_dependence has same output shape for everything + for name, Estimator in all_estimators(): + est = Estimator() + if not (isinstance(est, BaseGradientBoosting) or + isinstance(est, ForestRegressor)): + continue + if est._estimator_type == 'classifier': + est.fit(X_c, y_c) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_c, + method='recursion', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_c, + method='recursion', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + elif est._estimator_type == 'regressor': + est.fit(X_r, y_r) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_r, + method='recursion', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_r, + method='recursion', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + + +@ignore_warnings() +def test_output_shape_exact(): + # Test exact partial_dependence has same output shape for everything + for name, Estimator in all_estimators(): + est = Estimator() + if not hasattr(est, '_estimator_type') or 'MultiTask' in name: + continue + if est._estimator_type == 'classifier': + if not hasattr(est, 'predict_proba'): + continue + est.fit(X_c, y_c) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_c, + method='exact', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_c, + method='exact', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + elif est._estimator_type == 'regressor': + est.fit(X_r, y_r) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_r, + method='exact', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_r, + method='exact', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + + +@ignore_warnings() +def test_output_shape_estimated(): + # Test estimated partial_dependence has same output shape for everything + for name, Estimator in all_estimators(): + est = Estimator() + if not hasattr(est, '_estimator_type') or 'MultiTask' in name: + continue + if est._estimator_type == 'classifier': + if not hasattr(est, 'predict_proba'): + continue + est.fit(X_c, y_c) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_c, + method='estimated', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_r, + method='estimated', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + elif est._estimator_type == 'regressor': + est.fit(X_r, y_r) + pdp, axes = partial_dependence(est, + target_variables=[1], + X=X_r, + method='estimated', + grid_resolution=10) + assert (pdp.shape == (1, 10)) + pdp, axes = partial_dependence(est, + target_variables=[1, 2], + X=X_r, + method='estimated', + grid_resolution=10) + assert (pdp.shape == (1, 100)) + + +def test_partial_dependence_classifier(): + # Test partial dependence for classifier + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(X, y) + + pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) + + # only 4 grid points instead of 5 because only 4 unique X[:,0] vals + assert pdp.shape == (1, 4) + assert axes[0].shape[0] == 4 + + # now with our own grid + X_ = np.asarray(X) + grid = np.unique(X_[:, 0]) + pdp_2, axes = partial_dependence(clf, [0], grid=grid) + + assert axes is None + assert_array_equal(pdp, pdp_2) + + +def test_partial_dependence_multiclass(): + # Test partial dependence for multi-class classifier + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(iris.data, iris.target) + + grid_resolution = 25 + n_classes = clf.n_classes_ + pdp, axes = partial_dependence( + clf, [0], X=iris.data, grid_resolution=grid_resolution) + + assert pdp.shape == (n_classes, grid_resolution) + assert len(axes) == 1 + assert axes[0].shape[0] == grid_resolution + + +def test_partial_dependence_regressor(): + # Test partial dependence for regressor + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(boston.data, boston.target) + + grid_resolution = 25 + pdp, axes = partial_dependence( + clf, [0], X=boston.data, grid_resolution=grid_resolution) + + assert pdp.shape == (1, grid_resolution) + assert axes[0].shape[0] == grid_resolution + + +def test_partial_dependecy_input(): + # Test input validation of partial dependence. + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(X, y) + + assert_raises(ValueError, partial_dependence, + clf, [0], grid=None, X=None) + + assert_raises(ValueError, partial_dependence, + clf, [0], grid=[0, 1], X=X) + + # first argument must be an instance of BaseGradientBoosting + assert_raises(ValueError, partial_dependence, + {}, [0], X=X) + + # Gradient boosting estimator must be fit + assert_raises(ValueError, partial_dependence, + GradientBoostingClassifier(), [0], X=X) + + assert_raises(ValueError, partial_dependence, clf, [-1], X=X) + + assert_raises(ValueError, partial_dependence, clf, [100], X=X) + + # wrong ndim for grid + grid = np.random.rand(10, 2, 1) + assert_raises(ValueError, partial_dependence, clf, [0], grid=grid) + + +@if_matplotlib +def test_plot_partial_dependence(): + # Test partial dependence plot function. + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(boston.data, boston.target) + + grid_resolution = 25 + fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], + grid_resolution=grid_resolution, + feature_names=boston.feature_names) + assert len(axs) == 3 + assert all(ax.has_data for ax in axs) + + # check with str features and array feature names + fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', + ('CRIM', 'ZN')], + grid_resolution=grid_resolution, + feature_names=boston.feature_names) + + assert len(axs) == 3 + assert all(ax.has_data for ax in axs) + + # check with list feature_names + feature_names = boston.feature_names.tolist() + fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', + ('CRIM', 'ZN')], + grid_resolution=grid_resolution, + feature_names=feature_names) + assert len(axs) == 3 + assert all(ax.has_data for ax in axs) + + +@if_matplotlib +def test_plot_partial_dependence_input(): + # Test partial dependence plot function input checks. + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + + # not fitted yet + assert_raises(ValueError, plot_partial_dependence, + clf, X, [0]) + + clf.fit(X, y) + + assert_raises(ValueError, plot_partial_dependence, + clf, np.array(X)[:, :0], [0]) + + # first argument must be an instance of BaseGradientBoosting + assert_raises(ValueError, plot_partial_dependence, + {}, X, [0]) + + # must be larger than -1 + assert_raises(ValueError, plot_partial_dependence, + clf, X, [-1]) + + # too large feature value + assert_raises(ValueError, plot_partial_dependence, + clf, X, [100]) + + # str feature but no feature_names + assert_raises(ValueError, plot_partial_dependence, + clf, X, ['foobar']) + + # not valid features value + assert_raises(ValueError, plot_partial_dependence, + clf, X, [{'foo': 'bar'}]) + + +@if_matplotlib +def test_plot_partial_dependence_multiclass(): + # Test partial dependence plot function on multi-class input. + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(iris.data, iris.target) + + grid_resolution = 25 + fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], + label=0, + grid_resolution=grid_resolution) + assert len(axs) == 2 + assert all(ax.has_data for ax in axs) + + # now with symbol labels + target = iris.target_names[iris.target] + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(iris.data, target) + + grid_resolution = 25 + fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], + label='setosa', + grid_resolution=grid_resolution) + assert len(axs) == 2 + assert all(ax.has_data for ax in axs) + + # label not in gbrt.classes_ + assert_raises(ValueError, plot_partial_dependence, + clf, iris.data, [0, 1], label='foobar', + grid_resolution=grid_resolution) + + # label not provided + assert_raises(ValueError, plot_partial_dependence, + clf, iris.data, [0, 1], + grid_resolution=grid_resolution)