From b84ba1c45222c089d2d58ab73066f234f08f033b Mon Sep 17 00:00:00 2001 From: Juan Orduz Date: Thu, 26 Dec 2024 21:39:55 +0100 Subject: [PATCH 01/10] Update MyPy 14 (#210) * move mypy config * some fixes * some fixes * some fixes * some fixes * some fixes * some fixes * remove reference np.float64 * remove unnesserary casting * fix type * fix import --- .pre-commit-config.yaml | 4 +-- mypy.ini | 15 --------- pymc_bart/bart.py | 6 ++-- pymc_bart/pgbart.py | 73 ++++++++++++++++++++++++----------------- pymc_bart/tree.py | 58 +++++++++++++++++++------------- pymc_bart/utils.py | 50 ++++++++++++++-------------- pyproject.toml | 17 ++++++++++ 7 files changed, 123 insertions(+), 100 deletions(-) delete mode 100644 mypy.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fe00024..4f55bc1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,14 +12,14 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.3 + rev: v0.8.4 hooks: - id: ruff args: ["--fix", "--output-format=full"] - id: ruff-format args: ["--line-length=100"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 + rev: v1.14.0 hooks: - id: mypy args: [--ignore-missing-imports] diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 56088d7..0000000 --- a/mypy.ini +++ /dev/null @@ -1,15 +0,0 @@ -[mypy] -files = pymc_bart/*.py -plugins = numpy.typing.mypy_plugin - -[mypy-matplotlib.*] -ignore_missing_imports = True - -[mypy-numba.*] -ignore_missing_imports = True - -[mypy-pymc.*] -ignore_missing_imports = True - -[mypy-scipy.*] -ignore_missing_imports = True diff --git a/pymc_bart/bart.py b/pymc_bart/bart.py index decb499..ac2be35 100644 --- a/pymc_bart/bart.py +++ b/pymc_bart/bart.py @@ -132,7 +132,7 @@ def __new__( alpha: float = 0.95, beta: float = 2.0, response: str = "constant", - split_prior: Optional[npt.NDArray[np.float64]] = None, + split_prior: Optional[npt.NDArray] = None, split_rules: Optional[list[SplitRule]] = None, separate_trees: Optional[bool] = False, **kwargs, @@ -203,9 +203,7 @@ def get_moment(cls, rv, size, *rv_inputs): return mean -def preprocess_xy( - X: TensorLike, Y: TensorLike -) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: +def preprocess_xy(X: TensorLike, Y: TensorLike) -> tuple[npt.NDArray, npt.NDArray]: if isinstance(Y, (Series, DataFrame)): Y = Y.to_numpy() if isinstance(X, (Series, DataFrame)): diff --git a/pymc_bart/pgbart.py b/pymc_bart/pgbart.py index 1505f15..014313a 100644 --- a/pymc_bart/pgbart.py +++ b/pymc_bart/pgbart.py @@ -16,6 +16,8 @@ import numpy as np import numpy.typing as npt +import pymc as pm +import pytensor.tensor as pt from numba import njit from pymc.initial_point import PointType from pymc.model import Model, modelcontext @@ -120,15 +122,15 @@ class PGBART(ArrayStepShared): "tune": (bool, []), } - def __init__( # noqa: PLR0915 + def __init__( # noqa: PLR0912, PLR0915 self, - vars=None, # pylint: disable=redefined-builtin + vars: list[pm.Distribution] | None = None, num_particles: int = 10, batch: tuple[float, float] = (0.1, 0.1), model: Optional[Model] = None, initial_point: PointType | None = None, - compile_kwargs: dict | None = None, # pylint: disable=unused-argument - ): + compile_kwargs: dict | None = None, + ) -> None: model = modelcontext(model) if initial_point is None: initial_point = model.initial_point() @@ -137,6 +139,10 @@ def __init__( # noqa: PLR0915 else: vars = [model.rvs_to_values.get(var, var) for var in vars] vars = inputvars(vars) + + if vars is None: + raise ValueError("Unable to find variables to sample") + value_bart = vars[0] self.bart = model.values_to_rvs[value_bart].owner.op @@ -325,7 +331,7 @@ def normalize(self, particles: list[ParticleTree]) -> float: return wei / wei.sum() def resample( - self, particles: list[ParticleTree], normalized_weights: npt.NDArray[np.float64] + self, particles: list[ParticleTree], normalized_weights: npt.NDArray ) -> list[ParticleTree]: """ Use systematic resample for all but the first particle @@ -347,7 +353,7 @@ def resample( return particles def get_particle_tree( - self, particles: list[ParticleTree], normalized_weights: npt.NDArray[np.float64] + self, particles: list[ParticleTree], normalized_weights: npt.NDArray ) -> tuple[ParticleTree, Tree]: """ Sample a new particle and associated tree @@ -359,7 +365,7 @@ def get_particle_tree( return new_particle, new_particle.tree - def systematic(self, normalized_weights: npt.NDArray[np.float64]) -> npt.NDArray[np.int_]: + def systematic(self, normalized_weights: npt.NDArray) -> npt.NDArray[np.int_]: """ Systematic resampling. @@ -395,7 +401,7 @@ def update_weight(self, particle: ParticleTree, odim: int) -> None: particle.log_weight = new_likelihood @staticmethod - def competence(var, has_grad): + def competence(var: pm.Distribution, has_grad: bool) -> Competence: """PGBART is only suitable for BART distributions.""" dist = getattr(var.owner, "op", None) if isinstance(dist, BARTRV): @@ -406,12 +412,12 @@ def competence(var, has_grad): class RunningSd: """Welford's online algorithm for computing the variance/standard deviation""" - def __init__(self, shape: tuple) -> None: + def __init__(self, shape: tuple[int, ...]) -> None: self.count = 0 # number of data points self.mean = np.zeros(shape) # running mean self.m_2 = np.zeros(shape) # running second moment - def update(self, new_value: npt.NDArray[np.float64]) -> Union[float, npt.NDArray[np.float64]]: + def update(self, new_value: npt.NDArray) -> Union[float, npt.NDArray]: self.count = self.count + 1 self.mean, self.m_2, std = _update(self.count, self.mean, self.m_2, new_value) return fast_mean(std) @@ -420,10 +426,10 @@ def update(self, new_value: npt.NDArray[np.float64]) -> Union[float, npt.NDArray @njit def _update( count: int, - mean: npt.NDArray[np.float64], - m_2: npt.NDArray[np.float64], - new_value: npt.NDArray[np.float64], -) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], Union[float, npt.NDArray[np.float64]]]: + mean: npt.NDArray, + m_2: npt.NDArray, + new_value: npt.NDArray, +) -> tuple[npt.NDArray, npt.NDArray, Union[float, npt.NDArray]]: delta = new_value - mean mean += delta / count delta2 = new_value - mean @@ -434,7 +440,7 @@ def _update( class SampleSplittingVariable: - def __init__(self, alpha_vec: npt.NDArray[np.float64]) -> None: + def __init__(self, alpha_vec: npt.NDArray) -> None: """ Sample splitting variables proportional to `alpha_vec`. @@ -547,16 +553,16 @@ def filter_missing_values(available_splitting_values, idx_data_points, missing_d def draw_leaf_value( - y_mu_pred: npt.NDArray[np.float64], - x_mu: npt.NDArray[np.float64], + y_mu_pred: npt.NDArray, + x_mu: npt.NDArray, m: int, - norm: npt.NDArray[np.float64], + norm: npt.NDArray, shape: int, response: str, -) -> tuple[npt.NDArray[np.float64], Optional[npt.NDArray[np.float64]]]: +) -> tuple[npt.NDArray, Optional[npt.NDArray]]: """Draw Gaussian distributed leaf values.""" linear_params = None - mu_mean = np.empty(shape) + mu_mean: npt.NDArray if y_mu_pred.size == 0: return np.zeros(shape), linear_params @@ -571,7 +577,7 @@ def draw_leaf_value( @njit -def fast_mean(ari: npt.NDArray[np.float64]) -> Union[float, npt.NDArray[np.float64]]: +def fast_mean(ari: npt.NDArray) -> Union[float, npt.NDArray]: """Use Numba to speed up the computation of the mean.""" if ari.ndim == 1: count = ari.shape[0] @@ -590,11 +596,11 @@ def fast_mean(ari: npt.NDArray[np.float64]) -> Union[float, npt.NDArray[np.float @njit def fast_linear_fit( - x: npt.NDArray[np.float64], - y: npt.NDArray[np.float64], + x: npt.NDArray, + y: npt.NDArray, m: int, - norm: npt.NDArray[np.float64], -) -> tuple[npt.NDArray[np.float64], list[npt.NDArray[np.float64]]]: + norm: npt.NDArray, +) -> tuple[npt.NDArray, list[npt.NDArray]]: n = len(x) y = y / m + np.expand_dims(norm, axis=1) @@ -678,17 +684,17 @@ def update(self): @njit def inverse_cdf( - single_uniform: npt.NDArray[np.float64], normalized_weights: npt.NDArray[np.float64] + single_uniform: npt.NDArray, normalized_weights: npt.NDArray ) -> npt.NDArray[np.int_]: """ Inverse CDF algorithm for a finite distribution. Parameters ---------- - single_uniform: npt.NDArray[np.float64] + single_uniform: npt.NDArray Ordered points in [0,1] - normalized_weights: npt.NDArray[np.float64]) + normalized_weights: npt.NDArray) Normalized weights Returns @@ -711,7 +717,7 @@ def inverse_cdf( @njit -def jitter_duplicated(array: npt.NDArray[np.float64], std: float) -> npt.NDArray[np.float64]: +def jitter_duplicated(array: npt.NDArray, std: float) -> npt.NDArray: """ Jitter duplicated values. """ @@ -727,12 +733,17 @@ def jitter_duplicated(array: npt.NDArray[np.float64], std: float) -> npt.NDArray @njit -def are_whole_number(array: npt.NDArray[np.float64]) -> np.bool_: +def are_whole_number(array: npt.NDArray) -> np.bool_: """Check if all values in array are whole numbers""" return np.all(np.mod(array[~np.isnan(array)], 1) == 0) -def logp(point, out_vars, vars, shared): # pylint: disable=redefined-builtin +def logp( + point, + out_vars: list[pm.Distribution], + vars: list[pm.Distribution], + shared: list[pt.TensorVariable], +): """Compile PyTensor function of the model and the input and output variables. Parameters diff --git a/pymc_bart/tree.py b/pymc_bart/tree.py index 7655175..61e5050 100644 --- a/pymc_bart/tree.py +++ b/pymc_bart/tree.py @@ -28,7 +28,7 @@ class Node: Attributes ---------- - value : npt.NDArray[np.float64] + value : npt.NDArray idx_data_points : Optional[npt.NDArray[np.int_]] idx_split_variable : int linear_params: Optional[list[float]] = None @@ -38,11 +38,11 @@ class Node: def __init__( self, - value: npt.NDArray[np.float64] = np.array([-1.0]), + value: npt.NDArray = np.array([-1.0]), nvalue: int = 0, idx_data_points: Optional[npt.NDArray[np.int_]] = None, idx_split_variable: int = -1, - linear_params: Optional[list[npt.NDArray[np.float64]]] = None, + linear_params: Optional[list[npt.NDArray]] = None, ) -> None: self.value = value self.nvalue = nvalue @@ -53,11 +53,11 @@ def __init__( @classmethod def new_leaf_node( cls, - value: npt.NDArray[np.float64], + value: npt.NDArray, nvalue: int = 0, idx_data_points: Optional[npt.NDArray[np.int_]] = None, idx_split_variable: int = -1, - linear_params: Optional[list[npt.NDArray[np.float64]]] = None, + linear_params: Optional[list[npt.NDArray]] = None, ) -> "Node": return cls( value=value, @@ -101,7 +101,7 @@ class Tree: The dictionary's keys are integers that represent the nodes position. The dictionary's values are objects of type Node that represent the split and leaf nodes of the tree itself. - output: Optional[npt.NDArray[np.float64]] + output: Optional[npt.NDArray] Array of shape number of observations, shape split_rules : list[SplitRule] List of SplitRule objects, one per column in input data. @@ -122,7 +122,7 @@ class Tree: def __init__( self, tree_structure: dict[int, Node], - output: npt.NDArray[np.float64], + output: npt.NDArray, split_rules: list[SplitRule], idx_leaf_nodes: Optional[list[int]] = None, ) -> None: @@ -134,7 +134,7 @@ def __init__( @classmethod def new_tree( cls, - leaf_node_value: npt.NDArray[np.float64], + leaf_node_value: npt.NDArray, idx_data_points: Optional[npt.NDArray[np.int_]], num_observations: int, shape: int, @@ -190,7 +190,7 @@ def grow_leaf_node( self, current_node: Node, selected_predictor: int, - split_value: npt.NDArray[np.float64], + split_value: npt.NDArray, index_leaf_node: int, ) -> None: current_node.value = split_value @@ -222,7 +222,7 @@ def get_split_variables(self) -> Generator[int, None, None]: if node.is_split_node(): yield node.idx_split_variable - def _predict(self) -> npt.NDArray[np.float64]: + def _predict(self) -> npt.NDArray: output = self.output if self.idx_leaf_nodes is not None: @@ -233,23 +233,23 @@ def _predict(self) -> npt.NDArray[np.float64]: def predict( self, - x: npt.NDArray[np.float64], + x: npt.NDArray, excluded: Optional[list[int]] = None, shape: int = 1, - ) -> npt.NDArray[np.float64]: + ) -> npt.NDArray: """ Predict output of tree for an (un)observed point x. Parameters ---------- - x : npt.NDArray[np.float64] + x : npt.NDArray Unobserved point excluded: Optional[list[int]] Indexes of the variables to exclude when computing predictions Returns ------- - npt.NDArray[np.float64] + npt.NDArray Value of the leaf value where the unobserved point lies. """ if excluded is None: @@ -259,16 +259,16 @@ def predict( def _traverse_tree( self, - X: npt.NDArray[np.float64], + X: npt.NDArray, excluded: Optional[list[int]] = None, shape: Union[int, tuple[int, ...]] = 1, - ) -> npt.NDArray[np.float64]: + ) -> npt.NDArray: """ Traverse the tree starting from the root node given an (un)observed point. Parameters ---------- - X : npt.NDArray[np.float64] + X : npt.NDArray (Un)observed point(s) node_index : int Index of the node to start the traversal from @@ -279,14 +279,16 @@ def _traverse_tree( Returns ------- - npt.NDArray[np.float64] + npt.NDArray Leaf node value or mean of leaf node values """ x_shape = (1,) if len(X.shape) == 1 else X.shape[:-1] nd_dims = (...,) + (None,) * len(x_shape) - stack = [(0, np.ones(x_shape), 0)] # (node_index, weight, idx_split_variable) initial state + stack: list[tuple[int, npt.NDArray, int]] = [ + (0, np.ones(x_shape), 0) + ] # (node_index, weight, idx_split_variable) initial state p_d = ( np.zeros(shape + x_shape) if isinstance(shape, tuple) else np.zeros((shape,) + x_shape) ) @@ -309,9 +311,19 @@ def _traverse_tree( ) if excluded is not None and idx_split_variable in excluded: prop_nvalue_left = self.get_node(left_node_index).nvalue / node.nvalue - stack.append((left_node_index, weights * prop_nvalue_left, idx_split_variable)) stack.append( - (right_node_index, weights * (1 - prop_nvalue_left), idx_split_variable) + ( + left_node_index, + weights * prop_nvalue_left, + idx_split_variable, + ) + ) + stack.append( + ( + right_node_index, + weights * (1 - prop_nvalue_left), + idx_split_variable, + ) ) else: to_left = ( @@ -328,14 +340,14 @@ def _traverse_tree( return p_d def _traverse_leaf_values( - self, leaf_values: list[npt.NDArray[np.float64]], leaf_n_values: list[int], node_index: int + self, leaf_values: list[npt.NDArray], leaf_n_values: list[int], node_index: int ) -> None: """ Traverse the tree appending leaf values starting from a particular node. Parameters ---------- - leaf_values : list[npt.NDArray[np.float64]] + leaf_values : list[npt.NDArray] node_index : int """ node = self.get_node(node_index) diff --git a/pymc_bart/utils.py b/pymc_bart/utils.py index d9d5241..58d14b8 100644 --- a/pymc_bart/utils.py +++ b/pymc_bart/utils.py @@ -17,7 +17,7 @@ from .tree import Tree -TensorLike = Union[npt.NDArray[np.float64], pt.TensorVariable] +TensorLike = Union[npt.NDArray, pt.TensorVariable] def _sample_posterior( @@ -27,7 +27,7 @@ def _sample_posterior( size: Optional[Union[int, tuple[int, ...]]] = None, excluded: Optional[list[int]] = None, shape: int = 1, -) -> npt.NDArray[np.float64]: +) -> npt.NDArray: """ Generate samples from the BART-posterior. @@ -139,8 +139,8 @@ def plot_convergence( def plot_ice( bartrv: Variable, - X: npt.NDArray[np.float64], - Y: Optional[npt.NDArray[np.float64]] = None, + X: npt.NDArray, + Y: Optional[npt.NDArray] = None, var_idx: Optional[list[int]] = None, var_discrete: Optional[list[int]] = None, func: Optional[Callable] = None, @@ -165,9 +165,9 @@ def plot_ice( ---------- bartrv : BART Random Variable BART variable once the model that include it has been fitted. - X : npt.NDArray[np.float64] + X : npt.NDArray The covariate matrix. - Y : Optional[npt.NDArray[np.float64]], by default None. + Y : Optional[npt.NDArray], by default None. The response vector. var_idx : Optional[list[int]], by default None. List of the indices of the covariate for which to compute the pdp or ice. @@ -283,8 +283,8 @@ def identity(x): def plot_pdp( bartrv: Variable, - X: npt.NDArray[np.float64], - Y: Optional[npt.NDArray[np.float64]] = None, + X: npt.NDArray, + Y: Optional[npt.NDArray] = None, xs_interval: str = "quantiles", xs_values: Optional[Union[int, list[float]]] = None, var_idx: Optional[list[int]] = None, @@ -310,9 +310,9 @@ def plot_pdp( ---------- bartrv : BART Random Variable BART variable once the model that include it has been fitted. - X : npt.NDArray[np.float64] + X : npt.NDArray The covariate matrix. - Y : Optional[npt.NDArray[np.float64]], by default None. + Y : Optional[npt.NDArray], by default None. The response vector. xs_interval : str Method used to compute the values X used to evaluate the predicted function. "linear", @@ -526,14 +526,14 @@ def _get_axes(grid, n_plots, sharex, sharey, figsize): def _prepare_plot_data( - X: npt.NDArray[np.float64], - Y: Optional[npt.NDArray[np.float64]] = None, + X: npt.NDArray, + Y: Optional[npt.NDArray] = None, xs_interval: str = "quantiles", xs_values: Optional[Union[int, list[float]]] = None, var_idx: Optional[list[int]] = None, var_discrete: Optional[list[int]] = None, ) -> tuple[ - npt.NDArray[np.float64], + npt.NDArray, list[str], str, list[int], @@ -619,10 +619,10 @@ def _prepare_plot_data( def _create_pdp_data( - X: npt.NDArray[np.float64], + X: npt.NDArray, xs_interval: str, xs_values: Optional[Union[int, list[float]]] = None, -) -> npt.NDArray[np.float64]: +) -> npt.NDArray: """ Create data for partial dependence plot. @@ -637,7 +637,7 @@ def _create_pdp_data( Returns ------- - npt.NDArray[np.float64] + npt.NDArray A 2D array for the fake_X data. """ if xs_interval == "insample": @@ -654,8 +654,8 @@ def _create_pdp_data( def _smooth_mean( - new_x: npt.NDArray[np.float64], - p_di: npt.NDArray[np.float64], + new_x: npt.NDArray, + p_di: npt.NDArray, kind: str = "pdp", smooth_kwargs: Optional[dict[str, Any]] = None, ) -> tuple[np.ndarray, np.ndarray]: @@ -701,7 +701,7 @@ def plot_variable_inclusion(idata, X, labels=None, figsize=None, plot_kwargs=Non ---------- idata : InferenceData InferenceData containing a collection of BART_trees in sample_stats group - X : npt.NDArray[np.float64] + X : npt.NDArray The covariate matrix. labels : Optional[list[str]] List of the names of the covariates. If X is a DataFrame the names of the covariables will @@ -767,7 +767,7 @@ def plot_variable_inclusion(idata, X, labels=None, figsize=None, plot_kwargs=Non def compute_variable_importance( # noqa: PLR0915 PLR0912 idata: az.InferenceData, bartrv: Variable, - X: npt.NDArray[np.float64], + X: npt.NDArray, method: str = "VI", fixed: int = 0, samples: int = 50, @@ -782,7 +782,7 @@ def compute_variable_importance( # noqa: PLR0915 PLR0912 InferenceData containing a collection of BART_trees in sample_stats group bartrv : BART Random Variable BART variable once the model that include it has been fitted. - X : npt.NDArray[np.float64] + X : npt.NDArray The covariate matrix. method : str Method used to rank variables. Available options are "VI" (default), "backward" @@ -826,9 +826,9 @@ def compute_variable_importance( # noqa: PLR0915 PLR0912 else: labels = np.arange(n_vars).astype(str) - r2_mean = np.zeros(n_vars) - r2_hdi = np.zeros((n_vars, 2)) - preds = np.zeros((n_vars, samples, *bartrv.eval().T.shape)) + r2_mean: npt.NDArray = np.zeros(n_vars) + r2_hdi: npt.NDArray = np.zeros((n_vars, 2)) + preds: npt.NDArray = np.zeros((n_vars, samples, *bartrv.eval().T.shape)) if method == "backward_VI": if fixed >= n_vars: @@ -848,7 +848,7 @@ def compute_variable_importance( # noqa: PLR0915 PLR0912 idxs = np.argsort( idata["sample_stats"]["variable_inclusion"].mean(("chain", "draw")).values ) - subsets = [idxs[:-i].tolist() for i in range(1, len(idxs))] + subsets: list[list[int]] = [list(idxs[:-i]) for i in range(1, len(idxs))] subsets.append(None) # type: ignore if method == "backward_VI": diff --git a/pyproject.toml b/pyproject.toml index f8f3e7a..4a2273d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,3 +33,20 @@ exclude_lines = [ isort = 1 black = 1 pyupgrade = 1 + + +[tool.mypy] +files = "pymc_bart/*.py" +plugins = "numpy.typing.mypy_plugin" + +[tool.mypy-matplotlib] +ignore_missing_imports = true + +[tool.mypy-numba] +ignore_missing_imports = true + +[tool.mypy-pymc] +ignore_missing_imports = true + +[tool.mypy-scipy] +ignore_missing_imports = true From 139aeacc360914ef21cf42db2534a4512437198c Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Sat, 28 Dec 2024 15:17:47 -0300 Subject: [PATCH 02/10] Automatic Changelog (#213) * automatic changelog * add changelog to docs --- .github/workflows/post-release.yml | 19 +++++++++++++++++++ CHANGELOG.md | 0 docs/changelog.rst | 5 +++++ docs/index.rst | 12 +++++++----- 4 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/post-release.yml create mode 100644 CHANGELOG.md create mode 100644 docs/changelog.rst diff --git a/.github/workflows/post-release.yml b/.github/workflows/post-release.yml new file mode 100644 index 0000000..5526a27 --- /dev/null +++ b/.github/workflows/post-release.yml @@ -0,0 +1,19 @@ +name: Post-release +on: + release: + types: [published, released] + workflow_dispatch: + +jobs: + changelog: + name: Update changelog + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: main + - uses: rhysd/changelog-from-release/action@v3 + with: + file: CHANGELOG.md + github_token: ${{ secrets.GITHUB_TOKEN }} + commit_summary_template: 'update changelog for %s changes' diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..f83d445 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,5 @@ +Changelog +********* + +.. include:: ../CHANGELOG.md + :parser: myst_parser.sphinx_ diff --git a/docs/index.rst b/docs/index.rst index 4b1dd0e..c73500c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -93,10 +93,12 @@ Contents :maxdepth: 2 examples - api_reference -Indices -======= +References +========== + +.. toctree:: + :maxdepth: 1 -* :ref:`genindex` -* :ref:`modindex` + api_reference + changelog From 3bad2c68df2e766ff65091ad64d1fe8472eca143 Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Sat, 28 Dec 2024 18:37:17 -0300 Subject: [PATCH 03/10] Update index.rst --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index c73500c..78a59fb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,7 +29,7 @@ interpretation of those models and perform variable selection. Installation ============ -PyMC-BART requires a working Python interpreter (3.8+). We recommend installing Python and key numerical libraries using the `Anaconda distribution `_, which has one-click installers available on all major platforms. +PyMC-BART requires a working Python interpreter (3.10+). We recommend installing Python and key numerical libraries using the `Anaconda distribution `_, which has one-click installers available on all major platforms. Assuming a standard Python environment is installed on your machine, PyMC-BART itself can be installed either using pip or conda-forge. From 064457e34d3041bc3886b66a2707b94f5554aac4 Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Sun, 29 Dec 2024 08:11:29 -0300 Subject: [PATCH 04/10] Adds get_variable_inclusion function (#214) * add get_variable_inclusion function * add elements to API reference --- docs/api_reference.rst | 2 +- pymc_bart/__init__.py | 2 ++ pymc_bart/utils.py | 68 +++++++++++++++++++++++++++++++----------- 3 files changed, 53 insertions(+), 19 deletions(-) diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 93afde1..b6fb8a5 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -13,4 +13,4 @@ methods in the current release of PyMC-BART. ============================= .. automodule:: pymc_bart - :members: BART, PGBART, plot_pdp, plot_ice, plot_variable_importance, plot_convergence, ContinuousSplitRule, OneHotSplitRule, SubsetSplitRule + :members: BART, PGBART, compute_variable_importance, get_variable_inclusion, plot_convergence, plot_ice, plot_pdp, plot_scatter_submodels, plot_variable_importance, plot_variable_inclusion, ContinuousSplitRule, OneHotSplitRule, SubsetSplitRule diff --git a/pymc_bart/__init__.py b/pymc_bart/__init__.py index 361be83..f4a1f7a 100644 --- a/pymc_bart/__init__.py +++ b/pymc_bart/__init__.py @@ -18,6 +18,7 @@ from pymc_bart.split_rules import ContinuousSplitRule, OneHotSplitRule, SubsetSplitRule from pymc_bart.utils import ( compute_variable_importance, + get_variable_inclusion, plot_convergence, plot_ice, plot_pdp, @@ -33,6 +34,7 @@ "OneHotSplitRule", "SubsetSplitRule", "compute_variable_importance", + "get_variable_inclusion", "plot_convergence", "plot_ice", "plot_pdp", diff --git a/pymc_bart/utils.py b/pymc_bart/utils.py index 58d14b8..df8f76f 100644 --- a/pymc_bart/utils.py +++ b/pymc_bart/utils.py @@ -693,6 +693,50 @@ def _smooth_mean( return x_data, y_data +def get_variable_inclusion(idata, X, labels=None, to_kulprit=False): + """ + Get the normalized variable inclusion from BART model. + + Parameters + ---------- + idata : InferenceData + InferenceData containing a collection of BART_trees in sample_stats group + X : npt.NDArray + The covariate matrix. + labels : Optional[list[str]] + List of the names of the covariates. If X is a DataFrame the names of the covariables will + be taken from it and this argument will be ignored. + to_kulprit : bool + If True, the function will return a list of list with the variables names. + This list can be passed as a path to Kulprit's project method. Defaults to False. + Returns + ------- + VI_norm : npt.NDArray + Normalized variable inclusion. + labels : list[str] + List of the names of the covariates. + """ + VIs = idata["sample_stats"]["variable_inclusion"].mean(("chain", "draw")).values + VI_norm = VIs / VIs.sum() + idxs = np.argsort(VI_norm) + + indices = idxs[::-1] + n_vars = len(indices) + + if hasattr(X, "columns") and hasattr(X, "to_numpy"): + labels = X.columns + + if labels is None: + labels = np.arange(n_vars).astype(str) + + label_list = labels.to_list() + + if to_kulprit: + return [label_list[:idx] for idx in range(n_vars)] + else: + return VI_norm[indices], label_list + + def plot_variable_inclusion(idata, X, labels=None, figsize=None, plot_kwargs=None, ax=None): """ Plot normalized variable inclusion from BART model. @@ -720,26 +764,15 @@ def plot_variable_inclusion(idata, X, labels=None, figsize=None, plot_kwargs=Non Returns ------- - idxs: indexes of the covariates from higher to lower relative importance axes: matplotlib axes """ if plot_kwargs is None: plot_kwargs = {} - VIs = idata["sample_stats"]["variable_inclusion"].mean(("chain", "draw")).values - VIs = VIs / VIs.sum() - idxs = np.argsort(VIs) - - indices = idxs[::-1] - n_vars = len(indices) - - if hasattr(X, "columns") and hasattr(X, "to_numpy"): - labels = X.columns + VI_norm, labels = get_variable_inclusion(idata, X, labels) + n_vars = len(labels) - if labels is None: - labels = np.arange(n_vars).astype(str) - - new_labels = ["+ " + ele if index != 0 else ele for index, ele in enumerate(labels[indices])] + new_labels = ["+ " + ele if index != 0 else ele for index, ele in enumerate(labels)] ticks = np.arange(n_vars, dtype=int) @@ -749,19 +782,18 @@ def plot_variable_inclusion(idata, X, labels=None, figsize=None, plot_kwargs=Non if ax is None: _, ax = plt.subplots(1, 1, figsize=figsize) + ax.axhline(1 / n_vars, color="0.5", linestyle="--") ax.plot( - VIs[indices], + VI_norm, color=plot_kwargs.get("color", "k"), marker=plot_kwargs.get("marker", "o"), ls=plot_kwargs.get("ls", "-"), ) ax.set_xticks(ticks, new_labels, rotation=plot_kwargs.get("rotation", 0)) - - ax.axhline(1 / n_vars, color="0.5", linestyle="--") ax.set_ylim(0, 1) - return idxs, ax + return ax def compute_variable_importance( # noqa: PLR0915 PLR0912 From cd5dfbe4e09e3e450b384eacbc2d3292734ea9e7 Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Sun, 29 Dec 2024 08:11:49 -0300 Subject: [PATCH 05/10] refactor rng_fn method (#212) --- pymc_bart/bart.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pymc_bart/bart.py b/pymc_bart/bart.py index ac2be35..5114b6e 100644 --- a/pymc_bart/bart.py +++ b/pymc_bart/bart.py @@ -55,12 +55,12 @@ def rng_fn( # pylint: disable=W0237 if not size: size = None - if isinstance(cls.Y, (TensorSharedVariable, TensorVariable)): - Y = cls.Y.eval() - else: - Y = cls.Y - if not cls.all_trees: + if isinstance(cls.Y, (TensorSharedVariable, TensorVariable)): + Y = cls.Y.eval() + else: + Y = cls.Y + if size is not None: return np.full((size[0], Y.shape[0]), Y.mean()) else: From 0d4d6f55a077f05fe93c5973d9512e4cafa374f1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 12:18:59 +0100 Subject: [PATCH 06/10] [pre-commit.ci] pre-commit autoupdate (#215) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.4 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.4...v0.8.6) - [github.com/pre-commit/mirrors-mypy: v1.14.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.14.0...v1.14.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4f55bc1..8a5992a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,14 +12,14 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.4 + rev: v0.8.6 hooks: - id: ruff args: ["--fix", "--output-format=full"] - id: ruff-format args: ["--line-length=100"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.14.0 + rev: v1.14.1 hooks: - id: mypy args: [--ignore-missing-imports] From 44c787cc7b2a8473ca6c1f9fb62171004a27167f Mon Sep 17 00:00:00 2001 From: Juan Orduz Date: Tue, 11 Feb 2025 07:54:38 +0100 Subject: [PATCH 07/10] Fix docs by adding path of config (#217) * pre-commit update * add conf.py path --- .pre-commit-config.yaml | 4 ++-- .readthedocs.yaml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8a5992a..1bc3739 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,14 +12,14 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.6 + rev: v0.9.6 hooks: - id: ruff args: ["--fix", "--output-format=full"] - id: ruff-format args: ["--line-length=100"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.14.1 + rev: v1.15.0 hooks: - id: mypy args: [--ignore-missing-imports] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6e5cef0..0ce9313 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,6 +1,9 @@ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 +sphinx: + # Path to your Sphinx configuration file. + configuration: docs/conf.py build: os: ubuntu-20.04 From 16a78df60b874005ee3b6dd06a0d2a2e892f0946 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 10:38:37 +0200 Subject: [PATCH 08/10] [pre-commit.ci] pre-commit autoupdate (#219) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.6 → v0.9.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.6...v0.9.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1bc3739..6a3b804 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.6 + rev: v0.9.9 hooks: - id: ruff args: ["--fix", "--output-format=full"] From 73813e1308163d0571676600d1a2af2b45719592 Mon Sep 17 00:00:00 2001 From: Alexandre Andorra Date: Mon, 10 Mar 2025 03:23:11 -0400 Subject: [PATCH 09/10] Enhance `plot_pdp` and fix `plot_scatter_submodels` (#218) * Add YML env files * Expand scatter_submodels to categorical likelihood * Add softmax option to plot_pdp * Remove comments * Use func for softmax * handle func upstream * move func upstream * ensure p_d is an array --------- Co-authored-by: aloctavodia --- env-dev.yml | 23 +++++++++++ env.yml | 14 +++++++ pymc_bart/utils.py | 96 +++++++++++++++++++++++++++++++--------------- 3 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 env-dev.yml create mode 100644 env.yml diff --git a/env-dev.yml b/env-dev.yml new file mode 100644 index 0000000..1e28429 --- /dev/null +++ b/env-dev.yml @@ -0,0 +1,23 @@ +name: pymc-bart-dev +channels: + - conda-forge + - defaults +dependencies: + - pymc>=5.16.2,<=5.19.1 + - arviz>=0.18.0 + - numba + - matplotlib + - numpy + - pytensor + # Development dependencies + - pytest>=4.4.0 + - pytest-cov>=2.6.1 + - click==8.0.4 + - pylint==2.17.4 + - pre-commit + - black + - isort + - flake8 + - pip + - pip: + - -e . diff --git a/env.yml b/env.yml new file mode 100644 index 0000000..bd814ae --- /dev/null +++ b/env.yml @@ -0,0 +1,14 @@ +name: pymc-bart +channels: + - conda-forge + - defaults +dependencies: + - pymc>=5.16.2,<=5.19.1 + - arviz>=0.18.0 + - numba + - matplotlib + - numpy + - pytensor + - pip + - pip: + - pymc-bart diff --git a/pymc_bart/utils.py b/pymc_bart/utils.py index df8f76f..3ba6e58 100644 --- a/pymc_bart/utils.py +++ b/pymc_bart/utils.py @@ -254,13 +254,13 @@ def identity(x): ) new_x = fake_X[:, var] - p_d = np.array(y_pred) + p_d = func(np.array(y_pred)) for s_i in range(shape): if centered: - p_di = func(p_d[:, :, s_i]) - func(p_d[:, :, s_i][:, 0][:, None]) + p_di = p_d[:, :, s_i] - p_d[:, :, s_i][:, 0][:, None] else: - p_di = func(p_d[:, :, s_i]) + p_di = p_d[:, :, s_i] if var in var_discrete: axes[count].plot(new_x, p_di.mean(0), "o", color=color_mean) axes[count].plot(new_x, p_di.T, ".", color=color, alpha=alpha) @@ -393,14 +393,17 @@ def identity(x): for var in range(len(var_idx)): excluded = indices[:] excluded.remove(var) - p_d = _sample_posterior( - all_trees, X=fake_X, rng=rng, size=samples, excluded=excluded, shape=shape + p_d = func( + _sample_posterior( + all_trees, X=fake_X, rng=rng, size=samples, excluded=excluded, shape=shape + ) ) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="hdi currently interprets 2d data") new_x = fake_X[:, var] for s_i in range(shape): - p_di = func(p_d[:, :, s_i]) + p_di = p_d[:, :, s_i] null_pd.append(p_di.mean()) if var in var_discrete: _, idx_uni = np.unique(new_x, return_index=True) @@ -1125,8 +1128,11 @@ def plot_scatter_submodels( plot_kwargs : dict Additional keyword arguments for the plot. Defaults to None. Valid keys are: - - color_ref: matplotlib valid color for the 45 degree line + - marker_scatter: matplotlib valid marker for the scatter plot - color_scatter: matplotlib valid color for the scatter plot + - alpha_scatter: matplotlib valid alpha for the scatter plot + - color_ref: matplotlib valid color for the 45 degree line + - ls_ref: matplotlib valid linestyle for the reference line axes : axes Matplotlib axes. @@ -1140,41 +1146,69 @@ def plot_scatter_submodels( submodels = np.sort(submodels) indices = vi_results["indices"][submodels] - preds = vi_results["preds"][submodels] + preds_sub = vi_results["preds"][submodels] preds_all = vi_results["preds_all"] + if labels is None: + labels = vi_results["labels"][submodels] + + # handle categorical regression case: + n_cats = None + if preds_all.ndim > 2: + n_cats = preds_all.shape[-1] + indices = np.tile(indices, n_cats) + if ax is None: _, ax = _get_axes(grid, len(indices), True, True, figsize) if plot_kwargs is None: plot_kwargs = {} - if labels is None: - labels = vi_results["labels"][submodels] - if func is not None: - preds = func(preds) + preds_sub = func(preds_sub) preds_all = func(preds_all) - min_ = min(np.min(preds), np.min(preds_all)) - max_ = max(np.max(preds), np.max(preds_all)) - - for pred, x_label, axi in zip(preds, labels, ax.ravel()): - axi.plot( - pred, - preds_all, - marker=plot_kwargs.get("marker_scatter", "."), - ls="", - color=plot_kwargs.get("color_scatter", "C0"), - alpha=plot_kwargs.get("alpha_scatter", 0.1), - ) - axi.set_xlabel(x_label) - axi.axline( - [min_, min_], - [max_, max_], - color=plot_kwargs.get("color_ref", "0.5"), - ls=plot_kwargs.get("ls_ref", "--"), - ) + min_ = min(np.min(preds_sub), np.min(preds_all)) + max_ = max(np.max(preds_sub), np.max(preds_all)) + + # handle categorical regression case: + if n_cats is not None: + i = 0 + for cat in range(n_cats): + for pred_sub, x_label in zip(preds_sub, labels): + ax[i].plot( + pred_sub[..., cat], + preds_all[..., cat], + marker=plot_kwargs.get("marker_scatter", "."), + ls="", + color=plot_kwargs.get("color_scatter", f"C{cat}"), + alpha=plot_kwargs.get("alpha_scatter", 0.1), + ) + ax[i].set(xlabel=x_label, ylabel="ref model", title=f"Category {cat}") + ax[i].axline( + [min_, min_], + [max_, max_], + color=plot_kwargs.get("color_ref", "0.5"), + ls=plot_kwargs.get("ls_ref", "--"), + ) + i += 1 + else: + for pred_sub, x_label, axi in zip(preds_sub, labels, ax.ravel()): + axi.plot( + pred_sub, + preds_all, + marker=plot_kwargs.get("marker_scatter", "."), + ls="", + color=plot_kwargs.get("color_scatter", "C0"), + alpha=plot_kwargs.get("alpha_scatter", 0.1), + ) + axi.set(xlabel=x_label, ylabel="ref model") + axi.axline( + [min_, min_], + [max_, max_], + color=plot_kwargs.get("color_ref", "0.5"), + ls=plot_kwargs.get("ls_ref", "--"), + ) return ax From 7986e2325aa4f20b558d5c223b685f0fef6bb986 Mon Sep 17 00:00:00 2001 From: Osvaldo A Martin Date: Mon, 10 Mar 2025 09:38:58 +0200 Subject: [PATCH 10/10] bump release (#220) --- pymc_bart/__init__.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pymc_bart/__init__.py b/pymc_bart/__init__.py index f4a1f7a..ed1a29a 100644 --- a/pymc_bart/__init__.py +++ b/pymc_bart/__init__.py @@ -42,7 +42,7 @@ "plot_variable_importance", "plot_variable_inclusion", ] -__version__ = "0.8.2" +__version__ = "0.9.0" pm.STEP_METHODS = list(pm.STEP_METHODS) + [PGBART] diff --git a/requirements.txt b/requirements.txt index da634d4..785de62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pymc>=5.16.2, <=5.19.1 +pymc>=5.16.2, <=5.20.1 arviz>=0.18.0 numba matplotlib