From 0ac8a71ee2ad670d7b3a39201f63e1b48938204c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 28 Mar 2023 19:58:02 -0700 Subject: [PATCH 1/8] [EXAMPLE DIFF] (Tree featuresv2) Fork of sklearn that maintains all necessary refactorings to enable downstream functionality (#32) #### Reference Issues/PRs This is the most up-to-date PR branch to consolidate all proposed refactor changes that work with: - unsupervised trees - oblique trees - no performance/runtime regressions against main #### What does this implement/fix? Explain your changes. Incorporates refactors to: Internal Cython of scikit-learn's: - criterion - splitter - tree Internals of Python in scikit-learns: - python Tree Adds the basic implementation of oblique trees. The implementation of oblique trees has been tested on all sklearn's `check_estimator` testing function and has error-checking bounds for the new hyperparameter introduced, which is `feature_combinations` that defaults to ``min(1.5, n_features)``. TODO: 1. [ ] ~Add honest support for trees (splitting the data at the Python API level)~ 2. [x] Build wheels 3. [ ] ~Brainstorm unit-tests, or weekly checks to determine when our fork is out-of-date compared to upstream sklearn~ 4. [x] Revamp README for the fork #### Any other comments? [cd build] --------- Signed-off-by: Adam Li Co-authored-by: Chester Huynh Co-authored-by: Parth Vora --- .circleci/config.yml | 33 +- .cirrus.star | 4 +- .github/workflows/check-changelog.yml | 3 +- .github/workflows/check-manifest.yml | 2 +- .github/workflows/labeler-module.yml | 4 +- .github/workflows/update_tracking_issue.yml | 2 +- .github/workflows/wheels.yml | 33 +- .gitignore | 1 + Makefile | 3 + README.rst | 322 ++++++---- build_tools/azure/install.sh | 2 +- build_tools/azure/install_win.sh | 2 +- doc/Makefile | 2 + doc/conf.py | 3 +- doc/modules/tree.rst | 61 +- examples/tree/plot_iris_dtc.py | 4 - setup.py | 45 +- sklearn/ensemble/_forest.py | 108 +++- sklearn/ensemble/tests/test_forest.py | 171 +++++ sklearn/tree/_classes.py | 162 +++-- sklearn/tree/_criterion.pxd | 45 +- sklearn/tree/_criterion.pyx | 285 ++++----- sklearn/tree/_splitter.pxd | 41 +- sklearn/tree/_splitter.pyx | 165 +++-- sklearn/tree/_tree.pxd | 90 ++- sklearn/tree/_tree.pyx | 659 ++++++++++++-------- sklearn/tree/tests/test_tree.py | 32 +- 27 files changed, 1499 insertions(+), 785 deletions(-) mode change 100755 => 100644 setup.py diff --git a/.circleci/config.yml b/.circleci/config.yml index e2f54c0665c78..e4e66b5c57f49 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -94,22 +94,23 @@ jobs: root: doc/_build/html paths: . - deploy: - docker: - - image: cimg/python:3.8.12 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - # Attach documentation generated in the 'doc' step so that it can be - # deployed. - - attach_workspace: - at: doc/_build/html - - run: ls -ltrh doc/_build/html/stable - - deploy: - command: | - if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then - bash build_tools/circle/push_doc.sh doc/_build/html/stable - fi + # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy + # deploy: + # docker: + # - image: cimg/python:3.8.12 + # steps: + # - checkout + # - run: ./build_tools/circle/checkout_merge_commit.sh + # # Attach documentation generated in the 'doc' step so that it can be + # # deployed. + # - attach_workspace: + # at: doc/_build/html + # - run: ls -ltrh doc/_build/html/stable + # - deploy: + # command: | + # if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then + # bash build_tools/circle/push_doc.sh doc/_build/html/stable + # fi workflows: version: 2 diff --git a/.cirrus.star b/.cirrus.star index 8b3de0d10c532..2dd1e50144987 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -4,9 +4,9 @@ load("cirrus", "env", "fs", "http") def main(ctx): - # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can + # Only run for neurodata/scikit-learn. For debugging on a fork, you can # comment out the following condition. - if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn": + if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn": return [] arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml" diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index d5bfc8ef0f430..53f64ba5c886b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -10,12 +10,13 @@ jobs: check: name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV + echo "${{ github.repository }}" - uses: actions/checkout@v3 with: fetch-depth: '0' diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml index 004cc452e385e..5ef9ce2213e90 100644 --- a/.github/workflows/check-manifest.yml +++ b/.github/workflows/check-manifest.yml @@ -7,7 +7,7 @@ on: jobs: check-manifest: # Don't run on forks - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 061d0094b38c5..8092711f07e45 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" @@ -27,7 +27,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml index 124ea1e8c6ac4..c176ce356a4cf 100644 --- a/.github/workflows/update_tracking_issue.yml +++ b/.github/workflows/update_tracking_issue.yml @@ -24,7 +24,7 @@ on: jobs: update_tracking_issue: runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b43f29ffa4f7f..4ab75fd361586 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -7,12 +7,12 @@ on: - cron: "42 3 */1 * *" push: branches: - - main + - fork # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - - main + - fork - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: @@ -26,7 +26,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -178,31 +178,8 @@ jobs: with: path: dist/*.tar.gz - # Upload the wheels and the source distribution - upload_anaconda: - name: Upload to Anaconda - runs-on: ubuntu-latest - needs: [build_wheels, build_sdist] - # The artifacts cannot be uploaded on PRs - if: github.event_name != 'pull_request' - - steps: - - name: Checkout scikit-learn - uses: actions/checkout@v3 - - - name: Download artifacts - uses: actions/download-artifact@v3 + - uses: actions/upload-artifact@v3 with: path: dist + name: ${{ matrix.python[0] }}-${{ matrix.os[1] }} - - name: Setup Python - uses: actions/setup-python@v4 - - - name: Upload artifacts - env: - # Secret variables need to be mapped to environment variables explicitly - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} - ARTIFACTS_PATH: dist/artifact - # Force a replacement if the remote file already exists - run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index 89600846100a8..1e28896f50be6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build sklearn/datasets/__config__.py sklearn/**/*.html +scikit_learn_tree.egg-info/* dist/ MANIFEST diff --git a/Makefile b/Makefile index 5ea64dc0d6cac..148027b30f59f 100644 --- a/Makefile +++ b/Makefile @@ -63,3 +63,6 @@ doc-noplot: inplace code-analysis: flake8 sklearn | grep -v __init__ | grep -v external pylint -E -i y sklearn/ -d E1103,E0611,E1101 + +build-dev: + pip install --verbose --no-build-isolation --editable . diff --git a/README.rst b/README.rst index 5e2de6a6d8b46..fbdfdaa95ef4c 100644 --- a/README.rst +++ b/README.rst @@ -44,20 +44,36 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 -.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png - :target: https://scikit-learn.org/ +``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is +released under the name ``scikit-learn-tree`` to avoid confusion. -**scikit-learn** is a Python module for machine learning built on top of -SciPy and is distributed under the 3-Clause BSD license. +It is currently maintained by a team of volunteers. -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. +The upstream package **scikit-learn** is a Python module for machine learning built on top of +SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation +needs: https://scikit-learn.org. -It is currently maintained by a team of volunteers. +Why a fork? +----------- +Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize +and improve the extensibility of the code is currently unsupported, or may take a long time. +The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable. + +However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package +altogether is undesirable because it results in a tree codebase that is inherently different +and not compatible with ``scikit-learn``. For example, `quantile-forests `_, +and `EconML `_ do this, and their current tree submodules +cannot take advantage of improvements made in upstream ``scikit-learn``. + +An example of seamless integration would be `scikit-survival `_, which +only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees. -Website: https://scikit-learn.org +Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop +a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule +and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always +take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable +tree API. Installation ------------ @@ -73,133 +89,195 @@ scikit-learn requires: - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) -======= +============================ +Installing scikit-learn-tree +============================ -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 1.0 and later require Python 3.7 or newer. -scikit-learn 1.1 and later require Python 3.8 or newer. +Scikit-learn-tree is a maintained fork of scikit-learn, which extends the +tree submodule in a few ways documented in :ref:`changelog of the fork +`. -Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). -For running the examples Matplotlib >= |MatplotlibMinVersion| is required. -A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples -require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. +We release versions of scikit-learn-tree in an analagous fashion to +scikit-learn main. Due to maintenance resources, we only release on PyPi +and recommend therefore installing with ``pip``. -User installation -~~~~~~~~~~~~~~~~~ +There are different ways to install scikit-learn-tree: -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip``:: + * :ref:`Install the latest official release `. This + is the best approach for most users. It will provide a stable version + and pre-built packages are available for most platforms. + + * :ref:`Building the package from source + `. This is best for users who want the + latest-and-greatest features and aren't afraid of running + brand-new code. This is also needed for users who wish to contribute to the + project. - pip install -U scikit-learn +.. _install_fork_release: -or ``conda``:: +Installing the latest release +----------------------------- +We release wheels for common distributions and this is thus installable via pip. - conda install -c conda-forge scikit-learn +.. prompt:: bash $ + + pip install scikit-learn-tree -The documentation includes more detailed `installation instructions `_. +This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then +can be used as a stand-in for any package that relies on the public API of ``sklearn``. +For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree`` -Changelog ---------- + >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(random_state=0) + >>> X = [[ 1, 2, 3], # 2 samples, 3 features + ... [11, 12, 13]] + >>> y = [0, 1] # classes of each sample + >>> clf.fit(X, y) + RandomForestClassifier(random_state=0) -See the `changelog `__ -for a history of notable changes to scikit-learn. +.. _install_source: + +Building from source +-------------------- +If you are a developer and are interested in helping maintain, or add some new +features to the fork, the building from source instructions are exactly the same +as that of scikit-learn main, so please refer to `scikit-learn documentation `_ +for instructions on building from source. Development ------------ +=========== -We welcome new contributors of all experience levels. The scikit-learn -community goals are to be helpful, welcoming, and effective. The +We welcome new contributors of all experience levels, specifically to maintain the fork. +Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, +or improves the tree submodule in anyway will be appreciated. + +The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -Important links -~~~~~~~~~~~~~~~ - -- Official source code repo: https://github.com/scikit-learn/scikit-learn -- Download releases: https://pypi.org/project/scikit-learn/ -- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues - -Source code -~~~~~~~~~~~ - -You can check the latest sources with the command:: - - git clone https://github.com/scikit-learn/scikit-learn.git - -Contributing -~~~~~~~~~~~~ - -To learn more about making a contribution to scikit-learn, please see our -`Contributing guide -`_. - -Testing -~~~~~~~ - -After installation, you can launch the test suite from outside the source -directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: - - pytest sklearn - -See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage -for more information. - - Random number generation can be controlled during testing by setting - the ``SKLEARN_SEED`` environment variable. - -Submitting a Pull Request -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before opening a Pull Request, have a look at the -full Contributing page to make sure your code complies -with our guidelines: https://scikit-learn.org/stable/developers/index.html - -Project History ---------------- - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. - -The project is currently maintained by a team of volunteers. - -**Note**: `scikit-learn` was previously referred to as `scikits.learn`. - -Help and Support ----------------- - -Documentation -~~~~~~~~~~~~~ - -- HTML documentation (stable release): https://scikit-learn.org -- HTML documentation (development version): https://scikit-learn.org/dev/ -- FAQ: https://scikit-learn.org/stable/faq.html - -Communication -~~~~~~~~~~~~~ - -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos -- Blog: https://blog.scikit-learn.org -- Calendar: https://blog.scikit-learn.org/calendar/ -- Twitter: https://twitter.com/scikit_learn -- Twitter (commits): https://twitter.com/sklearn_commits -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions -- Website: https://scikit-learn.org -- LinkedIn: https://www.linkedin.com/company/scikit-learn -- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists -- Facebook: https://www.facebook.com/scikitlearnofficial/ -- Instagram: https://www.instagram.com/scikitlearnofficial/ -- TikTok: https://www.tiktok.com/@scikit.learn - -Citation -~~~~~~~~ - -If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn +.. _fork-changelog: +Major Changes of the Fork +========================= + +The purpose of this page is to illustrate some of the main features that +``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a +an understanding of core package ``scikit-learn`` and also decision trees +models. Please refer to our :ref:`installation instructions +` for installing ``scikit-learn-tree``. + +Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. +It is used in packages exactly the same way and will support all features +in the corresponding version of ``scikit-learn``. For example, if you +are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm, +then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have +all those features. + +The breaking API changes will be with respect to anything in the ``tree`` submodule, +and related Forest ensemble models. See below for a detailed list of breaking changes. + +See: https://scikit-learn.org/ for documentation on scikit-learn main. + +Our Philosophy +-------------- +Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes +as possible, such that incorporating upstream changes into the fork requires minimal effort. + +Candidate changes and PRs accepted into the fork are those that: + +- improve compatability with upstream ``scikit-learn`` main +- enable improved extensibility of tree models + +Decision tree generalizations +----------------------------- + +``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +decision tree model (classifier and regressor), which has a few fundamental limitations +that prevent 3rd parties from utilizing the existing class, without forking a large +amount of copy/pasted Python and Cython code. We highlight those limitations here +and then describe how we generalize that limitation. + +Cython Internal Private API: + +Note, the Cython API for scikit-learn is still not a publicly supported API, so it may +change without warning. + +- leaf and split nodes: These nodes are treated the same way and there is no internal + API for setting them differently. Quantile trees and causal trees inherently generalize + how leaf nodes are set. +- Criterion class: The criterion class currently assumes a supervised learning interface. + - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion. +- Splitter class: The splitter clas currently assumes a supervised learning interface and + does not provide a way of generalizing the way split candidates are proposed. + - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered. +- Tree class: The tree class currently assumes a supervised learning interface and does not + provide a way of generalizing the type of tree. + - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now. +- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various + stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions + may be extended. For example, in causal trees, one may want the splitter to also account for + a minimal degree of heterogeneity (i.e. variance) in its children nodes. + +Python API: + +- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y`` + parameter is required to be passed in, which is not necessary for general tree-based models. + For example, an unsupervised tree may pass in ``y=None``. + - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined. +- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter`` + and ``Tree`` Cython classes used: The current codebase requires users to define custom + criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents + users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper. + Moreover, the ``Tree`` class is not customizable. + - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them. +- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning + features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational + trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since + the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable + random forests and their variants to scale to millions of samples. + - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. + +Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` +and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend +the Cython/Python API easily. + +Roadmap +------- +There are several improvements that can be made in this fork. Primarily, the binning feature +promises to make Random Forests and their variants ultra-fast. However, the binning needs +to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes +in the binning thresholds throughout the tree construction step, such that the split nodes +store the actual numerical value of the bin rather than the "bin index". This requires +modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part +of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply +time because the tree already stores the "numerical" threshold value we would want to apply +to any incoming ``X`` that is not binned. + +Besides that modification, the tree and splitter need to be able to handle not just ``np.float32`` +data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it +is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions +of samples would result in many GB saved, but also improved runtime. + +So in summary, the Cython code of the tree submodule needs to take in an extra parameter for +the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``. +Afterwards, Random Forests will have fully leveraged the binning feature. + +Something to keep in mind is that upstream scikit-learn is actively working on incorporating +missing-value handling and categorical handling into Random Forests. + +Next steps +---------- + +We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``. +This enables packages to leverage these changes in developing more complex tree models +that may, or may not eventually be PRed into ``scikit-learn``. For example, + +- `scikit-tree `_ is a scikit-learn + compatible package for more complex and advanced tree models. + +If you are developing tree models, we encourage you to take a look at that package, or +if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree`` +more \ No newline at end of file diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5238cd1121d2e..db5b5d9414053 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccache" +CCACHE_LINKS_DIR="/tmp/ccachev2" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh index ab559a1878971..011e962885d45 100755 --- a/build_tools/azure/install_win.sh +++ b/build_tools/azure/install_win.sh @@ -22,4 +22,4 @@ show_installed_libraries python setup.py bdist_wheel # Install the generated wheel package to test it -pip install --pre --no-index --find-links dist scikit-learn +pip install --pre --no-index --find-links dist scikit-learn-tree diff --git a/doc/Makefile b/doc/Makefile index b56a1289cd581..c728bbbfd033e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -53,6 +53,8 @@ html: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" +# rm $(BUILDDIR)/html/stable/index.html +# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable @echo diff --git a/doc/conf.py b/doc/conf.py index 52b084b331c8c..01e0a332dd54f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -103,7 +103,8 @@ # source_encoding = 'utf-8' # The main toctree document. -root_doc = "contents" +# root_doc = "contents" +root_doc = "index" # General information about the project. project = "scikit-learn" diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 789b0bab616ca..7fa12fd16d487 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function:: >>> tree.plot_tree(clf) [...] -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center @@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. +.. _tree_tips_usage: + Tips on practical use ===================== @@ -612,11 +614,66 @@ be pruned. This process stops when the pruned tree's minimal * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +Classification, regression and multi-output problems +---------------------------------------------------- + +OTs can be used for both classification and regression, and can handle multi-output +problems in the same manner as DTs. + +Complexity +---------- + +The run time cost to construct an OT will be similar to that of a DT, with the +added complexity of a (possibly sparse) matrix multiplication to combine random +data columns into candidate split values. The cost at each node is +:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)` +where the additional :math:`n_{features}n_{samples}max\_features \lambda` term +comes from the (possibly sparse) matrix multiplication controlled by both the +number of candidate splits to generate ("max_features") and the sparsity of +the projection matrix that combines the data features (":math:`\lambda`"). + +Another consideration is space-complexity. + +Space-complexity and storing the OT pickled on disc is also a consideration. OTs +at every node need to store an additional vector of feature indices and vector of +feature weights that are used together to form the candidate splits. + +Tips on practical use +--------------------- + +Similar to DTs, the intuition for most parameters are the same. Therefore refer +to :ref:`tips for using decision trees ` for information on standard +tree parameters. Specific parameters, such as ``max_features`` and +``feature_combinations`` are different or special to OTs. + + * As specified earlier, ``max_features`` is not constrained to ``n_features`` + as it is in DTs. Setting ``max_features`` higher requires more computation time because + the algorithm needs to sample more candidate splits at every node. However, it also possibly + lets the user to sample more informative splits, thereby improving the model fit. This + presents a tradeoff between runtime resources and improvements to the model. In practice, + we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable + if one is willing to spend the computational resources. + + * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity + analysis, which specifies how sparse our combination of features is. If + ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However, + in practice, ``feature_combinations`` can be set much lower, therefore improving runtime + and storage complexity. + +Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends +always trying both model using some type of cross-validation procedure and hyperparameter +optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is +distributed along its features, such as data being axis-aligned, then one might use a DT. +Other considerations are runtime and space complexity. + .. topic:: References: .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984. - + + .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001). + https://doi.org/10.1023/A:1010933404324. + * https://en.wikipedia.org/wiki/Decision_tree_learning * https://en.wikipedia.org/wiki/Predictive_analytics diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index 14f6506b5810f..0dcca718bc6f0 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -2,16 +2,12 @@ ======================================================================= Plot the decision surface of decision trees trained on the iris dataset ======================================================================= - Plot the decision surface of a decision tree trained on pairs of features of the iris dataset. - See :ref:`decision tree ` for more information on the estimator. - For each pair of iris features, the decision tree learns decision boundaries made of combinations of simple thresholding rules inferred from the training samples. - We also show the tree structure of a model built on all of the features. """ # %% diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index f5522600f623f..e39e39455b7bc --- a/setup.py +++ b/setup.py @@ -30,19 +30,19 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" +DISTNAME = "scikit-learn-tree" +DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule." with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" +MAINTAINER = "Adam Li" +MAINTAINER_EMAIL = "adam.li@columbia.edu" URL = "http://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files" LICENSE = "new BSD" PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues", "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", + "Source Code": "https://github.com/neurodata/scikit-learn", } # We can actually import a restricted version of sklearn that @@ -170,11 +170,11 @@ def check_package_status(package, min_version): package_status["up_to_date"] = False package_status["version"] = "" - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) + req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version) instructions = ( "Installation instructions are available on the " - "scikit-learn website: " + "scikit-learn-tree website: " "http://scikit-learn.org/stable/install.html\n" ) @@ -221,10 +221,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -306,7 +306,7 @@ def check_package_status(package, min_version): {"sources": ["_ball_tree.pyx"], "include_np": True}, {"sources": ["_kd_tree.pyx"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -374,9 +374,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 19203da4fce1f..a3c29e4a269ce 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause +from time import time from numbers import Integral, Real from warnings import catch_warnings, simplefilter, warn import threading @@ -72,10 +73,11 @@ class calls the ``fit`` method of each sub-estimator on random samples _check_sample_weight, _check_feature_names_in, ) +from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import _num_samples from ..utils._param_validation import Interval, StrOptions from ..utils._param_validation import RealNotInt - +from ._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", @@ -210,6 +212,10 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], } @abstractmethod @@ -228,6 +234,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -244,6 +251,7 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins def apply(self, X): """ @@ -263,6 +271,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -420,6 +437,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -628,6 +677,35 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + def _accumulate_prediction(predict, X, out, lock): """ @@ -669,6 +747,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -683,6 +762,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) @staticmethod @@ -856,6 +936,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -937,6 +1025,7 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator, @@ -950,6 +1039,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) def predict(self, X): @@ -975,6 +1065,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1399,6 +1497,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1423,6 +1522,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -1734,6 +1834,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1757,6 +1858,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2084,6 +2186,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2108,6 +2211,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2406,6 +2510,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2429,6 +2534,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 9bf0bb2becd9b..0150340f24bc6 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -118,6 +118,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1791,3 +1905,60 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 ) forest.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b175275ea92dc..bd54483bf2dfe 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -40,8 +40,8 @@ from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._param_validation import RealNotInt -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import DepthFirstTreeBuilder from ._tree import BestFirstTreeBuilder from ._tree import Tree @@ -174,7 +174,7 @@ def get_n_leaves(self): check_is_fitted(self) return self.tree_.n_leaves - def fit(self, X, y, sample_weight=None, check_input=True): + def fit(self, X, y=None, sample_weight=None, check_input=True): self._validate_params() random_state = check_random_state(self.random_state) @@ -184,9 +184,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) if issparse(X): X.sort_indices() @@ -195,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -209,45 +212,56 @@ def fit(self, X, y, sample_weight=None, check_input=True): # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) + + y = np.atleast_1d(y) + expanded_class_weight = None - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - self.n_outputs_ = y.shape[1] + self.n_outputs_ = y.shape[1] - if is_classification: - check_classification_targets(y) - y = np.copy(y) + if is_classification: + check_classification_targets(y) + y = np.copy(y) - self.classes_ = [] - self.n_classes_ = [] + self.classes_ = [] + self.n_classes_ = [] - if self.class_weight is not None: - y_original = np.copy(y) + if self.class_weight is not None: + y_original = np.copy(y) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -299,16 +313,10 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -320,10 +328,63 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -337,7 +398,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -385,8 +446,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -817,7 +876,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -1173,7 +1235,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 47f616c6bad50..2e179e78e8c3f 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,6 +4,8 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -15,13 +17,11 @@ from ._tree cimport SIZE_t # Type for indices and counters from ._tree cimport INT32_t # Signed 32 bit integer from ._tree cimport UINT32_t # Unsigned 32 bit integer -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y @@ -37,19 +37,7 @@ cdef class Criterion: cdef double weighted_n_left # Weighted number of samples in the left node cdef double weighted_n_right # Weighted number of samples in the right node - # The criterion object is maintained such that left and right collected - # statistics correspond to samples[start:pos] and samples[pos:end]. - - # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil + # Core methods that criterion class _must_ implement. cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -71,6 +59,25 @@ cdef class Criterion: ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -88,4 +95,4 @@ cdef class RegressionCriterion(Criterion): cdef double[::1] sum_total # The sum of w*y. cdef double[::1] sum_left # Same as above, but for the left side of the split - cdef double[::1] sum_right # Same as above, but for the right side of the split + cdef double[::1] sum_right # Same as above, but for the right side of the split \ No newline at end of file diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7cd7bbb0e3c1b..c94914daa0e0b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -9,6 +9,8 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -29,11 +31,20 @@ from ._utils cimport WeightedMedianCalculator # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. - +cdef class BaseCriterion: + """This is an abstract interface for criterion. For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -41,61 +52,23 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - This method must be implemented by the subclass. """ pass cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. - Parameters ---------- new_pos : SIZE_t @@ -105,7 +78,6 @@ cdef class Criterion: cdef double node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. - Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the @@ -116,11 +88,9 @@ cdef class Criterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. - Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. - Parameters ---------- impurity_left : double pointer @@ -134,10 +104,8 @@ cdef class Criterion: cdef void node_value(self, double* dest) noexcept nogil: """Placeholder for storing the node value. - Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. - Parameters ---------- dest : double pointer @@ -147,12 +115,10 @@ cdef class Criterion: cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -167,28 +133,21 @@ cdef class Criterion: double impurity_left, double impurity_right) noexcept nogil: """Compute the improvement in impurity. - This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: - N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) - where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, - Parameters ---------- impurity_parent : double The initial impurity of the parent node before the split - impurity_left : double The impurity of the left child - impurity_right : double The impurity of the right child - Return ------ double : improvement in impurity after the split occurs @@ -199,6 +158,61 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics + such as the mean in regression and class probabilities in classification. + Instances of this class are responsible for compute splits' impurity difference + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -206,7 +220,6 @@ cdef class ClassificationCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, cnp.ndarray[SIZE_t, ndim=1] n_classes): """Initialize attributes for this criterion. - Parameters ---------- n_outputs : SIZE_t @@ -254,18 +267,11 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -277,18 +283,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -301,12 +313,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -317,11 +329,9 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -338,7 +348,6 @@ cdef class ClassificationCriterion(Criterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -355,10 +364,8 @@ cdef class ClassificationCriterion(Criterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- new_pos : SIZE_t @@ -428,7 +435,6 @@ cdef class ClassificationCriterion(Criterion): cdef void node_value(self, double* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. - Parameters ---------- dest : double pointer @@ -443,23 +449,17 @@ cdef class ClassificationCriterion(Criterion): cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The cross-entropy is then defined as - cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -481,10 +481,8 @@ cdef class Entropy(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). - Parameters ---------- impurity_left : double pointer @@ -516,24 +514,18 @@ cdef class Entropy(ClassificationCriterion): cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -559,10 +551,8 @@ cdef class Gini(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. - Parameters ---------- impurity_left : double pointer @@ -601,24 +591,20 @@ cdef class Gini(ClassificationCriterion): cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. - This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -648,23 +634,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -673,14 +665,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -692,7 +684,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" @@ -785,13 +776,11 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -807,22 +796,16 @@ cdef class MSE(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The MSE proxy is derived from - sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 - Neglecting constant terms, this gives: - - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef SIZE_t k @@ -839,7 +822,6 @@ cdef class MSE(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -883,7 +865,6 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): r"""Mean absolute error impurity criterion. - MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" @@ -895,12 +876,10 @@ cdef class MAE(RegressionCriterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -933,26 +912,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -963,10 +946,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -981,11 +964,9 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1016,7 +997,6 @@ cdef class MAE(RegressionCriterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1044,7 +1024,6 @@ cdef class MAE(RegressionCriterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1107,7 +1086,6 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1132,7 +1110,6 @@ cdef class MAE(RegressionCriterion): cdef void children_impurity(self, double* p_impurity_left, double* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1179,21 +1156,17 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. - Uses the formula (35) in Friedman's original Gradient Boosting paper: - diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1234,9 +1207,7 @@ cdef class FriedmanMSE(MSE): cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. - Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): @@ -1255,7 +1226,6 @@ cdef class Poisson(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1265,24 +1235,18 @@ cdef class Poisson(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The Poisson proxy is derived from: - sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) - Neglecting constant terms, this gives - - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ @@ -1312,7 +1276,6 @@ cdef class Poisson(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 13fec5974c3c5..b0207ab0a715d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,12 +4,14 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. -from ._criterion cimport Criterion +from ._criterion cimport BaseCriterion, Criterion from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight @@ -28,14 +30,15 @@ cdef struct SplitRecord: double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -54,7 +57,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -74,27 +76,38 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node SplitRecord* split, SIZE_t* n_constant_features ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil - cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil + +cdef class Splitter(BaseSplitter): + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight + ) except -1 + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil + cdef bint check_postsplit_conditions( + self + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 83a80d90cc1b9..17a747433d1a8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -8,7 +8,10 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # + # License: BSD 3 clause from ._criterion cimport Criterion @@ -43,16 +46,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.threshold = 0. self.improvement = -INFINITY -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. + + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state): + object random_state, *argv): """ Parameters ---------- @@ -75,7 +140,6 @@ cdef class Splitter: random_state : object The user inputted random state to be used for pseudo-randomness """ - self.criterion = criterion self.n_samples = 0 @@ -86,11 +150,6 @@ cdef class Splitter: self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass def __reduce__(self): return (type(self), (self.criterion, @@ -127,7 +186,6 @@ cdef class Splitter: are assumed to have uniform weight. This is represented as a Cython memoryview. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -165,6 +223,19 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -187,30 +258,11 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" @@ -221,6 +273,41 @@ cdef class Splitter: return self.criterion.node_impurity() + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + + if (((current_split.pos - self.start) < min_samples_leaf) or + ((self.end - current_split.pos) < min_samples_leaf)): + return 1 + + return 0 + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -229,7 +316,7 @@ cdef class Splitter: ctypedef fused Partitioner: DensePartitioner SparsePartitioner - + cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, @@ -349,15 +436,13 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -645,8 +730,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue # Evaluate split @@ -656,8 +740,7 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 1966651d8c89a..8140733a9fc26 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -13,6 +13,8 @@ import numpy as np cimport numpy as cnp +from libcpp.vector cimport vector + ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef cnp.npy_intp SIZE_t # Type for indices and counters @@ -33,40 +35,32 @@ cdef struct Node: SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node - -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -78,6 +72,49 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -91,8 +128,7 @@ cdef class TreeBuilder: # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. - cdef Splitter splitter # Splitting algorithm - + cdef Splitter splitter cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 75eed058bfd4e..e5b759aee23df 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -22,6 +22,8 @@ from libcpp.vector cimport vector from libcpp.algorithm cimport pop_heap from libcpp.algorithm cimport push_heap from libcpp cimport bool +from cython.operator cimport dereference as deref +from libc.stdlib cimport malloc, free import struct @@ -83,6 +85,7 @@ NODE_DTYPE = np.asarray((&dummy)).dtype # TreeBuilder # ============================================================================= + cdef class TreeBuilder: """Interface for different tree building strategies.""" @@ -196,9 +199,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef SIZE_t n_constant_features cdef bint is_leaf @@ -248,7 +253,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -256,8 +266,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: @@ -297,6 +307,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() @@ -462,6 +476,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): FrontierRecord* res) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -483,7 +499,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -493,7 +513,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, + split_ptr, impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: return -1 @@ -522,7 +542,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.improvement = 0.0 res.impurity_left = impurity res.impurity_right = impurity - + + free(split_ptr) return 0 @@ -530,190 +551,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. - - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. - - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. - - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. - - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. - - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. - - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. +cdef class BaseTree: + """Base class for Cython tree models. + + Downstream classes must implement """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - property n_classes: - def __get__(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - property children_left: - def __get__(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - property children_right: - def __get__(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - property n_leaves: - def __get__(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - property feature: - def __get__(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - property threshold: - def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - property impurity: - def __get__(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - property n_node_samples: - def __get__(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - property weighted_n_node_samples: - def __get__(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - property value: - def __get__(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -725,7 +571,10 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -756,14 +605,87 @@ cdef class Tree: self.capacity = capacity return 0 - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil: - """Add a node to the tree. + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil: + """Add a node to the tree. The new node registers itself as the child of its parent. - + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -784,28 +706,18 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node) != 1: + with gil: + raise RuntimeError self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): @@ -835,13 +747,20 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature value + cdef DTYPE_t feature_value = 0 + with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ndarray[i, node.feature] <= node.threshold: + + # compute the feature value to compare against threshold + feature_value = self._compute_feature(X_ndarray, i, node) + if feature_value <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -902,7 +821,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -951,6 +869,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -962,7 +883,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1091,8 +1014,6 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count @@ -1105,13 +1026,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] - - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) + self._compute_feature_importances( + importances, node) + node += 1 for i in range(self.n_features): @@ -1127,44 +1044,27 @@ cdef class Tree: return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. - - The array keeps a reference to this Tree, which manages the underlying - memory. + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. + + Wrapped in a private function to allow subclassing that + computes feature importances. """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. + left = &nodes[node.left_child] + right = &nodes[node.right_child] - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, @@ -1273,6 +1173,237 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + property n_classes: + def __get__(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + property children_left: + def __get__(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + property children_right: + def __get__(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + property n_leaves: + def __get__(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + property feature: + def __get__(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + property threshold: + def __get__(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + property impurity: + def __get__(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + property n_node_samples: + def __get__(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + property weighted_n_node_samples: + def __get__(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + property value: + def __get__(self): + return self._get_value_ndarray()[:self.node_count] + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1755,6 +1886,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1771,8 +1904,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1f3a9bf394b9b..69f948839259a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=1) + clf = Tree(random_state=0, max_features=X.shape[1]) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=3, + n_informative=4, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 3, "Failed with {0}".format(name) + assert n_important == 4, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -def test_importances_raises(): +@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) +def test_importances_raises(clf): # Check if variable importance before fit raises ValueError. - clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,6 +653,7 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) + # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -677,7 +678,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - + rng = np.random.RandomState(42) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -828,7 +829,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -918,6 +919,7 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ + # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1031,7 +1033,6 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) - # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1052,6 +1053,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1062,11 +1068,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - def test_sample_weight(): # Check sample weighting. @@ -1260,7 +1261,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if "ExtraTree" not in name: + if all(_name not in name for _name in ["ExtraTree"]): est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1586,6 +1587,7 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) + # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1644,6 +1646,7 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] + assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1930,6 +1933,7 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) + if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: From 475bd05f779a4be4f301f751ac86ba6a998a219a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 29 Mar 2023 09:41:10 -0700 Subject: [PATCH 2/8] Docs (#39) #### Reference Issues/PRs Fixes README and wheel building --------- Signed-off-by: Adam Li --- README.rst | 36 ++++++++++++--------- build_tools/azure/install.sh | 2 +- build_tools/github/repair_windows_wheels.sh | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index fbdfdaa95ef4c..7a7bd41c42846 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,10 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 +================= +Scikit-learn-tree +================= + ``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -94,8 +98,7 @@ Installing scikit-learn-tree ============================ Scikit-learn-tree is a maintained fork of scikit-learn, which extends the -tree submodule in a few ways documented in :ref:`changelog of the fork -`. +tree submodule in a few ways documented in `fork_changelog`_. We release versions of scikit-learn-tree in an analagous fashion to scikit-learn main. Due to maintenance resources, we only release on PyPi @@ -103,12 +106,11 @@ and recommend therefore installing with ``pip``. There are different ways to install scikit-learn-tree: - * :ref:`Install the latest official release `. This + * Install the latest official release `install_fork_release`_. This is the best approach for most users. It will provide a stable version and pre-built packages are available for most platforms. - * :ref:`Building the package from source - `. This is best for users who want the + * Building the package from source `install_source`_. This is best for users who want the latest-and-greatest features and aren't afraid of running brand-new code. This is also needed for users who wish to contribute to the project. @@ -119,9 +121,7 @@ Installing the latest release ----------------------------- We release wheels for common distributions and this is thus installable via pip. -.. prompt:: bash $ - - pip install scikit-learn-tree + pip install scikit-learn-tree This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then can be used as a stand-in for any package that relies on the public API of ``sklearn``. @@ -146,9 +146,11 @@ features to the fork, the building from source instructions are exactly the same as that of scikit-learn main, so please refer to `scikit-learn documentation `_ for instructions on building from source. -Development =========== +Development +----------- + We welcome new contributors of all experience levels, specifically to maintain the fork. Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, or improves the tree submodule in anyway will be appreciated. @@ -158,15 +160,17 @@ The scikit-learn community goals are to be helpful, welcoming, and effective. Th has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -.. _fork-changelog: -Major Changes of the Fork ========================= +.. _fork_changelog: + +Major Changes of the Fork +------------------------- + The purpose of this page is to illustrate some of the main features that ``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a an understanding of core package ``scikit-learn`` and also decision trees -models. Please refer to our :ref:`installation instructions -` for installing ``scikit-learn-tree``. +models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``. Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. It is used in packages exactly the same way and will support all features @@ -193,7 +197,7 @@ Candidate changes and PRs accepted into the fork are those that: Decision tree generalizations ----------------------------- -``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier `_ decision tree model (classifier and regressor), which has a few fundamental limitations that prevent 3rd parties from utilizing the existing class, without forking a large amount of copy/pasted Python and Cython code. We highlight those limitations here @@ -239,8 +243,8 @@ Python API: random forests and their variants to scale to millions of samples. - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. -Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` -and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier `_ +and `sklearn.ensemble.RandomForestClassifier `_ all work exactly the same as they would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend the Cython/Python API easily. diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index db5b5d9414053..5238cd1121d2e 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccachev2" +CCACHE_LINKS_DIR="/tmp/ccache" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh index cdd0c0c79d8c4..a857e61067960 100755 --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -9,7 +9,7 @@ DEST_DIR=$2 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" -WHEEL_DIRNAME=$(ls -d scikit_learn-*) +WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" From 706a74273bf736066b1d71eeed9da08c0943e311 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 4 Apr 2023 14:47:24 -0700 Subject: [PATCH 3/8] Release v1.2.2 #### Reference Issues/PRs #### What does this implement/fix? Explain your changes. #### Any other comments? --------- Signed-off-by: Adam Li --- .github/workflows/check-upstream.yml | 27 +++++++++++++++++++++++++++ sklearn/__init__.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/check-upstream.yml diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml new file mode 100644 index 0000000000000..80e8ace610607 --- /dev/null +++ b/.github/workflows/check-upstream.yml @@ -0,0 +1,27 @@ +# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and +# creates or updates +# an existing pull request to https://github.com/neurodata/scikit-learn:fork. +# Runs the check weekly. +# Creates a pull request if there are changes. + +# name: Check upstream scikit-learn + +# on: +# schedule: +# - cron: '0 0 * * 0' + +# jobs: +# check-upstream: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - name: Check upstream scikit-learn +# uses: neurodata/check-upstream@main +# with: +# upstream: scikit-learn/scikit-learn +# fork: neurodata/scikit-learn +# branch: fork +# token: ${{ secrets.GITHUB_TOKEN }} + +# # Creates a pull request if there are changes. + diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 47bb893bd00a0..6d5af7c771fb8 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.3.dev0" +__version__ = "1.2.2" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From a22db039704399a31d466be861f2b5a86bbc51b3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 11 Apr 2023 15:25:44 -0400 Subject: [PATCH 4/8] Update README Signed-off-by: Adam Li --- README.rst | 4 ++-- sklearn/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 7a7bd41c42846..444ead93017b9 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,7 @@ Scikit-learn-tree ================= -``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -85,7 +85,7 @@ Installation Dependencies ~~~~~~~~~~~~ -scikit-learn requires: +scikit-learn-tree requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 6d5af7c771fb8..4d7badd6b678e 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.2.2" +__version__ = "1.3.0dev0" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From f7b70429c5ca9d5d2babbf2b038ad3b21f53240d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 28 Apr 2023 10:11:24 -0400 Subject: [PATCH 5/8] Try it Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 35 ++++++++++++++++++++--------------- sklearn/tree/_classes.py | 28 ++++++++++++++-------------- sklearn/tree/_utils.pxd | 2 +- sklearn/tree/_utils.pyx | 2 +- sklearn/tree/meson.build | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 sklearn/tree/meson.build diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b521235967b79..32e65cc7c0447 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,11 +50,16 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack -from ..base import is_classifier -from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin +from sklearn.base import is_classifier +from sklearn.base import ( + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, +) -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -63,21 +68,21 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, compute_sample_weight -from ..exceptions import DataConversionWarning -from ._base import BaseEnsemble, _partition_estimators -from ..utils.parallel import delayed, Parallel -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.validation import ( +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.exceptions import DataConversionWarning +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.utils.parallel import delayed, Parallel +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, _check_feature_names_in, ) -from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils.validation import _num_samples -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt -from ._hist_gradient_boosting.binning import _BinMapper +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils.validation import _num_samples +from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a795008268acf..b299a1f8a1b37 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -25,20 +25,20 @@ import numpy as np from scipy.sparse import issparse -from ..base import BaseEstimator -from ..base import ClassifierMixin -from ..base import clone -from ..base import RegressorMixin -from ..base import is_classifier -from ..base import MultiOutputMixin -from ..utils import Bunch -from ..utils import check_random_state -from ..utils.validation import _check_sample_weight -from ..utils import compute_sample_weight -from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_is_fitted -from ..utils._param_validation import Hidden, Interval, StrOptions -from ..utils._param_validation import RealNotInt +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin +from sklearn.base import clone +from sklearn.base import RegressorMixin +from sklearn.base import is_classifier +from sklearn.base import MultiOutputMixin +from sklearn.utils import Bunch +from sklearn.utils import check_random_state +from sklearn.utils.validation import _check_sample_weight +from sklearn.utils import compute_sample_weight +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted +from sklearn.utils._param_validation import Hidden, Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt from ._criterion import BaseCriterion from ._splitter import BaseSplitter diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4938d3030245f..f7bae4c5c8553 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,7 +10,7 @@ cimport numpy as cnp from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell +from sklearn.neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 0bde50c315ee8..05eea549e641a 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -14,7 +14,7 @@ from libc.math cimport log as ln cimport numpy as cnp cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions diff --git a/sklearn/tree/meson.build b/sklearn/tree/meson.build new file mode 100644 index 0000000000000..3750dae12eaf1 --- /dev/null +++ b/sklearn/tree/meson.build @@ -0,0 +1,32 @@ +extensions = [ + '_splitter', + '_tree', + '_criterion', + '_utils', +] + +foreach ext: extensions + py3.extension_module(ext, + cython_gen_cpp.process(ext + '.pyx'), + c_args: cython_c_args, + include_directories: [incdir_numpy], + install: true, + subdir: 'sktree/_lib/scikit_learn/', + ) +endforeach + +# TODO: comment in _classes.py when we have a working Cython unsupervised tree with a Python API +python_sources = [ + '__init__.py', + '_classes.py', +] + +py3.install_sources( + python_sources, + subdir: 'sktree/tree' # Folder relative to site-packages to install to +) + +# TODO: comment in if we include tests +subdir('tests') +subdir('unsupervised') +subdir('manifold') \ No newline at end of file From bd2869bfe62daeaca062106f2aeb6bab71c311c5 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 7 Jun 2023 09:41:02 -0400 Subject: [PATCH 6/8] Trying to merge in changes from main Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 36 ++---------- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 114 ++++++------------------------------ sklearn/tree/_splitter.pxd | 1 + sklearn/tree/_splitter.pyx | 73 ++++++++++++----------- sklearn/tree/_tree.pxd | 5 +- sklearn/tree/_tree.pyx | 86 ++++++++++++++------------- 7 files changed, 112 insertions(+), 205 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 5ff7b515b9cfa..b3b4a49937db2 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -25,7 +25,6 @@ import numpy as np from scipy.sparse import issparse -<<<<<<< HEAD from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.base import clone @@ -37,27 +36,10 @@ from sklearn.utils.validation import _check_sample_weight from sklearn.utils import compute_sample_weight from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import (check_is_fitted, assert_all_finite, + _assert_all_finite_element_wise) from sklearn.utils._param_validation import Hidden, Interval, StrOptions from sklearn.utils._param_validation import RealNotInt -======= -from ..base import BaseEstimator -from ..base import ClassifierMixin -from ..base import clone -from ..base import RegressorMixin -from ..base import is_classifier -from ..base import MultiOutputMixin -from ..utils import Bunch -from ..utils import check_random_state -from ..utils.validation import _check_sample_weight -from ..utils.validation import assert_all_finite -from ..utils.validation import _assert_all_finite_element_wise -from ..utils import compute_sample_weight -from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_is_fitted -from ..utils._param_validation import Hidden, Interval, StrOptions -from ..utils._param_validation import RealNotInt ->>>>>>> main from ._criterion import BaseCriterion from ._splitter import BaseSplitter @@ -194,9 +176,6 @@ def get_n_leaves(self): check_is_fitted(self) return self.tree_.n_leaves -<<<<<<< HEAD - def fit(self, X, y=None, sample_weight=None, check_input=True): -======= def _support_missing_values(self, X): return not issparse(X) and self._get_tags()["allow_nan"] @@ -239,7 +218,6 @@ def _compute_feature_has_missing(self, X): def _fit( self, X, y, sample_weight=None, check_input=True, feature_has_missing=None ): ->>>>>>> main self._validate_params() random_state = check_random_state(self.random_state) @@ -254,20 +232,14 @@ def _fit( dtype=DTYPE, accept_sparse="csc", force_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) -<<<<<<< HEAD if y is not None or self._get_tags()["requires_y"]: X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) ) else: X = self._validate_data(X, **check_X_params) -======= - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) feature_has_missing = self._compute_feature_has_missing(X) ->>>>>>> main if issparse(X): X.sort_indices() @@ -415,6 +387,7 @@ def _fit( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -430,6 +403,7 @@ def _build_tree( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -447,6 +421,8 @@ def _build_tree( Y targets. sample_weight : Array-like Sample weights + feature_has_missing : Array-like + Boolean array indicating whether or not a feature has missing values. min_samples_leaf : float Number of samples required to be a leaf. min_weight_leaf : float diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index f3cad9ec11cc7..075309d93ee21 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -102,4 +102,4 @@ cdef class RegressionCriterion(Criterion): cdef double[::1] sum_total # The sum of w*y. cdef double[::1] sum_left # Same as above, but for the left side of the split cdef double[::1] sum_right # Same as above, but for the right side of the split cdef double[::1] sum_missing # Same as above, but for missing values in X - cdef double[:, ::1] sum_missing # Same as above, but for missing values in X \ No newline at end of file + cdef double[::1] sum_missing # Same as above, but for missing values in X \ No newline at end of file diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 47c5ad36c7685..f1a64e772310a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -170,6 +170,25 @@ cdef class BaseCriterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -214,25 +233,6 @@ cdef inline void _move_sums_classification( weighted_n_1[0] = 0.0 weighted_n_2[0] = criterion.weighted_n_node_samples - cdef void set_sample_pointers( - self, - SIZE_t start, - SIZE_t end - ) noexcept nogil: - """Abstract method which will set sample pointers in the criterion. - The dataset array that we compute criteria on is assumed to consist of 'N' - ordered samples or rows (i.e. sorted). Since we pass this by reference, we - use sample pointers to move the start and end around to consider only a subset of data. - This function should also update relevant statistics that the class uses to compute the final criterion. - Parameters - ---------- - start : SIZE_t - The index of the first sample to be used on computation of criteria of the current node. - end : SIZE_t - The last sample used on this node - """ - pass - cdef class Criterion(BaseCriterion): """Interface for impurity criteria. @@ -421,39 +421,6 @@ cdef class ClassificationCriterion(Criterion): self.weighted_n_missing += w - cdef void init_sum_missing(self): - """Init sum_missing to hold sums for missing values.""" - self.sum_missing = np.zeros((self.n_outputs, self.max_n_classes), dtype=np.float64) - - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - """ - cdef SIZE_t i, p, k, c - cdef DOUBLE_t w = 1.0 - - self.n_missing = n_missing - if n_missing == 0: - return - - memset(&self.sum_missing[0, 0], 0, self.max_n_classes * self.n_outputs * sizeof(double)) - - self.weighted_n_missing = 0.0 - - # The missing samples are assumed to be in self.sample_indices[-n_missing:] - for p in range(self.end - n_missing, self.end): - i = self.sample_indices[p] - if self.sample_weight is not None: - w = self.sample_weight[i] - - for k in range(self.n_outputs): - c = self.y[i, k] - self.sum_missing[k, c] += w - - self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -887,42 +854,6 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_missing += w - cdef void init_sum_missing(self): - """Init sum_missing to hold sums for missing values.""" - self.sum_missing = np.zeros(self.n_outputs, dtype=np.float64) - - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t y_ik - cdef DOUBLE_t w_y_ik - cdef DOUBLE_t w = 1.0 - - self.n_missing = n_missing - if n_missing == 0: - return - - memset(&self.sum_missing[0], 0, self.n_outputs * sizeof(double)) - - self.weighted_n_missing = 0.0 - - # The missing samples are assumed to be in self.sample_indices[-n_missing:] - for p in range(self.end - n_missing, self.end): - i = self.sample_indices[p] - if self.sample_weight is not None: - w = self.sample_weight[i] - - for k in range(self.n_outputs): - y_ik = self.y[i, k] - w_y_ik = w * y_ik - self.sum_missing[k] += w_y_ik - - self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" self.pos = self.start @@ -1218,13 +1149,6 @@ cdef class MAE(RegressionCriterion): with gil: raise ValueError("missing values is not supported for MAE.") - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Raise error if n_missing != 0.""" - if n_missing == 0: - return - with gil: - raise ValueError("missing values is not supported for MAE.") - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3419c6fa08819..4e790521a92ac 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -110,6 +110,7 @@ cdef class Splitter(BaseSplitter): cdef bint check_presplit_conditions( self, SplitRecord current_split, + SIZE_t n_missing ) noexcept nogil cdef bint check_postsplit_conditions( self diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 28daee229c89a..951bc2366b751 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -283,27 +283,10 @@ cdef class Splitter(BaseSplitter): return self.criterion.node_impurity() -cdef inline void shift_missing_values_to_left_if_required( - SplitRecord* best, - SIZE_t[::1] samples, - SIZE_t end, -) nogil: - cdef SIZE_t i, p, current_end - # The partitioner partitions the data such that the missing values are in - # samples[-n_missing:] for the criterion to consume. If the missing values - # are going to the right node, then the missing values are already in the - # correct position. If the missing values go left, then we move the missing - # values to samples[best.pos:best.pos+n_missing] and update `best.pos`. - if best.n_missing > 0 and best.missing_go_to_left: - for p in range(best.n_missing): - i = best.pos + p - current_end = end - 1 - p - samples[i], samples[current_end] = samples[current_end], samples[i] - best.pos += best.n_missing - cdef bint check_presplit_conditions( self, SplitRecord current_split, + SIZE_t n_missing, ) noexcept nogil: """Check stopping conditions pre-split. @@ -312,11 +295,19 @@ cdef inline void shift_missing_values_to_left_if_required( argument. """ cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef SIZE_t p = current_split.pos + cdef SIZE_t start = self.start + cdef SIZE_t end_non_missing = self.end - n_missing + + if n_missing > 0: + n_left = p - start + n_missing + n_right = end_non_missing - p + else: + n_left = p - start + n_right = end_non_missing - p + n_missing - if (((current_split.pos - self.start) < min_samples_leaf) or - ((self.end - current_split.pos) < min_samples_leaf)): + if n_left < min_samples_leaf or n_right < min_samples_leaf: return 1 - return 0 cdef bint check_postsplit_conditions( @@ -336,6 +327,25 @@ cdef inline void shift_missing_values_to_left_if_required( return 0 + +cdef inline void shift_missing_values_to_left_if_required( + SplitRecord* best, + SIZE_t[::1] samples, + SIZE_t end, +) nogil: + cdef SIZE_t i, p, current_end + # The partitioner partitions the data such that the missing values are in + # samples[-n_missing:] for the criterion to consume. If the missing values + # are going to the right node, then the missing values are already in the + # correct position. If the missing values go left, then we move the missing + # values to samples[best.pos:best.pos+n_missing] and update `best.pos`. + if best.n_missing > 0 and best.missing_go_to_left: + for p in range(best.n_missing): + i = best.pos + p + current_end = end - 1 - p + samples[i], samples[current_end] = samples[current_end], samples[i] + best.pos += best.n_missing + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -490,23 +500,16 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - if missing_go_to_left: - n_left = p - start + n_missing - n_right = end_non_missing - p - else: - n_left = p - start - n_right = end_non_missing - p + n_missing - - # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: - continue + # Reject if min_samples_leaf is not guaranteed + if splitter.check_presplit_conditions(current_split, n_missing) == 1: + continue current_split.pos = p criterion.update(current_split.pos) - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue + # Reject if min_weight_leaf is not satisfied + if splitter.check_postsplit_conditions() == 1: + continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -828,7 +831,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: + if splitter.check_presplit_conditions(current_split, 0) == 1: continue # Evaluate split diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 0d7bbe9a4a464..9af470857cb6a 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -44,8 +44,8 @@ cdef class BaseTree: cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample cdef double* value # Array of values prediction values for each node + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil @@ -55,8 +55,7 @@ cdef class BaseTree: SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, - double threshold, + SplitRecord* split_node, double impurity, SIZE_t n_node_samples, double weighted_n_node_samples, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index c1558b8db3e58..c756db229b6a9 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1242,47 +1242,51 @@ cdef class Tree(BaseTree): # WARNING: these reference the current `nodes` and `value` buffers, which # must not be freed by a subsequent memory allocation. # (i.e. through `_resize` or `__setstate__`) - property n_classes: - def __get__(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - property children_left: - def __get__(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - property children_right: - def __get__(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - property n_leaves: - def __get__(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - property feature: - def __get__(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - property threshold: - def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - property impurity: - def __get__(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - property n_node_samples: - def __get__(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - property weighted_n_node_samples: - def __get__(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - property value: - def __get__(self): - return self._get_value_ndarray()[:self.node_count] + @property + def n_classes(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + @property + def children_left(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + @property + def children_right(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + @property + def n_leaves(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + @property + def feature(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + @property + def threshold(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + @property + def impurity(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + @property + def n_node_samples(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + @property + def weighted_n_node_samples(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + @property + def missing_go_to_left(self): + return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] + + @property + def value(self): + return self._get_value_ndarray()[:self.node_count] # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed From a659a104ed4d737f447fd0be4d65e1a52b716321 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 7 Jun 2023 09:59:36 -0400 Subject: [PATCH 7/8] Figure out if tree code works properly Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 12 +++++++----- sklearn/tree/meson.build | 32 -------------------------------- 2 files changed, 7 insertions(+), 37 deletions(-) delete mode 100644 sklearn/tree/meson.build diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index c756db229b6a9..9352d25bb8f63 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -760,18 +760,20 @@ cdef class BaseTree: cdef Node* node = NULL cdef SIZE_t i = 0 - # the feature value - cdef DTYPE_t feature_value = 0 - with nogil: for i in range(n_samples): node = self.nodes # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: - if X_ndarray[i, node.feature] <= node.threshold: + if isnan(X_i_node_feature): + if node.missing_go_to_left: + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + elif X_i_node_feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] diff --git a/sklearn/tree/meson.build b/sklearn/tree/meson.build deleted file mode 100644 index 3750dae12eaf1..0000000000000 --- a/sklearn/tree/meson.build +++ /dev/null @@ -1,32 +0,0 @@ -extensions = [ - '_splitter', - '_tree', - '_criterion', - '_utils', -] - -foreach ext: extensions - py3.extension_module(ext, - cython_gen_cpp.process(ext + '.pyx'), - c_args: cython_c_args, - include_directories: [incdir_numpy], - install: true, - subdir: 'sktree/_lib/scikit_learn/', - ) -endforeach - -# TODO: comment in _classes.py when we have a working Cython unsupervised tree with a Python API -python_sources = [ - '__init__.py', - '_classes.py', -] - -py3.install_sources( - python_sources, - subdir: 'sktree/tree' # Folder relative to site-packages to install to -) - -# TODO: comment in if we include tests -subdir('tests') -subdir('unsupervised') -subdir('manifold') \ No newline at end of file From 1f6202560897e175b703eb1730350cfea40de6da Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 09:59:02 -0400 Subject: [PATCH 8/8] Fixing diff Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 9 ++++++--- sklearn/tree/tests/test_tree.py | 30 +++++++++++++----------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 9352d25bb8f63..82a3c1092871f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -654,9 +654,12 @@ cdef class BaseTree: node.threshold = _TREE_UNDEFINED return 1 - cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, - Node *node) noexcept nogil: + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: """Compute feature from a given data matrix, X. In axis-aligned trees, this is simply the value in the column of X diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 6be168e4c8e7c..45578843fc71f 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=X.shape[1]) + clf = Tree(random_state=0, max_features=1) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=4, + n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 4, "Failed with {0}".format(name) + assert n_important == 3, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) def test_importances_raises(clf): # Check if variable importance before fit raises ValueError. + clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,7 +653,6 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) - # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -678,7 +677,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - rng = np.random.RandomState(42) + weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -829,7 +828,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -919,7 +918,6 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ - # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1033,6 +1031,7 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) + # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1053,11 +1052,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1068,6 +1062,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + def test_sample_weight(): # Check sample weighting. @@ -1261,7 +1260,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if all(_name not in name for _name in ["ExtraTree"]): + if "ExtraTree" not in name: est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1587,7 +1586,6 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) - # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1646,7 +1644,6 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] - assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1933,7 +1930,6 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) - if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: