diff --git a/.binder/postBuild b/.binder/postBuild old mode 100644 new mode 100755 index c33605a68456c..00e8d39b93549 --- a/.binder/postBuild +++ b/.binder/postBuild @@ -6,9 +6,9 @@ set -e # inside a git checkout of the scikit-learn/scikit-learn repo. This script is # generating notebooks from the scikit-learn python examples. -if [[ ! -f /.dockerenv ]]; then - echo "This script was written for repo2docker and is supposed to run inside a docker container." - echo "Exiting because this script can delete data if run outside of a docker container." +if [[ -z "${REPO_DIR}" ]]; then + echo "This script was written for repo2docker and the REPO_DIR environment variable is supposed to be set." + echo "Exiting because this script can delete data if run outside of a repo2docker context." exit 1 fi @@ -23,7 +23,7 @@ find . -delete GENERATED_NOTEBOOKS_DIR=.generated-notebooks cp -r $TMP_CONTENT_DIR/examples $GENERATED_NOTEBOOKS_DIR -find $GENERATED_NOTEBOOKS_DIR -name '*.py' -exec sphx_glr_python_to_jupyter.py '{}' + +find $GENERATED_NOTEBOOKS_DIR -name '*.py' -exec sphinx_gallery_py2jupyter '{}' + NON_NOTEBOOKS=$(find $GENERATED_NOTEBOOKS_DIR -type f | grep -v '\.ipynb') rm -f $NON_NOTEBOOKS diff --git a/.binder/requirements.txt b/.binder/requirements.txt index 9ecc5c6fba79c..bd2b70f5f43b0 100644 --- a/.binder/requirements.txt +++ b/.binder/requirements.txt @@ -1,8 +1,10 @@ ---extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn +--find-links https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/scikit-learn --pre matplotlib scikit-image pandas +seaborn +Pillow sphinx-gallery scikit-learn - +polars diff --git a/.binder/runtime.txt b/.binder/runtime.txt new file mode 100644 index 0000000000000..8fdd90711cf30 --- /dev/null +++ b/.binder/runtime.txt @@ -0,0 +1 @@ +python-3.9 diff --git a/.circleci/artifact_path b/.circleci/artifact_path deleted file mode 100644 index 82181e4f2a5d1..0000000000000 --- a/.circleci/artifact_path +++ /dev/null @@ -1 +0,0 @@ -0/doc/_changed.html diff --git a/.circleci/config.yml b/.circleci/config.yml index bc4acd8a35fcb..bd4914056fe10 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,26 +1,38 @@ -version: 2 +version: 2.1 jobs: + lint: + docker: + - image: cimg/python:3.10.16 + steps: + - checkout + - run: + name: dependencies + command: | + source build_tools/shared.sh + # Include pytest compatibility with mypy + pip install pytest $(get_dep ruff min) $(get_dep mypy min) cython-lint + - run: + name: linting + command: ./build_tools/linting.sh + doc-min-dependencies: docker: - - image: circleci/python:3.7.3-stretch + - image: cimg/base:current-22.04 environment: - - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 + - OPENBLAS_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - - PYTHON_VERSION: 3.6 - - NUMPY_VERSION: 'min' - - SCIPY_VERSION: 'min' - - MATPLOTLIB_VERSION: 'min' - - CYTHON_VERSION: 'min' - - SCIKIT_IMAGE_VERSION: 'min' - - SPHINX_VERSION: 'min' - - PANDAS_VERSION: 'min' + - LOCK_FILE: build_tools/circle/doc_min_dependencies_linux-64_conda.lock + # Do not fail if the documentation build generates warnings with minimum + # dependencies as long as we can avoid raising warnings with more recent + # versions of the same dependencies. + - SKLEARN_WARNINGS_AS_ERRORS: '0' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - restore_cache: - key: v1-datasets-{{ .Branch }} + key: v1-doc-min-deps-datasets-{{ .Branch }} - restore_cache: keys: - doc-min-deps-ccache-{{ .Branch }} @@ -32,7 +44,7 @@ jobs: - ~/.ccache - ~/.cache/pip - save_cache: - key: v1-datasets-{{ .Branch }} + key: v1-doc-min-deps-datasets-{{ .Branch }} paths: - ~/scikit_learn_data - store_artifacts: @@ -44,24 +56,20 @@ jobs: doc: docker: - - image: circleci/python:3.7.3-stretch + - image: cimg/base:current-22.04 environment: - - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 + - OPENBLAS_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - - PYTHON_VERSION: 3 - - NUMPY_VERSION: 'latest' - - SCIPY_VERSION: 'latest' - - MATPLOTLIB_VERSION: 'latest' - - CYTHON_VERSION: 'latest' - - SCIKIT_IMAGE_VERSION: 'latest' - - SPHINX_VERSION: 'min' - - PANDAS_VERSION: 'latest' + - LOCK_FILE: build_tools/circle/doc_linux-64_conda.lock + # Make sure that we fail if the documentation build generates warnings with + # recent versions of the dependencies. + - SKLEARN_WARNINGS_AS_ERRORS: '1' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh - restore_cache: - key: v1-datasets-{{ .Branch }} + key: v1-doc-datasets-{{ .Branch }} - restore_cache: keys: - doc-ccache-{{ .Branch }} @@ -73,7 +81,7 @@ jobs: - ~/.ccache - ~/.cache/pip - save_cache: - key: v1-datasets-{{ .Branch }} + key: v1-doc-datasets-{{ .Branch }} paths: - ~/scikit_learn_data - store_artifacts: @@ -88,43 +96,9 @@ jobs: root: doc/_build/html paths: . - lint: - docker: - - image: circleci/python:3.6 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - - run: - name: dependencies - command: sudo pip install flake8 - - run: - name: linting - command: ./build_tools/circle/linting.sh - - pypy3: - docker: - - image: condaforge/miniforge3 - environment: - # Avoid the interactive dialog when installing tzdata - - DEBIAN_FRONTEND: noninteractive - steps: - - restore_cache: - keys: - - pypy3-ccache-{{ .Branch }} - - pypy3-ccache - - run: apt-get -yq update && apt-get -yq install git ssh - - checkout - - run: conda init bash && source ~/.bashrc - - run: ./build_tools/circle/build_test_pypy.sh - - save_cache: - key: pypy3-ccache-{{ .Branch }}-{{ .BuildNum }} - paths: - - ~/.ccache - - ~/.cache/pip - deploy: docker: - - image: circleci/python:3.6 + - image: cimg/base:current-22.04 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh @@ -133,7 +107,7 @@ jobs: - attach_workspace: at: doc/_build/html - run: ls -ltrh doc/_build/html/stable - - deploy: + - run: command: | if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then bash build_tools/circle/push_doc.sh doc/_build/html/stable @@ -150,21 +124,6 @@ workflows: - doc-min-dependencies: requires: - lint - - pypy3: - filters: - branches: - only: - - 0.20.X - deploy: requires: - doc - pypy: - triggers: - - schedule: - cron: "0 0 * * *" - filters: - branches: - only: - - main - jobs: - - pypy3 diff --git a/.codecov.yml b/.codecov.yml index d430925ea7508..f4ecd6e7d8fee 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -28,4 +28,6 @@ codecov: ignore: - "sklearn/externals" - "sklearn/_build_utils" -- "**/setup.py" +- "sklearn/__check_build" +- "sklearn/_min_dependencies.py" +- "**/conftest.py" diff --git a/.coveragerc b/.coveragerc index a8601458a0b07..0d5f02b3edafc 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,9 +1,11 @@ [run] -branch = True +# Use statement coverage rather than branch coverage because +# COVERAGE_CORE=sysmon can make branch coverage slower rather than faster. See +# https://github.com/nedbat/coveragepy/issues/1812 for more details. +branch = False source = sklearn parallel = True omit = */sklearn/externals/* */sklearn/_build_utils/* */benchmarks/* - **/setup.py diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000000..77fb878ee8fe7 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,48 @@ +# Since git version 2.23, git-blame has a feature to ignore +# certain commits. +# +# This file contains a list of commits that are not likely what +# you are looking for in `git blame`. You can set this file as +# a default ignore file for blame by running the following +# command. +# +# $ git config blame.ignoreRevsFile .git-blame-ignore-revs + +# PR 18948: Migrate code style to Black +82df48934eba1df9a1ed3be98aaace8eada59e6e + +# PR 20294: Use target_version >= 3.7 in Black +351ace7935a4ea685171cc6d174890f08facd561 + +# PR 20412: Use experimental_string_processing=true in Black +3ae7c7615343bbd36acece57825d8b0d70fd9da4 + +# PR 20502: Runs Black on examples +70a185ae59b4362633d18b0d0083abb1b6f7370c + +# PR 22474: Update to Black 22.1.0 +1fc86b6aacd89da44a3b4e8abf7c3e2ba4336ffe + +# PR 22983: Update to Black 22.3.0 +d4aad64b1eb2e42e76f49db2ccfbe4b4660d092b + +# PR 26110: Update black to 23.3.0 +893d5accaf9d16f447645e704f85a216187564f7 + +# PR 26649: Add isort and ruff rules +42173fdb34b5aded79664e045cada719dfbe39dc + +# PR 28802: Update black to 24.3.0 +c4c546355667b070edd5c892b206aa4a97af9a0b + +# PR 30694: Enforce ruff rules (RUF) +fe7c4176828af5231f526e76683fb9bdb9ea0367 + +# PR 30695: Apply ruff/flake8-implicit-str-concat rules (ISC) +5cdbbf15e3fade7cc2462ef66dc4ea0f37f390e3 + +# PR 31015: black -> ruff format +ff78e258ccf11068e2b3a433c51517ae56234f88 + +# PR 31226: Enforce ruff/pygrep-hooks rules +b98dc797c480b1b9495f918e201d45ee07f29feb diff --git a/.gitattributes b/.gitattributes index 163f2a4fe2030..f45e0f29ccfa2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,6 @@ -/doc/whats_new.rst merge=union +.* export-ignore +asv_benchmarks export-ignore +azure-pipelines.yml export-ignore +benchmarks export-ignore +build_tools export-ignore +maint_tools export-ignore diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 102ebd0770535..0000000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -name: Bug report -about: Create a report to help us reproduce and correct the bug -title: '' -labels: 'Bug: triage' -assignees: '' - ---- - - - -#### Describe the bug - - -#### Steps/Code to Reproduce - - -``` -Sample code to reproduce the problem -``` - -#### Expected Results - - -#### Actual Results - - -#### Versions - - - - diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000..bc8e5b5ff70d1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,95 @@ +name: Bug Report +description: Create a report to help us reproduce and correct the bug +labels: ['Bug', 'Needs Triage'] + +body: +- type: markdown + attributes: + value: > + #### Before submitting a bug, please make sure the issue hasn't been already + addressed by searching through [the past issues](https://github.com/scikit-learn/scikit-learn/issues). +- type: textarea + attributes: + label: Describe the bug + description: > + A clear and concise description of what the bug is. + validations: + required: true +- type: textarea + attributes: + label: Steps/Code to Reproduce + description: | + Please add a [minimal code example](https://scikit-learn.org/dev/developers/minimal_reproducer.html) that can reproduce the error when running it. Be as succinct as possible, **do not depend on external data files**: instead you can generate synthetic data using `numpy.random`, [sklearn.datasets.make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html), [sklearn.datasets.make_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html) or a few lines of Python code. Example: + + ```python + from sklearn.feature_extraction.text import CountVectorizer + from sklearn.decomposition import LatentDirichletAllocation + docs = ["Help I have a bug" for i in range(1000)] + vectorizer = CountVectorizer(input=docs, analyzer='word') + lda_features = vectorizer.fit_transform(docs) + lda_model = LatentDirichletAllocation( + n_topics=10, + learning_method='online', + evaluate_every=10, + n_jobs=4, + ) + model = lda_model.fit(lda_features) + ``` + + If the code is too long, feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + In short, **we are going to copy-paste your code** to run it and we expect to get the same result as you. + + We acknowledge that crafting a [minimal reproducible code example](https://scikit-learn.org/dev/developers/minimal_reproducer.html) requires some effort on your side but it really helps the maintainers quickly reproduce the problem and analyze its cause without any ambiguity. Ambiguous bug reports tend to be slower to fix because they will require more effort and back and forth discussion between the maintainers and the reporter to pin-point the precise conditions necessary to reproduce the problem. + placeholder: | + ``` + Sample code to reproduce the problem + ``` + validations: + required: true +- type: textarea + attributes: + label: Expected Results + description: > + Please paste or describe the expected results. + placeholder: > + Example: No error is thrown. + validations: + required: true +- type: textarea + attributes: + label: Actual Results + description: | + Please paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full traceback** of the exception. For instance the code above raises the following exception: + + ```python-traceback + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + in + 4 vectorizer = CountVectorizer(input=docs, analyzer='word') + 5 lda_features = vectorizer.fit_transform(docs) + ----> 6 lda_model = LatentDirichletAllocation( + 7 n_topics=10, + 8 learning_method='online', + + TypeError: __init__() got an unexpected keyword argument 'n_topics' + ``` + placeholder: > + Please paste or specifically describe the actual output or traceback. + validations: + required: true +- type: textarea + attributes: + label: Versions + render: shell + description: | + Please run the following and paste the output below. + ```python + import sklearn; sklearn.show_versions() + ``` + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 7d39c399ca81b..0ebed8c85161b 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,14 +1,17 @@ blank_issues_enabled: false contact_links: - name: Discussions - url: https://github.com/scikit-learn/scikit-learn/discussions + url: https://github.com/scikit-learn/scikit-learn/discussions/new about: Ask questions and discuss with other scikit-learn community members - - name: Stack overflow + - name: Stack Overflow url: https://stackoverflow.com/questions/tagged/scikit-learn - about: Please ask and answer usage questions on stackoverflow - - name: Mailing list + about: Please ask and answer usage questions on Stack Overflow + - name: Mailing list url: https://mail.python.org/mailman/listinfo/scikit-learn about: General discussions and announcements on the mailing list - - name: Gitter - url: https://gitter.im/scikit-learn/scikit-learn - about: Users and developers can sometimes be found on the gitter channel + - name: Discord server + url: https://discord.gg/h9qyrK8Jc8 + about: Developers and users can be found on the Discord server + - name: Blank issue + url: https://github.com/scikit-learn/scikit-learn/issues/new?template=BLANK_ISSUE + about: Please note that GitHub Discussions should be used in most cases instead diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.md b/.github/ISSUE_TEMPLATE/doc_improvement.md deleted file mode 100644 index 4c2906bb18418..0000000000000 --- a/.github/ISSUE_TEMPLATE/doc_improvement.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Documentation improvement -about: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. -title: '' -labels: Documentation -assignees: '' - ---- - -#### Describe the issue linked to the documentation - - - -#### Suggest a potential alternative/fix - - diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.yml b/.github/ISSUE_TEMPLATE/doc_improvement.yml new file mode 100644 index 0000000000000..48d0c3de89103 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/doc_improvement.yml @@ -0,0 +1,17 @@ +name: Documentation improvement +description: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. +labels: [Documentation, 'Needs Triage'] + +body: +- type: textarea + attributes: + label: Describe the issue linked to the documentation + description: > + Tell us about the confusion introduced in the documentation. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index b2ff110d69a04..0000000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -name: Feature request -about: Suggest a new algorithm, enhancement to an existing algorithm, etc. -title: '' -labels: New Feature -assignees: '' - ---- - - - -#### Describe the workflow you want to enable - -#### Describe your proposed solution - -#### Describe alternatives you've considered, if relevant - -#### Additional context diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000000000..51a2cdd94920d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,25 @@ +name: Feature request +description: Suggest a new algorithm, enhancement to an existing algorithm, etc. +labels: ['New Feature', 'Needs Triage'] + +body: +- type: markdown + attributes: + value: > + #### If you want to propose a new algorithm, please refer first to the [scikit-learn inclusion criterion](https://scikit-learn.org/stable/faq.html#what-are-the-inclusion-criteria-for-new-algorithms). +- type: textarea + attributes: + label: Describe the workflow you want to enable + validations: + required: true +- type: textarea + attributes: + label: Describe your proposed solution + validations: + required: true +- type: textarea + attributes: + label: Describe alternatives you've considered, if relevant +- type: textarea + attributes: + label: Additional context diff --git a/.github/ISSUE_TEMPLATE/other_template.md b/.github/ISSUE_TEMPLATE/other_template.md deleted file mode 100644 index d46ae9e50b18f..0000000000000 --- a/.github/ISSUE_TEMPLATE/other_template.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: Other -about: For all other issues to reach the community... -title: '' -labels: '' -assignees: '' - ---- - - diff --git a/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md similarity index 92% rename from PULL_REQUEST_TEMPLATE.md rename to .github/PULL_REQUEST_TEMPLATE.md index 8528d5386b58a..f59f9bc2fbcd7 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -26,7 +26,7 @@ review, either the pull request needs some benchmarking, tinkering, convincing, etc. or more likely the reviewers are simply busy. In either case, we ask for your understanding during the review process. For more information, see our FAQ on this topic: -http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention. +https://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention. Thanks for contributing! --> diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..7ac17eb0442ad --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ +version: 2 +updates: + # Maintain dependencies for GitHub Actions as recommended in SPEC8: + # https://github.com/scientific-python/specs/pull/325 + # At the time of writing, release critical workflows such as + # pypa/gh-action-pypi-publish should use hash-based versioning for security + # reasons. This strategy may be generalized to all other github actions + # in the future. + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + groups: + actions: + patterns: + - "*" + labels: + - "Build / CI" + - "dependencies" + reviewers: + - "scikit-learn/core-devs" diff --git a/.github/labeler-file-extensions.yml b/.github/labeler-file-extensions.yml new file mode 100644 index 0000000000000..63fcfcacfeb17 --- /dev/null +++ b/.github/labeler-file-extensions.yml @@ -0,0 +1,8 @@ +cython: +- sklearn/**/*.pyx +- sklearn/**/*.pxd +- sklearn/**/*.pxi +# Tempita templates +- sklearn/**/*.pyx.tp +- sklearn/**/*.pxd.tp +- sklearn/**/*.pxi.tp diff --git a/.github/scripts/label_title_regex.py b/.github/scripts/label_title_regex.py index 26fc02b3aef38..9a689b8db09b4 100644 --- a/.github/scripts/label_title_regex.py +++ b/.github/scripts/label_title_regex.py @@ -1,25 +1,25 @@ """Labels PRs based on title. Must be run in a github action with the pull_request_target event.""" -from ghapi.all import context_github -from ghapi.all import GhApi -from ghapi.all import user_repo -from ghapi.all import github_token + +import json +import os import re -owner, repo = user_repo() -pull_request = context_github.event.pull_request -title = pull_request.title +from github import Github + +context_dict = json.loads(os.getenv("CONTEXT_GITHUB")) + +repo = context_dict["repository"] +g = Github(context_dict["token"]) +repo = g.get_repo(repo) +pr_number = context_dict["event"]["number"] +issue = repo.get_issue(number=pr_number) +title = issue.title + -regex_to_labels = [ - (r"\bDOC\b", "Documentation"), - (r"\bCI\b", "Build / CI") -] +regex_to_labels = [(r"\bDOC\b", "Documentation"), (r"\bCI\b", "Build / CI")] -labels_to_add = [ - label for regex, label in regex_to_labels - if re.search(regex, title) -] +labels_to_add = [label for regex, label in regex_to_labels if re.search(regex, title)] if labels_to_add: - api = GhApi(owner=owner, repo=repo, token=github_token()) - api.issues.add_labels(pull_request.number, labels=labels_to_add) + issue.add_to_labels(*labels_to_add) diff --git a/.github/workflows/arm-unit-tests.yml b/.github/workflows/arm-unit-tests.yml new file mode 100644 index 0000000000000..e7636d55d7945 --- /dev/null +++ b/.github/workflows/arm-unit-tests.yml @@ -0,0 +1,54 @@ +name: Unit test for ARM +permissions: + contents: read + +on: + push: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + if: github.repository == 'scikit-learn/scikit-learn' + + steps: + - name: Checkout + uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + - name: Install linters + run: | + source build_tools/shared.sh + # Include pytest compatibility with mypy + pip install pytest $(get_dep ruff min) $(get_dep mypy min) cython-lint + - name: Run linters + run: ./build_tools/linting.sh + - name: Run Meson OpenMP checks + run: | + pip install ninja meson scipy + python build_tools/check-meson-openmp-dependencies.py + + run-unit-tests: + name: Run unit tests + runs-on: ubuntu-24.04-arm + if: github.repository == 'scikit-learn/scikit-learn' + needs: [lint] + steps: + - name: Checkout + uses: actions/checkout@v4 + - uses: mamba-org/setup-micromamba@v2 + with: + environment-file: build_tools/github/pymin_conda_forge_arm_linux-aarch64_conda.lock + environment-name: ci + cache-environment: true + + - name: Build and run tests + shell: bash -el {0} + run: bash build_tools/github/build_test_arm.sh diff --git a/.github/workflows/artifact-redirector.yml b/.github/workflows/artifact-redirector.yml new file mode 100644 index 0000000000000..690cacefda935 --- /dev/null +++ b/.github/workflows/artifact-redirector.yml @@ -0,0 +1,24 @@ +name: CircleCI artifacts redirector +on: [status] + +# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this +# github actions workflow: +# https://docs.github.com/en/actions/security-guides/automatic-token-authentication +permissions: + statuses: write + +jobs: + circleci_artifacts_redirector_job: + runs-on: ubuntu-latest + # For testing this action on a fork, remove the "github.repository =="" condition. + if: "github.repository == 'scikit-learn/scikit-learn' && github.event.context == 'ci/circleci: doc'" + name: Run CircleCI artifacts redirector + steps: + - name: GitHub Action step + uses: scientific-python/circleci-artifacts-redirector-action@v1 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + api-token: ${{ secrets.CIRCLECI_TOKEN }} + artifact-path: 0/doc/_changed.html + circleci-jobs: doc + job-title: Check the rendered docs here! diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml index 4b067135bbfdb..a69b60ee0f0a0 100644 --- a/.github/workflows/assign.yml +++ b/.github/workflows/assign.yml @@ -4,15 +4,27 @@ on: issue_comment: types: created +# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this +# github actions workflow: +# https://docs.github.com/en/actions/security-guides/automatic-token-authentication +permissions: + issues: write + jobs: one: runs-on: ubuntu-latest + # Note that string comparisons is not case sensitive. if: >- - (github.event.comment.body == 'take' || - github.event.comment.body == 'Take') - && !github.event.issue.assignee + startsWith(github.event.comment.body, '/take') + && !github.event.issue.assignee steps: - run: | + # Using REST API directly because assigning through gh has some severe limitations. For more details, see + # https://github.com/scikit-learn/scikit-learn/issues/29395#issuecomment-2206776963 echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted + curl -H "Authorization: token $GH_TOKEN" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' \ + https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + gh issue edit $ISSUE --remove-label "help wanted" + env: + GH_TOKEN: ${{ github.token }} + ISSUE: ${{ github.event.issue.html_url }} diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index 753f473354131..00e6a81f8cd0b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -1,54 +1,36 @@ name: Check Changelog +permissions: + contents: read + # This check makes sure that the changelog is properly updated # when a PR introduces a change in a test file. # To bypass this check, label the PR with "No Changelog Needed". on: pull_request: - types: [opened, edited, labeled, unlabeled, synchronize] + types: [opened, synchronize, labeled, unlabeled] jobs: check: + name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} steps: - - name: Get PR number and milestone - run: | - echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV - echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: '0' - - name: Check the changelog + - name: Check if tests have changed + id: tests_changed run: | set -xe changed_files=$(git diff --name-only origin/main) # Changelog should be updated only if tests have been modified - if [[ ! "$changed_files" =~ tests ]] + if [[ "$changed_files" =~ tests ]] then - exit 0 - fi - all_changelogs=$(cat ./doc/whats_new/v*.rst) - if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]] - then - echo "Changelog has been updated." - # If the pull request is milestoned check the correspondent changelog - if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst - then - expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst) - if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]] - then - echo "Changelog and milestone correspond." - else - echo "Changelog and milestone do not correspond." - echo "If you see this error make sure that the tagged milestone for the PR" - echo "and the changelog name properly match." - exit 1 - fi - fi - else - echo "Changelog entry is missing." - echo "If you see this error and there is already a changelog entry then make sure that" - echo "the PR number is correct. If no changelog entry is required for this PR," - echo "label the PR with 'No Changelog Needed' to bypass this check." - exit 1 + echo "check_changelog=true" >> $GITHUB_OUTPUT fi + + - name: Check changelog entry + if: steps.tests_changed.outputs.check_changelog == 'true' + uses: scientific-python/action-towncrier-changelog@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BOT_USERNAME: changelog-bot diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml deleted file mode 100644 index b9ce183890e72..0000000000000 --- a/.github/workflows/check-manifest.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: "Check Manifest" - -on: - schedule: - - cron: '0 0 * * *' - -jobs: - check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: '3.9' - - name: Install dependencies - # scipy and cython are required to build sdist - run: | - python -m pip install --upgrade pip - pip install check-manifest scipy cython - - run: | - check-manifest -v diff --git a/.github/workflows/check-sdist.yml b/.github/workflows/check-sdist.yml new file mode 100644 index 0000000000000..d97236dae1e40 --- /dev/null +++ b/.github/workflows/check-sdist.yml @@ -0,0 +1,35 @@ +name: "Check sdist" +permissions: + contents: read + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + check-sdist: + # Don't run on forks + if: github.repository == 'scikit-learn/scikit-learn' + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies + # scipy and cython are required to build sdist + run: | + python -m pip install --upgrade pip + pip install check-sdist + - run: | + check-sdist --inject-junk + + update-tracker: + uses: ./.github/workflows/update_tracking_issue.yml + if: ${{ always() }} + needs: [check-sdist] + with: + job_status: ${{ needs.check-sdist.result }} + secrets: + BOT_GITHUB_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000000..58b8fbf5c4ce7 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,73 @@ +name: "CodeQL" + +on: + push: + branches: [ "main", "*.X" ] + pull_request: + branches: [ "main", "*.X" ] + schedule: + - cron: '0 6 * * 1' + +jobs: + analyze: + name: Analyze + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners + # Consider using larger runners for possible analysis time improvements. + runs-on: 'ubuntu-latest' + timeout-minutes: 360 + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + language: [ 'javascript-typescript', 'python', 'actions' ] + # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ] + # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/cuda-ci.yml b/.github/workflows/cuda-ci.yml new file mode 100644 index 0000000000000..028ff06903e8a --- /dev/null +++ b/.github/workflows/cuda-ci.yml @@ -0,0 +1,78 @@ +name: CUDA GPU +permissions: + contents: read + +# Only run this workflow when a Pull Request is labeled with the +# 'CUDA CI' label. +on: + pull_request: + types: + - labeled + +jobs: + build_wheel: + if: contains(github.event.pull_request.labels.*.name, 'CUDA CI') + runs-on: "ubuntu-latest" + name: Build wheel for Pull Request + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: pypa/cibuildwheel@faf86a6ed7efa889faf6996aa23820831055001a + env: + CIBW_BUILD: cp313-manylinux_x86_64 + CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 + CIBW_BUILD_VERBOSITY: 1 + CIBW_ARCHS: x86_64 + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels + path: ./wheelhouse/*.whl + + tests: + if: contains(github.event.pull_request.labels.*.name, 'CUDA CI') + needs: [build_wheel] + runs-on: + group: cuda-gpu-runner-group + # Set this high enough so that the tests can comforatble run. We set a + # timeout to make abusing this workflow less attractive. + timeout-minutes: 20 + name: Run Array API unit tests + steps: + - uses: actions/download-artifact@v4 + with: + pattern: cibw-wheels + path: ~/dist + + - uses: actions/setup-python@v5 + with: + # XXX: The 3.12.4 release of Python on GitHub Actions is corrupted: + # https://github.com/actions/setup-python/issues/886 + python-version: '3.12.3' + - name: Checkout main repository + uses: actions/checkout@v4 + - name: Cache conda environment + id: cache-conda + uses: actions/cache@v4 + with: + path: ~/conda + key: ${{ runner.os }}-build-${{ hashFiles('build_tools/github/create_gpu_environment.sh') }}-${{ hashFiles('build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock') }} + - name: Install miniforge + if: ${{ steps.cache-conda.outputs.cache-hit != 'true' }} + run: bash build_tools/github/create_gpu_environment.sh + - name: Install scikit-learn + run: | + source "${HOME}/conda/etc/profile.d/conda.sh" + conda activate sklearn + pip install ~/dist/cibw-wheels/$(ls ~/dist/cibw-wheels) + + - name: Run array API tests + run: | + source "${HOME}/conda/etc/profile.d/conda.sh" + conda activate sklearn + python -c "import sklearn; sklearn.show_versions()" + + SCIPY_ARRAY_API=1 pytest --pyargs sklearn -k 'array_api' -v + # Run in /home/runner to not load sklearn from the checkout repo + working-directory: /home/runner diff --git a/.github/workflows/cuda-label-remover.yml b/.github/workflows/cuda-label-remover.yml new file mode 100644 index 0000000000000..bb87f5419b662 --- /dev/null +++ b/.github/workflows/cuda-label-remover.yml @@ -0,0 +1,23 @@ +name: Remove "CUDA CI" Label + +# This workflow removes the "CUDA CI" label that triggers the actual +# CUDA CI. It is separate so that we can use the `pull_request_target` +# trigger which has a API token with write access. +on: + pull_request_target: + types: + - labeled + +# In order to remove the "CUDA CI" label we need to have write permissions for PRs +permissions: + pull-requests: write + +jobs: + label-remover: + if: contains(github.event.pull_request.labels.*.name, 'CUDA CI') + name: Remove "CUDA CI" Label + runs-on: ubuntu-24.04 + steps: + - uses: actions-ecosystem/action-remove-labels@v1 + with: + labels: CUDA CI diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml new file mode 100644 index 0000000000000..6ed68496de8b2 --- /dev/null +++ b/.github/workflows/emscripten.yml @@ -0,0 +1,108 @@ +name: Test Emscripten/Pyodide build + +on: + schedule: + # Nightly build at 3:42 A.M. + - cron: "42 3 */1 * *" + push: + branches: + - main + # Release branches + - "[0-9]+.[0-9]+.X" + pull_request: + branches: + - main + - "[0-9]+.[0-9]+.X" + # Manual run + workflow_dispatch: + +env: + FORCE_COLOR: 3 + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + check_build_trigger: + name: Check build trigger + runs-on: ubuntu-latest + if: github.repository == 'scikit-learn/scikit-learn' + outputs: + build: ${{ steps.check_build_trigger.outputs.build }} + steps: + - name: Checkout scikit-learn + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + persist-credentials: false + + - id: check_build_trigger + name: Check build trigger + shell: bash + run: | + set -e + set -x + + COMMIT_MSG=$(git log --no-merges -1 --oneline) + + # The commit marker "[pyodide]" will trigger the build when required + if [[ "$GITHUB_EVENT_NAME" == schedule || + "$GITHUB_EVENT_NAME" == workflow_dispatch || + "$COMMIT_MSG" =~ \[pyodide\] ]]; then + echo "build=true" >> $GITHUB_OUTPUT + fi + + build_wasm_wheel: + name: Build WASM wheel + runs-on: ubuntu-latest + needs: check_build_trigger + if: needs.check_build_trigger.outputs.build + steps: + - name: Checkout scikit-learn + uses: actions/checkout@v4 + with: + persist-credentials: false + + - uses: pypa/cibuildwheel@faf86a6ed7efa889faf6996aa23820831055001a + env: + CIBW_PLATFORM: pyodide + SKLEARN_SKIP_OPENMP_TEST: "true" + SKLEARN_SKIP_NETWORK_TESTS: 1 + # Temporary work-around to avoid joblib 1.5.0 until there is a joblib + # release with https://github.com/joblib/joblib/pull/1721 + CIBW_TEST_REQUIRES: "pytest pandas joblib!=1.5.0" + # -s pytest argument is needed to avoid an issue in pytest output capturing with Pyodide + CIBW_TEST_COMMAND: "python -m pytest -svra --pyargs sklearn --durations 20 --showlocals" + + - name: Upload wheel artifact + uses: actions/upload-artifact@v4 + with: + name: pyodide_wheel + path: ./wheelhouse/*.whl + if-no-files-found: error + + # Push to https://anaconda.org/scientific-python-nightly-wheels/scikit-learn + # WARNING: this job will overwrite any existing WASM wheels. + upload-wheels: + name: Upload scikit-learn WASM wheels to Anaconda.org + runs-on: ubuntu-latest + permissions: {} + environment: upload_anaconda + needs: [build_wasm_wheel] + if: github.repository == 'scikit-learn/scikit-learn' && github.event_name != 'pull_request' + steps: + - name: Download wheel artifact + uses: actions/download-artifact@v4 + with: + path: wheelhouse/ + merge-multiple: true + + - name: Push to Anaconda PyPI index + uses: scientific-python/upload-nightly-action@b36e8c0c10dbcfd2e05bf95f17ef8c14fd708dbf # 0.6.2 + with: + artifacts_path: wheelhouse/ + anaconda_nightly_upload_token: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} diff --git a/.github/workflows/label-blank-issue.yml b/.github/workflows/label-blank-issue.yml new file mode 100644 index 0000000000000..7c00984d1169f --- /dev/null +++ b/.github/workflows/label-blank-issue.yml @@ -0,0 +1,16 @@ +name: Labels Blank issues +permissions: + issues: write + +on: + issues: + types: [opened] + +jobs: + label-blank-issues: + runs-on: ubuntu-latest + steps: + - uses: andymckay/labeler@1.0.4 + with: + add-labels: "Needs Triage" + ignore-if-labeled: true diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 3a9ed8d364f79..468d3282903f2 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -1,14 +1,33 @@ name: "Pull Request Labeler" -on: pull_request_target +on: + pull_request_target: + types: [opened] + +# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this +# github actions workflow: +# https://docs.github.com/en/actions/security-guides/automatic-token-authentication +permissions: + contents: read + pull-requests: write jobs: triage: runs-on: ubuntu-latest steps: - - uses: thomasjpfan/labeler@v2.5.0 + - uses: thomasjpfan/labeler@v2.5.1 continue-on-error: true if: github.repository == 'scikit-learn/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" configuration-path: ".github/labeler-module.yml" + + triage_file_extensions: + runs-on: ubuntu-latest + steps: + - uses: thomasjpfan/labeler@v2.5.1 + continue-on-error: true + if: github.repository == 'scikit-learn/scikit-learn' + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/labeler-title-regex.yml b/.github/workflows/labeler-title-regex.yml index e3c0812029d1b..8b127925cbdae 100644 --- a/.github/workflows/labeler-title-regex.yml +++ b/.github/workflows/labeler-title-regex.yml @@ -3,17 +3,24 @@ on: pull_request_target: types: [opened, edited] +# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this +# github actions workflow: +# https://docs.github.com/en/actions/security-guides/automatic-token-authentication +permissions: + contents: read + pull-requests: write + jobs: labeler: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.9' - - name: Install ghapi - run: pip install -Uq ghapi + - name: Install PyGithub + run: pip install -Uq PyGithub - name: Label pull request run: python .github/scripts/label_title_regex.py env: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000..f8075e779c56b --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,103 @@ +# This linter job on GH actions is used to trigger the commenter bot +# in bot-lint-comment.yml file. It stores the output of the linter to be used +# by the commenter bot. +name: linter + +on: + - pull_request_target + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + + # setting any permission will set everything else to none for GITHUB_TOKEN + permissions: + pull-requests: none + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + curl https://raw.githubusercontent.com/${{ github.repository }}/main/build_tools/shared.sh --retry 5 -o ./build_tools/shared.sh + source build_tools/shared.sh + # Include pytest compatibility with mypy + pip install pytest $(get_dep ruff min) $(get_dep mypy min) cython-lint + # we save the versions of the linters to be used in the error message later. + python -c "from importlib.metadata import version; print(f\"ruff={version('ruff')}\")" >> /tmp/versions.txt + python -c "from importlib.metadata import version; print(f\"mypy={version('mypy')}\")" >> /tmp/versions.txt + python -c "from importlib.metadata import version; print(f\"cython-lint={version('cython-lint')}\")" >> /tmp/versions.txt + + - name: Run linting + id: lint-script + # We download the linting script from main, since this workflow is run + # from main itself. + run: | + curl https://raw.githubusercontent.com/${{ github.repository }}/main/build_tools/linting.sh --retry 5 -o ./build_tools/linting.sh + set +e + ./build_tools/linting.sh &> /tmp/linting_output.txt + cat /tmp/linting_output.txt + + - name: Upload Artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: lint-log + path: | + /tmp/linting_output.txt + /tmp/versions.txt + retention-days: 1 + + comment: + needs: lint + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + + # We need these permissions to be able to post / update comments + permissions: + pull-requests: write + issues: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install dependencies + run: python -m pip install requests + + - name: Download artifact + id: download-artifact + uses: actions/download-artifact@v4 + with: + name: lint-log + + - name: Print log + run: cat linting_output.txt + + - name: Process Comments + id: process-comments + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH_SHA: ${{ github.event.pull_request.head.sha }} + RUN_ID: ${{ github.run_id }} + LOG_FILE: linting_output.txt + VERSIONS_FILE: versions.txt + run: python ./build_tools/get_comment.py diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index 677188a3567b3..ad24ea805eb8a 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -13,9 +13,13 @@ on: jobs: publish: runs-on: ubuntu-latest + environment: publish_pypi + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.8' - name: Install dependencies @@ -35,15 +39,13 @@ jobs: run: | python build_tools/github/check_wheels.py - name: Publish package to TestPyPI - uses: pypa/gh-action-pypi-publish@v1.4.1 + uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 with: - user: __token__ - password: ${{ secrets.TEST_PYPI_TOKEN }} - repository_url: https://test.pypi.org/legacy/ + repository-url: https://test.pypi.org/legacy/ + print-hash: true if: ${{ github.event.inputs.pypi_repo == 'testpypi' }} - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.4.1 - with: - user: __token__ - password: ${{ secrets.PYPI_TOKEN }} + uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 if: ${{ github.event.inputs.pypi_repo == 'pypi' }} + with: + print-hash: true diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml deleted file mode 100644 index 96b32ec902efa..0000000000000 --- a/.github/workflows/twitter.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Tweet the URL of a commit on @sklearn_commits whenever a push event -# happens on the main branch -name: Twitter Push Notification - - -on: - push: - branches: - - main - - -jobs: - tweet: - name: Twitter Notification - runs-on: ubuntu-latest - steps: - - name: Tweet URL of last commit as @sklearn_commits - if: github.repository == 'scikit-learn/scikit-learn' - uses: docker://thomasjpfan/twitter-action:0.3 - with: - args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\"" - env: - TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} - TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} - TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} - TWITTER_ACCESS_SECRET: ${{ secrets.TWITTER_ACCESS_SECRET }} diff --git a/.github/workflows/unassign.yml b/.github/workflows/unassign.yml index 0f4e78478b810..94a50d49839d6 100644 --- a/.github/workflows/unassign.yml +++ b/.github/workflows/unassign.yml @@ -4,6 +4,12 @@ on: issues: types: unassigned +# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this +# github actions workflow: +# https://docs.github.com/en/actions/security-guides/automatic-token-authentication +permissions: + issues: write + jobs: one: runs-on: ubuntu-latest @@ -12,4 +18,7 @@ jobs: if: github.event.issue.state == 'open' run: | echo "Marking issue ${{ github.event.issue.number }} as help wanted" - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels + gh issue edit $ISSUE --add-label "help wanted" + env: + GH_TOKEN: ${{ github.token }} + ISSUE: ${{ github.event.issue.html_url }} diff --git a/.github/workflows/update-lock-files.yml b/.github/workflows/update-lock-files.yml new file mode 100644 index 0000000000000..3d67bd9f70701 --- /dev/null +++ b/.github/workflows/update-lock-files.yml @@ -0,0 +1,88 @@ +# Workflow to update lock files +name: Update lock files +permissions: + contents: read + +on: + workflow_dispatch: + schedule: + - cron: '0 5 * * 1' + +jobs: + update_lock_files: + if: github.repository == 'scikit-learn/scikit-learn' + runs-on: ubuntu-latest + + strategy: + # Ensure that each build will continue even if one build in the matrix fails + fail-fast: false + matrix: + include: + - name: main + update_script_args: "--select-tag main-ci" + additional_commit_message: "[doc build]" + - name: scipy-dev + update_script_args: "--select-tag scipy-dev" + additional_commit_message: "[scipy-dev]" + - name: free-threaded + update_script_args: "--select-tag free-threaded" + additional_commit_message: "[free-threaded]" + - name: array-api + update_script_args: "--select-tag cuda" + + steps: + - uses: actions/checkout@v4 + - name: Generate lock files + run: | + source build_tools/shared.sh + source $CONDA/bin/activate + conda update -n base --all + conda install -n base conda conda-libmamba-solver -y + conda config --set solver libmamba + conda install -c conda-forge "$(get_dep conda-lock min)" -y + + python build_tools/update_environments_and_lock_files.py ${{ matrix.update_script_args }} + + - name: Create Pull Request + id: cpr + uses: peter-evans/create-pull-request@v7 + with: + token: ${{ secrets.BOT_GITHUB_TOKEN }} + push-to-fork: scikit-learn-bot/scikit-learn + commit-message: Update CI lock files ${{ matrix.additional_commit_message }} + committer: "Lock file bot " + author: "Lock file bot " + delete-branch: true + branch: auto-update-lock-files-${{ matrix.name }} + title: ":lock: :robot: CI Update lock files for ${{ matrix.name }} CI build(s) :lock: :robot:" + body: | + Update lock files. + + ### Note + If the CI tasks fail, create a new branch based on this PR and add the required fixes to that branch. + + # The CUDA workflow needs to be triggered explicitly as it uses an expensive runner + - name: Trigger additional tests + if: steps.cpr.outputs.pull-request-number != '' && matrix.name == 'array-api' + env: + GH_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }} + PR_NUMBER: ${{steps.cpr.outputs.pull-request-number}} + run: | + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GH_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/scikit-learn/scikit-learn/issues/$PR_NUMBER/labels \ + -d '{"labels":["CUDA CI"]}' + + - name: Check Pull Request + if: steps.cpr.outputs.pull-request-number != '' + run: | + echo "### :rocket: Pull-Request Summary" >> ${GITHUB_STEP_SUMMARY} + echo "" >> ${GITHUB_STEP_SUMMARY} + echo "The following lock files pull-request has been auto-generated:" + echo "- **PR** #${{ steps.cpr.outputs.pull-request-number }}" >> ${GITHUB_STEP_SUMMARY} + echo "- **URL** ${{ steps.cpr.outputs.pull-request-url }}" >> ${GITHUB_STEP_SUMMARY} + echo "- **Operation** [${{ steps.cpr.outputs.pull-request-operation }}]" >> ${GITHUB_STEP_SUMMARY} + echo "- **SHA** ${{ steps.cpr.outputs.pull-request-head-sha }}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml new file mode 100644 index 0000000000000..54db3f50bc43b --- /dev/null +++ b/.github/workflows/update_tracking_issue.yml @@ -0,0 +1,51 @@ +# For workflows to use this workflow include the following: +# +# update-tracker: +# uses: ./.github/workflows/update_tracking_issue.yml +# if: ${{ always() }} +# needs: [JOB_NAME] +# with: +# job_status: ${{ needs.JOB_NAME.result }} +# secrets: +# BOT_GITHUB_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }} +# Where JOB_NAME is contains the status of the job you are interested in + +name: "Update tracking issue" +permissions: + contents: read + +on: + workflow_call: + inputs: + job_status: + required: true + type: string + secrets: + BOT_GITHUB_TOKEN: + required: true + +jobs: + update_tracking_issue: + runs-on: ubuntu-latest + if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + - name: Update tracking issue on GitHub + run: | + set -ex + if [[ ${{ inputs.job_status }} == "success" ]]; then + TESTS_PASSED=true + else + TESTS_PASSED=false + fi + + pip install defusedxml PyGithub + python maint_tools/update_tracking_issue.py \ + ${{ secrets.BOT_GITHUB_TOKEN }} \ + "$GITHUB_WORKFLOW" \ + "$GITHUB_REPOSITORY" \ + https://github.com/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID \ + --tests-passed $TESTS_PASSED diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ffddf9ef88db3..33e8897c147f7 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -1,5 +1,7 @@ # Workflow to build and test wheels name: Wheel builder +permissions: + contents: read on: schedule: @@ -17,6 +19,10 @@ on: # Manual run workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + jobs: # Check whether to build the wheels and the source tarball check_build_trigger: @@ -28,7 +34,7 @@ jobs: steps: - name: Checkout scikit-learn - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} @@ -36,10 +42,15 @@ jobs: name: Check build trigger run: bash build_tools/github/check_build_trigger.sh - # Build the wheels for Linux, Windows and macOS for Python 3.6 and newer + # Build the wheels for Linux, Windows and macOS for Python 3.9 and newer build_wheels: name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}-${{ matrix.manylinux_image }} runs-on: ${{ matrix.os }} + + # For conda-incubator/setup-miniconda to work + defaults: + run: + shell: bash -el {0} needs: check_build_trigger if: needs.check_build_trigger.outputs.build @@ -47,69 +58,157 @@ jobs: # Ensure that a wheel builder finishes even if another fails fail-fast: false matrix: - os: [windows-latest, ubuntu-latest, macos-latest] - python: [36, 37, 38, 39] - bitness: [32, 64] - manylinux_image: [manylinux1, manylinux2010] include: - # Run 32 and 64 bit version in parallel for Linux and Windows + # Window 64 bit + - os: windows-latest + python: 310 + platform_id: win_amd64 + - os: windows-latest + python: 311 + platform_id: win_amd64 + - os: windows-latest + python: 312 + platform_id: win_amd64 - os: windows-latest - bitness: 64 + python: 313 platform_id: win_amd64 - os: windows-latest - bitness: 32 - platform_id: win32 + python: 313t + platform_id: win_amd64 + free_threaded_support: True + + # Linux 64 bit manylinux2014 + - os: ubuntu-latest + python: 310 + platform_id: manylinux_x86_64 + manylinux_image: manylinux2014 + - os: ubuntu-latest + python: 311 + platform_id: manylinux_x86_64 + manylinux_image: manylinux2014 + - os: ubuntu-latest + python: 312 + platform_id: manylinux_x86_64 + manylinux_image: manylinux2014 - os: ubuntu-latest - bitness: 64 + python: 313 platform_id: manylinux_x86_64 + manylinux_image: manylinux2014 - os: ubuntu-latest - bitness: 32 - platform_id: manylinux_i686 - - os: macos-latest - bitness: 64 + python: 313t + platform_id: manylinux_x86_64 + manylinux_image: manylinux2014 + free_threaded_support: True + + # # Linux 64 bit manylinux2014 + - os: ubuntu-24.04-arm + python: 310 + platform_id: manylinux_aarch64 + manylinux_image: manylinux2014 + - os: ubuntu-24.04-arm + python: 311 + platform_id: manylinux_aarch64 + manylinux_image: manylinux2014 + - os: ubuntu-24.04-arm + python: 312 + platform_id: manylinux_aarch64 + manylinux_image: manylinux2014 + - os: ubuntu-24.04-arm + python: 313 + platform_id: manylinux_aarch64 + manylinux_image: manylinux2014 + + # MacOS x86_64 + - os: macos-13 + python: 310 platform_id: macosx_x86_64 - exclude: - - os: macos-latest - bitness: 32 - # Remove manylinux1 from the windows and osx build matrix since - # manylinux_image is not used for these platforms - - os: windows-latest - manylinux_image: manylinux1 - - os: macos-latest - manylinux_image: manylinux1 + - os: macos-13 + python: 311 + platform_id: macosx_x86_64 + - os: macos-13 + python: 312 + platform_id: macosx_x86_64 + - os: macos-13 + python: 313 + platform_id: macosx_x86_64 + - os: macos-13 + python: 313t + platform_id: macosx_x86_64 + free_threaded_support: True + + # MacOS arm64 + - os: macos-14 + python: 310 + platform_id: macosx_arm64 + - os: macos-14 + python: 311 + platform_id: macosx_arm64 + - os: macos-14 + python: 312 + platform_id: macosx_arm64 + - os: macos-14 + python: 313 + platform_id: macosx_arm64 + - os: macos-14 + python: 313t + platform_id: macosx_arm64 + free_threaded_support: True steps: - name: Checkout scikit-learn - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 + with: + python-version: "3.11" # update once build dependencies are available + + - uses: conda-incubator/setup-miniconda@v3 + if: ${{ startsWith(matrix.platform_id, 'macosx') }} - name: Build and test wheels env: - CONFTEST_PATH: ${{ github.workspace }}/conftest.py - CONFTEST_NAME: conftest.py - CIBW_ENVIRONMENT: OMP_NUM_THREADS=2 - OPENBLAS_NUM_THREADS=2 - SKLEARN_SKIP_NETWORK_TESTS=1 - SKLEARN_BUILD_PARALLEL=3 - MACOSX_DEPLOYMENT_TARGET=10.13 + CIBW_PRERELEASE_PYTHONS: ${{ matrix.prerelease_pythons }} + CIBW_FREE_THREADED_SUPPORT: ${{ matrix.free_threaded_support }} + CIBW_ENVIRONMENT: SKLEARN_SKIP_NETWORK_TESTS=1 CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }} + CIBW_ARCHS: all CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.manylinux_image }} CIBW_MANYLINUX_I686_IMAGE: ${{ matrix.manylinux_image }} - CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} - CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }} - CIBW_TEST_REQUIRES: pytest pandas threadpoolctl - CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh - CIBW_TEST_COMMAND_WINDOWS: bash {project}/build_tools/github/test_windows_wheels.sh ${{ matrix.python }} ${{ matrix.bitness }} + # Needed on Windows CI to compile with Visual Studio compiler + # otherwise Meson detects a MINGW64 platform and use MINGW64 + # toolchain + CIBW_CONFIG_SETTINGS_WINDOWS: "setup-args=--vsenv" + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} + CIBW_BEFORE_BUILD: bash {project}/build_tools/wheels/cibw_before_build.sh {project} + CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} + CIBW_ENVIRONMENT_PASS_LINUX: RUNNER_OS + CIBW_TEST_REQUIRES: pytest pandas + # On Windows, we use a custom Docker image and CIBW_TEST_REQUIRES_WINDOWS + # does not make sense because it would install dependencies in the host + # rather than inside the Docker image + CIBW_TEST_REQUIRES_WINDOWS: "" + CIBW_TEST_COMMAND: bash {project}/build_tools/wheels/test_wheels.sh {project} + CIBW_TEST_COMMAND_WINDOWS: bash {project}/build_tools/github/test_windows_wheels.sh ${{ matrix.python }} {project} + CIBW_BUILD_VERBOSITY: 1 - run: bash build_tools/github/build_wheels.sh + run: bash build_tools/wheels/build_wheels.sh - name: Store artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: + name: cibw-wheels-cp${{ matrix.python }}-${{ matrix.platform_id }} path: wheelhouse/*.whl + update-tracker: + uses: ./.github/workflows/update_tracking_issue.yml + if: ${{ always() }} + needs: [build_wheels] + with: + job_status: ${{ needs.build_wheels.result }} + secrets: + BOT_GITHUB_TOKEN: ${{ secrets.BOT_GITHUB_TOKEN }} + # Build the source distribution under Linux build_sdist: name: Source distribution @@ -119,52 +218,55 @@ jobs: steps: - name: Checkout scikit-learn - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 + with: + python-version: "3.12" - name: Build source distribution run: bash build_tools/github/build_source.sh - env: - SKLEARN_BUILD_PARALLEL: 3 - name: Test source distribution run: bash build_tools/github/test_source.sh env: - OMP_NUM_THREADS: 2 - OPENBLAS_NUM_THREADS: 2 SKLEARN_SKIP_NETWORK_TESTS: 1 - name: Store artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: + name: cibw-sdist path: dist/*.tar.gz # Upload the wheels and the source distribution upload_anaconda: name: Upload to Anaconda runs-on: ubuntu-latest + environment: upload_anaconda needs: [build_wheels, build_sdist] # The artifacts cannot be uploaded on PRs if: github.event_name != 'pull_request' steps: - name: Checkout scikit-learn - uses: actions/checkout@v1 + uses: actions/checkout@v4 - name: Download artifacts - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: + pattern: cibw-* path: dist + merge-multiple: true - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 - name: Upload artifacts env: # Secret variables need to be mapped to environment variables explicitly SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} + ARTIFACTS_PATH: dist # Force a replacement if the remote file already exists run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index 3ebd8e2bb1699..7e00b8802bd01 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -*.pyc +*.pyc* *.so *.pyd *~ @@ -13,12 +13,21 @@ sklearn/**/*.html dist/ MANIFEST +doc/sg_execution_times.rst doc/_build/ +doc/api/*.rst doc/auto_examples/ +doc/css/* +!doc/css/.gitkeep doc/modules/generated/ doc/datasets/generated/ +doc/developers/maintainer.rst +doc/index.rst doc/min_dependency_table.rst doc/min_dependency_substitutions.rst +# release notes generated by towncrier +doc/whats_new/notes-towncrier.rst + *.pdf pip-log.txt scikit_learn.egg-info/ @@ -53,11 +62,15 @@ nips2010_pdf/ examples/cluster/joblib reuters/ benchmarks/bench_covertype_data/ +benchmarks/HIGGS.csv.gz +bench_pca_solvers.csv *.prefs .pydevproject .idea .vscode +# used by pyenv +.python-version *.c *.cpp @@ -75,7 +88,11 @@ _configtest.o.d # Used by mypy .mypy_cache/ -# files generated from a template -sklearn/utils/_seq_dataset.pyx -sklearn/utils/_seq_dataset.pxd -sklearn/linear_model/_sag_fast.pyx +# virtualenv from advanced installation guide +sklearn-env/ + +# Default JupyterLite content +jupyterlite_contents + +# file recognised by vscode IDEs containing env variables +.env diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ebdc1346b183..48871d2a4abed 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,20 +1,33 @@ +exclude: '^(.git/|sklearn/externals/|asv_benchmarks/env/)' repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + rev: v5.0.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.8 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.7 hooks: - - id: flake8 - types: [file, python] - # only check for unused imports for now, as long as - # the code is not fully PEP8 compatible - args: [--select=F401] + - id: ruff + args: ["--fix", "--output-format=full"] + - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.782 + rev: v1.15.0 hooks: - id: mypy files: sklearn/ + additional_dependencies: [pytest==6.2.4] +- repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.16.6 + hooks: + # TODO: add the double-quote-cython-strings hook when it's usability has improved: + # possibility to pass a directory and use it as a check instead of auto-formatter. + - id: cython-lint +- repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + files: ^doc/scss/|^doc/js/scripts/ + exclude: ^doc/js/scripts/vendor/ + types_or: ["scss", "javascript"] diff --git a/.spin/cmds.py b/.spin/cmds.py new file mode 100644 index 0000000000000..954749b8005c2 --- /dev/null +++ b/.spin/cmds.py @@ -0,0 +1,29 @@ +import shutil +import sys + +import click +from spin.cmds import util + + +@click.command() +def clean(): + """đŸĒĨ Clean build folder. + + Very rarely needed since meson-python recompiles as needed when sklearn is + imported. + + One known use case where "spin clean" is useful: avoid compilation errors + when switching from numpy<2 to numpy>=2 in the same conda environment or + virtualenv. + """ + util.run([sys.executable, "-m", "pip", "uninstall", "scikit-learn", "-y"]) + default_meson_build_dir = ( + f"build/cp{sys.version_info.major}{sys.version_info.minor}" + ) + click.secho( + f"removing default Meson build dir: {default_meson_build_dir}", + bold=True, + fg="bright_blue", + ) + + shutil.rmtree(default_meson_build_dir, ignore_errors=True) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1e6ed78d28ac2..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,90 +0,0 @@ -# Make it explicit that we favor the -# new container-based Travis workers -language: python -dist: xenial - -cache: - apt: true - directories: - - $HOME/.cache/pip - - $HOME/.ccache - -env: - global: - - CPU_COUNT=3 - - TEST_DIR=/tmp/sklearn # Test directory for continuous integration jobs - - PYTEST_VERSION=latest - - OMP_NUM_THREADS=2 - - OPENBLAS_NUM_THREADS=2 - - SKLEARN_BUILD_PARALLEL=3 - - SKLEARN_SKIP_NETWORK_TESTS=1 - # Custom environment variables for the ARM wheel builder - - CIBW_BUILD_VERBOSITY=1 - - CIBW_TEST_REQUIRES="pytest pytest-xdist threadpoolctl" - - CIBW_TEST_COMMAND="bash {project}/build_tools/travis/test_wheels.sh" - - CIBW_ENVIRONMENT="CPU_COUNT=2 - OMP_NUM_THREADS=2 - OPENBLAS_NUM_THREADS=2 - SKLEARN_BUILD_PARALLEL=3 - SKLEARN_SKIP_NETWORK_TESTS=1" - -jobs: - include: - # Manual trigger of linux/arm64 tests in PR without triggering the full - # wheel building process for all the Python versions. - - python: 3.9 - os: linux - arch: arm64 - if: commit_message =~ /\[arm64\]/ - env: - - CPU_COUNT=4 - - # Linux environments to build the scikit-learn wheels for the ARM64 - # architecture and Python 3.6 and newer. This is used both at release time - # with the manual trigger in the commit message in the release branch and as - # a scheduled task to build the weekly dev build on the main branch. The - # weekly frequency is meant to avoid depleting the Travis CI credits too - # fast. - - python: 3.6 - os: linux - arch: arm64 - if: type = cron or commit_message =~ /\[cd build\]/ - env: - - BUILD_WHEEL=true - - CIBW_BUILD=cp36-manylinux_aarch64 - - - python: 3.7 - os: linux - arch: arm64 - if: type = cron or commit_message =~ /\[cd build\]/ - env: - - BUILD_WHEEL=true - - CIBW_BUILD=cp37-manylinux_aarch64 - - - python: 3.8 - os: linux - arch: arm64 - if: type = cron or commit_message =~ /\[cd build\]/ - env: - - BUILD_WHEEL=true - - CIBW_BUILD=cp38-manylinux_aarch64 - - - python: 3.9 - os: linux - arch: arm64 - if: type = cron or commit_message =~ /\[cd build\]/ - env: - - BUILD_WHEEL=true - - CIBW_BUILD=cp39-manylinux_aarch64 - -install: source build_tools/travis/install.sh || travis_terminate 1 -script: source build_tools/travis/script.sh || travis_terminate 1 -after_success: source build_tools/travis/after_success.sh || travis_terminate 1 - -notifications: - webhooks: - urls: - - https://webhooks.gitter.im/e/4ffabb4df010b70cd624 - on_success: change - on_failure: always - on_start: never diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000..c3e367c124f81 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,48 @@ +cff-version: 1.2.0 +title: scikit-learn +type: software +authors: + - name: "The scikit-learn developers" +message: "If you use scikit-learn in a scientific publication, we would appreciate citations to the following paper:" +preferred-citation: + type: article + title: "Scikit-learn: Machine Learning in Python" + authors: + - family-names: "Pedregosa" + given-names: "Fabian" + - family-names: "Varoquaux" + given-names: "GaÃĢl" + - family-names: "Gramfort" + given-names: "Alexandre" + - family-names: "Michel" + given-names: "Vincent" + - family-names: "Thirion" + given-names: "Bertrand" + - family-names: "Grisel" + given-names: "Olivier" + - family-names: "Blondel" + given-names: "Mathieu" + - family-names: "Prettenhofer" + given-names: "Peter" + - family-names: "Weiss" + given-names: "Ron" + - family-names: "Dubourg" + given-names: "Vincent" + - family-names: "Vanderplas" + given-names: "Jake" + - family-names: "Passos" + given-names: "Alexandre" + - family-names: "Cournapeau" + given-names: "David" + - family-names: "Brucher" + given-names: "Matthieu" + - family-names: "Perrot" + given-names: "Matthieu" + - family-names: "Duchesnay" + given-names: "Édouard" + journal: "Journal of Machine Learning Research" + volume: 12 + start: 2825 + end: 2830 + year: 2011 + url: "https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html" diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 23016563a5f6e..b4e1709e67c3f 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -13,4 +13,3 @@ all priceless contributions. We abide by the principles of openness, respect, and consideration of others of the Python Software Foundation: https://www.python.org/psf/codeofconduct/ - diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f6f65883c65b2..92a673462e3a6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ Documentation can be found under the But there are many other ways to help. In particular answering queries on the [issue tracker](https://github.com/scikit-learn/scikit-learn/issues), investigating bugs, and [reviewing other developers' pull -requests](http://scikit-learn.org/dev/developers/contributing.html#code-review-guidelines) +requests](https://scikit-learn.org/dev/developers/contributing.html#code-review-guidelines) are very valuable contributions that decrease the burden on the project maintainers. @@ -30,8 +30,8 @@ link to it from your website, or simply star it in GitHub to say "I use it". Quick links ----------- -* [Submitting a bug report or feature request](http://scikit-learn.org/dev/developers/contributing.html#submitting-a-bug-report-or-a-feature-request) -* [Contributing code](http://scikit-learn.org/dev/developers/contributing.html#contributing-code) +* [Submitting a bug report or feature request](https://scikit-learn.org/dev/developers/contributing.html#submitting-a-bug-report-or-a-feature-request) +* [Contributing code](https://scikit-learn.org/dev/developers/contributing.html#contributing-code) * [Coding guidelines](https://scikit-learn.org/dev/developers/develop.html#coding-guidelines) * [Tips to read current code](https://scikit-learn.org/dev/developers/contributing.html#reading-the-existing-code-base) diff --git a/COPYING b/COPYING index 558c4c1245615..e1cd01d584578 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2007-2020 The scikit-learn developers. +Copyright (c) 2007-2024 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5a4b582bd9886..0000000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,32 +0,0 @@ -include *.rst -recursive-include doc * -recursive-include examples * -recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi *.tp -recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz -include COPYING -include README.rst -include pyproject.toml -include sklearn/externals/README -include sklearn/svm/src/liblinear/COPYRIGHT -include sklearn/svm/src/libsvm/LIBSVM_CHANGES -include conftest.py -include Makefile -include MANIFEST.in -include .coveragerc - -# exclude from sdist -recursive-exclude asv_benchmarks * -recursive-exclude benchmarks * -recursive-exclude build_tools * -recursive-exclude maint_tools * -recursive-exclude benchmarks * -recursive-exclude .binder * -recursive-exclude .circleci * -exclude .codecov.yml -exclude .mailmap -exclude .pre-commit-config.yaml -exclude azure-pipelines.yml -exclude lgtm.yml -exclude CODE_OF_CONDUCT.md -exclude CONTRIBUTING.md -exclude PULL_REQUEST_TEMPLATE.md diff --git a/Makefile b/Makefile index 112b1e68188a0..eb6ec39edcbdc 100644 --- a/Makefile +++ b/Makefile @@ -1,68 +1,27 @@ # simple makefile to simplify repetitive build env management tasks under posix -# caution: testing won't work on windows, see README - PYTHON ?= python -CYTHON ?= cython -PYTEST ?= pytest -CTAGS ?= ctags - -# skip doctests on 32bit python -BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))') - -all: clean inplace test - -clean-ctags: - rm -f tags - -clean: clean-ctags - $(PYTHON) setup.py clean - rm -rf dist - -in: inplace # just a shortcut -inplace: - $(PYTHON) setup.py build_ext -i - -test-code: in - $(PYTEST) --showlocals -v sklearn --durations=20 -test-sphinxext: - $(PYTEST) --showlocals -v doc/sphinxext/ -test-doc: -ifeq ($(BITS),64) - $(PYTEST) $(shell find doc -name '*.rst' | sort) -endif -test-code-parallel: in - $(PYTEST) -n auto --showlocals -v sklearn --durations=20 - -test-coverage: - rm -rf coverage .coverage - $(PYTEST) sklearn --showlocals -v --cov=sklearn --cov-report=html:coverage -test-coverage-parallel: - rm -rf coverage .coverage .coverage.* - $(PYTEST) sklearn -n auto --showlocals -v --cov=sklearn --cov-report=html:coverage - -test: test-code test-sphinxext test-doc - -trailing-spaces: - find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; +DEFAULT_MESON_BUILD_DIR = build/cp$(shell python -c 'import sys; print(f"{sys.version_info.major}{sys.version_info.minor}")' ) -cython: - python setup.py build_src +all: + @echo "Please use 'make ' where is one of" + @echo " dev build scikit-learn with Meson" + @echo " clean clean scikit-learn Meson build. Very rarely needed," + @echo " since meson-python recompiles on import." -ctags: - # make tags for symbol based navigation in emacs and vim - # Install with: sudo apt-get install exuberant-ctags - $(CTAGS) --python-kinds=-i -R sklearn +.PHONY: all -doc: inplace - $(MAKE) -C doc html +dev: dev-meson -doc-noplot: inplace - $(MAKE) -C doc html-noplot +dev-meson: + pip install --verbose --no-build-isolation --editable . --config-settings editable-verbose=true -code-analysis: - flake8 sklearn | grep -v __init__ | grep -v external - pylint -E -i y sklearn/ -d E1103,E0611,E1101 +clean: clean-meson -flake8-diff: - git diff upstream/main -u -- "*.py" | flake8 --diff +clean-meson: + pip uninstall -y scikit-learn + # It seems in some cases removing the folder avoids weird compilation + # errors (e.g. when switching from numpy>=2 to numpy<2). For some + # reason ninja clean -C $(DEFAULT_MESON_BUILD_DIR) is not + # enough. + rm -rf $(DEFAULT_MESON_BUILD_DIR) diff --git a/README.rst b/README.rst index 68f9ffee17d03..5885bce67baa7 100644 --- a/README.rst +++ b/README.rst @@ -1,43 +1,47 @@ .. -*- mode: rst -*- -|Azure|_ |Travis|_ |Codecov|_ |CircleCI|_ |Nightly wheels|_ |PythonVersion|_ |PyPi|_ |DOI|_ +|Azure| |Codecov| |CircleCI| |Nightly wheels| |Ruff| |PythonVersion| |PyPi| |DOI| |Benchmark| .. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=main -.. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main + :target: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main -.. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=main -.. _Travis: https://travis-ci.com/scikit-learn/scikit-learn +.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield + :target: https://circleci.com/gh/scikit-learn/scikit-learn -.. |Codecov| image:: https://codecov.io/github/scikit-learn/scikit-learn/badge.svg?branch=main&service=github -.. _Codecov: https://codecov.io/github/scikit-learn/scikit-learn?branch=main +.. |Codecov| image:: https://codecov.io/gh/scikit-learn/scikit-learn/branch/main/graph/badge.svg?token=Pk8G9gg3y9 + :target: https://codecov.io/gh/scikit-learn/scikit-learn -.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token -.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn +.. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/actions/workflows/wheels.yml/badge.svg?event=schedule + :target: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule -.. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule -.. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule +.. |Ruff| image:: https://img.shields.io/badge/code%20style-ruff-000000.svg + :target: https://github.com/astral-sh/ruff -.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue -.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue +.. |PythonVersion| image:: https://img.shields.io/pypi/pyversions/scikit-learn.svg + :target: https://pypi.org/project/scikit-learn/ -.. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg -.. _PyPi: https://badge.fury.io/py/scikit-learn +.. |PyPi| image:: https://img.shields.io/pypi/v/scikit-learn + :target: https://pypi.org/project/scikit-learn .. |DOI| image:: https://zenodo.org/badge/21369/scikit-learn/scikit-learn.svg -.. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn - -.. |PythonMinVersion| replace:: 3.6 -.. |NumPyMinVersion| replace:: 1.13.3 -.. |SciPyMinVersion| replace:: 0.19.1 -.. |JoblibMinVersion| replace:: 0.11 -.. |ThreadpoolctlMinVersion| replace:: 2.0.0 -.. |MatplotlibMinVersion| replace:: 2.1.1 -.. |Scikit-ImageMinVersion| replace:: 0.13 -.. |PandasMinVersion| replace:: 0.25.0 + :target: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn + +.. |Benchmark| image:: https://img.shields.io/badge/Benchmarked%20by-asv-blue + :target: https://scikit-learn.org/scikit-learn-benchmarks + +.. |PythonMinVersion| replace:: 3.10 +.. |NumPyMinVersion| replace:: 1.22.0 +.. |SciPyMinVersion| replace:: 1.8.0 +.. |JoblibMinVersion| replace:: 1.2.0 +.. |ThreadpoolctlMinVersion| replace:: 3.1.0 +.. |MatplotlibMinVersion| replace:: 3.5.0 +.. |Scikit-ImageMinVersion| replace:: 0.19.0 +.. |PandasMinVersion| replace:: 1.4.0 .. |SeabornMinVersion| replace:: 0.9.0 -.. |PytestMinVersion| replace:: 5.0.1 +.. |PytestMinVersion| replace:: 7.1.2 +.. |PlotlyMinVersion| replace:: 5.14.0 -.. image:: doc/logos/scikit-learn-logo.png +.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png :target: https://scikit-learn.org/ **scikit-learn** is a Python module for machine learning built on top of @@ -68,21 +72,18 @@ scikit-learn requires: ======= -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 0.23 and later require Python 3.6 or newer. - Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). +classes end with ``Display``) require Matplotlib (>= |MatplotlibMinVersion|). For running the examples Matplotlib >= |MatplotlibMinVersion| is required. A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion|. +|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. User installation ~~~~~~~~~~~~~~~~~ -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip`` :: +If you already have a working installation of NumPy and SciPy, +the easiest way to install scikit-learn is using ``pip``:: pip install -U scikit-learn @@ -137,7 +138,7 @@ directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: pytest sklearn -See the web page https://scikit-learn.org/dev/developers/advanced_installation.html#testing +See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage for more information. Random number generation can be controlled during testing by setting @@ -175,12 +176,36 @@ Documentation Communication ~~~~~~~~~~~~~ -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- IRC channel: ``#scikit-learn`` at ``webchat.freenode.net`` -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Twitter: https://twitter.com/scikit_learn -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Website: https://scikit-learn.org +Main Channels +^^^^^^^^^^^^^ + +- **Website**: https://scikit-learn.org +- **Blog**: https://blog.scikit-learn.org +- **Mailing list**: https://mail.python.org/mailman/listinfo/scikit-learn + +Developer & Support +^^^^^^^^^^^^^^^^^^^^^^ + +- **GitHub Discussions**: https://github.com/scikit-learn/scikit-learn/discussions +- **Stack Overflow**: https://stackoverflow.com/questions/tagged/scikit-learn +- **Discord**: https://discord.gg/h9qyrK8Jc8 + +Social Media Platforms +^^^^^^^^^^^^^^^^^^^^^^ + +- **LinkedIn**: https://www.linkedin.com/company/scikit-learn +- **YouTube**: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists +- **Facebook**: https://www.facebook.com/scikitlearnofficial/ +- **Instagram**: https://www.instagram.com/scikitlearnofficial/ +- **TikTok**: https://www.tiktok.com/@scikit.learn +- **Bluesky**: https://bsky.app/profile/scikit-learn.org +- **Mastodon**: https://mastodon.social/@sklearn@fosstodon.org + +Resources +^^^^^^^^^ + +- **Calendar**: https://blog.scikit-learn.org/calendar/ +- **Logos & Branding**: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos Citation ~~~~~~~~ diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000..56c3e982be28a --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,23 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------------- | ------------------ | +| 1.7.0 | :white_check_mark: | +| < 1.7.0 | :x: | + +## Reporting a Vulnerability + +Please report security vulnerabilities by opening a new [GitHub security +advisory](https://github.com/scikit-learn/scikit-learn/security/advisories/new). + +You can also send an email to `security@scikit-learn.org`, which is an alias to +a subset of the scikit-learn maintainers' team. + +If the security vulnerability is accepted, a patch will be crafted privately +in order to prepare a dedicated bugfix release as timely as possible (depending +on the complexity of the fix). + +In addition to the options above, you can also report security vulnerabilities +to [tidelift](https://tidelift.com/security). diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json index c6ba1de71c2d2..3b16389139c0c 100644 --- a/asv_benchmarks/asv.conf.json +++ b/asv_benchmarks/asv.conf.json @@ -7,31 +7,21 @@ "project": "scikit-learn", // The project's homepage - "project_url": "scikit-learn.org/", + "project_url": "https://scikit-learn.org/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "..", - // The Python project's subdirectory in your repo. If missing or - // the empty string, the project is assumed to be located at the root - // of the repository. - // "repo_subdir": "", - // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. - // - // "install_command": ["python -mpip install {wheel_file}"], - // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - // "build_command": [ - // "python setup.py build", - // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" - // ], + "install_command": ["python -mpip install {wheel_file}"], + "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": ["python -m build --wheel -o {build_cache_dir} {build_dir}"], - // List of branches to benchmark. If not provided, defaults to "master + // List of branches to benchmark. If not provided, defaults to "main" // (for git) or "default" (for mercurial). "branches": ["main"], - // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL @@ -50,19 +40,19 @@ // defaults to 10 min //"install_timeout": 600, + // timeout in seconds all benchmarks, can be overridden per benchmark + // defaults to 1 min + //"default_benchmark_timeout": 60, + // the base URL to show a commit for the project. "show_commit_url": "https://github.com/scikit-learn/scikit-learn/commit/", - // The Pythons you'd like to test against. If not provided, defaults + // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - // "pythons": ["3.6"], + // "pythons": ["3.12"], - // The list of conda channel names to be searched for benchmark - // dependency packages in the specified order - // "conda_channels": ["conda-forge", "defaults"] - - // The matrix of dependencies to test. Each key is the name of a - // package (in PyPI) and the values are version numbers. An empty + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty // list or empty string indicates to just test against the default // (latest) version. null indicates that the package is to not be // installed. If the package to be tested is only available from @@ -71,12 +61,17 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). // + // The versions of the dependencies should be bumped in a dedicated commit + // to easily identify regressions/improvements due to code changes from + // those due to dependency changes. + // "matrix": { - "numpy": [], - "scipy": [], - "cython": [], - "joblib": [], - "threadpoolctl": [] + "numpy": ["2.0.0"], + "scipy": ["1.14.0"], + "cython": ["3.0.10"], + "joblib": ["1.3.2"], + "threadpoolctl": ["3.2.0"], + "pandas": ["2.2.2"] }, // Combinations of libraries/python versions can be excluded/included @@ -106,10 +101,10 @@ // ], // // "include": [ - // // additional env for python2.7 - // {"python": "2.7", "numpy": "1.8"}, + // // additional env for python3.12 + // {"python": "3.12", "numpy": "1.26"}, // // additional env if run on windows+conda - // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // {"sys_platform": "win32", "environment_type": "conda", "python": "3.12", "libpython": ""}, // ], // The directory (relative to the current directory) that benchmarks are @@ -131,10 +126,10 @@ // The number of characters to retain in the commit hashes. // "hash_length": 8, - // `asv` will cache results of the recent builds in each + // `asv` will cache wheels of the recent builds in each // environment, making them faster to install next time. This is - // the number of builds to keep, per environment. - // "build_cache_size": 2, + // number of builds to keep, per environment. + // "build_cache_size": 0 // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are @@ -147,16 +142,5 @@ // "regressions_first_commits": { // "some_benchmark": "352cdf", // Consider regressions only after this commit // "another_benchmark": null, // Skip regression detection altogether - // }, - - // The thresholds for relative change in results, after which `asv - // publish` starts reporting regressions. Dictionary of the same - // form as in ``regressions_first_commits``, with values - // indicating the thresholds. If multiple entries match, the - // maximum is taken. If no entry matches, the default is 5%. - // - // "regressions_thresholds": { - // "some_benchmark": 0.01, // Threshold of 1% - // "another_benchmark": 0.5, // Threshold of 50% - // }, + // } } diff --git a/asv_benchmarks/benchmarks/cluster.py b/asv_benchmarks/benchmarks/cluster.py index 7e92f8cb6ddd2..457a15dd938e9 100644 --- a/asv_benchmarks/benchmarks/cluster.py +++ b/asv_benchmarks/benchmarks/cluster.py @@ -1,7 +1,7 @@ from sklearn.cluster import KMeans, MiniBatchKMeans from .common import Benchmark, Estimator, Predictor, Transformer -from .datasets import _blobs_dataset, _20newsgroups_highdim_dataset +from .datasets import _20newsgroups_highdim_dataset, _blobs_dataset from .utils import neg_mean_inertia @@ -10,8 +10,8 @@ class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): Benchmarks for KMeans. """ - param_names = ['representation', 'algorithm', 'init'] - params = (['dense', 'sparse'], ['full', 'elkan'], ['random', 'k-means++']) + param_names = ["representation", "algorithm", "init"] + params = (["dense", "sparse"], ["lloyd", "elkan"], ["random", "k-means++"]) def setup_cache(self): super().setup_cache() @@ -19,7 +19,7 @@ def setup_cache(self): def make_data(self, params): representation, algorithm, init = params - if representation == 'sparse': + if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=8000) else: data = _blobs_dataset(n_clusters=20) @@ -29,27 +29,29 @@ def make_data(self, params): def make_estimator(self, params): representation, algorithm, init = params - max_iter = 30 if representation == 'sparse' else 100 + max_iter = 30 if representation == "sparse" else 100 - estimator = KMeans(n_clusters=20, - algorithm=algorithm, - init=init, - n_init=1, - max_iter=max_iter, - tol=-1, - random_state=0) + estimator = KMeans( + n_clusters=20, + algorithm=algorithm, + init=init, + n_init=1, + max_iter=max_iter, + tol=0, + random_state=0, + ) return estimator def make_scorers(self): - self.train_scorer = ( - lambda _, __: neg_mean_inertia(self.X, - self.estimator.predict(self.X), - self.estimator.cluster_centers_)) - self.test_scorer = ( - lambda _, __: neg_mean_inertia(self.X_val, - self.estimator.predict(self.X_val), - self.estimator.cluster_centers_)) + self.train_scorer = lambda _, __: neg_mean_inertia( + self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_ + ) + self.test_scorer = lambda _, __: neg_mean_inertia( + self.X_val, + self.estimator.predict(self.X_val), + self.estimator.cluster_centers_, + ) class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): @@ -57,8 +59,8 @@ class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark): Benchmarks for MiniBatchKMeans. """ - param_names = ['representation', 'init'] - params = (['dense', 'sparse'], ['random', 'k-means++']) + param_names = ["representation", "init"] + params = (["dense", "sparse"], ["random", "k-means++"]) def setup_cache(self): super().setup_cache() @@ -66,7 +68,7 @@ def setup_cache(self): def make_data(self, params): representation, init = params - if representation == 'sparse': + if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _blobs_dataset(n_clusters=20) @@ -76,25 +78,27 @@ def make_data(self, params): def make_estimator(self, params): representation, init = params - max_iter = 5 if representation == 'sparse' else 2 + max_iter = 5 if representation == "sparse" else 2 - estimator = MiniBatchKMeans(n_clusters=20, - init=init, - n_init=1, - max_iter=max_iter, - batch_size=1000, - max_no_improvement=None, - compute_labels=False, - random_state=0) + estimator = MiniBatchKMeans( + n_clusters=20, + init=init, + n_init=1, + max_iter=max_iter, + batch_size=1000, + max_no_improvement=None, + compute_labels=False, + random_state=0, + ) return estimator def make_scorers(self): - self.train_scorer = ( - lambda _, __: neg_mean_inertia(self.X, - self.estimator.predict(self.X), - self.estimator.cluster_centers_)) - self.test_scorer = ( - lambda _, __: neg_mean_inertia(self.X_val, - self.estimator.predict(self.X_val), - self.estimator.cluster_centers_)) + self.train_scorer = lambda _, __: neg_mean_inertia( + self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_ + ) + self.test_scorer = lambda _, __: neg_mean_inertia( + self.X_val, + self.estimator.predict(self.X_val), + self.estimator.cluster_centers_, + ) diff --git a/asv_benchmarks/benchmarks/common.py b/asv_benchmarks/benchmarks/common.py index 70760dc47a9b7..c12da551010f6 100644 --- a/asv_benchmarks/benchmarks/common.py +++ b/asv_benchmarks/benchmarks/common.py @@ -1,11 +1,11 @@ -import os +import itertools import json -import timeit +import os import pickle -import itertools +import timeit from abc import ABC, abstractmethod -from pathlib import Path from multiprocessing import cpu_count +from pathlib import Path import numpy as np @@ -14,86 +14,102 @@ def get_from_config(): """Get benchmarks configuration from the config.json file""" current_path = Path(__file__).resolve().parent - config_path = current_path / 'config.json' - with open(config_path, 'r') as config_file: - config_file = ''.join(line for line in config_file - if line and '//' not in line) + config_path = current_path / "config.json" + with open(config_path, "r") as config_file: + config_file = "".join(line for line in config_file if line and "//" not in line) config = json.loads(config_file) - profile = os.getenv('SKLBENCH_PROFILE', config['profile']) + profile = os.getenv("SKLBENCH_PROFILE", config["profile"]) - n_jobs_vals_env = os.getenv('SKLBENCH_NJOBS') + n_jobs_vals_env = os.getenv("SKLBENCH_NJOBS") if n_jobs_vals_env: - n_jobs_vals = eval(n_jobs_vals_env) + n_jobs_vals = json.loads(n_jobs_vals_env) else: - n_jobs_vals = config['n_jobs_vals'] + n_jobs_vals = config["n_jobs_vals"] if not n_jobs_vals: n_jobs_vals = list(range(1, 1 + cpu_count())) - cache_path = current_path / 'cache' + cache_path = current_path / "cache" cache_path.mkdir(exist_ok=True) - (cache_path / 'estimators').mkdir(exist_ok=True) - (cache_path / 'tmp').mkdir(exist_ok=True) + (cache_path / "estimators").mkdir(exist_ok=True) + (cache_path / "tmp").mkdir(exist_ok=True) - save_estimators = os.getenv('SKLBENCH_SAVE_ESTIMATORS', - config['save_estimators']) - save_dir = os.getenv('ASV_COMMIT', 'new')[:8] + save_estimators = os.getenv("SKLBENCH_SAVE_ESTIMATORS", config["save_estimators"]) + save_dir = os.getenv("ASV_COMMIT", "new")[:8] if save_estimators: - (cache_path / 'estimators' / save_dir).mkdir(exist_ok=True) + (cache_path / "estimators" / save_dir).mkdir(exist_ok=True) - base_commit = os.getenv('SKLBENCH_BASE_COMMIT', config['base_commit']) + base_commit = os.getenv("SKLBENCH_BASE_COMMIT", config["base_commit"]) - bench_predict = os.getenv('SKLBENCH_PREDICT', config['bench_predict']) - bench_transform = os.getenv('SKLBENCH_TRANSFORM', - config['bench_transform']) + bench_predict = os.getenv("SKLBENCH_PREDICT", config["bench_predict"]) + bench_transform = os.getenv("SKLBENCH_TRANSFORM", config["bench_transform"]) - return (profile, n_jobs_vals, save_estimators, save_dir, base_commit, - bench_predict, bench_transform) + return ( + profile, + n_jobs_vals, + save_estimators, + save_dir, + base_commit, + bench_predict, + bench_transform, + ) def get_estimator_path(benchmark, directory, params, save=False): """Get path of pickled fitted estimator""" - path = Path(__file__).resolve().parent / 'cache' - path = (path / 'estimators' / directory) if save else (path / 'tmp') + path = Path(__file__).resolve().parent / "cache" + path = (path / "estimators" / directory) if save else (path / "tmp") - filename = (benchmark.__class__.__name__ - + '_estimator_' + '_'.join(list(map(str, params))) + '.pkl') + filename = ( + benchmark.__class__.__name__ + + "_estimator_" + + "_".join(list(map(str, params))) + + ".pkl" + ) return path / filename def clear_tmp(): """Clean the tmp directory""" - path = Path(__file__).resolve().parent / 'cache' / 'tmp' + path = Path(__file__).resolve().parent / "cache" / "tmp" for child in path.iterdir(): child.unlink() class Benchmark(ABC): """Abstract base class for all the benchmarks""" + timer = timeit.default_timer # wall time processes = 1 timeout = 500 - (profile, n_jobs_vals, save_estimators, save_dir, base_commit, - bench_predict, bench_transform) = get_from_config() - - if profile == 'fast': + ( + profile, + n_jobs_vals, + save_estimators, + save_dir, + base_commit, + bench_predict, + bench_transform, + ) = get_from_config() + + if profile == "fast": warmup_time = 0 repeat = 1 number = 1 min_run_count = 1 - data_size = 'small' - elif profile == 'regular': + data_size = "small" + elif profile == "regular": warmup_time = 1 repeat = (3, 100, 30) - data_size = 'small' - elif profile == 'large_scale': + data_size = "small" + elif profile == "large_scale": warmup_time = 1 repeat = 3 number = 1 - data_size = 'large' + data_size = "large" @property @abstractmethod @@ -103,6 +119,7 @@ def params(self): class Estimator(ABC): """Abstract base class for all benchmarks of estimators""" + @abstractmethod def make_data(self, params): """Return the dataset for a combination of parameters""" @@ -112,8 +129,7 @@ def make_data(self, params): @abstractmethod def make_estimator(self, params): - """Return an instance of the estimator for a combination of parameters - """ + """Return an instance of the estimator for a combination of parameters""" pass def skip(self, params): @@ -137,9 +153,10 @@ def setup_cache(self): estimator.fit(X, y) - est_path = get_estimator_path(self, Benchmark.save_dir, - params, Benchmark.save_estimators) - with est_path.open(mode='wb') as f: + est_path = get_estimator_path( + self, Benchmark.save_dir, params, Benchmark.save_estimators + ) + with est_path.open(mode="wb") as f: pickle.dump(estimator, f) def setup(self, *params): @@ -152,9 +169,10 @@ def setup(self, *params): self.X, self.X_val, self.y, self.y_val = self.make_data(params) - est_path = get_estimator_path(self, Benchmark.save_dir, - params, Benchmark.save_estimators) - with est_path.open(mode='rb') as f: + est_path = get_estimator_path( + self, Benchmark.save_dir, params, Benchmark.save_estimators + ) + with est_path.open(mode="rb") as f: self.estimator = pickle.load(f) self.make_scorers() @@ -166,14 +184,14 @@ def peakmem_fit(self, *args): self.estimator.fit(self.X, self.y) def track_train_score(self, *args): - if hasattr(self.estimator, 'predict'): + if hasattr(self.estimator, "predict"): y_pred = self.estimator.predict(self.X) else: y_pred = None return float(self.train_scorer(self.y, y_pred)) def track_test_score(self, *args): - if hasattr(self.estimator, 'predict'): + if hasattr(self.estimator, "predict"): y_val_pred = self.estimator.predict(self.X_val) else: y_val_pred = None @@ -182,7 +200,9 @@ def track_test_score(self, *args): class Predictor(ABC): """Abstract base class for benchmarks of estimators implementing predict""" + if Benchmark.bench_predict: + def time_predict(self, *args): self.estimator.predict(self.X) @@ -190,10 +210,10 @@ def peakmem_predict(self, *args): self.estimator.predict(self.X) if Benchmark.base_commit is not None: + def track_same_prediction(self, *args): - est_path = get_estimator_path(self, Benchmark.base_commit, - args, True) - with est_path.open(mode='rb') as f: + est_path = get_estimator_path(self, Benchmark.base_commit, args, True) + with est_path.open(mode="rb") as f: estimator_base = pickle.load(f) y_val_pred_base = estimator_base.predict(self.X_val) @@ -208,9 +228,10 @@ def params(self): class Transformer(ABC): - """Abstract base class for benchmarks of estimators implementing transform - """ + """Abstract base class for benchmarks of estimators implementing transform""" + if Benchmark.bench_transform: + def time_transform(self, *args): self.estimator.transform(self.X) @@ -218,10 +239,10 @@ def peakmem_transform(self, *args): self.estimator.transform(self.X) if Benchmark.base_commit is not None: + def track_same_transform(self, *args): - est_path = get_estimator_path(self, Benchmark.base_commit, - args, True) - with est_path.open(mode='rb') as f: + est_path = get_estimator_path(self, Benchmark.base_commit, args, True) + with est_path.open(mode="rb") as f: estimator_base = pickle.load(f) X_val_t_base = estimator_base.transform(self.X_val) diff --git a/asv_benchmarks/benchmarks/config.json b/asv_benchmarks/benchmarks/config.json index f50827cdbd7b7..b5a10b930e60b 100644 --- a/asv_benchmarks/benchmarks/config.json +++ b/asv_benchmarks/benchmarks/config.json @@ -9,7 +9,7 @@ // Can be overridden by environment variable SKLBENCH_PROFILE. "profile": "regular", - // List of values of n_jobs to use for estimators which accept this + // List of values of n_jobs to use for estimators which accept this // parameter (-1 means all cores). An empty list means all values from 1 to // the maximum number of available cores. // Can be overridden by environment variable SKLBENCH_NJOBS. diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py index b00d5888fd2b2..bbf5029062448 100644 --- a/asv_benchmarks/benchmarks/datasets.py +++ b/asv_benchmarks/benchmarks/datasets.py @@ -1,25 +1,32 @@ +from pathlib import Path + import numpy as np import scipy.sparse as sp from joblib import Memory -from pathlib import Path +from sklearn.datasets import ( + fetch_20newsgroups, + fetch_olivetti_faces, + fetch_openml, + load_digits, + make_blobs, + make_classification, + make_regression, +) from sklearn.decomposition import TruncatedSVD -from sklearn.datasets import (make_blobs, fetch_20newsgroups, - fetch_openml, load_digits, make_regression, - make_classification, fetch_olivetti_faces) -from sklearn.preprocessing import MaxAbsScaler, StandardScaler from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MaxAbsScaler, StandardScaler # memory location for caching datasets -M = Memory(location=str(Path(__file__).resolve().parent / 'cache')) +M = Memory(location=str(Path(__file__).resolve().parent / "cache")) @M.cache -def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, - dtype=np.float32): - X, _ = make_blobs(n_samples=n_samples, n_features=n_features, - centers=n_clusters, random_state=0) +def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, dtype=np.float32): + X, _ = make_blobs( + n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=0 + ) X = X.astype(dtype, copy=False) X, X_val = train_test_split(X, test_size=0.1, random_state=0) @@ -27,8 +34,7 @@ def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, @M.cache -def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), - dtype=np.float32): +def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), dtype=np.float32): newsgroups = fetch_20newsgroups(random_state=0) vectorizer = TfidfVectorizer(ngram_range=ngrams, dtype=dtype) X = vectorizer.fit_transform(newsgroups.data[:n_samples]) @@ -39,8 +45,7 @@ def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), @M.cache -def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), - dtype=np.float32): +def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float32): newsgroups = fetch_20newsgroups() vectorizer = TfidfVectorizer(ngram_range=ngrams) X = vectorizer.fit_transform(newsgroups.data) @@ -55,8 +60,7 @@ def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), @M.cache def _mnist_dataset(dtype=np.float32): - X, y = fetch_openml('mnist_784', version=1, return_X_y=True, - as_frame=False) + X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) X = X.astype(dtype, copy=False) X = MaxAbsScaler().fit_transform(X) @@ -77,11 +81,14 @@ def _digits_dataset(n_samples=None, dtype=np.float32): @M.cache -def _synth_regression_dataset(n_samples=100000, n_features=100, - dtype=np.float32): - X, y = make_regression(n_samples=n_samples, n_features=n_features, - n_informative=n_features // 10, noise=50, - random_state=0) +def _synth_regression_dataset(n_samples=100000, n_features=100, dtype=np.float32): + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features // 10, + noise=50, + random_state=0, + ) X = X.astype(dtype, copy=False) X = StandardScaler().fit_transform(X) @@ -90,10 +97,12 @@ def _synth_regression_dataset(n_samples=100000, n_features=100, @M.cache -def _synth_regression_sparse_dataset(n_samples=10000, n_features=10000, - density=0.01, dtype=np.float32): - X = sp.random(m=n_samples, n=n_features, density=density, format='csr', - random_state=0) +def _synth_regression_sparse_dataset( + n_samples=10000, n_features=10000, density=0.01, dtype=np.float32 +): + X = sp.random( + m=n_samples, n=n_features, density=density, format="csr", random_state=0 + ) X.data = np.random.RandomState(0).randn(X.getnnz()) X = X.astype(dtype, copy=False) coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) @@ -106,11 +115,17 @@ def _synth_regression_sparse_dataset(n_samples=10000, n_features=10000, @M.cache -def _synth_classification_dataset(n_samples=1000, n_features=10000, - n_classes=2, dtype=np.float32): - X, y = make_classification(n_samples=n_samples, n_features=n_features, - n_classes=n_classes, random_state=0, - n_informative=n_features, n_redundant=0) +def _synth_classification_dataset( + n_samples=1000, n_features=10000, n_classes=2, dtype=np.float32 +): + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + random_state=0, + n_informative=n_features, + n_redundant=0, + ) X = X.astype(dtype, copy=False) X = StandardScaler().fit_transform(X) @@ -133,14 +148,21 @@ def _olivetti_faces_dataset(): @M.cache -def _random_dataset(n_samples=1000, n_features=1000, - representation='dense', dtype=np.float32): - if representation == 'dense': +def _random_dataset( + n_samples=1000, n_features=1000, representation="dense", dtype=np.float32 +): + if representation == "dense": X = np.random.RandomState(0).random_sample((n_samples, n_features)) X = X.astype(dtype, copy=False) else: - X = sp.random(n_samples, n_features, density=0.05, format='csr', - dtype=dtype, random_state=0) + X = sp.random( + n_samples, + n_features, + density=0.05, + format="csr", + dtype=dtype, + random_state=0, + ) X, X_val = train_test_split(X, test_size=0.1, random_state=0) return X, X_val, None, None diff --git a/asv_benchmarks/benchmarks/decomposition.py b/asv_benchmarks/benchmarks/decomposition.py index ea23b6d0d4c82..0a7bb7ad07f3e 100644 --- a/asv_benchmarks/benchmarks/decomposition.py +++ b/asv_benchmarks/benchmarks/decomposition.py @@ -1,9 +1,8 @@ -from sklearn.decomposition import (PCA, DictionaryLearning, - MiniBatchDictionaryLearning) +from sklearn.decomposition import PCA, DictionaryLearning, MiniBatchDictionaryLearning from .common import Benchmark, Estimator, Transformer -from .datasets import _olivetti_faces_dataset, _mnist_dataset -from .utils import make_pca_scorers, make_dict_learning_scorers +from .datasets import _mnist_dataset, _olivetti_faces_dataset +from .utils import make_dict_learning_scorers, make_pca_scorers class PCABenchmark(Transformer, Estimator, Benchmark): @@ -11,8 +10,8 @@ class PCABenchmark(Transformer, Estimator, Benchmark): Benchmarks for PCA. """ - param_names = ['svd_solver'] - params = (['full', 'arpack', 'randomized'],) + param_names = ["svd_solver"] + params = (["full", "arpack", "randomized"],) def setup_cache(self): super().setup_cache() @@ -21,11 +20,9 @@ def make_data(self, params): return _mnist_dataset() def make_estimator(self, params): - svd_solver, = params + (svd_solver,) = params - estimator = PCA(n_components=32, - svd_solver=svd_solver, - random_state=0) + estimator = PCA(n_components=32, svd_solver=svd_solver, random_state=0) return estimator @@ -38,8 +35,8 @@ class DictionaryLearningBenchmark(Transformer, Estimator, Benchmark): Benchmarks for DictionaryLearning. """ - param_names = ['fit_algorithm', 'n_jobs'] - params = (['lars', 'cd'], Benchmark.n_jobs_vals) + param_names = ["fit_algorithm", "n_jobs"] + params = (["lars", "cd"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() @@ -50,13 +47,16 @@ def make_data(self, params): def make_estimator(self, params): fit_algorithm, n_jobs = params - estimator = DictionaryLearning(n_components=15, - fit_algorithm=fit_algorithm, - alpha=0.1, - max_iter=20, - tol=1e-16, - random_state=0, - n_jobs=n_jobs) + estimator = DictionaryLearning( + n_components=15, + fit_algorithm=fit_algorithm, + alpha=0.1, + transform_alpha=1, + max_iter=20, + tol=1e-16, + random_state=0, + n_jobs=n_jobs, + ) return estimator @@ -69,8 +69,8 @@ class MiniBatchDictionaryLearningBenchmark(Transformer, Estimator, Benchmark): Benchmarks for MiniBatchDictionaryLearning """ - param_names = ['fit_algorithm', 'n_jobs'] - params = (['lars', 'cd'], Benchmark.n_jobs_vals) + param_names = ["fit_algorithm", "n_jobs"] + params = (["lars", "cd"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() @@ -81,12 +81,14 @@ def make_data(self, params): def make_estimator(self, params): fit_algorithm, n_jobs = params - estimator = MiniBatchDictionaryLearning(n_components=15, - fit_algorithm=fit_algorithm, - alpha=0.1, - batch_size=3, - random_state=0, - n_jobs=n_jobs) + estimator = MiniBatchDictionaryLearning( + n_components=15, + fit_algorithm=fit_algorithm, + alpha=0.1, + batch_size=3, + random_state=0, + n_jobs=n_jobs, + ) return estimator diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index c46ac07c84475..c336d1e5f8805 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -1,12 +1,15 @@ -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.ensemble import (RandomForestClassifier, - GradientBoostingClassifier, - HistGradientBoostingClassifier) +from sklearn.ensemble import ( + GradientBoostingClassifier, + HistGradientBoostingClassifier, + RandomForestClassifier, +) from .common import Benchmark, Estimator, Predictor -from .datasets import (_20newsgroups_highdim_dataset, - _20newsgroups_lowdim_dataset, - _synth_classification_dataset) +from .datasets import ( + _20newsgroups_highdim_dataset, + _20newsgroups_lowdim_dataset, + _synth_classification_dataset, +) from .utils import make_gen_classif_scorers @@ -15,8 +18,8 @@ class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): Benchmarks for RandomForestClassifier. """ - param_names = ['representation', 'n_jobs'] - params = (['dense', 'sparse'], Benchmark.n_jobs_vals) + param_names = ["representation", "n_jobs"] + params = (["dense", "sparse"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() @@ -24,7 +27,7 @@ def setup_cache(self): def make_data(self, params): representation, n_jobs = params - if representation == 'sparse': + if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _20newsgroups_lowdim_dataset() @@ -34,13 +37,15 @@ def make_data(self, params): def make_estimator(self, params): representation, n_jobs = params - n_estimators = 500 if Benchmark.data_size == 'large' else 100 + n_estimators = 500 if Benchmark.data_size == "large" else 100 - estimator = RandomForestClassifier(n_estimators=n_estimators, - min_samples_split=10, - max_features='log2', - n_jobs=n_jobs, - random_state=0) + estimator = RandomForestClassifier( + n_estimators=n_estimators, + min_samples_split=10, + max_features="log2", + n_jobs=n_jobs, + random_state=0, + ) return estimator @@ -53,16 +58,16 @@ class GradientBoostingClassifierBenchmark(Predictor, Estimator, Benchmark): Benchmarks for GradientBoostingClassifier. """ - param_names = ['representation'] - params = (['dense', 'sparse'],) + param_names = ["representation"] + params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): - representation, = params + (representation,) = params - if representation == 'sparse': + if representation == "sparse": data = _20newsgroups_highdim_dataset() else: data = _20newsgroups_lowdim_dataset() @@ -70,14 +75,16 @@ def make_data(self, params): return data def make_estimator(self, params): - representation, = params + (representation,) = params - n_estimators = 100 if Benchmark.data_size == 'large' else 10 + n_estimators = 100 if Benchmark.data_size == "large" else 10 - estimator = GradientBoostingClassifier(n_estimators=n_estimators, - max_features='log2', - subsample=0.5, - random_state=0) + estimator = GradientBoostingClassifier( + n_estimators=n_estimators, + max_features="log2", + subsample=0.5, + random_state=0, + ) return estimator @@ -97,17 +104,16 @@ def setup_cache(self): super().setup_cache() def make_data(self, params): - data = _synth_classification_dataset(n_samples=10000, - n_features=100, - n_classes=5) + data = _synth_classification_dataset( + n_samples=10000, n_features=100, n_classes=5 + ) return data def make_estimator(self, params): - estimator = HistGradientBoostingClassifier(max_iter=100, - max_leaf_nodes=15, - early_stopping=False, - random_state=0) + estimator = HistGradientBoostingClassifier( + max_iter=100, max_leaf_nodes=15, early_stopping=False, random_state=0 + ) return estimator diff --git a/asv_benchmarks/benchmarks/linear_model.py b/asv_benchmarks/benchmarks/linear_model.py index e8f41a97a80cd..24153895611df 100644 --- a/asv_benchmarks/benchmarks/linear_model.py +++ b/asv_benchmarks/benchmarks/linear_model.py @@ -1,11 +1,19 @@ -from sklearn.linear_model import (LogisticRegression, Ridge, ElasticNet, Lasso, - LinearRegression, SGDRegressor) +from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, + LogisticRegression, + Ridge, + SGDRegressor, +) from .common import Benchmark, Estimator, Predictor -from .datasets import (_20newsgroups_highdim_dataset, - _20newsgroups_lowdim_dataset, - _synth_regression_dataset, - _synth_regression_sparse_dataset) +from .datasets import ( + _20newsgroups_highdim_dataset, + _20newsgroups_lowdim_dataset, + _synth_regression_dataset, + _synth_regression_sparse_dataset, +) from .utils import make_gen_classif_scorers, make_gen_reg_scorers @@ -14,8 +22,8 @@ class LogisticRegressionBenchmark(Predictor, Estimator, Benchmark): Benchmarks for LogisticRegression. """ - param_names = ['representation', 'solver', 'n_jobs'] - params = (['dense', 'sparse'], ['lbfgs', 'saga'], Benchmark.n_jobs_vals) + param_names = ["representation", "solver", "n_jobs"] + params = (["dense", "sparse"], ["lbfgs", "saga"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() @@ -23,13 +31,13 @@ def setup_cache(self): def make_data(self, params): representation, solver, n_jobs = params - if Benchmark.data_size == 'large': - if representation == 'sparse': + if Benchmark.data_size == "large": + if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=10000) else: data = _20newsgroups_lowdim_dataset(n_components=1e3) else: - if representation == 'sparse': + if representation == "sparse": data = _20newsgroups_highdim_dataset(n_samples=2500) else: data = _20newsgroups_lowdim_dataset() @@ -39,14 +47,15 @@ def make_data(self, params): def make_estimator(self, params): representation, solver, n_jobs = params - penalty = 'l2' if solver == 'lbfgs' else 'l1' + penalty = "l2" if solver == "lbfgs" else "l1" - estimator = LogisticRegression(solver=solver, - penalty=penalty, - multi_class='multinomial', - tol=0.01, - n_jobs=n_jobs, - random_state=0) + estimator = LogisticRegression( + solver=solver, + penalty=penalty, + tol=0.01, + n_jobs=n_jobs, + random_state=0, + ) return estimator @@ -59,9 +68,11 @@ class RidgeBenchmark(Predictor, Estimator, Benchmark): Benchmarks for Ridge. """ - param_names = ['representation', 'solver'] - params = (['dense', 'sparse'], - ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']) + param_names = ["representation", "solver"] + params = ( + ["dense", "sparse"], + ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"], + ) def setup_cache(self): super().setup_cache() @@ -69,21 +80,19 @@ def setup_cache(self): def make_data(self, params): representation, solver = params - if representation == 'dense': + if representation == "dense": data = _synth_regression_dataset(n_samples=500000, n_features=100) else: - data = _synth_regression_sparse_dataset(n_samples=100000, - n_features=10000, - density=0.005) + data = _synth_regression_sparse_dataset( + n_samples=100000, n_features=10000, density=0.005 + ) return data def make_estimator(self, params): representation, solver = params - estimator = Ridge(solver=solver, - fit_intercept=False, - random_state=0) + estimator = Ridge(solver=solver, fit_intercept=False, random_state=0) return estimator @@ -93,31 +102,31 @@ def make_scorers(self): def skip(self, params): representation, solver = params - if representation == 'sparse' and solver == 'svd': + if representation == "sparse" and solver == "svd": return True return False class LinearRegressionBenchmark(Predictor, Estimator, Benchmark): """ - Benchmarks for Linear Reagression. + Benchmarks for Linear Regression. """ - param_names = ['representation'] - params = (['dense', 'sparse'],) + param_names = ["representation"] + params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): - representation, = params + (representation,) = params - if representation == 'dense': + if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: - data = _synth_regression_sparse_dataset(n_samples=10000, - n_features=100000, - density=0.01) + data = _synth_regression_sparse_dataset( + n_samples=10000, n_features=100000, density=0.01 + ) return data @@ -135,28 +144,30 @@ class SGDRegressorBenchmark(Predictor, Estimator, Benchmark): Benchmark for SGD """ - param_names = ['representation'] - params = (['dense', 'sparse'],) + param_names = ["representation"] + params = (["dense", "sparse"],) def setup_cache(self): super().setup_cache() def make_data(self, params): - representation, = params + (representation,) = params - if representation == 'dense': + if representation == "dense": data = _synth_regression_dataset(n_samples=100000, n_features=200) else: - data = _synth_regression_sparse_dataset(n_samples=100000, - n_features=1000, - density=0.01) + data = _synth_regression_sparse_dataset( + n_samples=100000, n_features=1000, density=0.01 + ) return data def make_estimator(self, params): - estimator = SGDRegressor(max_iter=1000, - tol=1e-16, - random_state=0) + (representation,) = params + + max_iter = 60 if representation == "dense" else 300 + + estimator = SGDRegressor(max_iter=max_iter, tol=None, random_state=0) return estimator @@ -169,8 +180,8 @@ class ElasticNetBenchmark(Predictor, Estimator, Benchmark): Benchmarks for ElasticNet. """ - param_names = ['representation', 'precompute'] - params = (['dense', 'sparse'], [True, False]) + param_names = ["representation", "precompute"] + params = (["dense", "sparse"], [True, False]) def setup_cache(self): super().setup_cache() @@ -178,21 +189,19 @@ def setup_cache(self): def make_data(self, params): representation, precompute = params - if representation == 'dense': + if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: - data = _synth_regression_sparse_dataset(n_samples=50000, - n_features=5000, - density=0.01) + data = _synth_regression_sparse_dataset( + n_samples=50000, n_features=5000, density=0.01 + ) return data def make_estimator(self, params): representation, precompute = params - estimator = ElasticNet(precompute=precompute, - alpha=0.001, - random_state=0) + estimator = ElasticNet(precompute=precompute, alpha=0.001, random_state=0) return estimator @@ -202,7 +211,7 @@ def make_scorers(self): def skip(self, params): representation, precompute = params - if representation == 'sparse' and precompute is False: + if representation == "sparse" and precompute is False: return True return False @@ -212,8 +221,8 @@ class LassoBenchmark(Predictor, Estimator, Benchmark): Benchmarks for Lasso. """ - param_names = ['representation', 'precompute'] - params = (['dense', 'sparse'], [True, False]) + param_names = ["representation", "precompute"] + params = (["dense", "sparse"], [True, False]) def setup_cache(self): super().setup_cache() @@ -221,21 +230,19 @@ def setup_cache(self): def make_data(self, params): representation, precompute = params - if representation == 'dense': + if representation == "dense": data = _synth_regression_dataset(n_samples=1000000, n_features=100) else: - data = _synth_regression_sparse_dataset(n_samples=50000, - n_features=5000, - density=0.01) + data = _synth_regression_sparse_dataset( + n_samples=50000, n_features=5000, density=0.01 + ) return data def make_estimator(self, params): representation, precompute = params - estimator = Lasso(precompute=precompute, - alpha=0.001, - random_state=0) + estimator = Lasso(precompute=precompute, alpha=0.001, random_state=0) return estimator @@ -245,6 +252,6 @@ def make_scorers(self): def skip(self, params): representation, precompute = params - if representation == 'sparse' and precompute is False: + if representation == "sparse" and precompute is False: return True return False diff --git a/asv_benchmarks/benchmarks/manifold.py b/asv_benchmarks/benchmarks/manifold.py index 26197dc8bbc31..c32f3e061dc33 100644 --- a/asv_benchmarks/benchmarks/manifold.py +++ b/asv_benchmarks/benchmarks/manifold.py @@ -9,21 +9,21 @@ class TSNEBenchmark(Estimator, Benchmark): Benchmarks for t-SNE. """ - param_names = ['method'] - params = (['exact', 'barnes_hut'],) + param_names = ["method"] + params = (["exact", "barnes_hut"],) def setup_cache(self): super().setup_cache() def make_data(self, params): - method, = params + (method,) = params - n_samples = 500 if method == 'exact' else None + n_samples = 500 if method == "exact" else None return _digits_dataset(n_samples=n_samples) def make_estimator(self, params): - method, = params + (method,) = params estimator = TSNE(random_state=0, method=method) diff --git a/asv_benchmarks/benchmarks/metrics.py b/asv_benchmarks/benchmarks/metrics.py index 4a84cf1941a8f..597e5dc789f6c 100644 --- a/asv_benchmarks/benchmarks/metrics.py +++ b/asv_benchmarks/benchmarks/metrics.py @@ -9,34 +9,34 @@ class PairwiseDistancesBenchmark(Benchmark): Benchmarks for pairwise distances. """ - param_names = ['representation', 'metric', 'n_jobs'] - params = (['dense', 'sparse'], - ['cosine', 'euclidean', 'manhattan', 'correlation'], - Benchmark.n_jobs_vals) + param_names = ["representation", "metric", "n_jobs"] + params = ( + ["dense", "sparse"], + ["cosine", "euclidean", "manhattan", "correlation"], + Benchmark.n_jobs_vals, + ) def setup(self, *params): representation, metric, n_jobs = params - if representation == 'sparse' and metric == 'correlation': + if representation == "sparse" and metric == "correlation": raise NotImplementedError - if Benchmark.data_size == 'large': - if metric in ('manhattan', 'correlation'): + if Benchmark.data_size == "large": + if metric in ("manhattan", "correlation"): n_samples = 8000 else: n_samples = 24000 else: - if metric in ('manhattan', 'correlation'): + if metric in ("manhattan", "correlation"): n_samples = 4000 else: n_samples = 12000 - data = _random_dataset(n_samples=n_samples, - representation=representation) + data = _random_dataset(n_samples=n_samples, representation=representation) self.X, self.X_val, self.y, self.y_val = data - self.pdist_params = {'metric': metric, - 'n_jobs': n_jobs} + self.pdist_params = {"metric": metric, "n_jobs": n_jobs} def time_pairwise_distances(self, *args): pairwise_distances(self.X, **self.pdist_params) diff --git a/asv_benchmarks/benchmarks/model_selection.py b/asv_benchmarks/benchmarks/model_selection.py index 4e7058ffc2262..335ffe498adaa 100644 --- a/asv_benchmarks/benchmarks/model_selection.py +++ b/asv_benchmarks/benchmarks/model_selection.py @@ -13,23 +13,20 @@ class CrossValidationBenchmark(Benchmark): timeout = 20000 - param_names = ['n_jobs'] + param_names = ["n_jobs"] params = (Benchmark.n_jobs_vals,) def setup(self, *params): - n_jobs, = params + (n_jobs,) = params data = _synth_classification_dataset(n_samples=50000, n_features=100) self.X, self.X_val, self.y, self.y_val = data - self.clf = RandomForestClassifier(n_estimators=50, - max_depth=10, - random_state=0) + self.clf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0) - cv = 16 if Benchmark.data_size == 'large' else 4 + cv = 16 if Benchmark.data_size == "large" else 4 - self.cv_params = {'n_jobs': n_jobs, - 'cv': cv} + self.cv_params = {"n_jobs": n_jobs, "cv": cv} def time_crossval(self, *args): cross_val_score(self.clf, self.X, self.y, **self.cv_params) @@ -38,8 +35,7 @@ def peakmem_crossval(self, *args): cross_val_score(self.clf, self.X, self.y, **self.cv_params) def track_crossval(self, *args): - return float(cross_val_score(self.clf, self.X, - self.y, **self.cv_params).mean()) + return float(cross_val_score(self.clf, self.X, self.y, **self.cv_params).mean()) class GridSearchBenchmark(Predictor, Estimator, Benchmark): @@ -49,7 +45,7 @@ class GridSearchBenchmark(Predictor, Estimator, Benchmark): timeout = 20000 - param_names = ['n_jobs'] + param_names = ["n_jobs"] params = (Benchmark.n_jobs_vals,) def setup_cache(self): @@ -61,11 +57,11 @@ def make_data(self, params): return data def make_estimator(self, params): - n_jobs, = params + (n_jobs,) = params clf = RandomForestClassifier(random_state=0) - if Benchmark.data_size == 'large': + if Benchmark.data_size == "large": n_estimators_list = [10, 25, 50, 100, 500] max_depth_list = [5, 10, None] max_features_list = [0.1, 0.4, 0.8, 1.0] @@ -74,9 +70,11 @@ def make_estimator(self, params): max_depth_list = [5, 10] max_features_list = [0.1, 0.4, 0.8] - param_grid = {'n_estimators': n_estimators_list, - 'max_depth': max_depth_list, - 'max_features': max_features_list} + param_grid = { + "n_estimators": n_estimators_list, + "max_depth": max_depth_list, + "max_features": max_features_list, + } estimator = GridSearchCV(clf, param_grid, n_jobs=n_jobs, cv=4) diff --git a/asv_benchmarks/benchmarks/neighbors.py b/asv_benchmarks/benchmarks/neighbors.py index 2be6cc2f09364..b0bf6aba1d85b 100644 --- a/asv_benchmarks/benchmarks/neighbors.py +++ b/asv_benchmarks/benchmarks/neighbors.py @@ -10,10 +10,8 @@ class KNeighborsClassifierBenchmark(Predictor, Estimator, Benchmark): Benchmarks for KNeighborsClassifier. """ - param_names = ['algorithm', 'dimension', 'n_jobs'] - params = (['brute', 'kd_tree', 'ball_tree'], - ['low', 'high'], - Benchmark.n_jobs_vals) + param_names = ["algorithm", "dimension", "n_jobs"] + params = (["brute", "kd_tree", "ball_tree"], ["low", "high"], Benchmark.n_jobs_vals) def setup_cache(self): super().setup_cache() @@ -21,10 +19,10 @@ def setup_cache(self): def make_data(self, params): algorithm, dimension, n_jobs = params - if Benchmark.data_size == 'large': - n_components = 40 if dimension == 'low' else 200 + if Benchmark.data_size == "large": + n_components = 40 if dimension == "low" else 200 else: - n_components = 10 if dimension == 'low' else 50 + n_components = 10 if dimension == "low" else 50 data = _20newsgroups_lowdim_dataset(n_components=n_components) @@ -33,8 +31,7 @@ def make_data(self, params): def make_estimator(self, params): algorithm, dimension, n_jobs = params - estimator = KNeighborsClassifier(algorithm=algorithm, - n_jobs=n_jobs) + estimator = KNeighborsClassifier(algorithm=algorithm, n_jobs=n_jobs) return estimator diff --git a/asv_benchmarks/benchmarks/svm.py b/asv_benchmarks/benchmarks/svm.py index bbcc7a27edecf..36d3066484ee5 100644 --- a/asv_benchmarks/benchmarks/svm.py +++ b/asv_benchmarks/benchmarks/svm.py @@ -8,8 +8,8 @@ class SVCBenchmark(Predictor, Estimator, Benchmark): """Benchmarks for SVC.""" - param_names = ['kernel'] - params = (['linear', 'poly', 'rbf', 'sigmoid'],) + param_names = ["kernel"] + params = (["linear", "poly", "rbf", "sigmoid"],) def setup_cache(self): super().setup_cache() @@ -18,13 +18,11 @@ def make_data(self, params): return _synth_classification_dataset() def make_estimator(self, params): - kernel, = params + (kernel,) = params - estimator = SVC(max_iter=100, - tol=1e-16, - kernel=kernel, - random_state=0, - gamma='scale') + estimator = SVC( + max_iter=100, tol=1e-16, kernel=kernel, random_state=0, gamma="scale" + ) return estimator diff --git a/asv_benchmarks/benchmarks/utils.py b/asv_benchmarks/benchmarks/utils.py index 6a3073a634169..fca30579e529b 100644 --- a/asv_benchmarks/benchmarks/utils.py +++ b/asv_benchmarks/benchmarks/utils.py @@ -4,7 +4,7 @@ def neg_mean_inertia(X, labels, centers): - return - (np.asarray(X - centers[labels])**2).sum(axis=1).mean() + return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean() def make_gen_classif_scorers(caller): @@ -18,18 +18,22 @@ def make_gen_reg_scorers(caller): def neg_mean_data_error(X, U, V): - return - np.sqrt(((X - U.dot(V))**2).mean()) + return -np.sqrt(((X - U.dot(V)) ** 2).mean()) def make_dict_learning_scorers(caller): caller.train_scorer = lambda _, __: ( - neg_mean_data_error(caller.X, - caller.estimator.transform(caller.X), - caller.estimator.components_)) + neg_mean_data_error( + caller.X, caller.estimator.transform(caller.X), caller.estimator.components_ + ) + ) caller.test_scorer = lambda _, __: ( - neg_mean_data_error(caller.X_val, - caller.estimator.transform(caller.X_val), - caller.estimator.components_)) + neg_mean_data_error( + caller.X_val, + caller.estimator.transform(caller.X_val), + caller.estimator.components_, + ) + ) def explained_variance_ratio(Xt, X): @@ -37,8 +41,7 @@ def explained_variance_ratio(Xt, X): def make_pca_scorers(caller): - caller.train_scorer = ( - lambda _, __: caller.estimator.explained_variance_ratio_.sum()) + caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum() caller.test_scorer = lambda _, __: ( - explained_variance_ratio(caller.estimator.transform(caller.X_val), - caller.X_val)) + explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val) + ) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3cd2b5bb4cd9f..a36daf39b50db 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,20 +11,9 @@ jobs: - job: git_commit displayName: Get Git Commit pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-24.04 steps: - - bash: | - set -ex - if [[ $BUILD_REASON == "PullRequest" ]]; then - # By default pull requests use refs/pull/PULL_ID/merge as the source branch - # which has a "Merge ID into ID" as a commit message. The latest commit - # message is the second to last commit - COMMIT_ID=$(echo $BUILD_SOURCEVERSIONMESSAGE | awk '{print $2}') - message=$(git log $COMMIT_ID -1 --pretty=%B) - else - message=$BUILD_SOURCEVERSIONMESSAGE - fi - echo "##vso[task.setvariable variable=message;isOutput=true]$message" + - bash: python build_tools/azure/get_commit_message.py name: commit displayName: Get source version message @@ -38,25 +27,29 @@ jobs: ) displayName: Linting pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-24.04 steps: - task: UsePythonVersion@0 inputs: - versionSpec: '3.9' + versionSpec: '3.12' - bash: | - pip install flake8 mypy==0.782 + source build_tools/shared.sh + # Include pytest compatibility with mypy + pip install pytest $(get_dep ruff min) $(get_dep mypy min) cython-lint displayName: Install linters - bash: | - ./build_tools/circle/linting.sh - displayName: Run linting + ./build_tools/linting.sh + displayName: Run linters - bash: | - mypy sklearn/ - displayName: Run mypy + pip install ninja meson scipy + python build_tools/check-meson-openmp-dependencies.py + displayName: Run Meson OpenMP checks + - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly - vmImage: ubuntu-18.04 + vmImage: ubuntu-22.04 dependsOn: [git_commit, linting] condition: | and( @@ -70,41 +63,37 @@ jobs: matrix: pylatest_pip_scipy_dev: DISTRIB: 'conda-pip-scipy-dev' - PYTHON_VERSION: '*' - CHECK_WARNINGS: 'true' + LOCK_FILE: './build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock' + SKLEARN_WARNINGS_AS_ERRORS: '1' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' - TEST_DOCSTRINGS: 'true' - # Tests that require large downloads over the networks are skipped in CI. - # Here we make sure, that they are still run on a regular basis. - SKLEARN_SKIP_NETWORK_TESTS: '0' -# Check compilation with intel C++ compiler (ICC) - template: build_tools/azure/posix.yml + # CPython 3.13 free-threaded build parameters: - name: Linux_Nightly_ICC - vmImage: ubuntu-18.04 + name: Linux_free_threaded + vmImage: ubuntu-22.04 dependsOn: [git_commit, linting] condition: | and( succeeded(), not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), or(eq(variables['Build.Reason'], 'Schedule'), - contains(dependencies['git_commit']['outputs']['commit.message'], '[icc-build]') + contains(dependencies['git_commit']['outputs']['commit.message'], '[free-threaded]' + ) ) ) matrix: - pylatest_conda_mkl: - DISTRIB: 'conda' - PYTHON_VERSION: '*' - BLAS: 'mkl' + pylatest_free_threaded: + DISTRIB: 'conda-free-threaded' + LOCK_FILE: './build_tools/azure/pylatest_free_threaded_linux-64_conda.lock' COVERAGE: 'false' - BUILD_WITH_ICC: 'true' + SKLEARN_FAULTHANDLER_TIMEOUT: '1800' # 30 * 60 seconds # Will run all the time regardless of linting outcome. - template: build_tools/azure/posix.yml parameters: name: Linux_Runs - vmImage: ubuntu-18.04 + vmImage: ubuntu-22.04 dependsOn: [git_commit] condition: | and( @@ -112,124 +101,171 @@ jobs: not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: - pylatest_conda_mkl: + pylatest_conda_forge_mkl: DISTRIB: 'conda' - PYTHON_VERSION: '*' - BLAS: 'mkl' + LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock' COVERAGE: 'true' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '42' # default global random seed + # Tests that require large downloads over the networks are skipped in CI. + # Here we make sure, that they are still run on a regular basis. + ${{ if eq(variables['Build.Reason'], 'Schedule') }}: + SKLEARN_SKIP_NETWORK_TESTS: '0' + SCIPY_ARRAY_API: '1' -- template: build_tools/azure/posix.yml +# Check compilation with Ubuntu 22.04 LTS (Jammy Jellyfish) and scipy from conda-forge +# By default the CI is sequential, where `Ubuntu_Jammy_Jellyfish` runs first and +# the others jobs are run only if `Ubuntu_Jammy_Jellyfish` succeeds. +# When "[azure parallel]" is in the commit message, `Ubuntu_Jammy_Jellyfish` will +# run in parallel with the rest of the jobs. On Azure, the job's name will be +# `Ubuntu_Jammy_Jellyfish_Parallel`. +- template: build_tools/azure/posix-all-parallel.yml parameters: - name: Linux - vmImage: ubuntu-18.04 - dependsOn: [linting, git_commit] + name: Ubuntu_Jammy_Jellyfish + vmImage: ubuntu-22.04 + dependsOn: [git_commit, linting] condition: | and( succeeded(), - not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), - ne(variables['Build.Reason'], 'Schedule') + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) + ) + commitMessage: dependencies['git_commit']['outputs']['commit.message'] + matrix: + pymin_conda_forge_openblas_ubuntu_2204: + DISTRIB: 'conda' + LOCK_FILE: './build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock' + SKLEARN_WARNINGS_AS_ERRORS: '1' + COVERAGE: 'false' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '0' # non-default seed + +- template: build_tools/azure/posix.yml + parameters: + name: Ubuntu_Atlas + vmImage: ubuntu-24.04 + dependsOn: [linting, git_commit, Ubuntu_Jammy_Jellyfish] + # Runs when dependencies succeeded or skipped + condition: | + and( + not(or(failed(), canceled())), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: # Linux environment to test that scikit-learn can be built against - # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 - # i.e. numpy 1.13.3 and scipy 0.19 - py36_ubuntu_atlas: + # versions of numpy, scipy with ATLAS that comes with Ubuntu 24.04 Noble Numbat + # i.e. numpy 1.26.4 and scipy 1.11.4 + ubuntu_atlas: DISTRIB: 'ubuntu' - PYTHON_VERSION: '3.6' - JOBLIB_VERSION: 'min' - PANDAS_VERSION: 'none' - THREADPOOLCTL_VERSION: 'min' - PYTEST_VERSION: 'min' - PYTEST_XDIST_VERSION: 'none' + LOCK_FILE: './build_tools/azure/ubuntu_atlas_lock.txt' COVERAGE: 'false' - # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB - py36_conda_openblas: + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '1' # non-default seed + +- template: build_tools/azure/posix.yml + parameters: + name: Linux + vmImage: ubuntu-22.04 + dependsOn: [linting, git_commit, Ubuntu_Jammy_Jellyfish] + # Runs when dependencies succeeded or skipped + condition: | + and( + not(or(failed(), canceled())), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) + ) + matrix: + # Linux build with minimum supported version of dependencies + pymin_conda_forge_openblas_min_dependencies: DISTRIB: 'conda' - PYTHON_VERSION: '3.6' - BLAS: 'openblas' - NUMPY_VERSION: 'min' - SCIPY_VERSION: 'min' - MATPLOTLIB_VERSION: 'min' - # latest version of joblib available in conda for Python 3.6 - JOBLIB_VERSION: '0.13.2' - THREADPOOLCTL_VERSION: '2.0.0' - # temporary pin pytest due to unknown failure with pytest 5.4 and - # python 3.6 - PYTEST_VERSION: 'min' - PYTEST_XDIST_VERSION: 'none' - # Linux environment to test the latest available dependencies and MKL. + LOCK_FILE: './build_tools/azure/pymin_conda_forge_openblas_min_dependencies_linux-64_conda.lock' + # Enable debug Cython directives to capture IndexError exceptions in + # combination with the -Werror::pytest.PytestUnraisableExceptionWarning + # flag for pytest. + # https://github.com/scikit-learn/scikit-learn/pull/24438 + SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES: '1' + SKLEARN_RUN_FLOAT32_TESTS: '1' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '2' # non-default seed + # Linux environment to test the latest available dependencies. # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - PYTHON_VERSION: '3.9' - PANDAS_VERSION: 'none' + LOCK_FILE: './build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' - TEST_DOCSTRINGS: 'true' - CHECK_WARNINGS: 'true' + SKLEARN_WARNINGS_AS_ERRORS: '1' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '3' # non-default seed + # disable pytest-xdist to have 1 job where OpenMP and BLAS are not single + # threaded because by default the tests configuration (sklearn/conftest.py) + # makes sure that they are single threaded in each xdist subprocess. + PYTEST_XDIST_VERSION: 'none' + PIP_BUILD_ISOLATION: 'true' + SCIPY_ARRAY_API: '1' -- template: build_tools/azure/posix-32.yml +- template: build_tools/azure/posix-docker.yml parameters: - name: Linux32 - vmImage: ubuntu-18.04 - dependsOn: [linting, git_commit] + name: Linux_Docker + vmImage: ubuntu-24.04 + dependsOn: [linting, git_commit, Ubuntu_Jammy_Jellyfish] + # Runs when dependencies succeeded or skipped condition: | and( - succeeded(), - not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), - ne(variables['Build.Reason'], 'Schedule') + not(or(failed(), canceled())), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: - py36_ubuntu_atlas_32bit: - DISTRIB: 'ubuntu-32' - PYTHON_VERSION: '3.6' - JOBLIB_VERSION: 'min' + debian_32bit: + DOCKER_CONTAINER: 'i386/debian:trixie' + DISTRIB: 'debian-32' + COVERAGE: "true" + LOCK_FILE: './build_tools/azure/debian_32bit_lock.txt' # disable pytest xdist due to unknown bug with 32-bit container PYTEST_XDIST_VERSION: 'none' - # temporary pin pytest due to unknown failure with pytest 5.4 and - # python 3.6 - PYTEST_VERSION: 'min' - THREADPOOLCTL_VERSION: 'min' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '4' # non-default seed - template: build_tools/azure/posix.yml parameters: name: macOS - vmImage: macOS-10.14 - dependsOn: [linting, git_commit] + vmImage: macOS-13 + dependsOn: [linting, git_commit, Ubuntu_Jammy_Jellyfish] + # Runs when dependencies succeeded or skipped condition: | and( - succeeded(), - not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), - ne(variables['Build.Reason'], 'Schedule') + not(or(failed(), canceled())), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' - BLAS: 'mkl' - CONDA_CHANNEL: 'conda-forge' + LOCK_FILE: './build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '5' # non-default seed + SCIPY_ARRAY_API: '1' pylatest_conda_mkl_no_openmp: DISTRIB: 'conda' - BLAS: 'mkl' + LOCK_FILE: './build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock' SKLEARN_TEST_NO_OPENMP: 'true' SKLEARN_SKIP_OPENMP_TEST: 'true' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '6' # non-default seed - template: build_tools/azure/windows.yml parameters: name: Windows - vmImage: vs2017-win2016 - dependsOn: [linting, git_commit] + vmImage: windows-latest + dependsOn: [linting, git_commit, Ubuntu_Jammy_Jellyfish] + # Runs when dependencies succeeded or skipped condition: | and( - succeeded(), - not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), - ne(variables['Build.Reason'], 'Schedule') + not(or(failed(), canceled())), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) ) matrix: - py37_conda_mkl: - PYTHON_VERSION: '3.7' - CHECK_WARNINGS: 'true' - PYTHON_ARCH: '64' - PYTEST_VERSION: '*' - COVERAGE: 'true' - py36_pip_openblas_32bit: - PYTHON_VERSION: '3.6' - PYTHON_ARCH: '32' + pymin_conda_forge_mkl: + DISTRIB: 'conda' + LOCK_FILE: ./build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock + SKLEARN_WARNINGS_AS_ERRORS: '1' + # The Azure Windows runner is typically much slower than other CI + # runners due to the lack of compiler cache. Running the tests with + # coverage enabled make them run extra slower. Since very few parts of + # code should have windows-specific code branches, it should be enable + # to restrict the code coverage collection to the non-windows runners. + COVERAGE: 'false' + # Enable debug Cython directives to capture IndexError exceptions in + # combination with the -Werror::pytest.PytestUnraisableExceptionWarning + # flag for pytest. + # https://github.com/scikit-learn/scikit-learn/pull/24438 + SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES: '1' + SKLEARN_TESTS_GLOBAL_RANDOM_SEED: '7' # non-default seed diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py index 8efc740e937da..a559bc59b5f8a 100644 --- a/benchmarks/bench_20newsgroups.py +++ b/benchmarks/bench_20newsgroups.py @@ -1,25 +1,24 @@ -from time import time import argparse -import numpy as np +from time import time -from sklearn.dummy import DummyClassifier +import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.metrics import accuracy_score -from sklearn.utils.validation import check_array - -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import AdaBoostClassifier +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ( + AdaBoostClassifier, + ExtraTreesClassifier, + RandomForestClassifier, +) from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score from sklearn.naive_bayes import MultinomialNB +from sklearn.utils.validation import check_array ESTIMATORS = { "dummy": DummyClassifier(), - "random_forest": RandomForestClassifier(max_features="sqrt", - min_samples_split=10), - "extra_trees": ExtraTreesClassifier(max_features="sqrt", - min_samples_split=10), + "random_forest": RandomForestClassifier(max_features="sqrt", min_samples_split=10), + "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10), "logistic_regression": LogisticRegression(), "naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), @@ -30,34 +29,31 @@ # Data if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-e', '--estimators', nargs="+", required=True, - choices=ESTIMATORS) + parser.add_argument( + "-e", "--estimators", nargs="+", required=True, choices=ESTIMATORS + ) args = vars(parser.parse_args()) data_train = fetch_20newsgroups_vectorized(subset="train") data_test = fetch_20newsgroups_vectorized(subset="test") - X_train = check_array(data_train.data, dtype=np.float32, - accept_sparse="csc") + X_train = check_array(data_train.data, dtype=np.float32, accept_sparse="csc") X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") y_train = data_train.target y_test = data_test.target print("20 newsgroups") print("=============") - print("X_train.shape = {0}".format(X_train.shape)) - print("X_train.format = {0}".format(X_train.format)) - print("X_train.dtype = {0}".format(X_train.dtype)) - print("X_train density = {0}" - "".format(X_train.nnz / np.product(X_train.shape))) - print("y_train {0}".format(y_train.shape)) - print("X_test {0}".format(X_test.shape)) - print("X_test.format = {0}".format(X_test.format)) - print("X_test.dtype = {0}".format(X_test.dtype)) - print("y_test {0}".format(y_test.shape)) + print(f"X_train.shape = {X_train.shape}") + print(f"X_train.format = {X_train.format}") + print(f"X_train.dtype = {X_train.dtype}") + print(f"X_train density = {X_train.nnz / np.prod(X_train.shape)}") + print(f"y_train {y_train.shape}") + print(f"X_test {X_test.shape}") + print(f"X_test.format = {X_test.format}") + print(f"X_test.dtype = {X_test.dtype}") + print(f"y_test {y_test.shape}") print() - print("Classifier Training") print("===================") accuracy, train_time, test_time = {}, {}, {} @@ -82,13 +78,17 @@ print("Classification performance:") print("===========================") print() - print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", - "Accuracy")) + print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "Accuracy")) print("-" * 44) for name in sorted(accuracy, key=accuracy.get): - print("%s %s %s %s" % (name.ljust(16), - ("%.4fs" % train_time[name]).center(10), - ("%.4fs" % test_time[name]).center(10), - ("%.4f" % accuracy[name]).center(10))) + print( + "%s %s %s %s" + % ( + name.ljust(16), + ("%.4fs" % train_time[name]).center(10), + ("%.4fs" % test_time[name]).center(10), + ("%.4f" % accuracy[name]).center(10), + ) + ) print() diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py index b74f74bbbbb76..243cce03a632f 100644 --- a/benchmarks/bench_covertype.py +++ b/benchmarks/bench_covertype.py @@ -25,13 +25,13 @@ The same task has been used in a number of papers including: - * `"SVM Optimization: Inverse Dependence on Training Set Size" - `_ + * :doi:`"SVM Optimization: Inverse Dependence on Training Set Size" S. Shalev-Shwartz, N. Srebro - In Proceedings of ICML '08. + <10.1145/1390156.1390273>` - * `"Pegasos: Primal estimated sub-gradient solver for svm" - `_ + * :doi:`"Pegasos: Primal estimated sub-gradient solver for svm" S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. + <10.1145/1273496.1273598>` * `"Training Linear SVMs in Linear Time" `_ @@ -41,42 +41,47 @@ """ -# Author: Peter Prettenhofer -# Arnaud Joly -# License: BSD 3 clause +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause +import argparse import os from time import time -import argparse + import numpy as np from joblib import Memory from sklearn.datasets import fetch_covtype, get_data_home -from sklearn.svm import LinearSVC -from sklearn.linear_model import SGDClassifier, LogisticRegression +from sklearn.ensemble import ( + ExtraTreesClassifier, + GradientBoostingClassifier, + RandomForestClassifier, +) +from sklearn.linear_model import LogisticRegression, SGDClassifier +from sklearn.metrics import zero_one_loss from sklearn.naive_bayes import GaussianNB +from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import zero_one_loss from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode -memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), - mmap_mode='r') +memory = Memory( + os.path.join(get_data_home(), "covertype_benchmark_data"), mmap_mode="r" +) @memory.cache -def load_data(dtype=np.float32, order='C', random_state=13): +def load_data(dtype=np.float32, order="C", random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") - data = fetch_covtype(download_if_missing=True, shuffle=True, - random_state=random_state) - X = check_array(data['data'], dtype=dtype, order=order) - y = (data['target'] != 1).astype(int) + data = fetch_covtype( + download_if_missing=True, shuffle=True, random_state=random_state + ) + X = check_array(data["data"], dtype=dtype, order=order) + y = (data["target"] != 1).astype(int) # Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") @@ -97,39 +102,59 @@ def load_data(dtype=np.float32, order='C', random_state=13): ESTIMATORS = { - 'GBRT': GradientBoostingClassifier(n_estimators=250), - 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), - 'RandomForest': RandomForestClassifier(n_estimators=20), - 'CART': DecisionTreeClassifier(min_samples_split=5), - 'SGD': SGDClassifier(alpha=0.001), - 'GaussianNB': GaussianNB(), - 'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, - tol=1e-3), - 'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000) + "GBRT": GradientBoostingClassifier(n_estimators=250), + "ExtraTrees": ExtraTreesClassifier(n_estimators=20), + "RandomForest": RandomForestClassifier(n_estimators=20), + "CART": DecisionTreeClassifier(min_samples_split=5), + "SGD": SGDClassifier(alpha=0.001), + "GaussianNB": GaussianNB(), + "liblinear": LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, tol=1e-3), + "SAG": LogisticRegression(solver="sag", max_iter=2, C=1000), } if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--classifiers', nargs="+", - choices=ESTIMATORS, type=str, - default=['liblinear', 'GaussianNB', 'SGD', 'CART'], - help="list of classifiers to benchmark.") - parser.add_argument('--n-jobs', nargs="?", default=1, type=int, - help="Number of concurrently running workers for " - "models that support parallelism.") - parser.add_argument('--order', nargs="?", default="C", type=str, - choices=["F", "C"], - help="Allow to choose between fortran and C ordered " - "data") - parser.add_argument('--random-seed', nargs="?", default=13, type=int, - help="Common seed used by random number generator.") + parser.add_argument( + "--classifiers", + nargs="+", + choices=ESTIMATORS, + type=str, + default=["liblinear", "GaussianNB", "SGD", "CART"], + help="list of classifiers to benchmark.", + ) + parser.add_argument( + "--n-jobs", + nargs="?", + default=1, + type=int, + help=( + "Number of concurrently running workers for " + "models that support parallelism." + ), + ) + parser.add_argument( + "--order", + nargs="?", + default="C", + type=str, + choices=["F", "C"], + help="Allow to choose between fortran and C ordered data", + ) + parser.add_argument( + "--random-seed", + nargs="?", + default=13, + type=int, + help="Common seed used by random number generator.", + ) args = vars(parser.parse_args()) print(__doc__) X_train, X_test, y_train, y_test = load_data( - order=args["order"], random_state=args["random_seed"]) + order=args["order"], random_state=args["random_seed"] + ) print("") print("Dataset statistics:") @@ -137,14 +162,26 @@ def load_data(dtype=np.float32, order='C', random_state=13): print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) - print("%s %d (pos=%d, neg=%d, size=%dMB)" - % ("number of train samples:".ljust(25), - X_train.shape[0], np.sum(y_train == 1), - np.sum(y_train == 0), int(X_train.nbytes / 1e6))) - print("%s %d (pos=%d, neg=%d, size=%dMB)" - % ("number of test samples:".ljust(25), - X_test.shape[0], np.sum(y_test == 1), - np.sum(y_test == 0), int(X_test.nbytes / 1e6))) + print( + "%s %d (pos=%d, neg=%d, size=%dMB)" + % ( + "number of train samples:".ljust(25), + X_train.shape[0], + np.sum(y_train == 1), + np.sum(y_train == 0), + int(X_train.nbytes / 1e6), + ) + ) + print( + "%s %d (pos=%d, neg=%d, size=%dMB)" + % ( + "number of test samples:".ljust(25), + X_test.shape[0], + np.sum(y_test == 1), + np.sum(y_test == 0), + int(X_test.nbytes / 1e6), + ) + ) print() print("Training Classifiers") @@ -155,9 +192,13 @@ def load_data(dtype=np.float32, order='C', random_state=13): estimator = ESTIMATORS[name] estimator_params = estimator.get_params() - estimator.set_params(**{p: args["random_seed"] - for p in estimator_params - if p.endswith("random_state")}) + estimator.set_params( + **{ + p: args["random_seed"] + for p in estimator_params + if p.endswith("random_state") + } + ) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=args["n_jobs"]) @@ -177,13 +218,17 @@ def load_data(dtype=np.float32, order='C', random_state=13): print() print("Classification performance:") print("===========================") - print("%s %s %s %s" - % ("Classifier ", "train-time", "test-time", "error-rate")) + print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "error-rate")) print("-" * 44) for name in sorted(args["classifiers"], key=error.get): - print("%s %s %s %s" % (name.ljust(12), - ("%.4fs" % train_time[name]).center(10), - ("%.4fs" % test_time[name]).center(10), - ("%.4f" % error[name]).center(10))) + print( + "%s %s %s %s" + % ( + name.ljust(12), + ("%.4fs" % train_time[name]).center(10), + ("%.4fs" % test_time[name]).center(10), + ("%.4f" % error[name]).center(10), + ) + ) print() diff --git a/benchmarks/bench_feature_expansions.py b/benchmarks/bench_feature_expansions.py index 412ab28598c9b..b9d9efbdea4f1 100644 --- a/benchmarks/bench_feature_expansions.py +++ b/benchmarks/bench_feature_expansions.py @@ -1,8 +1,10 @@ +from time import time + import matplotlib.pyplot as plt import numpy as np import scipy.sparse as sparse + from sklearn.preprocessing import PolynomialFeatures -from time import time degree = 2 trials = 3 @@ -11,8 +13,9 @@ densities = np.array([0.01, 0.1, 1.0]) csr_times = {d: np.zeros(len(dimensionalities)) for d in densities} dense_times = {d: np.zeros(len(dimensionalities)) for d in densities} -transform = PolynomialFeatures(degree=degree, include_bias=False, - interaction_only=False) +transform = PolynomialFeatures( + degree=degree, include_bias=False, interaction_only=False +) for trial in range(trials): for density in densities: @@ -34,16 +37,22 @@ fig, axes = plt.subplots(nrows=len(densities), ncols=1, figsize=(8, 10)) for density, ax in zip(densities, axes): - - ax.plot(dimensionalities, csr_times[density] / trials, - label='csr', linestyle=csr_linestyle) - ax.plot(dimensionalities, dense_times[density] / trials, - label='dense', linestyle=dense_linestyle) - ax.set_title("density %0.2f, degree=%d, n_samples=%d" % - (density, degree, num_rows)) + ax.plot( + dimensionalities, + csr_times[density] / trials, + label="csr", + linestyle=csr_linestyle, + ) + ax.plot( + dimensionalities, + dense_times[density] / trials, + label="dense", + linestyle=dense_linestyle, + ) + ax.set_title("density %0.2f, degree=%d, n_samples=%d" % (density, degree, num_rows)) ax.legend() - ax.set_xlabel('Dimensionality') - ax.set_ylabel('Time (seconds)') + ax.set_xlabel("Dimensionality") + ax.set_ylabel("Time (seconds)") plt.tight_layout() plt.show() diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py index afb9f0d3bb0f1..84cf31858afa7 100644 --- a/benchmarks/bench_glm.py +++ b/benchmarks/bench_glm.py @@ -4,13 +4,14 @@ Data comes from a random square matrix. """ + from datetime import datetime -import numpy as np -from sklearn import linear_model +import numpy as np -if __name__ == '__main__': +from sklearn import linear_model +if __name__ == "__main__": import matplotlib.pyplot as plt n_iter = 40 @@ -22,8 +23,7 @@ dimensions = 500 * np.arange(1, n_iter + 1) for i in range(n_iter): - - print('Iteration %s of %s' % (i, n_iter)) + print("Iteration %s of %s" % (i, n_iter)) n_samples, n_features = 10 * i + 3, 10 * i + 3 @@ -31,7 +31,7 @@ Y = np.random.randn(n_samples) start = datetime.now() - ridge = linear_model.Ridge(alpha=1.) + ridge = linear_model.Ridge(alpha=1.0) ridge.fit(X, Y) time_ridge[i] = (datetime.now() - start).total_seconds() @@ -45,13 +45,13 @@ lasso.fit(X, Y) time_lasso[i] = (datetime.now() - start).total_seconds() - plt.figure('scikit-learn GLM benchmark results') - plt.xlabel('Dimensions') - plt.ylabel('Time (s)') - plt.plot(dimensions, time_ridge, color='r') - plt.plot(dimensions, time_ols, color='g') - plt.plot(dimensions, time_lasso, color='b') + plt.figure("scikit-learn GLM benchmark results") + plt.xlabel("Dimensions") + plt.ylabel("Time (s)") + plt.plot(dimensions, time_ridge, color="r") + plt.plot(dimensions, time_ols, color="g") + plt.plot(dimensions, time_lasso, color="b") - plt.legend(['Ridge', 'OLS', 'LassoLars'], loc='upper left') - plt.axis('tight') + plt.legend(["Ridge", "OLS", "LassoLars"], loc="upper left") + plt.axis("tight") plt.show() diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py index e8841cba46d57..1aaad99c10587 100644 --- a/benchmarks/bench_glmnet.py +++ b/benchmarks/bench_glmnet.py @@ -16,9 +16,12 @@ In both cases, only 10% of the features are informative. """ -import numpy as np + import gc from time import time + +import numpy as np + from sklearn.datasets import make_regression alpha = 0.1 @@ -35,7 +38,7 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): # start time tstart = time() clf = factory(alpha=alpha).fit(X, Y) - delta = (time() - tstart) + delta = time() - tstart # stop time print("duration: %0.3fs" % delta) @@ -44,11 +47,12 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): return delta -if __name__ == '__main__': - from glmnet.elastic_net import Lasso as GlmnetLasso - from sklearn.linear_model import Lasso as ScikitLasso +if __name__ == "__main__": # Delayed import of matplotlib.pyplot import matplotlib.pyplot as plt + from glmnet.elastic_net import Lasso as GlmnetLasso + + from sklearn.linear_model import Lasso as ScikitLasso scikit_results = [] glmnet_results = [] @@ -58,18 +62,22 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): n_informative = n_features / 10 n_test_samples = 1000 for i in range(1, n + 1): - print('==================') - print('Iteration %s of %s' % (i, n)) - print('==================') + print("==================") + print("Iteration %s of %s" % (i, n)) + print("==================") X, Y, coef_ = make_regression( - n_samples=(i * step) + n_test_samples, n_features=n_features, - noise=0.1, n_informative=n_informative, coef=True) + n_samples=(i * step) + n_test_samples, + n_features=n_features, + noise=0.1, + n_informative=n_informative, + coef=True, + ) X_test = X[-n_test_samples:] Y_test = Y[-n_test_samples:] - X = X[:(i * step)] - Y = Y[:(i * step)] + X = X[: (i * step)] + Y = Y[: (i * step)] print("benchmarking scikit-learn: ") scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) @@ -78,12 +86,12 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): plt.clf() xx = range(0, n * step, step) - plt.title('Lasso regression on sample dataset (%d features)' % n_features) - plt.plot(xx, scikit_results, 'b-', label='scikit-learn') - plt.plot(xx, glmnet_results, 'r-', label='glmnet') + plt.title("Lasso regression on sample dataset (%d features)" % n_features) + plt.plot(xx, scikit_results, "b-", label="scikit-learn") + plt.plot(xx, glmnet_results, "r-", label="glmnet") plt.legend() - plt.xlabel('number of samples to classify') - plt.ylabel('Time (s)') + plt.xlabel("number of samples to classify") + plt.ylabel("Time (s)") plt.show() # now do a benchmark where the number of points is fixed @@ -96,15 +104,19 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): n_samples = 500 for i in range(1, n + 1): - print('==================') - print('Iteration %02d of %02d' % (i, n)) - print('==================') + print("==================") + print("Iteration %02d of %02d" % (i, n)) + print("==================") n_features = i * step n_informative = n_features / 10 X, Y, coef_ = make_regression( - n_samples=(i * step) + n_test_samples, n_features=n_features, - noise=0.1, n_informative=n_informative, coef=True) + n_samples=(i * step) + n_test_samples, + n_features=n_features, + noise=0.1, + n_informative=n_informative, + coef=True, + ) X_test = X[-n_test_samples:] Y_test = Y[-n_test_samples:] @@ -117,12 +129,12 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) xx = np.arange(100, 100 + n * step, step) - plt.figure('scikit-learn vs. glmnet benchmark results') - plt.title('Regression in high dimensional spaces (%d samples)' % n_samples) - plt.plot(xx, scikit_results, 'b-', label='scikit-learn') - plt.plot(xx, glmnet_results, 'r-', label='glmnet') + plt.figure("scikit-learn vs. glmnet benchmark results") + plt.title("Regression in high dimensional spaces (%d samples)" % n_samples) + plt.plot(xx, scikit_results, "b-", label="scikit-learn") + plt.plot(xx, glmnet_results, "r-", label="glmnet") plt.legend() - plt.xlabel('number of features') - plt.ylabel('Time (s)') - plt.axis('tight') + plt.xlabel("number of features") + plt.ylabel("Time (s)") + plt.axis("tight") plt.show() diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 158b6fbb22d2b..c1dfffabe71c2 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -1,40 +1,48 @@ -from time import time import argparse +from time import time import matplotlib.pyplot as plt import numpy as np -from sklearn.model_selection import train_test_split -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression -from sklearn.ensemble._hist_gradient_boosting.utils import ( - get_equivalent_estimator) +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser() -parser.add_argument('--n-leaf-nodes', type=int, default=31) -parser.add_argument('--n-trees', type=int, default=10) -parser.add_argument('--lightgbm', action="store_true", default=False, - help='also plot lightgbm') -parser.add_argument('--xgboost', action="store_true", default=False, - help='also plot xgboost') -parser.add_argument('--catboost', action="store_true", default=False, - help='also plot catboost') -parser.add_argument('--learning-rate', type=float, default=.1) -parser.add_argument('--problem', type=str, default='classification', - choices=['classification', 'regression']) -parser.add_argument('--loss', type=str, default='default') -parser.add_argument('--missing-fraction', type=float, default=0) -parser.add_argument('--n-classes', type=int, default=2) -parser.add_argument('--n-samples-max', type=int, default=int(1e6)) -parser.add_argument('--n-features', type=int, default=20) -parser.add_argument('--max-bins', type=int, default=255) -parser.add_argument('--random-sample-weights', action="store_true", - default=False, - help="generate and use random sample weights") +parser.add_argument("--n-leaf-nodes", type=int, default=31) +parser.add_argument("--n-trees", type=int, default=10) +parser.add_argument( + "--lightgbm", action="store_true", default=False, help="also plot lightgbm" +) +parser.add_argument( + "--xgboost", action="store_true", default=False, help="also plot xgboost" +) +parser.add_argument( + "--catboost", action="store_true", default=False, help="also plot catboost" +) +parser.add_argument("--learning-rate", type=float, default=0.1) +parser.add_argument( + "--problem", + type=str, + default="classification", + choices=["classification", "regression"], +) +parser.add_argument("--loss", type=str, default="default") +parser.add_argument("--missing-fraction", type=float, default=0) +parser.add_argument("--n-classes", type=int, default=2) +parser.add_argument("--n-samples-max", type=int, default=int(1e6)) +parser.add_argument("--n-features", type=int, default=20) +parser.add_argument("--max-bins", type=int, default=255) +parser.add_argument( + "--random-sample-weights", + action="store_true", + default=False, + help="generate and use random sample weights", +) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes @@ -44,24 +52,26 @@ def get_estimator_and_data(): - if args.problem == 'classification': - X, y = make_classification(args.n_samples_max * 2, - n_features=args.n_features, - n_classes=args.n_classes, - n_clusters_per_class=1, - n_informative=args.n_classes, - random_state=0) + if args.problem == "classification": + X, y = make_classification( + args.n_samples_max * 2, + n_features=args.n_features, + n_classes=args.n_classes, + n_clusters_per_class=1, + n_informative=args.n_classes, + random_state=0, + ) return X, y, HistGradientBoostingClassifier - elif args.problem == 'regression': - X, y = make_regression(args.n_samples_max * 2, - n_features=args.n_features, random_state=0) + elif args.problem == "regression": + X, y = make_regression( + args.n_samples_max * 2, n_features=args.n_features, random_state=0 + ) return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() if args.missing_fraction: - mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype( - bool) + mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(bool) X[mask] = np.nan if args.random_sample_weights: @@ -70,12 +80,13 @@ def get_estimator_and_data(): sample_weight = None if sample_weight is not None: - (X_train_, X_test_, y_train_, y_test_, - sample_weight_train_, _) = train_test_split( - X, y, sample_weight, test_size=0.5, random_state=0) + (X_train_, X_test_, y_train_, y_test_, sample_weight_train_, _) = train_test_split( + X, y, sample_weight, test_size=0.5, random_state=0 + ) else: X_train_, X_test_, y_train_, y_test_ = train_test_split( - X, y, test_size=0.5, random_state=0) + X, y, test_size=0.5, random_state=0 + ) sample_weight_train_ = None @@ -90,27 +101,26 @@ def one_run(n_samples): sample_weight_train = None assert X_train.shape[0] == n_samples assert X_test.shape[0] == n_samples - print("Data size: %d samples train, %d samples test." - % (n_samples, n_samples)) + print("Data size: %d samples train, %d samples test." % (n_samples, n_samples)) print("Fitting a sklearn model...") tic = time() - est = Estimator(learning_rate=lr, - max_iter=n_trees, - max_bins=max_bins, - max_leaf_nodes=n_leaf_nodes, - early_stopping=False, - random_state=0, - verbose=0) + est = Estimator( + learning_rate=lr, + max_iter=n_trees, + max_bins=max_bins, + max_leaf_nodes=n_leaf_nodes, + early_stopping=False, + random_state=0, + verbose=0, + ) loss = args.loss - if args.problem == 'classification': - if loss == 'default': - # loss='auto' does not work with get_equivalent_estimator() - loss = 'binary_crossentropy' if args.n_classes == 2 else \ - 'categorical_crossentropy' + if args.problem == "classification": + if loss == "default": + loss = "log_loss" else: # regression - if loss == 'default': - loss = 'least_squares' + if loss == "default": + loss = "squared_error" est.set_params(loss=loss) est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic @@ -126,7 +136,9 @@ def one_run(n_samples): lightgbm_score_duration = None if args.lightgbm: print("Fitting a LightGBM model...") - lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') + lightgbm_est = get_equivalent_estimator( + est, lib="lightgbm", n_classes=args.n_classes + ) tic = time() lightgbm_est.fit(X_train, y_train, sample_weight=sample_weight_train) @@ -143,7 +155,7 @@ def one_run(n_samples): xgb_score_duration = None if args.xgboost: print("Fitting an XGBoost model...") - xgb_est = get_equivalent_estimator(est, lib='xgboost') + xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes) tic = time() xgb_est.fit(X_train, y_train, sample_weight=sample_weight_train) @@ -160,7 +172,9 @@ def one_run(n_samples): cat_score_duration = None if args.catboost: print("Fitting a CatBoost model...") - cat_est = get_equivalent_estimator(est, lib='catboost') + cat_est = get_equivalent_estimator( + est, lib="catboost", n_classes=args.n_classes + ) tic = time() cat_est.fit(X_train, y_train, sample_weight=sample_weight_train) @@ -172,15 +186,26 @@ def one_run(n_samples): print("fit duration: {:.3f}s,".format(cat_fit_duration)) print("score duration: {:.3f}s,".format(cat_score_duration)) - return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, - lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, - xgb_score, xgb_fit_duration, xgb_score_duration, - cat_score, cat_fit_duration, cat_score_duration) + return ( + sklearn_score, + sklearn_fit_duration, + sklearn_score_duration, + lightgbm_score, + lightgbm_fit_duration, + lightgbm_score_duration, + xgb_score, + xgb_fit_duration, + xgb_score_duration, + cat_score, + cat_fit_duration, + cat_score_duration, + ) n_samples_list = [1000, 10000, 100000, 500000, 1000000, 5000000, 10000000] -n_samples_list = [n_samples for n_samples in n_samples_list - if n_samples <= args.n_samples_max] +n_samples_list = [ + n_samples for n_samples in n_samples_list if n_samples <= args.n_samples_max +] sklearn_scores = [] sklearn_fit_durations = [] @@ -196,67 +221,70 @@ def one_run(n_samples): cat_score_durations = [] for n_samples in n_samples_list: - (sklearn_score, - sklearn_fit_duration, - sklearn_score_duration, - lightgbm_score, - lightgbm_fit_duration, - lightgbm_score_duration, - xgb_score, - xgb_fit_duration, - xgb_score_duration, - cat_score, - cat_fit_duration, - cat_score_duration) = one_run(n_samples) + ( + sklearn_score, + sklearn_fit_duration, + sklearn_score_duration, + lightgbm_score, + lightgbm_fit_duration, + lightgbm_score_duration, + xgb_score, + xgb_fit_duration, + xgb_score_duration, + cat_score, + cat_fit_duration, + cat_score_duration, + ) = one_run(n_samples) for scores, score in ( - (sklearn_scores, sklearn_score), - (sklearn_fit_durations, sklearn_fit_duration), - (sklearn_score_durations, sklearn_score_duration), - (lightgbm_scores, lightgbm_score), - (lightgbm_fit_durations, lightgbm_fit_duration), - (lightgbm_score_durations, lightgbm_score_duration), - (xgb_scores, xgb_score), - (xgb_fit_durations, xgb_fit_duration), - (xgb_score_durations, xgb_score_duration), - (cat_scores, cat_score), - (cat_fit_durations, cat_fit_duration), - (cat_score_durations, cat_score_duration)): + (sklearn_scores, sklearn_score), + (sklearn_fit_durations, sklearn_fit_duration), + (sklearn_score_durations, sklearn_score_duration), + (lightgbm_scores, lightgbm_score), + (lightgbm_fit_durations, lightgbm_fit_duration), + (lightgbm_score_durations, lightgbm_score_duration), + (xgb_scores, xgb_score), + (xgb_fit_durations, xgb_fit_duration), + (xgb_score_durations, xgb_score_duration), + (cat_scores, cat_score), + (cat_fit_durations, cat_fit_duration), + (cat_score_durations, cat_score_duration), + ): scores.append(score) fig, axs = plt.subplots(3, sharex=True) -axs[0].plot(n_samples_list, sklearn_scores, label='sklearn') -axs[1].plot(n_samples_list, sklearn_fit_durations, label='sklearn') -axs[2].plot(n_samples_list, sklearn_score_durations, label='sklearn') +axs[0].plot(n_samples_list, sklearn_scores, label="sklearn") +axs[1].plot(n_samples_list, sklearn_fit_durations, label="sklearn") +axs[2].plot(n_samples_list, sklearn_score_durations, label="sklearn") if args.lightgbm: - axs[0].plot(n_samples_list, lightgbm_scores, label='lightgbm') - axs[1].plot(n_samples_list, lightgbm_fit_durations, label='lightgbm') - axs[2].plot(n_samples_list, lightgbm_score_durations, label='lightgbm') + axs[0].plot(n_samples_list, lightgbm_scores, label="lightgbm") + axs[1].plot(n_samples_list, lightgbm_fit_durations, label="lightgbm") + axs[2].plot(n_samples_list, lightgbm_score_durations, label="lightgbm") if args.xgboost: - axs[0].plot(n_samples_list, xgb_scores, label='XGBoost') - axs[1].plot(n_samples_list, xgb_fit_durations, label='XGBoost') - axs[2].plot(n_samples_list, xgb_score_durations, label='XGBoost') + axs[0].plot(n_samples_list, xgb_scores, label="XGBoost") + axs[1].plot(n_samples_list, xgb_fit_durations, label="XGBoost") + axs[2].plot(n_samples_list, xgb_score_durations, label="XGBoost") if args.catboost: - axs[0].plot(n_samples_list, cat_scores, label='CatBoost') - axs[1].plot(n_samples_list, cat_fit_durations, label='CatBoost') - axs[2].plot(n_samples_list, cat_score_durations, label='CatBoost') + axs[0].plot(n_samples_list, cat_scores, label="CatBoost") + axs[1].plot(n_samples_list, cat_fit_durations, label="CatBoost") + axs[2].plot(n_samples_list, cat_score_durations, label="CatBoost") for ax in axs: - ax.set_xscale('log') - ax.legend(loc='best') - ax.set_xlabel('n_samples') + ax.set_xscale("log") + ax.legend(loc="best") + ax.set_xlabel("n_samples") -axs[0].set_title('scores') -axs[1].set_title('fit duration (s)') -axs[2].set_title('score duration (s)') +axs[0].set_title("scores") +axs[1].set_title("fit duration (s)") +axs[2].set_title("score duration (s)") title = args.problem -if args.problem == 'classification': - title += ' n_classes = {}'.format(args.n_classes) +if args.problem == "classification": + title += " n_classes = {}".format(args.n_classes) fig.suptitle(title) diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py index 5b47fcb3a6678..4d5ce48cded81 100644 --- a/benchmarks/bench_hist_gradient_boosting_adult.py +++ b/benchmarks/bench_hist_gradient_boosting_adult.py @@ -1,23 +1,25 @@ import argparse from time import time -from sklearn.model_selection import train_test_split +import numpy as np +import pandas as pd + +from sklearn.compose import make_column_selector, make_column_transformer from sklearn.datasets import fetch_openml -from sklearn.metrics import accuracy_score, roc_auc_score -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble._hist_gradient_boosting.utils import ( - get_equivalent_estimator) - +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OrdinalEncoder parser = argparse.ArgumentParser() -parser.add_argument('--n-leaf-nodes', type=int, default=31) -parser.add_argument('--n-trees', type=int, default=100) -parser.add_argument('--lightgbm', action="store_true", default=False) -parser.add_argument('--learning-rate', type=float, default=.1) -parser.add_argument('--max-bins', type=int, default=255) -parser.add_argument('--no-predict', action="store_true", default=False) -parser.add_argument('--verbose', action="store_true", default=False) +parser.add_argument("--n-leaf-nodes", type=int, default=31) +parser.add_argument("--n-trees", type=int, default=100) +parser.add_argument("--lightgbm", action="store_true", default=False) +parser.add_argument("--learning-rate", type=float, default=0.1) +parser.add_argument("--max-bins", type=int, default=255) +parser.add_argument("--no-predict", action="store_true", default=False) +parser.add_argument("--verbose", action="store_true", default=False) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes @@ -44,28 +46,37 @@ def predict(est, data_test, target_test): toc = time() roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) - print(f"predicted in {toc - tic:.3f}s, " - f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") + print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc:.4f}") -data = fetch_openml(data_id=179, as_frame=False) # adult dataset +data = fetch_openml(data_id=179, as_frame=True) # adult dataset X, y = data.data, data.target +# Ordinal encode the categories to use the native support available in HGBDT +cat_columns = make_column_selector(dtype_include="category")(X) +preprocessing = make_column_transformer( + (OrdinalEncoder(), cat_columns), + remainder="passthrough", + verbose_feature_names_out=False, +) +X = pd.DataFrame( + preprocessing.fit_transform(X), + columns=preprocessing.get_feature_names_out(), +) + +n_classes = len(np.unique(y)) n_features = X.shape[1] -n_categorical_features = len(data.categories) +n_categorical_features = len(cat_columns) n_numerical_features = n_features - n_categorical_features print(f"Number of features: {n_features}") print(f"Number of categorical features: {n_categorical_features}") print(f"Number of numerical features: {n_numerical_features}") -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, - random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) -# Note: no need to use an OrdinalEncoder because categorical features are -# already clean -is_categorical = [name in data.categories for name in data.feature_names] +is_categorical = [True] * n_categorical_features + [False] * n_numerical_features est = HistGradientBoostingClassifier( - loss='binary_crossentropy', + loss="log_loss", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, @@ -73,18 +84,17 @@ def predict(est, data_test, target_test): categorical_features=is_categorical, early_stopping=False, random_state=0, - verbose=verbose + verbose=verbose, ) -fit(est, X_train, y_train, 'sklearn') +fit(est, X_train, y_train, "sklearn") predict(est, X_test, y_test) if args.lightgbm: - est = get_equivalent_estimator(est, lib='lightgbm') + est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes) est.set_params(max_cat_to_onehot=1) # dont use OHE - categorical_features = [f_idx - for (f_idx, is_cat) in enumerate(is_categorical) - if is_cat] - fit(est, X_train, y_train, 'lightgbm', - categorical_feature=categorical_features) + categorical_features = [ + f_idx for (f_idx, is_cat) in enumerate(is_categorical) if is_cat + ] + fit(est, X_train, y_train, "lightgbm", categorical_feature=categorical_features) predict(est, X_test, y_test) diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py index 6c69b32eff26f..1085bbc49f4f8 100644 --- a/benchmarks/bench_hist_gradient_boosting_categorical_only.py +++ b/benchmarks/bench_hist_gradient_boosting_categorical_only.py @@ -1,25 +1,22 @@ import argparse from time import time -from sklearn.preprocessing import KBinsDiscretizer from sklearn.datasets import make_classification -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble._hist_gradient_boosting.utils import ( - get_equivalent_estimator) - +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.preprocessing import KBinsDiscretizer parser = argparse.ArgumentParser() -parser.add_argument('--n-leaf-nodes', type=int, default=31) -parser.add_argument('--n-trees', type=int, default=100) -parser.add_argument('--n-features', type=int, default=20) -parser.add_argument('--n-cats', type=int, default=20) -parser.add_argument('--n-samples', type=int, default=10_000) -parser.add_argument('--lightgbm', action="store_true", default=False) -parser.add_argument('--learning-rate', type=float, default=.1) -parser.add_argument('--max-bins', type=int, default=255) -parser.add_argument('--no-predict', action="store_true", default=False) -parser.add_argument('--verbose', action="store_true", default=False) +parser.add_argument("--n-leaf-nodes", type=int, default=31) +parser.add_argument("--n-trees", type=int, default=100) +parser.add_argument("--n-features", type=int, default=20) +parser.add_argument("--n-cats", type=int, default=20) +parser.add_argument("--n-samples", type=int, default=10_000) +parser.add_argument("--lightgbm", action="store_true", default=False) +parser.add_argument("--learning-rate", type=float, default=0.1) +parser.add_argument("--max-bins", type=int, default=255) +parser.add_argument("--no-predict", action="store_true", default=False) +parser.add_argument("--verbose", action="store_true", default=False) args = parser.parse_args() n_leaf_nodes = args.n_leaf_nodes @@ -51,17 +48,16 @@ def predict(est, data_test): print(f"predicted in {toc - tic:.3f}s") -X, y = make_classification(n_samples=n_samples, n_features=n_features, - random_state=0) +X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0) -X = KBinsDiscretizer(n_bins=n_categories, encode='ordinal').fit_transform(X) +X = KBinsDiscretizer(n_bins=n_categories, encode="ordinal").fit_transform(X) print(f"Number of features: {n_features}") print(f"Number of samples: {n_samples}") is_categorical = [True] * n_features est = HistGradientBoostingClassifier( - loss='binary_crossentropy', + loss="log_loss", learning_rate=lr, max_iter=n_trees, max_bins=max_bins, @@ -69,16 +65,15 @@ def predict(est, data_test): categorical_features=is_categorical, early_stopping=False, random_state=0, - verbose=verbose + verbose=verbose, ) -fit(est, X, y, 'sklearn') +fit(est, X, y, "sklearn") predict(est, X) if args.lightgbm: - est = get_equivalent_estimator(est, lib='lightgbm') + est = get_equivalent_estimator(est, lib="lightgbm", n_classes=2) est.set_params(max_cat_to_onehot=1) # dont use OHE categorical_features = list(range(n_features)) - fit(est, X, y, 'lightgbm', - categorical_feature=categorical_features) + fit(est, X, y, "lightgbm", categorical_feature=categorical_features) predict(est, X) diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index 2c74bb8818343..ceab576bc0a52 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -1,49 +1,48 @@ -from urllib.request import urlretrieve +import argparse import os from gzip import GzipFile from time import time -import argparse +from urllib.request import urlretrieve import numpy as np import pandas as pd from joblib import Memory -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score, roc_auc_score -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble._hist_gradient_boosting.utils import ( - get_equivalent_estimator) +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.model_selection import train_test_split parser = argparse.ArgumentParser() -parser.add_argument('--n-leaf-nodes', type=int, default=31) -parser.add_argument('--n-trees', type=int, default=10) -parser.add_argument('--lightgbm', action="store_true", default=False) -parser.add_argument('--xgboost', action="store_true", default=False) -parser.add_argument('--catboost', action="store_true", default=False) -parser.add_argument('--learning-rate', type=float, default=1.) -parser.add_argument('--subsample', type=int, default=None) -parser.add_argument('--max-bins', type=int, default=255) -parser.add_argument('--no-predict', action="store_true", default=False) -parser.add_argument('--cache-loc', type=str, default='/tmp') +parser.add_argument("--n-leaf-nodes", type=int, default=31) +parser.add_argument("--n-trees", type=int, default=10) +parser.add_argument("--lightgbm", action="store_true", default=False) +parser.add_argument("--xgboost", action="store_true", default=False) +parser.add_argument("--catboost", action="store_true", default=False) +parser.add_argument("--learning-rate", type=float, default=1.0) +parser.add_argument("--subsample", type=int, default=None) +parser.add_argument("--max-bins", type=int, default=255) +parser.add_argument("--no-predict", action="store_true", default=False) +parser.add_argument("--cache-loc", type=str, default="/tmp") +parser.add_argument("--no-interactions", type=bool, default=False) +parser.add_argument("--max-features", type=float, default=1.0) args = parser.parse_args() HERE = os.path.dirname(__file__) -URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/" - "HIGGS.csv.gz") -m = Memory(location=args.cache_loc, mmap_mode='r') +URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" +m = Memory(location=args.cache_loc, mmap_mode="r") n_leaf_nodes = args.n_leaf_nodes n_trees = args.n_trees subsample = args.subsample lr = args.learning_rate max_bins = args.max_bins +max_features = args.max_features @m.cache def load_data(): - filename = os.path.join(HERE, URL.rsplit('/', 1)[-1]) + filename = os.path.join(HERE, URL.rsplit("/", 1)[-1]) if not os.path.exists(filename): print(f"Downloading {URL} to {filename} (2.6 GB)...") urlretrieve(URL, filename) @@ -75,15 +74,16 @@ def predict(est, data_test, target_test): toc = time() roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) - print(f"predicted in {toc - tic:.3f}s, " - f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") + print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc:.4f}") df = load_data() target = df.values[:, 0] data = np.ascontiguousarray(df.values[:, 1:]) data_train, data_test, target_train, target_test = train_test_split( - data, target, test_size=.2, random_state=0) + data, target, test_size=0.2, random_state=0 +) +n_classes = len(np.unique(target)) if subsample is not None: data_train, target_train = data_train[:subsample], target_train[:subsample] @@ -91,28 +91,37 @@ def predict(est, data_test, target_test): n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") -est = HistGradientBoostingClassifier(loss='binary_crossentropy', - learning_rate=lr, - max_iter=n_trees, - max_bins=max_bins, - max_leaf_nodes=n_leaf_nodes, - early_stopping=False, - random_state=0, - verbose=1) -fit(est, data_train, target_train, 'sklearn') +if args.no_interactions: + interaction_cst = [[i] for i in range(n_features)] +else: + interaction_cst = None + +est = HistGradientBoostingClassifier( + loss="log_loss", + learning_rate=lr, + max_iter=n_trees, + max_bins=max_bins, + max_leaf_nodes=n_leaf_nodes, + early_stopping=False, + random_state=0, + verbose=1, + interaction_cst=interaction_cst, + max_features=max_features, +) +fit(est, data_train, target_train, "sklearn") predict(est, data_test, target_test) if args.lightgbm: - est = get_equivalent_estimator(est, lib='lightgbm') - fit(est, data_train, target_train, 'lightgbm') + est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes) + fit(est, data_train, target_train, "lightgbm") predict(est, data_test, target_test) if args.xgboost: - est = get_equivalent_estimator(est, lib='xgboost') - fit(est, data_train, target_train, 'xgboost') + est = get_equivalent_estimator(est, lib="xgboost", n_classes=n_classes) + fit(est, data_train, target_train, "xgboost") predict(est, data_test, target_test) if args.catboost: - est = get_equivalent_estimator(est, lib='catboost') - fit(est, data_train, target_train, 'catboost') + est = get_equivalent_estimator(est, lib="catboost", n_classes=n_classes) + fit(est, data_train, target_train, "catboost") predict(est, data_test, target_test) diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py index 3cc6afa3871c6..9acf65bdbaf6a 100644 --- a/benchmarks/bench_hist_gradient_boosting_threading.py +++ b/benchmarks/bench_hist_gradient_boosting_threading.py @@ -1,49 +1,59 @@ -from time import time import argparse import os from pprint import pprint +from time import time import numpy as np from threadpoolctl import threadpool_limits + import sklearn +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator from sklearn.model_selection import train_test_split -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.datasets import make_classification -from sklearn.datasets import make_regression -from sklearn.ensemble._hist_gradient_boosting.utils import ( - get_equivalent_estimator) - parser = argparse.ArgumentParser() -parser.add_argument('--n-leaf-nodes', type=int, default=31) -parser.add_argument('--n-trees', type=int, default=10) -parser.add_argument('--lightgbm', action="store_true", default=False, - help='also benchmark lightgbm') -parser.add_argument('--xgboost', action="store_true", default=False, - help='also benchmark xgboost') -parser.add_argument('--catboost', action="store_true", default=False, - help='also benchmark catboost') -parser.add_argument('--learning-rate', type=float, default=.1) -parser.add_argument('--problem', type=str, default='classification', - choices=['classification', 'regression']) -parser.add_argument('--loss', type=str, default='default') -parser.add_argument('--missing-fraction', type=float, default=0) -parser.add_argument('--n-classes', type=int, default=2) -parser.add_argument('--n-samples', type=int, default=int(1e6)) -parser.add_argument('--n-features', type=int, default=100) -parser.add_argument('--max-bins', type=int, default=255) - -parser.add_argument('--print-params', action="store_true", default=False) -parser.add_argument('--random-sample-weights', action="store_true", - default=False, - help="generate and use random sample weights") -parser.add_argument('--plot', action="store_true", default=False, - help='show a plot results') -parser.add_argument('--plot-filename', default=None, - help='filename to save the figure to disk') +parser.add_argument("--n-leaf-nodes", type=int, default=31) +parser.add_argument("--n-trees", type=int, default=10) +parser.add_argument( + "--lightgbm", action="store_true", default=False, help="also benchmark lightgbm" +) +parser.add_argument( + "--xgboost", action="store_true", default=False, help="also benchmark xgboost" +) +parser.add_argument( + "--catboost", action="store_true", default=False, help="also benchmark catboost" +) +parser.add_argument("--learning-rate", type=float, default=0.1) +parser.add_argument( + "--problem", + type=str, + default="classification", + choices=["classification", "regression"], +) +parser.add_argument("--loss", type=str, default="default") +parser.add_argument("--missing-fraction", type=float, default=0) +parser.add_argument("--n-classes", type=int, default=2) +parser.add_argument("--n-samples", type=int, default=int(1e6)) +parser.add_argument("--n-features", type=int, default=100) +parser.add_argument("--max-bins", type=int, default=255) + +parser.add_argument("--print-params", action="store_true", default=False) +parser.add_argument( + "--random-sample-weights", + action="store_true", + default=False, + help="generate and use random sample weights", +) +parser.add_argument( + "--plot", action="store_true", default=False, help="show a plot results" +) +parser.add_argument( + "--plot-filename", default=None, help="filename to save the figure to disk" +) args = parser.parse_args() n_samples = args.n_samples @@ -53,30 +63,31 @@ max_bins = args.max_bins -print("Data size: %d samples train, %d samples test." - % (n_samples, n_samples)) +print("Data size: %d samples train, %d samples test." % (n_samples, n_samples)) print(f"n_features: {args.n_features}") def get_estimator_and_data(): - if args.problem == 'classification': - X, y = make_classification(args.n_samples * 2, - n_features=args.n_features, - n_classes=args.n_classes, - n_clusters_per_class=1, - n_informative=args.n_features // 2, - random_state=0) + if args.problem == "classification": + X, y = make_classification( + args.n_samples * 2, + n_features=args.n_features, + n_classes=args.n_classes, + n_clusters_per_class=1, + n_informative=args.n_features // 2, + random_state=0, + ) return X, y, HistGradientBoostingClassifier - elif args.problem == 'regression': - X, y = make_regression(args.n_samples_max * 2, - n_features=args.n_features, random_state=0) + elif args.problem == "regression": + X, y = make_regression( + args.n_samples_max * 2, n_features=args.n_features, random_state=0 + ) return X, y, HistGradientBoostingRegressor X, y, Estimator = get_estimator_and_data() if args.missing_fraction: - mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype( - bool) + mask = np.random.binomial(1, args.missing_fraction, size=X.shape).astype(bool) X[mask] = np.nan if args.random_sample_weights: @@ -85,12 +96,13 @@ def get_estimator_and_data(): sample_weight = None if sample_weight is not None: - (X_train_, X_test_, y_train_, y_test_, - sample_weight_train_, _) = train_test_split( - X, y, sample_weight, test_size=0.5, random_state=0) + (X_train_, X_test_, y_train_, y_test_, sample_weight_train_, _) = train_test_split( + X, y, sample_weight, test_size=0.5, random_state=0 + ) else: X_train_, X_test_, y_train_, y_test_ = train_test_split( - X, y, test_size=0.5, random_state=0) + X, y, test_size=0.5, random_state=0 + ) sample_weight_train_ = None @@ -104,15 +116,14 @@ def get_estimator_and_data(): verbose=0, ) loss = args.loss -if args.problem == 'classification': - if loss == 'default': +if args.problem == "classification": + if loss == "default": # loss='auto' does not work with get_equivalent_estimator() - loss = 'binary_crossentropy' if args.n_classes == 2 else \ - 'categorical_crossentropy' + loss = "log_loss" else: # regression - if loss == 'default': - loss = 'least_squares' + if loss == "default": + loss = "squared_error" sklearn_est.set_params(loss=loss) @@ -123,7 +134,9 @@ def get_estimator_and_data(): for libname in ["lightgbm", "xgboost", "catboost"]: if getattr(args, libname): print(libname) - est = get_equivalent_estimator(sklearn_est, lib=libname) + est = get_equivalent_estimator( + sklearn_est, lib=libname, n_classes=args.n_classes + ) pprint(est.get_params()) @@ -157,7 +170,9 @@ def one_run(n_threads, n_samples): lightgbm_score_duration = None if args.lightgbm: print("Fitting a LightGBM model...") - lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') + lightgbm_est = get_equivalent_estimator( + est, lib="lightgbm", n_classes=args.n_classes + ) lightgbm_est.set_params(num_threads=n_threads) tic = time() @@ -175,7 +190,7 @@ def one_run(n_threads, n_samples): xgb_score_duration = None if args.xgboost: print("Fitting an XGBoost model...") - xgb_est = get_equivalent_estimator(est, lib='xgboost') + xgb_est = get_equivalent_estimator(est, lib="xgboost", n_classes=args.n_classes) xgb_est.set_params(nthread=n_threads) tic = time() @@ -193,7 +208,9 @@ def one_run(n_threads, n_samples): cat_score_duration = None if args.catboost: print("Fitting a CatBoost model...") - cat_est = get_equivalent_estimator(est, lib='catboost') + cat_est = get_equivalent_estimator( + est, lib="catboost", n_classes=args.n_classes + ) cat_est.set_params(thread_count=n_threads) tic = time() @@ -206,14 +223,24 @@ def one_run(n_threads, n_samples): print("fit duration: {:.3f}s,".format(cat_fit_duration)) print("score duration: {:.3f}s,".format(cat_score_duration)) - return (sklearn_score, sklearn_fit_duration, sklearn_score_duration, - lightgbm_score, lightgbm_fit_duration, lightgbm_score_duration, - xgb_score, xgb_fit_duration, xgb_score_duration, - cat_score, cat_fit_duration, cat_score_duration) + return ( + sklearn_score, + sklearn_fit_duration, + sklearn_score_duration, + lightgbm_score, + lightgbm_fit_duration, + lightgbm_score_duration, + xgb_score, + xgb_fit_duration, + xgb_score_duration, + cat_score, + cat_fit_duration, + cat_score_duration, + ) max_threads = os.cpu_count() -n_threads_list = [2 ** i for i in range(8) if (2 ** i) < max_threads] +n_threads_list = [2**i for i in range(8) if (2**i) < max_threads] n_threads_list.append(max_threads) sklearn_scores = [] @@ -243,28 +270,29 @@ def one_run(n_threads, n_samples): xgb_score_duration, cat_score, cat_fit_duration, - cat_score_duration + cat_score_duration, ) = one_run(n_threads, n_samples) for scores, score in ( - (sklearn_scores, sklearn_score), - (sklearn_fit_durations, sklearn_fit_duration), - (sklearn_score_durations, sklearn_score_duration), - (lightgbm_scores, lightgbm_score), - (lightgbm_fit_durations, lightgbm_fit_duration), - (lightgbm_score_durations, lightgbm_score_duration), - (xgb_scores, xgb_score), - (xgb_fit_durations, xgb_fit_duration), - (xgb_score_durations, xgb_score_duration), - (cat_scores, cat_score), - (cat_fit_durations, cat_fit_duration), - (cat_score_durations, cat_score_duration)): + (sklearn_scores, sklearn_score), + (sklearn_fit_durations, sklearn_fit_duration), + (sklearn_score_durations, sklearn_score_duration), + (lightgbm_scores, lightgbm_score), + (lightgbm_fit_durations, lightgbm_fit_duration), + (lightgbm_score_durations, lightgbm_score_duration), + (xgb_scores, xgb_score), + (xgb_fit_durations, xgb_fit_duration), + (xgb_score_durations, xgb_score_duration), + (cat_scores, cat_score), + (cat_fit_durations, cat_fit_duration), + (cat_score_durations, cat_score_duration), + ): scores.append(score) if args.plot or args.plot_filename: - import matplotlib.pyplot as plt import matplotlib + import matplotlib.pyplot as plt fig, axs = plt.subplots(2, figsize=(12, 12)) @@ -274,37 +302,40 @@ def one_run(n_threads, n_samples): if args.lightgbm: import lightgbm - label = f'LightGBM {lightgbm.__version__}' + + label = f"LightGBM {lightgbm.__version__}" axs[0].plot(n_threads_list, lightgbm_fit_durations, label=label) axs[1].plot(n_threads_list, lightgbm_score_durations, label=label) if args.xgboost: import xgboost - label = f'XGBoost {xgboost.__version__}' + + label = f"XGBoost {xgboost.__version__}" axs[0].plot(n_threads_list, xgb_fit_durations, label=label) axs[1].plot(n_threads_list, xgb_score_durations, label=label) if args.catboost: import catboost - label = f'CatBoost {catboost.__version__}' + + label = f"CatBoost {catboost.__version__}" axs[0].plot(n_threads_list, cat_fit_durations, label=label) axs[1].plot(n_threads_list, cat_score_durations, label=label) for ax in axs: - ax.set_xscale('log') - ax.set_xlabel('n_threads') - ax.set_ylabel('duration (s)') + ax.set_xscale("log") + ax.set_xlabel("n_threads") + ax.set_ylabel("duration (s)") ax.set_ylim(0, None) ax.set_xticks(n_threads_list) ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - ax.legend(loc='best') + ax.legend(loc="best") - axs[0].set_title('fit duration (s)') - axs[1].set_title('score duration (s)') + axs[0].set_title("fit duration (s)") + axs[1].set_title("score duration (s)") title = args.problem - if args.problem == 'classification': - title += ' n_classes = {}'.format(args.n_classes) + if args.problem == "classification": + title += " n_classes = {}".format(args.n_classes) fig.suptitle(title) plt.tight_layout() diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py index b673b5606473a..743911936dccc 100644 --- a/benchmarks/bench_isolation_forest.py +++ b/benchmarks/bench_isolation_forest.py @@ -17,12 +17,13 @@ """ from time import time -import numpy as np + import matplotlib.pyplot as plt +import numpy as np +from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml from sklearn.ensemble import IsolationForest -from sklearn.metrics import roc_curve, auc -from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml +from sklearn.metrics import auc, roc_curve from sklearn.preprocessing import LabelBinarizer from sklearn.utils import shuffle as sh @@ -48,34 +49,34 @@ def print_outlier_ratio(y): with_decision_function_histograms = False # datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"] # Loop over all datasets for fitting and scoring the estimator: for dat in datasets: - # Loading and vectorizing the data: - print('====== %s ======' % dat) - print('--- Fetching data...') - if dat in ['http', 'smtp', 'SF', 'SA']: - dataset = fetch_kddcup99(subset=dat, shuffle=True, - percent10=True, random_state=random_state) + print("====== %s ======" % dat) + print("--- Fetching data...") + if dat in ["http", "smtp", "SF", "SA"]: + dataset = fetch_kddcup99( + subset=dat, shuffle=True, percent10=True, random_state=random_state + ) X = dataset.data y = dataset.target - if dat == 'shuttle': - dataset = fetch_openml('shuttle') + if dat == "shuttle": + dataset = fetch_openml("shuttle", as_frame=False) X = dataset.data - y = dataset.target + y = dataset.target.astype(np.int64) X, y = sh(X, y, random_state=random_state) # we remove data with label 4 # normal data are then those of class 1 - s = (y != 4) + s = y != 4 X = X[s, :] y = y[s] y = (y != 1).astype(int) - print('----- ') + print("----- ") - if dat == 'forestcover': + if dat == "forestcover": dataset = fetch_covtype(shuffle=True, random_state=random_state) X = dataset.data y = dataset.target @@ -87,26 +88,26 @@ def print_outlier_ratio(y): y = (y != 2).astype(int) print_outlier_ratio(y) - print('--- Vectorizing data...') + print("--- Vectorizing data...") - if dat == 'SF': + if dat == "SF": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] - y = (y != b'normal.').astype(int) + y = (y != b"normal.").astype(int) print_outlier_ratio(y) - if dat == 'SA': + if dat == "SA": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != b'normal.').astype(int) + y = (y != b"normal.").astype(int) print_outlier_ratio(y) - if dat in ('http', 'smtp'): - y = (y != b'normal.').astype(int) + if dat in ("http", "smtp"): + y = (y != b"normal.").astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape @@ -118,32 +119,36 @@ def print_outlier_ratio(y): y_train = y[:n_samples_train] y_test = y[n_samples_train:] - print('--- Fitting the IsolationForest estimator...') + print("--- Fitting the IsolationForest estimator...") model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() - scoring = - model.decision_function(X_test) # the lower, the more abnormal + scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) - ax[0].hist(scoring, bins, color='black') - ax[0].set_title('Decision function for %s dataset' % dat) - ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') + ax[0].hist(scoring, bins, color="black") + ax[0].set_title("Decision function for %s dataset" % dat) + ax[1].hist(scoring[y_test == 0], bins, color="b", label="normal data") ax[1].legend(loc="lower right") - ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers') + ax[2].hist(scoring[y_test == 1], bins, color="r", label="outliers") ax[2].legend(loc="lower right") # Show ROC Curves predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring) auc_score = auc(fpr, tpr) - label = ('%s (AUC: %0.3f, train_time= %0.2fs, ' - 'test_time= %0.2fs)' % (dat, auc_score, fit_time, predict_time)) + label = "%s (AUC: %0.3f, train_time= %0.2fs, test_time= %0.2fs)" % ( + dat, + auc_score, + fit_time, + predict_time, + ) # Print AUC score and train/test time: print(label) ax_roc.plot(fpr, tpr, lw=1, label=label) @@ -151,9 +156,9 @@ def print_outlier_ratio(y): ax_roc.set_xlim([-0.05, 1.05]) ax_roc.set_ylim([-0.05, 1.05]) -ax_roc.set_xlabel('False Positive Rate') -ax_roc.set_ylabel('True Positive Rate') -ax_roc.set_title('Receiver operating characteristic (ROC) curves') +ax_roc.set_xlabel("False Positive Rate") +ax_roc.set_ylabel("True Positive Rate") +ax_roc.set_title("Receiver operating characteristic (ROC) curves") ax_roc.legend(loc="lower right") fig_roc.tight_layout() plt.show() diff --git a/benchmarks/bench_isolation_forest_predict.py b/benchmarks/bench_isolation_forest_predict.py new file mode 100644 index 0000000000000..f16e65cf19511 --- /dev/null +++ b/benchmarks/bench_isolation_forest_predict.py @@ -0,0 +1,213 @@ +""" +========================================== +IsolationForest prediction benchmark +========================================== +A test of IsolationForest on classical anomaly detection datasets. + +The benchmark is run as follows: +1. The dataset is randomly split into a training set and a test set, both +assumed to contain outliers. +2. Isolation Forest is trained on the training set fixed at 1000 samples. +3. The test samples are scored using the trained model at: + - 1000, 10000, 50000 samples + - 10, 100, 1000 features + - 0.01, 0.1, 0.5 contamination + - 1, 2, 3, 4 n_jobs + +We compare the prediction time at the very end. + +Here are instructions for running this benchmark to compare runtime against main branch: + +1. Build and run on a branch or main, e.g. for a branch named `pr`: + +```bash +python bench_isolation_forest_predict.py bench ~/bench_results pr +``` + +2. Plotting to compare two branches `pr` and `main`: + +```bash +python bench_isolation_forest_predict.py plot ~/bench_results pr main results_image.png +``` +""" + +import argparse +from collections import defaultdict +from pathlib import Path +from time import time + +import numpy as np +import pandas as pd +from joblib import parallel_config + +from sklearn.ensemble import IsolationForest + +print(__doc__) + + +def get_data( + n_samples_train, n_samples_test, n_features, contamination=0.1, random_state=0 +): + """Function based on code from: https://scikit-learn.org/stable/ + auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto- + examples-ensemble-plot-isolation-forest-py + """ + rng = np.random.RandomState(random_state) + + X = 0.3 * rng.randn(n_samples_train, n_features) + X_train = np.r_[X + 2, X - 2] + + X = 0.3 * rng.randn(n_samples_test, n_features) + X_test = np.r_[X + 2, X - 2] + + n_outliers = int(np.floor(contamination * n_samples_test)) + X_outliers = rng.uniform(low=-4, high=4, size=(n_outliers, n_features)) + + outlier_idx = rng.choice(np.arange(0, n_samples_test), n_outliers, replace=False) + X_test[outlier_idx, :] = X_outliers + + return X_train, X_test + + +def plot(args): + import matplotlib.pyplot as plt + import seaborn as sns + + bench_results = Path(args.bench_results) + pr_name = args.pr_name + main_name = args.main_name + image_path = args.image_path + + results_path = Path(bench_results) + pr_path = results_path / f"{pr_name}.csv" + main_path = results_path / f"{main_name}.csv" + image_path = results_path / image_path + + df_pr = pd.read_csv(pr_path).assign(branch=pr_name) + df_main = pd.read_csv(main_path).assign(branch=main_name) + + # Merge the two datasets on the common columns + merged_data = pd.merge( + df_pr, + df_main, + on=["n_samples_test", "n_jobs"], + suffixes=("_pr", "_main"), + ) + + # Set up the plotting grid + sns.set(style="whitegrid", context="notebook", font_scale=1.5) + + # Create a figure with subplots + fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharex=True, sharey=True) + + # Plot predict time as a function of n_samples_test with different n_jobs + print(merged_data["n_jobs"].unique()) + ax = axes[0] + sns.lineplot( + data=merged_data, + x="n_samples_test", + y="predict_time_pr", + hue="n_jobs", + style="n_jobs", + markers="o", + ax=ax, + legend="full", + ) + ax.set_title(f"Predict Time vs. n_samples_test - {pr_name} branch") + ax.set_ylabel("Predict Time (Seconds)") + ax.set_xlabel("n_samples_test") + + ax = axes[1] + sns.lineplot( + data=merged_data, + x="n_samples_test", + y="predict_time_main", + hue="n_jobs", + style="n_jobs", + markers="X", + dashes=True, + ax=ax, + legend=None, + ) + ax.set_title(f"Predict Time vs. n_samples_test - {main_name} branch") + ax.set_ylabel("Predict Time") + ax.set_xlabel("n_samples_test") + + # Adjust layout and display the plots + plt.tight_layout() + fig.savefig(image_path, bbox_inches="tight") + print(f"Saved image to {image_path}") + + +def bench(args): + results_dir = Path(args.bench_results) + branch = args.branch + random_state = 1 + + results = defaultdict(list) + + # Loop over all datasets for fitting and scoring the estimator: + n_samples_train = 1000 + for n_samples_test in [ + 1000, + 10000, + 50000, + ]: + for n_features in [10, 100, 1000]: + for contamination in [0.01, 0.1, 0.5]: + for n_jobs in [1, 2, 3, 4]: + X_train, X_test = get_data( + n_samples_train, + n_samples_test, + n_features, + contamination, + random_state, + ) + + print("--- Fitting the IsolationForest estimator...") + model = IsolationForest(n_jobs=-1, random_state=random_state) + tstart = time() + model.fit(X_train) + fit_time = time() - tstart + + # clearcache + for _ in range(1000): + 1 + 1 + with parallel_config("threading", n_jobs=n_jobs): + tstart = time() + model.decision_function(X_test) # the lower, the more abnormal + predict_time = time() - tstart + + results["predict_time"].append(predict_time) + results["fit_time"].append(fit_time) + results["n_samples_train"].append(n_samples_train) + results["n_samples_test"].append(n_samples_test) + results["n_features"].append(n_features) + results["contamination"].append(contamination) + results["n_jobs"].append(n_jobs) + + df = pd.DataFrame(results) + df.to_csv(results_dir / f"{branch}.csv", index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # parse arguments for benchmarking + subparsers = parser.add_subparsers() + bench_parser = subparsers.add_parser("bench") + bench_parser.add_argument("bench_results") + bench_parser.add_argument("branch") + bench_parser.set_defaults(func=bench) + + # parse arguments for plotting + plot_parser = subparsers.add_parser("plot") + plot_parser.add_argument("bench_results") + plot_parser.add_argument("pr_name") + plot_parser.add_argument("main_name") + plot_parser.add_argument("image_path") + plot_parser.set_defaults(func=plot) + + # enable the parser and run the relevant function + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/bench_isotonic.py b/benchmarks/bench_isotonic.py index d1eacaa8d1758..be2ff6548cb92 100644 --- a/benchmarks/bench_isotonic.py +++ b/benchmarks/bench_isotonic.py @@ -10,18 +10,20 @@ This allows the scaling of the algorithm with the problem size to be visualized and understood. """ -import numpy as np + +import argparse import gc -from datetime import datetime -from sklearn.isotonic import isotonic_regression -from scipy.special import expit +from timeit import default_timer + import matplotlib.pyplot as plt -import argparse +import numpy as np +from scipy.special import expit + +from sklearn.isotonic import isotonic_regression def generate_perturbed_logarithm_dataset(size): - return (np.random.randint(-50, 50, size=size) + - 50. * np.log(1 + np.arange(size))) + return np.random.randint(-50, 50, size=size) + 50.0 * np.log(1 + np.arange(size)) def generate_logistic_dataset(size): @@ -31,15 +33,15 @@ def generate_logistic_dataset(size): def generate_pathological_dataset(size): # Triggers O(n^2) complexity on the original implementation. - return np.r_[np.arange(size), - np.arange(-(size - 1), size), - np.arange(-(size - 1), 1)] + return np.r_[ + np.arange(size), np.arange(-(size - 1), size), np.arange(-(size - 1), 1) + ] DATASET_GENERATORS = { - 'perturbed_logarithm': generate_perturbed_logarithm_dataset, - 'logistic': generate_logistic_dataset, - 'pathological': generate_pathological_dataset, + "perturbed_logarithm": generate_perturbed_logarithm_dataset, + "logistic": generate_logistic_dataset, + "pathological": generate_pathological_dataset, } @@ -50,39 +52,48 @@ def bench_isotonic_regression(Y): """ gc.collect() - tstart = datetime.now() + tstart = default_timer() isotonic_regression(Y) - return (datetime.now() - tstart).total_seconds() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Isotonic Regression benchmark tool") - parser.add_argument('--seed', type=int, - help="RNG seed") - parser.add_argument('--iterations', type=int, required=True, - help="Number of iterations to average timings over " - "for each problem size") - parser.add_argument('--log_min_problem_size', type=int, required=True, - help="Base 10 logarithm of the minimum problem size") - parser.add_argument('--log_max_problem_size', type=int, required=True, - help="Base 10 logarithm of the maximum problem size") - parser.add_argument('--show_plot', action='store_true', - help="Plot timing output with matplotlib") - parser.add_argument('--dataset', choices=DATASET_GENERATORS.keys(), - required=True) + return default_timer() - tstart + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Isotonic Regression benchmark tool") + parser.add_argument("--seed", type=int, help="RNG seed") + parser.add_argument( + "--iterations", + type=int, + required=True, + help="Number of iterations to average timings over for each problem size", + ) + parser.add_argument( + "--log_min_problem_size", + type=int, + required=True, + help="Base 10 logarithm of the minimum problem size", + ) + parser.add_argument( + "--log_max_problem_size", + type=int, + required=True, + help="Base 10 logarithm of the maximum problem size", + ) + parser.add_argument( + "--show_plot", action="store_true", help="Plot timing output with matplotlib" + ) + parser.add_argument("--dataset", choices=DATASET_GENERATORS.keys(), required=True) args = parser.parse_args() np.random.seed(args.seed) timings = [] - for exponent in range(args.log_min_problem_size, - args.log_max_problem_size): - n = 10 ** exponent + for exponent in range(args.log_min_problem_size, args.log_max_problem_size): + n = 10**exponent Y = DATASET_GENERATORS[args.dataset](n) - time_per_iteration = \ - [bench_isotonic_regression(Y) for i in range(args.iterations)] + time_per_iteration = [ + bench_isotonic_regression(Y) for i in range(args.iterations) + ] timing = (n, np.mean(time_per_iteration)) timings.append(timing) @@ -93,8 +104,8 @@ def bench_isotonic_regression(Y): if args.show_plot: plt.plot(*zip(*timings)) plt.title("Average time taken running isotonic regression") - plt.xlabel('Number of observations') - plt.ylabel('Time (s)') - plt.axis('tight') + plt.xlabel("Number of observations") + plt.ylabel("Time (s)") + plt.axis("tight") plt.loglog() plt.show() diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py new file mode 100644 index 0000000000000..a468f7b3e1abf --- /dev/null +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py @@ -0,0 +1,177 @@ +""" +============================================================= +Kernel PCA Solvers comparison benchmark: time vs n_components +============================================================= + +This benchmark shows that the approximate solvers provided in Kernel PCA can +help significantly improve its execution speed when an approximate solution +(small `n_components`) is acceptable. In many real-world datasets a few +hundreds of principal components are indeed sufficient enough to capture the +underlying distribution. + +Description: +------------ +A fixed number of training (default: 2000) and test (default: 1000) samples +with 2 features is generated using the `make_circles` helper method. + +KernelPCA models are trained on the training set with an increasing number of +principal components, between 1 and `max_n_compo` (default: 1999), with +`n_compo_grid_size` positions (default: 10). For each value of `n_components` +to try, KernelPCA models are trained for the various possible `eigen_solver` +values. The execution times are displayed in a plot at the end of the +experiment. + +What you can observe: +--------------------- +When the number of requested principal components is small, the dense solver +takes more time to complete, while the randomized method returns similar +results with shorter execution times. + +Going further: +-------------- +You can adjust `max_n_compo` and `n_compo_grid_size` if you wish to explore a +different range of values for `n_components`. + +You can also set `arpack_all=True` to activate arpack solver for large number +of components (this takes more time). +""" + +import time + +import matplotlib.pyplot as plt +import numpy as np +from numpy.testing import assert_array_almost_equal + +from sklearn.datasets import make_circles +from sklearn.decomposition import KernelPCA + +print(__doc__) + + +# 1- Design the Experiment +# ------------------------ +n_train, n_test = 2000, 1000 # the sample sizes to use +max_n_compo = 1999 # max n_components to try +n_compo_grid_size = 10 # nb of positions in the grid to try +# generate the grid +n_compo_range = [ + np.round(np.exp((x / (n_compo_grid_size - 1)) * np.log(max_n_compo))) + for x in range(0, n_compo_grid_size) +] + +n_iter = 3 # the number of times each experiment will be repeated +arpack_all = False # set to True if you wish to run arpack for all n_compo + + +# 2- Generate random data +# ----------------------- +n_features = 2 +X, y = make_circles( + n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0 +) +X_train, X_test = X[:n_train, :], X[n_train:, :] + + +# 3- Benchmark +# ------------ +# init +ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan +a_time = np.empty((len(n_compo_range), n_iter)) * np.nan +r_time = np.empty((len(n_compo_range), n_iter)) * np.nan +# loop +for j, n_components in enumerate(n_compo_range): + n_components = int(n_components) + print("Performing kPCA with n_components = %i" % n_components) + + # A- reference (dense) + print(" - dense solver") + for i in range(n_iter): + start_time = time.perf_counter() + ref_pred = ( + KernelPCA(n_components, eigen_solver="dense").fit(X_train).transform(X_test) + ) + ref_time[j, i] = time.perf_counter() - start_time + + # B- arpack (for small number of components only, too slow otherwise) + if arpack_all or n_components < 100: + print(" - arpack solver") + for i in range(n_iter): + start_time = time.perf_counter() + a_pred = ( + KernelPCA(n_components, eigen_solver="arpack") + .fit(X_train) + .transform(X_test) + ) + a_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approx + assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) + + # C- randomized + print(" - randomized solver") + for i in range(n_iter): + start_time = time.perf_counter() + r_pred = ( + KernelPCA(n_components, eigen_solver="randomized") + .fit(X_train) + .transform(X_test) + ) + r_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + +# Compute statistics for the 3 methods +avg_ref_time = ref_time.mean(axis=1) +std_ref_time = ref_time.std(axis=1) +avg_a_time = a_time.mean(axis=1) +std_a_time = a_time.std(axis=1) +avg_r_time = r_time.mean(axis=1) +std_r_time = r_time.std(axis=1) + + +# 4- Plots +# -------- +fig, ax = plt.subplots(figsize=(12, 8)) + +# Display 1 plot with error bars per method +ax.errorbar( + n_compo_range, + avg_ref_time, + yerr=std_ref_time, + marker="x", + linestyle="", + color="r", + label="full", +) +ax.errorbar( + n_compo_range, + avg_a_time, + yerr=std_a_time, + marker="x", + linestyle="", + color="g", + label="arpack", +) +ax.errorbar( + n_compo_range, + avg_r_time, + yerr=std_r_time, + marker="x", + linestyle="", + color="b", + label="randomized", +) +ax.legend(loc="upper left") + +# customize axes +ax.set_xscale("log") +ax.set_xlim(1, max(n_compo_range) * 1.1) +ax.set_ylabel("Execution time (s)") +ax.set_xlabel("n_components") + +ax.set_title( + "kPCA Execution time comparison on %i samples with %i " + "features, according to the choice of `eigen_solver`" + "" % (n_train, n_features) +) + +plt.show() diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py new file mode 100644 index 0000000000000..cae74c6f442ff --- /dev/null +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py @@ -0,0 +1,183 @@ +""" +========================================================== +Kernel PCA Solvers comparison benchmark: time vs n_samples +========================================================== + +This benchmark shows that the approximate solvers provided in Kernel PCA can +help significantly improve its execution speed when an approximate solution +(small `n_components`) is acceptable. In many real-world datasets the number of +samples is very large, but a few hundreds of principal components are +sufficient enough to capture the underlying distribution. + +Description: +------------ +An increasing number of examples is used to train a KernelPCA, between +`min_n_samples` (default: 101) and `max_n_samples` (default: 4000) with +`n_samples_grid_size` positions (default: 4). Samples have 2 features, and are +generated using `make_circles`. For each training sample size, KernelPCA models +are trained for the various possible `eigen_solver` values. All of them are +trained to obtain `n_components` principal components (default: 100). The +execution times are displayed in a plot at the end of the experiment. + +What you can observe: +--------------------- +When the number of samples provided gets large, the dense solver takes a lot +of time to complete, while the randomized method returns similar results in +much shorter execution times. + +Going further: +-------------- +You can increase `max_n_samples` and `nb_n_samples_to_try` if you wish to +explore a wider range of values for `n_samples`. + +You can also set `include_arpack=True` to add this other solver in the +experiments (much slower). + +Finally you can have a look at the second example of this series, "Kernel PCA +Solvers comparison benchmark: time vs n_components", where this time the number +of examples is fixed, and the desired number of components varies. +""" + +# Author: Sylvain MARIE, Schneider Electric + +import time + +import matplotlib.pyplot as plt +import numpy as np +from numpy.testing import assert_array_almost_equal + +from sklearn.datasets import make_circles +from sklearn.decomposition import KernelPCA + +print(__doc__) + + +# 1- Design the Experiment +# ------------------------ +min_n_samples, max_n_samples = 101, 4000 # min and max n_samples to try +n_samples_grid_size = 4 # nb of positions in the grid to try +# generate the grid +n_samples_range = [ + min_n_samples + + np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples)) + for x in range(0, n_samples_grid_size) +] + +n_components = 100 # the number of principal components we want to use +n_iter = 3 # the number of times each experiment will be repeated +include_arpack = False # set this to True to include arpack solver (slower) + + +# 2- Generate random data +# ----------------------- +n_features = 2 +X, y = make_circles(n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0) + + +# 3- Benchmark +# ------------ +# init +ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan +a_time = np.empty((len(n_samples_range), n_iter)) * np.nan +r_time = np.empty((len(n_samples_range), n_iter)) * np.nan + +# loop +for j, n_samples in enumerate(n_samples_range): + n_samples = int(n_samples) + print("Performing kPCA with n_samples = %i" % n_samples) + + X_train = X[:n_samples, :] + X_test = X_train + + # A- reference (dense) + print(" - dense") + for i in range(n_iter): + start_time = time.perf_counter() + ref_pred = ( + KernelPCA(n_components, eigen_solver="dense").fit(X_train).transform(X_test) + ) + ref_time[j, i] = time.perf_counter() - start_time + + # B- arpack + if include_arpack: + print(" - arpack") + for i in range(n_iter): + start_time = time.perf_counter() + a_pred = ( + KernelPCA(n_components, eigen_solver="arpack") + .fit(X_train) + .transform(X_test) + ) + a_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approx + assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) + + # C- randomized + print(" - randomized") + for i in range(n_iter): + start_time = time.perf_counter() + r_pred = ( + KernelPCA(n_components, eigen_solver="randomized") + .fit(X_train) + .transform(X_test) + ) + r_time[j, i] = time.perf_counter() - start_time + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + +# Compute statistics for the 3 methods +avg_ref_time = ref_time.mean(axis=1) +std_ref_time = ref_time.std(axis=1) +avg_a_time = a_time.mean(axis=1) +std_a_time = a_time.std(axis=1) +avg_r_time = r_time.mean(axis=1) +std_r_time = r_time.std(axis=1) + + +# 4- Plots +# -------- +fig, ax = plt.subplots(figsize=(12, 8)) + +# Display 1 plot with error bars per method +ax.errorbar( + n_samples_range, + avg_ref_time, + yerr=std_ref_time, + marker="x", + linestyle="", + color="r", + label="full", +) +if include_arpack: + ax.errorbar( + n_samples_range, + avg_a_time, + yerr=std_a_time, + marker="x", + linestyle="", + color="g", + label="arpack", + ) +ax.errorbar( + n_samples_range, + avg_r_time, + yerr=std_r_time, + marker="x", + linestyle="", + color="b", + label="randomized", +) +ax.legend(loc="upper left") + +# customize axes +ax.set_xlim(min(n_samples_range) * 0.9, max(n_samples_range) * 1.1) +ax.set_ylabel("Execution time (s)") +ax.set_xlabel("n_samples") + +ax.set_title( + "Execution time comparison of kPCA with %i components on samples " + "with %i features, according to the choice of `eigen_solver`" + "" % (n_components, n_features) +) + +plt.show() diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py index 4a2c8bbe6e248..9bae570505a75 100644 --- a/benchmarks/bench_lasso.py +++ b/benchmarks/bench_lasso.py @@ -11,8 +11,10 @@ In both cases, only 10% of the features are informative. """ + import gc from time import time + import numpy as np from sklearn.datasets import make_regression @@ -27,29 +29,30 @@ def compute_bench(alpha, n_samples, n_features, precompute): for ns in n_samples: for nf in n_features: it += 1 - print('==================') - print('Iteration %s of %s' % (it, max(len(n_samples), - len(n_features)))) - print('==================') + print("==================") + print("Iteration %s of %s" % (it, max(len(n_samples), len(n_features)))) + print("==================") n_informative = nf // 10 - X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, - n_informative=n_informative, - noise=0.1, coef=True) + X, Y, coef_ = make_regression( + n_samples=ns, + n_features=nf, + n_informative=n_informative, + noise=0.1, + coef=True, + ) - X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data + X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data gc.collect() print("- benchmarking Lasso") - clf = Lasso(alpha=alpha, fit_intercept=False, - precompute=precompute) + clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print("- benchmarking LassoLars") - clf = LassoLars(alpha=alpha, fit_intercept=False, - normalize=False, precompute=precompute) + clf = LassoLars(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() clf.fit(X, Y) lars_lasso_results.append(time() - tstart) @@ -57,40 +60,40 @@ def compute_bench(alpha, n_samples, n_features, precompute): return lasso_results, lars_lasso_results -if __name__ == '__main__': - from sklearn.linear_model import Lasso, LassoLars +if __name__ == "__main__": import matplotlib.pyplot as plt + from sklearn.linear_model import Lasso, LassoLars + alpha = 0.01 # regularization parameter n_features = 10 list_n_samples = np.linspace(100, 1000000, 5).astype(int) - lasso_results, lars_lasso_results = compute_bench(alpha, list_n_samples, - [n_features], precompute=True) + lasso_results, lars_lasso_results = compute_bench( + alpha, list_n_samples, [n_features], precompute=True + ) - plt.figure('scikit-learn LASSO benchmark results') + plt.figure("scikit-learn LASSO benchmark results") plt.subplot(211) - plt.plot(list_n_samples, lasso_results, 'b-', - label='Lasso') - plt.plot(list_n_samples, lars_lasso_results, 'r-', - label='LassoLars') - plt.title('precomputed Gram matrix, %d features, alpha=%s' % (n_features, - alpha)) - plt.legend(loc='upper left') - plt.xlabel('number of samples') - plt.ylabel('Time (s)') - plt.axis('tight') + plt.plot(list_n_samples, lasso_results, "b-", label="Lasso") + plt.plot(list_n_samples, lars_lasso_results, "r-", label="LassoLars") + plt.title("precomputed Gram matrix, %d features, alpha=%s" % (n_features, alpha)) + plt.legend(loc="upper left") + plt.xlabel("number of samples") + plt.ylabel("Time (s)") + plt.axis("tight") n_samples = 2000 list_n_features = np.linspace(500, 3000, 5).astype(int) - lasso_results, lars_lasso_results = compute_bench(alpha, [n_samples], - list_n_features, precompute=False) + lasso_results, lars_lasso_results = compute_bench( + alpha, [n_samples], list_n_features, precompute=False + ) plt.subplot(212) - plt.plot(list_n_features, lasso_results, 'b-', label='Lasso') - plt.plot(list_n_features, lars_lasso_results, 'r-', label='LassoLars') - plt.title('%d samples, alpha=%s' % (n_samples, alpha)) - plt.legend(loc='upper left') - plt.xlabel('number of features') - plt.ylabel('Time (s)') - plt.axis('tight') + plt.plot(list_n_features, lasso_results, "b-", label="Lasso") + plt.plot(list_n_features, lars_lasso_results, "r-", label="LassoLars") + plt.title("%d samples, alpha=%s" % (n_samples, alpha)) + plt.legend(loc="upper left") + plt.xlabel("number of features") + plt.ylabel("Time (s)") + plt.axis("tight") plt.show() diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py index 288caf212e7af..2c9732fab901f 100644 --- a/benchmarks/bench_lof.py +++ b/benchmarks/bench_lof.py @@ -18,11 +18,13 @@ """ from time import time -import numpy as np + import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import fetch_covtype, fetch_kddcup99, fetch_openml +from sklearn.metrics import auc, roc_curve from sklearn.neighbors import LocalOutlierFactor -from sklearn.metrics import roc_curve, auc -from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml from sklearn.preprocessing import LabelBinarizer print(__doc__) @@ -30,30 +32,31 @@ random_state = 2 # to control the random selection of anomalies in SA # datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] -datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] +datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"] plt.figure() for dataset_name in datasets: # loading and vectorization - print('loading data') - if dataset_name in ['http', 'smtp', 'SA', 'SF']: - dataset = fetch_kddcup99(subset=dataset_name, percent10=True, - random_state=random_state) + print("loading data") + if dataset_name in ["http", "smtp", "SA", "SF"]: + dataset = fetch_kddcup99( + subset=dataset_name, percent10=True, random_state=random_state + ) X = dataset.data y = dataset.target - if dataset_name == 'shuttle': - dataset = fetch_openml('shuttle') + if dataset_name == "shuttle": + dataset = fetch_openml("shuttle", as_frame=False) X = dataset.data - y = dataset.target + y = dataset.target.astype(np.int64) # we remove data with label 4 # normal data are then those of class 1 - s = (y != 4) + s = y != 4 X = X[s, :] y = y[s] y = (y != 1).astype(int) - if dataset_name == 'forestcover': + if dataset_name == "forestcover": dataset = fetch_covtype() X = dataset.data y = dataset.target @@ -64,28 +67,28 @@ y = y[s] y = (y != 2).astype(int) - print('vectorizing data') + print("vectorizing data") - if dataset_name == 'SF': + if dataset_name == "SF": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) X = np.c_[X[:, :1], x1, X[:, 2:]] - y = (y != b'normal.').astype(int) + y = (y != b"normal.").astype(int) - if dataset_name == 'SA': + if dataset_name == "SA": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] - y = (y != b'normal.').astype(int) + y = (y != b"normal.").astype(int) - if dataset_name == 'http' or dataset_name == 'smtp': - y = (y != b'normal.').astype(int) + if dataset_name == "http" or dataset_name == "smtp": + y = (y != b"normal.").astype(int) X = X.astype(float) - print('LocalOutlierFactor processing...') + print("LocalOutlierFactor processing...") model = LocalOutlierFactor(n_neighbors=20) tstart = time() model.fit(X) @@ -93,14 +96,18 @@ scoring = -model.negative_outlier_factor_ # the lower, the more normal fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) - plt.plot(fpr, tpr, lw=1, - label=('ROC for %s (area = %0.3f, train-time: %0.2fs)' - % (dataset_name, AUC, fit_time))) + plt.plot( + fpr, + tpr, + lw=1, + label="ROC for %s (area = %0.3f, train-time: %0.2fs)" + % (dataset_name, AUC, fit_time), + ) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) -plt.xlabel('False Positive Rate') -plt.ylabel('True Positive Rate') -plt.title('Receiver operating characteristic') +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("Receiver operating characteristic") plt.legend(loc="lower right") plt.show() diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py index 1ff76028739c6..5745a6d1e3882 100644 --- a/benchmarks/bench_mnist.py +++ b/benchmarks/bench_mnist.py @@ -6,7 +6,7 @@ Benchmark on the MNIST dataset. The dataset comprises 70,000 samples and 784 features. Here, we consider the task of predicting 10 classes - digits from 0 to 9 from their raw images. By contrast to the -covertype dataset, the feature space is homogenous. +covertype dataset, the feature space is homogeneous. Example of output : [..] @@ -26,45 +26,41 @@ dummy 0.00s 0.01s 0.8973 """ -# Author: Issam H. Laradji -# Arnaud Joly -# License: BSD 3 clause +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause +import argparse import os from time import time -import argparse + import numpy as np from joblib import Memory -from sklearn.datasets import fetch_openml -from sklearn.datasets import get_data_home -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_openml, get_data_home from sklearn.dummy import DummyClassifier -from sklearn.kernel_approximation import Nystroem -from sklearn.kernel_approximation import RBFSampler +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.kernel_approximation import Nystroem, RBFSampler +from sklearn.linear_model import LogisticRegression from sklearn.metrics import zero_one_loss +from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array -from sklearn.linear_model import LogisticRegression -from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode -memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), - mmap_mode='r') +memory = Memory(os.path.join(get_data_home(), "mnist_benchmark_data"), mmap_mode="r") @memory.cache -def load_data(dtype=np.float32, order='F'): +def load_data(dtype=np.float32, order="F"): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") - data = fetch_openml('mnist_784') - X = check_array(data['data'], dtype=dtype, order=order) + data = fetch_openml("mnist_784", as_frame=True) + X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] # Normalize features @@ -83,43 +79,76 @@ def load_data(dtype=np.float32, order='F'): ESTIMATORS = { "dummy": DummyClassifier(), - 'CART': DecisionTreeClassifier(), - 'ExtraTrees': ExtraTreesClassifier(), - 'RandomForest': RandomForestClassifier(), - 'Nystroem-SVM': make_pipeline( - Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), - 'SampledRBF-SVM': make_pipeline( - RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), - 'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, - C=1e4), - 'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1, - C=1e4), - 'MultilayerPerceptron': MLPClassifier( - hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, - solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, - tol=1e-4, random_state=1), - 'MLP-adam': MLPClassifier( - hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, - solver='adam', learning_rate_init=0.001, verbose=1, - tol=1e-4, random_state=1) + "CART": DecisionTreeClassifier(), + "ExtraTrees": ExtraTreesClassifier(), + "RandomForest": RandomForestClassifier(), + "Nystroem-SVM": make_pipeline( + Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100) + ), + "SampledRBF-SVM": make_pipeline( + RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100) + ), + "LogisticRegression-SAG": LogisticRegression(solver="sag", tol=1e-1, C=1e4), + "LogisticRegression-SAGA": LogisticRegression(solver="saga", tol=1e-1, C=1e4), + "MultilayerPerceptron": MLPClassifier( + hidden_layer_sizes=(100, 100), + max_iter=400, + alpha=1e-4, + solver="sgd", + learning_rate_init=0.2, + momentum=0.9, + verbose=1, + tol=1e-4, + random_state=1, + ), + "MLP-adam": MLPClassifier( + hidden_layer_sizes=(100, 100), + max_iter=400, + alpha=1e-4, + solver="adam", + learning_rate_init=0.001, + verbose=1, + tol=1e-4, + random_state=1, + ), } if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--classifiers', nargs="+", - choices=ESTIMATORS, type=str, - default=['ExtraTrees', 'Nystroem-SVM'], - help="list of classifiers to benchmark.") - parser.add_argument('--n-jobs', nargs="?", default=1, type=int, - help="Number of concurrently running workers for " - "models that support parallelism.") - parser.add_argument('--order', nargs="?", default="C", type=str, - choices=["F", "C"], - help="Allow to choose between fortran and C ordered " - "data") - parser.add_argument('--random-seed', nargs="?", default=0, type=int, - help="Common seed used by random number generator.") + parser.add_argument( + "--classifiers", + nargs="+", + choices=ESTIMATORS, + type=str, + default=["ExtraTrees", "Nystroem-SVM"], + help="list of classifiers to benchmark.", + ) + parser.add_argument( + "--n-jobs", + nargs="?", + default=1, + type=int, + help=( + "Number of concurrently running workers for " + "models that support parallelism." + ), + ) + parser.add_argument( + "--order", + nargs="?", + default="C", + type=str, + choices=["F", "C"], + help="Allow to choose between fortran and C ordered data", + ) + parser.add_argument( + "--random-seed", + nargs="?", + default=0, + type=int, + help="Common seed used by random number generator.", + ) args = vars(parser.parse_args()) print(__doc__) @@ -132,10 +161,22 @@ def load_data(dtype=np.float32, order='F'): print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) print("%s %s" % ("data type:".ljust(25), X_train.dtype)) - print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), - X_train.shape[0], int(X_train.nbytes / 1e6))) - print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), - X_test.shape[0], int(X_test.nbytes / 1e6))) + print( + "%s %d (size=%dMB)" + % ( + "number of train samples:".ljust(25), + X_train.shape[0], + int(X_train.nbytes / 1e6), + ) + ) + print( + "%s %d (size=%dMB)" + % ( + "number of test samples:".ljust(25), + X_test.shape[0], + int(X_test.nbytes / 1e6), + ) + ) print() print("Training Classifiers") @@ -146,9 +187,13 @@ def load_data(dtype=np.float32, order='F'): estimator = ESTIMATORS[name] estimator_params = estimator.get_params() - estimator.set_params(**{p: args["random_seed"] - for p in estimator_params - if p.endswith("random_state")}) + estimator.set_params( + **{ + p: args["random_seed"] + for p in estimator_params + if p.endswith("random_state") + } + ) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=args["n_jobs"]) @@ -168,12 +213,17 @@ def load_data(dtype=np.float32, order='F'): print() print("Classification performance:") print("===========================") - print("{0: <24} {1: >10} {2: >11} {3: >12}" - "".format("Classifier ", "train-time", "test-time", "error-rate")) + print( + "{0: <24} {1: >10} {2: >11} {3: >12}".format( + "Classifier ", "train-time", "test-time", "error-rate" + ) + ) print("-" * 60) for name in sorted(args["classifiers"], key=error.get): - - print("{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}" - "".format(name, train_time[name], test_time[name], error[name])) + print( + "{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}".format( + name, train_time[name], test_time[name], error[name] + ) + ) print() diff --git a/benchmarks/bench_multilabel_metrics.py b/benchmarks/bench_multilabel_metrics.py index 36fc7cb3c47b8..1b8449a24da51 100755 --- a/benchmarks/bench_multilabel_metrics.py +++ b/benchmarks/bench_multilabel_metrics.py @@ -3,43 +3,50 @@ A comparison of multilabel target formats and metrics over them """ -from timeit import timeit -from functools import partial -import itertools import argparse +import itertools import sys +from functools import partial +from timeit import timeit import matplotlib.pyplot as plt -import scipy.sparse as sp import numpy as np +import scipy.sparse as sp from sklearn.datasets import make_multilabel_classification -from sklearn.metrics import (f1_score, accuracy_score, hamming_loss, - jaccard_similarity_score) +from sklearn.metrics import ( + accuracy_score, + f1_score, + hamming_loss, + jaccard_similarity_score, +) from sklearn.utils._testing import ignore_warnings - METRICS = { - 'f1': partial(f1_score, average='micro'), - 'f1-by-sample': partial(f1_score, average='samples'), - 'accuracy': accuracy_score, - 'hamming': hamming_loss, - 'jaccard': jaccard_similarity_score, + "f1": partial(f1_score, average="micro"), + "f1-by-sample": partial(f1_score, average="samples"), + "accuracy": accuracy_score, + "hamming": hamming_loss, + "jaccard": jaccard_similarity_score, } FORMATS = { - 'sequences': lambda y: [list(np.flatnonzero(s)) for s in y], - 'dense': lambda y: y, - 'csr': lambda y: sp.csr_matrix(y), - 'csc': lambda y: sp.csc_matrix(y), + "sequences": lambda y: [list(np.flatnonzero(s)) for s in y], + "dense": lambda y: y, + "csr": sp.csr_matrix, + "csc": sp.csc_matrix, } @ignore_warnings -def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())), - formats=tuple(v for k, v in sorted(FORMATS.items())), - samples=1000, classes=4, density=.2, - n_times=5): +def benchmark( + metrics=tuple(v for k, v in sorted(METRICS.items())), + formats=tuple(v for k, v in sorted(FORMATS.items())), + samples=1000, + classes=4, + density=0.2, + n_times=5, +): """Times metric calculations for a number of inputs Parameters @@ -73,16 +80,18 @@ def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())), classes = np.atleast_1d(classes) density = np.atleast_1d(density) formats = np.atleast_1d(formats) - out = np.zeros((len(metrics), len(formats), len(samples), len(classes), - len(density)), dtype=float) + out = np.zeros( + (len(metrics), len(formats), len(samples), len(classes), len(density)), + dtype=float, + ) it = itertools.product(samples, classes, density) for i, (s, c, d) in enumerate(it): - _, y_true = make_multilabel_classification(n_samples=s, n_features=1, - n_classes=c, n_labels=d * c, - random_state=42) - _, y_pred = make_multilabel_classification(n_samples=s, n_features=1, - n_classes=c, n_labels=d * c, - random_state=84) + _, y_true = make_multilabel_classification( + n_samples=s, n_features=1, n_classes=c, n_labels=d * c, random_state=42 + ) + _, y_pred = make_multilabel_classification( + n_samples=s, n_features=1, n_classes=c, n_labels=d * c, random_state=84 + ) for j, f in enumerate(formats): f_true = f(y_true) f_pred = f(y_pred) @@ -100,70 +109,95 @@ def _tabulate(results, metrics, formats): """ column_width = max(max(len(k) for k in formats) + 1, 8) first_width = max(len(k) for k in metrics) - head_fmt = ('{:<{fw}s}' + '{:>{cw}s}' * len(formats)) - row_fmt = ('{:<{fw}s}' + '{:>{cw}.3f}' * len(formats)) - print(head_fmt.format('Metric', *formats, - cw=column_width, fw=first_width)) + head_fmt = "{:<{fw}s}" + "{:>{cw}s}" * len(formats) + row_fmt = "{:<{fw}s}" + "{:>{cw}.3f}" * len(formats) + print(head_fmt.format("Metric", *formats, cw=column_width, fw=first_width)) for metric, row in zip(metrics, results[:, :, -1, -1, -1]): - print(row_fmt.format(metric, *row, - cw=column_width, fw=first_width)) - - -def _plot(results, metrics, formats, title, x_ticks, x_label, - format_markers=('x', '|', 'o', '+'), - metric_colors=('c', 'm', 'y', 'k', 'g', 'r', 'b')): + print(row_fmt.format(metric, *row, cw=column_width, fw=first_width)) + + +def _plot( + results, + metrics, + formats, + title, + x_ticks, + x_label, + format_markers=("x", "|", "o", "+"), + metric_colors=("c", "m", "y", "k", "g", "r", "b"), +): """ Plot the results by metric, format and some other variable given by x_label """ - fig = plt.figure('scikit-learn multilabel metrics benchmarks') + fig = plt.figure("scikit-learn multilabel metrics benchmarks") plt.title(title) ax = fig.add_subplot(111) for i, metric in enumerate(metrics): for j, format in enumerate(formats): - ax.plot(x_ticks, results[i, j].flat, - label='{}, {}'.format(metric, format), - marker=format_markers[j], - color=metric_colors[i % len(metric_colors)]) + ax.plot( + x_ticks, + results[i, j].flat, + label="{}, {}".format(metric, format), + marker=format_markers[j], + color=metric_colors[i % len(metric_colors)], + ) ax.set_xlabel(x_label) - ax.set_ylabel('Time (s)') + ax.set_ylabel("Time (s)") ax.legend() plt.show() if __name__ == "__main__": ap = argparse.ArgumentParser() - ap.add_argument('metrics', nargs='*', default=sorted(METRICS), - help='Specifies metrics to benchmark, defaults to all. ' - 'Choices are: {}'.format(sorted(METRICS))) - ap.add_argument('--formats', nargs='+', choices=sorted(FORMATS), - help='Specifies multilabel formats to benchmark ' - '(defaults to all).') - ap.add_argument('--samples', type=int, default=1000, - help='The number of samples to generate') - ap.add_argument('--classes', type=int, default=10, - help='The number of classes') - ap.add_argument('--density', type=float, default=.2, - help='The average density of labels per sample') - ap.add_argument('--plot', choices=['classes', 'density', 'samples'], - default=None, - help='Plot time with respect to this parameter varying ' - 'up to the specified value') - ap.add_argument('--n-steps', default=10, type=int, - help='Plot this many points for each metric') - ap.add_argument('--n-times', - default=5, type=int, - help="Time performance over n_times trials") + ap.add_argument( + "metrics", + nargs="*", + default=sorted(METRICS), + help="Specifies metrics to benchmark, defaults to all. Choices are: {}".format( + sorted(METRICS) + ), + ) + ap.add_argument( + "--formats", + nargs="+", + choices=sorted(FORMATS), + help="Specifies multilabel formats to benchmark (defaults to all).", + ) + ap.add_argument( + "--samples", type=int, default=1000, help="The number of samples to generate" + ) + ap.add_argument("--classes", type=int, default=10, help="The number of classes") + ap.add_argument( + "--density", + type=float, + default=0.2, + help="The average density of labels per sample", + ) + ap.add_argument( + "--plot", + choices=["classes", "density", "samples"], + default=None, + help=( + "Plot time with respect to this parameter varying up to the specified value" + ), + ) + ap.add_argument( + "--n-steps", default=10, type=int, help="Plot this many points for each metric" + ) + ap.add_argument( + "--n-times", default=5, type=int, help="Time performance over n_times trials" + ) args = ap.parse_args() if args.plot is not None: max_val = getattr(args, args.plot) - if args.plot in ('classes', 'samples'): + if args.plot in ("classes", "samples"): min_val = 2 else: min_val = 0 steps = np.linspace(min_val, max_val, num=args.n_steps + 1)[1:] - if args.plot in ('classes', 'samples'): + if args.plot in ("classes", "samples"): steps = np.unique(np.round(steps).astype(int)) setattr(args, args.plot, steps) @@ -172,17 +206,22 @@ def _plot(results, metrics, formats, title, x_ticks, x_label, if args.formats is None: args.formats = sorted(FORMATS) - results = benchmark([METRICS[k] for k in args.metrics], - [FORMATS[k] for k in args.formats], - args.samples, args.classes, args.density, - args.n_times) + results = benchmark( + [METRICS[k] for k in args.metrics], + [FORMATS[k] for k in args.formats], + args.samples, + args.classes, + args.density, + args.n_times, + ) _tabulate(results, args.metrics, args.formats) if args.plot is not None: - print('Displaying plot', file=sys.stderr) - title = ('Multilabel metrics with %s' % - ', '.join('{0}={1}'.format(field, getattr(args, field)) - for field in ['samples', 'classes', 'density'] - if args.plot != field)) + print("Displaying plot", file=sys.stderr) + title = "Multilabel metrics with %s" % ", ".join( + "{0}={1}".format(field, getattr(args, field)) + for field in ["samples", "classes", "density"] + if args.plot != field + ) _plot(results, args.metrics, args.formats, title, steps, args.plot) diff --git a/benchmarks/bench_online_ocsvm.py b/benchmarks/bench_online_ocsvm.py new file mode 100644 index 0000000000000..9f92150e079dd --- /dev/null +++ b/benchmarks/bench_online_ocsvm.py @@ -0,0 +1,294 @@ +""" +===================================== +SGDOneClassSVM benchmark +===================================== +This benchmark compares the :class:`SGDOneClassSVM` with :class:`OneClassSVM`. +The former is an online One-Class SVM implemented with a Stochastic Gradient +Descent (SGD). The latter is based on the LibSVM implementation. The +complexity of :class:`SGDOneClassSVM` is linear in the number of samples +whereas the one of :class:`OneClassSVM` is at best quadratic in the number of +samples. We here compare the performance in terms of AUC and training time on +classical anomaly detection datasets. + +The :class:`OneClassSVM` is applied with a Gaussian kernel and we therefore +use a kernel approximation prior to the application of :class:`SGDOneClassSVM`. +""" + +from time import time + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +from scipy.interpolate import interp1d + +from sklearn.datasets import fetch_covtype, fetch_kddcup99 +from sklearn.kernel_approximation import Nystroem +from sklearn.linear_model import SGDOneClassSVM +from sklearn.metrics import auc, roc_curve +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import LabelBinarizer, StandardScaler +from sklearn.svm import OneClassSVM +from sklearn.utils import shuffle + +font = {"weight": "normal", "size": 15} + +matplotlib.rc("font", **font) + +print(__doc__) + + +def print_outlier_ratio(y): + """ + Helper function to show the distinct value count of element in the target. + Useful indicator for the datasets used in bench_isolation_forest.py. + """ + uniq, cnt = np.unique(y, return_counts=True) + print("----- Target count values: ") + for u, c in zip(uniq, cnt): + print("------ %s -> %d occurrences" % (str(u), c)) + print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) + + +# for roc curve computation +n_axis = 1000 +x_axis = np.linspace(0, 1, n_axis) + +datasets = ["http", "smtp", "SA", "SF", "forestcover"] + +novelty_detection = False # if False, training set polluted by outliers + +random_states = [42] +nu = 0.05 + +results_libsvm = np.empty((len(datasets), n_axis + 5)) +results_online = np.empty((len(datasets), n_axis + 5)) + +for dat, dataset_name in enumerate(datasets): + print(dataset_name) + + # Loading datasets + if dataset_name in ["http", "smtp", "SA", "SF"]: + dataset = fetch_kddcup99( + subset=dataset_name, shuffle=False, percent10=False, random_state=88 + ) + X = dataset.data + y = dataset.target + + if dataset_name == "forestcover": + dataset = fetch_covtype(shuffle=False) + X = dataset.data + y = dataset.target + # normal data are those with attribute 2 + # abnormal those with attribute 4 + s = (y == 2) + (y == 4) + X = X[s, :] + y = y[s] + y = (y != 2).astype(int) + + # Vectorizing data + if dataset_name == "SF": + # Casting type of X (object) as string is needed for string categorical + # features to apply LabelBinarizer + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) + X = np.c_[X[:, :1], x1, X[:, 2:]] + y = (y != b"normal.").astype(int) + + if dataset_name == "SA": + lb = LabelBinarizer() + # Casting type of X (object) as string is needed for string categorical + # features to apply LabelBinarizer + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) + X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] + y = (y != b"normal.").astype(int) + + if dataset_name in ["http", "smtp"]: + y = (y != b"normal.").astype(int) + + print_outlier_ratio(y) + + n_samples, n_features = np.shape(X) + if dataset_name == "SA": # LibSVM too long with n_samples // 2 + n_samples_train = n_samples // 20 + else: + n_samples_train = n_samples // 2 + + n_samples_test = n_samples - n_samples_train + print("n_train: ", n_samples_train) + print("n_features: ", n_features) + + tpr_libsvm = np.zeros(n_axis) + tpr_online = np.zeros(n_axis) + fit_time_libsvm = 0 + fit_time_online = 0 + predict_time_libsvm = 0 + predict_time_online = 0 + + X = X.astype(float) + + gamma = 1 / n_features # OCSVM default parameter + + for random_state in random_states: + print("random state: %s" % random_state) + + X, y = shuffle(X, y, random_state=random_state) + X_train = X[:n_samples_train] + X_test = X[n_samples_train:] + y_train = y[:n_samples_train] + y_test = y[n_samples_train:] + + if novelty_detection: + X_train = X_train[y_train == 0] + y_train = y_train[y_train == 0] + + std = StandardScaler() + + print("----------- LibSVM OCSVM ------------") + ocsvm = OneClassSVM(kernel="rbf", gamma=gamma, nu=nu) + pipe_libsvm = make_pipeline(std, ocsvm) + + tstart = time() + pipe_libsvm.fit(X_train) + fit_time_libsvm += time() - tstart + + tstart = time() + # scoring such that the lower, the more normal + scoring = -pipe_libsvm.decision_function(X_test) + predict_time_libsvm += time() - tstart + fpr_libsvm_, tpr_libsvm_, _ = roc_curve(y_test, scoring) + + f_libsvm = interp1d(fpr_libsvm_, tpr_libsvm_) + tpr_libsvm += f_libsvm(x_axis) + + print("----------- Online OCSVM ------------") + nystroem = Nystroem(gamma=gamma, random_state=random_state) + online_ocsvm = SGDOneClassSVM(nu=nu, random_state=random_state) + pipe_online = make_pipeline(std, nystroem, online_ocsvm) + + tstart = time() + pipe_online.fit(X_train) + fit_time_online += time() - tstart + + tstart = time() + # scoring such that the lower, the more normal + scoring = -pipe_online.decision_function(X_test) + predict_time_online += time() - tstart + fpr_online_, tpr_online_, _ = roc_curve(y_test, scoring) + + f_online = interp1d(fpr_online_, tpr_online_) + tpr_online += f_online(x_axis) + + tpr_libsvm /= len(random_states) + tpr_libsvm[0] = 0.0 + fit_time_libsvm /= len(random_states) + predict_time_libsvm /= len(random_states) + auc_libsvm = auc(x_axis, tpr_libsvm) + + results_libsvm[dat] = [ + fit_time_libsvm, + predict_time_libsvm, + auc_libsvm, + n_samples_train, + n_features, + ] + list(tpr_libsvm) + + tpr_online /= len(random_states) + tpr_online[0] = 0.0 + fit_time_online /= len(random_states) + predict_time_online /= len(random_states) + auc_online = auc(x_axis, tpr_online) + + results_online[dat] = [ + fit_time_online, + predict_time_online, + auc_online, + n_samples_train, + n_features, + ] + list(tpr_libsvm) + + +# -------- Plotting bar charts ------------- +fit_time_libsvm_all = results_libsvm[:, 0] +predict_time_libsvm_all = results_libsvm[:, 1] +auc_libsvm_all = results_libsvm[:, 2] +n_train_all = results_libsvm[:, 3] +n_features_all = results_libsvm[:, 4] + +fit_time_online_all = results_online[:, 0] +predict_time_online_all = results_online[:, 1] +auc_online_all = results_online[:, 2] + + +width = 0.7 +ind = 2 * np.arange(len(datasets)) +x_tickslabels = [ + (name + "\n" + r"$n={:,d}$" + "\n" + r"$d={:d}$").format(int(n), int(d)) + for name, n, d in zip(datasets, n_train_all, n_features_all) +] + + +def autolabel_auc(rects, ax): + """Attach a text label above each bar displaying its height.""" + for rect in rects: + height = rect.get_height() + ax.text( + rect.get_x() + rect.get_width() / 2.0, + 1.05 * height, + "%.3f" % height, + ha="center", + va="bottom", + ) + + +def autolabel_time(rects, ax): + """Attach a text label above each bar displaying its height.""" + for rect in rects: + height = rect.get_height() + ax.text( + rect.get_x() + rect.get_width() / 2.0, + 1.05 * height, + "%.1f" % height, + ha="center", + va="bottom", + ) + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel("AUC") +ax.set_ylim((0, 1.3)) +rect_libsvm = ax.bar(ind, auc_libsvm_all, width=width, color="r") +rect_online = ax.bar(ind + width, auc_online_all, width=width, color="y") +ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_auc(rect_libsvm, ax) +autolabel_auc(rect_online, ax) +plt.show() + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel("Training time (sec) - Log scale") +ax.set_yscale("log") +rect_libsvm = ax.bar(ind, fit_time_libsvm_all, color="r", width=width) +rect_online = ax.bar(ind + width, fit_time_online_all, color="y", width=width) +ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_time(rect_libsvm, ax) +autolabel_time(rect_online, ax) +plt.show() + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel("Testing time (sec) - Log scale") +ax.set_yscale("log") +rect_libsvm = ax.bar(ind, predict_time_libsvm_all, color="r", width=width) +rect_online = ax.bar(ind + width, predict_time_online_all, color="y", width=width) +ax.legend((rect_libsvm[0], rect_online[0]), ("LibSVM", "Online SVM")) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_time(rect_libsvm, ax) +autolabel_time(rect_online, ax) +plt.show() diff --git a/benchmarks/bench_pca_solvers.py b/benchmarks/bench_pca_solvers.py new file mode 100644 index 0000000000000..337af3a42e900 --- /dev/null +++ b/benchmarks/bench_pca_solvers.py @@ -0,0 +1,165 @@ +# %% +# +# This benchmark compares the speed of PCA solvers on datasets of different +# sizes in order to determine the best solver to select by default via the +# "auto" heuristic. +# +# Note: we do not control for the accuracy of the solvers: we assume that all +# solvers yield transformed data with similar explained variance. This +# assumption is generally true, except for the randomized solver that might +# require more power iterations. +# +# We generate synthetic data with dimensions that are useful to plot: +# - time vs n_samples for a fixed n_features and, +# - time vs n_features for a fixed n_samples for a fixed n_features. +import itertools +from math import log10 +from time import perf_counter + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from sklearn import config_context +from sklearn.decomposition import PCA + +REF_DIMS = [100, 1000, 10_000] +data_shapes = [] +for ref_dim in REF_DIMS: + data_shapes.extend([(ref_dim, 10**i) for i in range(1, 8 - int(log10(ref_dim)))]) + data_shapes.extend( + [(ref_dim, 3 * 10**i) for i in range(1, 8 - int(log10(ref_dim)))] + ) + data_shapes.extend([(10**i, ref_dim) for i in range(1, 8 - int(log10(ref_dim)))]) + data_shapes.extend( + [(3 * 10**i, ref_dim) for i in range(1, 8 - int(log10(ref_dim)))] + ) + +# Remove duplicates: +data_shapes = sorted(set(data_shapes)) + +print("Generating test datasets...") +rng = np.random.default_rng(0) +datasets = [rng.normal(size=shape) for shape in data_shapes] + + +# %% +def measure_one(data, n_components, solver, method_name="fit"): + print( + f"Benchmarking {solver=!r}, {n_components=}, {method_name=!r} on data with" + f" shape {data.shape}" + ) + pca = PCA(n_components=n_components, svd_solver=solver, random_state=0) + timings = [] + elapsed = 0 + method = getattr(pca, method_name) + with config_context(assume_finite=True): + while elapsed < 0.5: + tic = perf_counter() + method(data) + duration = perf_counter() - tic + timings.append(duration) + elapsed += duration + return np.median(timings) + + +SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"] +measurements = [] +for data, n_components, method_name in itertools.product( + datasets, [2, 50], ["fit", "fit_transform"] +): + if n_components >= min(data.shape): + continue + for solver in SOLVERS: + if solver == "covariance_eigh" and data.shape[1] > 5000: + # Too much memory and too slow. + continue + if solver in ["arpack", "full"] and log10(data.size) > 7: + # Too slow, in particular for the full solver. + continue + time = measure_one(data, n_components, solver, method_name=method_name) + measurements.append( + { + "n_components": n_components, + "n_samples": data.shape[0], + "n_features": data.shape[1], + "time": time, + "solver": solver, + "method_name": method_name, + } + ) +measurements = pd.DataFrame(measurements) +measurements.to_csv("bench_pca_solvers.csv", index=False) + +# %% +all_method_names = measurements["method_name"].unique() +all_n_components = measurements["n_components"].unique() + +for method_name in all_method_names: + fig, axes = plt.subplots( + figsize=(16, 16), + nrows=len(REF_DIMS), + ncols=len(all_n_components), + sharey=True, + constrained_layout=True, + ) + fig.suptitle(f"Benchmarks for PCA.{method_name}, varying n_samples", fontsize=16) + + for row_idx, ref_dim in enumerate(REF_DIMS): + for n_components, ax in zip(all_n_components, axes[row_idx]): + for solver in SOLVERS: + if solver == "auto": + style_kwargs = dict(linewidth=2, color="black", style="--") + else: + style_kwargs = dict(style="o-") + ax.set( + title=f"n_components={n_components}, n_features={ref_dim}", + ylabel="time (s)", + ) + measurements.query( + "n_components == @n_components and n_features == @ref_dim" + " and solver == @solver and method_name == @method_name" + ).plot.line( + x="n_samples", + y="time", + label=solver, + logx=True, + logy=True, + ax=ax, + **style_kwargs, + ) +# %% +for method_name in all_method_names: + fig, axes = plt.subplots( + figsize=(16, 16), + nrows=len(REF_DIMS), + ncols=len(all_n_components), + sharey=True, + ) + fig.suptitle(f"Benchmarks for PCA.{method_name}, varying n_features", fontsize=16) + + for row_idx, ref_dim in enumerate(REF_DIMS): + for n_components, ax in zip(all_n_components, axes[row_idx]): + for solver in SOLVERS: + if solver == "auto": + style_kwargs = dict(linewidth=2, color="black", style="--") + else: + style_kwargs = dict(style="o-") + ax.set( + title=f"n_components={n_components}, n_samples={ref_dim}", + ylabel="time (s)", + ) + measurements.query( + "n_components == @n_components and n_samples == @ref_dim " + " and solver == @solver and method_name == @method_name" + ).plot.line( + x="n_features", + y="time", + label=solver, + logx=True, + logy=True, + ax=ax, + **style_kwargs, + ) + +# %% diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py index dd37f159a5c91..d5a2d10fbf22d 100644 --- a/benchmarks/bench_plot_fastkmeans.py +++ b/benchmarks/bench_plot_fastkmeans.py @@ -8,7 +8,6 @@ def compute_bench(samples_range, features_range): - it = 0 results = defaultdict(lambda: []) chunk = 100 @@ -17,29 +16,29 @@ def compute_bench(samples_range, features_range): for n_samples in samples_range: for n_features in features_range: it += 1 - print('==============================') - print('Iteration %03d of %03d' % (it, max_it)) - print('==============================') + print("==============================") + print("Iteration %03d of %03d" % (it, max_it)) + print("==============================") print() data = nr.randint(-50, 51, (n_samples, n_features)) - print('K-Means') + print("K-Means") tstart = time() - kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) + kmeans = KMeans(init="k-means++", n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.5f" % kmeans.inertia_) print() - results['kmeans_speed'].append(delta) - results['kmeans_quality'].append(kmeans.inertia_) + results["kmeans_speed"].append(delta) + results["kmeans_quality"].append(kmeans.inertia_) - print('Fast K-Means') + print("Fast K-Means") # let's prepare the data in small chunks - mbkmeans = MiniBatchKMeans(init='k-means++', - n_clusters=10, - batch_size=chunk) + mbkmeans = MiniBatchKMeans( + init="k-means++", n_clusters=10, batch_size=chunk + ) tstart = time() mbkmeans.fit(data) delta = time() - tstart @@ -48,8 +47,8 @@ def compute_bench(samples_range, features_range): print() print() - results['MiniBatchKMeans Speed'].append(delta) - results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + results["MiniBatchKMeans Speed"].append(delta) + results["MiniBatchKMeans Quality"].append(mbkmeans.inertia_) return results @@ -57,8 +56,18 @@ def compute_bench(samples_range, features_range): def compute_bench_2(chunks): results = defaultdict(lambda: []) n_features = 50000 - means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], - [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) + means = np.array( + [ + [1, 1], + [-1, -1], + [1, -1], + [-1, 1], + [0.5, 0.5], + [0.75, -0.5], + [-1, 0.75], + [1, 0], + ] + ) X = np.empty((0, 2)) for i in range(8): X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] @@ -66,16 +75,14 @@ def compute_bench_2(chunks): it = 0 for chunk in chunks: it += 1 - print('==============================') - print('Iteration %03d of %03d' % (it, max_it)) - print('==============================') + print("==============================") + print("Iteration %03d of %03d" % (it, max_it)) + print("==============================") print() - print('Fast K-Means') + print("Fast K-Means") tstart = time() - mbkmeans = MiniBatchKMeans(init='k-means++', - n_clusters=8, - batch_size=chunk) + mbkmeans = MiniBatchKMeans(init="k-means++", n_clusters=8, batch_size=chunk) mbkmeans.fit(X) delta = time() - tstart @@ -83,15 +90,15 @@ def compute_bench_2(chunks): print("Inertia: %0.3fs" % mbkmeans.inertia_) print() - results['MiniBatchKMeans Speed'].append(delta) - results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + results["MiniBatchKMeans Speed"].append(delta) + results["MiniBatchKMeans Quality"].append(mbkmeans.inertia_) return results -if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection +if __name__ == "__main__": import matplotlib.pyplot as plt + from mpl_toolkits.mplot3d import axes3d # register the 3d projection # noqa: F401 samples_range = np.linspace(50, 150, 5).astype(int) features_range = np.linspace(150, 50000, 5).astype(int) @@ -100,37 +107,35 @@ def compute_bench_2(chunks): results = compute_bench(samples_range, features_range) results_2 = compute_bench_2(chunks) - max_time = max([max(i) for i in [t for (label, t) in results.items() - if "speed" in label]]) - max_inertia = max([max(i) for i in [ - t for (label, t) in results.items() - if "speed" not in label]]) - - fig = plt.figure('scikit-learn K-Means benchmark results') - for c, (label, timings) in zip('brcy', - sorted(results.items())): - if 'speed' in label: - ax = fig.add_subplot(2, 2, 1, projection='3d') + max_time = max( + [max(i) for i in [t for (label, t) in results.items() if "speed" in label]] + ) + max_inertia = max( + [max(i) for i in [t for (label, t) in results.items() if "speed" not in label]] + ) + + fig = plt.figure("scikit-learn K-Means benchmark results") + for c, (label, timings) in zip("brcy", sorted(results.items())): + if "speed" in label: + ax = fig.add_subplot(2, 2, 1, projection="3d") ax.set_zlim3d(0.0, max_time * 1.1) else: - ax = fig.add_subplot(2, 2, 2, projection='3d') + ax = fig.add_subplot(2, 2, 2, projection="3d") ax.set_zlim3d(0.0, max_inertia * 1.1) X, Y = np.meshgrid(samples_range, features_range) - Z = np.asarray(timings).reshape(samples_range.shape[0], - features_range.shape[0]) + Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5) - ax.set_xlabel('n_samples') - ax.set_ylabel('n_features') + ax.set_xlabel("n_samples") + ax.set_ylabel("n_features") i = 0 - for c, (label, timings) in zip('br', - sorted(results_2.items())): + for c, (label, timings) in zip("br", sorted(results_2.items())): i += 1 ax = fig.add_subplot(2, 2, i + 2) y = np.asarray(timings) ax.plot(chunks, y, color=c, alpha=0.8) - ax.set_xlabel('Chunks') + ax.set_xlabel("Chunks") ax.set_ylabel(label) plt.show() diff --git a/benchmarks/bench_plot_hierarchical.py b/benchmarks/bench_plot_hierarchical.py index 72c3f36616ff4..861a0ea0b5296 100644 --- a/benchmarks/bench_plot_hierarchical.py +++ b/benchmarks/bench_plot_hierarchical.py @@ -8,7 +8,6 @@ def compute_bench(samples_range, features_range): - it = 0 results = defaultdict(lambda: []) @@ -16,20 +15,17 @@ def compute_bench(samples_range, features_range): for n_samples in samples_range: for n_features in features_range: it += 1 - print('==============================') - print('Iteration %03d of %03d' % (it, max_it)) - print('n_samples %05d; n_features %02d' % (n_samples, n_features)) - print('==============================') + print("==============================") + print("Iteration %03d of %03d" % (it, max_it)) + print("n_samples %05d; n_features %02d" % (n_samples, n_features)) + print("==============================") print() data = nr.randint(-50, 51, (n_samples, n_features)) for linkage in ("single", "average", "complete", "ward"): print(linkage.capitalize()) tstart = time() - AgglomerativeClustering( - linkage=linkage, - n_clusters=10 - ).fit(data) + AgglomerativeClustering(linkage=linkage, n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) @@ -40,7 +36,7 @@ def compute_bench(samples_range, features_range): return results -if __name__ == '__main__': +if __name__ == "__main__": import matplotlib.pyplot as plt samples_range = np.linspace(1000, 15000, 8).astype(int) @@ -50,36 +46,32 @@ def compute_bench(samples_range, features_range): max_time = max([max(i) for i in [t for (label, t) in results.items()]]) - colors = plt.get_cmap('tab10')(np.linspace(0, 1, 10))[:4] + colors = plt.get_cmap("tab10")(np.linspace(0, 1, 10))[:4] lines = {linkage: None for linkage in results.keys()} fig, axs = plt.subplots(2, 2, sharex=True, sharey=True) - fig.suptitle( - 'Scikit-learn agglomerative clustering benchmark results', - fontsize=16 - ) - for c, (label, timings) in zip(colors, - sorted(results.items())): + fig.suptitle("Scikit-learn agglomerative clustering benchmark results", fontsize=16) + for c, (label, timings) in zip(colors, sorted(results.items())): timing_by_samples = np.asarray(timings).reshape( - samples_range.shape[0], - features_range.shape[0] + samples_range.shape[0], features_range.shape[0] ) for n in range(timing_by_samples.shape[1]): ax = axs.flatten()[n] - lines[label], = ax.plot( - samples_range, - timing_by_samples[:, n], - color=c, - label=label + (lines[label],) = ax.plot( + samples_range, timing_by_samples[:, n], color=c, label=label ) - ax.set_title('n_features = %d' % features_range[n]) + ax.set_title("n_features = %d" % features_range[n]) if n >= 2: - ax.set_xlabel('n_samples') + ax.set_xlabel("n_samples") if n % 2 == 0: - ax.set_ylabel('time (s)') + ax.set_ylabel("time (s)") fig.subplots_adjust(right=0.8) - fig.legend([lines[link] for link in sorted(results.keys())], - sorted(results.keys()), loc="center right", fontsize=8) + fig.legend( + [lines[link] for link in sorted(results.keys())], + sorted(results.keys()), + loc="center right", + fontsize=8, + ) plt.show() diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py index 8579abcae3bed..49b87c8c7060a 100644 --- a/benchmarks/bench_plot_incremental_pca.py +++ b/benchmarks/bench_plot_incremental_pca.py @@ -7,17 +7,19 @@ """ -import numpy as np import gc -from time import time from collections import defaultdict +from time import time + import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import fetch_lfw_people -from sklearn.decomposition import IncrementalPCA, PCA +from sklearn.decomposition import PCA, IncrementalPCA def plot_results(X, y, label): - plt.plot(X, y, label=label, marker='o') + plt.plot(X, y, label=label, marker="o") def benchmark(estimator, data): @@ -29,60 +31,68 @@ def benchmark(estimator, data): data_t = estimator.transform(data) data_r = estimator.inverse_transform(data_t) reconstruction_error = np.mean(np.abs(data - data_r)) - return {'time': training_time, 'error': reconstruction_error} + return {"time": training_time, "error": reconstruction_error} def plot_feature_times(all_times, batch_size, all_components, data): plt.figure() - plot_results(all_components, all_times['pca'], label="PCA") - plot_results(all_components, all_times['ipca'], - label="IncrementalPCA, bsize=%i" % batch_size) + plot_results(all_components, all_times["pca"], label="PCA") + plot_results( + all_components, all_times["ipca"], label="IncrementalPCA, bsize=%i" % batch_size + ) plt.legend(loc="upper left") - plt.suptitle("Algorithm runtime vs. n_components\n \ - LFW, size %i x %i" % data.shape) + plt.suptitle( + "Algorithm runtime vs. n_components\n LFW, size %i x %i" + % data.shape + ) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Time (seconds)") def plot_feature_errors(all_errors, batch_size, all_components, data): plt.figure() - plot_results(all_components, all_errors['pca'], label="PCA") - plot_results(all_components, all_errors['ipca'], - label="IncrementalPCA, bsize=%i" % batch_size) + plot_results(all_components, all_errors["pca"], label="PCA") + plot_results( + all_components, + all_errors["ipca"], + label="IncrementalPCA, bsize=%i" % batch_size, + ) plt.legend(loc="lower left") - plt.suptitle("Algorithm error vs. n_components\n" - "LFW, size %i x %i" % data.shape) + plt.suptitle("Algorithm error vs. n_components\nLFW, size %i x %i" % data.shape) plt.xlabel("Number of components (out of max %i)" % data.shape[1]) plt.ylabel("Mean absolute error") def plot_batch_times(all_times, n_features, all_batch_sizes, data): plt.figure() - plot_results(all_batch_sizes, all_times['pca'], label="PCA") - plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") + plot_results(all_batch_sizes, all_times["pca"], label="PCA") + plot_results(all_batch_sizes, all_times["ipca"], label="IncrementalPCA") plt.legend(loc="lower left") - plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ - LFW, size %i x %i" % ( - n_features, data.shape[0], data.shape[1])) + plt.suptitle( + "Algorithm runtime vs. batch_size for n_components %i\n LFW," + " size %i x %i" % (n_features, data.shape[0], data.shape[1]) + ) plt.xlabel("Batch size") plt.ylabel("Time (seconds)") def plot_batch_errors(all_errors, n_features, all_batch_sizes, data): plt.figure() - plot_results(all_batch_sizes, all_errors['pca'], label="PCA") - plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA") + plot_results(all_batch_sizes, all_errors["pca"], label="PCA") + plot_results(all_batch_sizes, all_errors["ipca"], label="IncrementalPCA") plt.legend(loc="lower left") - plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \ - LFW, size %i x %i" % ( - n_features, data.shape[0], data.shape[1])) + plt.suptitle( + "Algorithm error vs. batch_size for n_components %i\n LFW," + " size %i x %i" % (n_features, data.shape[0], data.shape[1]) + ) plt.xlabel("Batch size") plt.ylabel("Mean absolute error") def fixed_batch_size_comparison(data): - all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10, - data.shape[1], num=5)] + all_features = [ + i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=5) + ] batch_size = 1000 # Compare runtimes and error for fixed batch size all_times = defaultdict(list) @@ -90,53 +100,52 @@ def fixed_batch_size_comparison(data): for n_components in all_features: pca = PCA(n_components=n_components) ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) - results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), - ('ipca', ipca)]} + results_dict = { + k: benchmark(est, data) for k, est in [("pca", pca), ("ipca", ipca)] + } for k in sorted(results_dict.keys()): - all_times[k].append(results_dict[k]['time']) - all_errors[k].append(results_dict[k]['error']) + all_times[k].append(results_dict[k]["time"]) + all_errors[k].append(results_dict[k]["error"]) plot_feature_times(all_times, batch_size, all_features, data) plot_feature_errors(all_errors, batch_size, all_features, data) def variable_batch_size_comparison(data): - batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10, - data.shape[0], num=10)] + batch_sizes = [ + i.astype(int) for i in np.linspace(data.shape[0] // 10, data.shape[0], num=10) + ] - for n_components in [i.astype(int) for i in - np.linspace(data.shape[1] // 10, - data.shape[1], num=4)]: + for n_components in [ + i.astype(int) for i in np.linspace(data.shape[1] // 10, data.shape[1], num=4) + ]: all_times = defaultdict(list) all_errors = defaultdict(list) pca = PCA(n_components=n_components) - rpca = PCA(n_components=n_components, svd_solver='randomized', - random_state=1999) - results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), - ('rpca', rpca)]} + rpca = PCA( + n_components=n_components, svd_solver="randomized", random_state=1999 + ) + results_dict = { + k: benchmark(est, data) for k, est in [("pca", pca), ("rpca", rpca)] + } # Create flat baselines to compare the variation over batch size - all_times['pca'].extend([results_dict['pca']['time']] * - len(batch_sizes)) - all_errors['pca'].extend([results_dict['pca']['error']] * - len(batch_sizes)) - all_times['rpca'].extend([results_dict['rpca']['time']] * - len(batch_sizes)) - all_errors['rpca'].extend([results_dict['rpca']['error']] * - len(batch_sizes)) + all_times["pca"].extend([results_dict["pca"]["time"]] * len(batch_sizes)) + all_errors["pca"].extend([results_dict["pca"]["error"]] * len(batch_sizes)) + all_times["rpca"].extend([results_dict["rpca"]["time"]] * len(batch_sizes)) + all_errors["rpca"].extend([results_dict["rpca"]["error"]] * len(batch_sizes)) for batch_size in batch_sizes: - ipca = IncrementalPCA(n_components=n_components, - batch_size=batch_size) - results_dict = {k: benchmark(est, data) for k, est in [('ipca', - ipca)]} - all_times['ipca'].append(results_dict['ipca']['time']) - all_errors['ipca'].append(results_dict['ipca']['error']) + ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) + results_dict = {k: benchmark(est, data) for k, est in [("ipca", ipca)]} + all_times["ipca"].append(results_dict["ipca"]["time"]) + all_errors["ipca"].append(results_dict["ipca"]["error"]) plot_batch_times(all_times, n_components, batch_sizes, data) plot_batch_errors(all_errors, n_components, batch_sizes, data) -faces = fetch_lfw_people(resize=.2, min_faces_per_person=5) + +faces = fetch_lfw_people(resize=0.2, min_faces_per_person=5) # limit dataset to 5000 people (don't care who they are!) X = faces.data[:5000] n_samples, h, w = faces.images.shape diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py index a1530bb5c06c4..9acc1b4b35952 100644 --- a/benchmarks/bench_plot_lasso_path.py +++ b/benchmarks/bench_plot_lasso_path.py @@ -2,20 +2,19 @@ The input data is mostly low rank but is a fat infinite tail. """ -from collections import defaultdict + import gc import sys +from collections import defaultdict from time import time import numpy as np -from sklearn.linear_model import lars_path, lars_path_gram -from sklearn.linear_model import lasso_path from sklearn.datasets import make_regression +from sklearn.linear_model import lars_path, lars_path_gram, lasso_path def compute_bench(samples_range, features_range): - it = 0 results = defaultdict(lambda: []) @@ -24,65 +23,65 @@ def compute_bench(samples_range, features_range): for n_samples in samples_range: for n_features in features_range: it += 1 - print('====================') - print('Iteration %03d of %03d' % (it, max_it)) - print('====================') + print("====================") + print("Iteration %03d of %03d" % (it, max_it)) + print("====================") dataset_kwargs = { - 'n_samples': n_samples, - 'n_features': n_features, - 'n_informative': n_features // 10, - 'effective_rank': min(n_samples, n_features) / 10, - #'effective_rank': None, - 'bias': 0.0, + "n_samples": n_samples, + "n_features": n_features, + "n_informative": n_features // 10, + "effective_rank": min(n_samples, n_features) / 10, + # 'effective_rank': None, + "bias": 0.0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) X, y = make_regression(**dataset_kwargs) gc.collect() - print("benchmarking lars_path (with Gram):", end='') + print("benchmarking lars_path (with Gram):", end="") sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) - lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, method='lasso') + lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, method="lasso") delta = time() - tstart print("%0.3fs" % delta) - results['lars_path (with Gram)'].append(delta) + results["lars_path (with Gram)"].append(delta) gc.collect() - print("benchmarking lars_path (without Gram):", end='') + print("benchmarking lars_path (without Gram):", end="") sys.stdout.flush() tstart = time() - lars_path(X, y, method='lasso') + lars_path(X, y, method="lasso") delta = time() - tstart print("%0.3fs" % delta) - results['lars_path (without Gram)'].append(delta) + results["lars_path (without Gram)"].append(delta) gc.collect() - print("benchmarking lasso_path (with Gram):", end='') + print("benchmarking lasso_path (with Gram):", end="") sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=True) delta = time() - tstart print("%0.3fs" % delta) - results['lasso_path (with Gram)'].append(delta) + results["lasso_path (with Gram)"].append(delta) gc.collect() - print("benchmarking lasso_path (without Gram):", end='') + print("benchmarking lasso_path (without Gram):", end="") sys.stdout.flush() tstart = time() lasso_path(X, y, precompute=False) delta = time() - tstart print("%0.3fs" % delta) - results['lasso_path (without Gram)'].append(delta) + results["lasso_path (without Gram)"].append(delta) return results -if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection +if __name__ == "__main__": import matplotlib.pyplot as plt + from mpl_toolkits.mplot3d import axes3d # register the 3d projection # noqa: F401 samples_range = np.linspace(10, 2000, 5).astype(int) features_range = np.linspace(10, 2000, 5).astype(int) @@ -90,13 +89,12 @@ def compute_bench(samples_range, features_range): max_time = max(max(t) for t in results.values()) - fig = plt.figure('scikit-learn Lasso path benchmark results') + fig = plt.figure("scikit-learn Lasso path benchmark results") i = 1 - for c, (label, timings) in zip('bcry', sorted(results.items())): - ax = fig.add_subplot(2, 2, i, projection='3d') + for c, (label, timings) in zip("bcry", sorted(results.items())): + ax = fig.add_subplot(2, 2, i, projection="3d") X, Y = np.meshgrid(samples_range, features_range) - Z = np.asarray(timings).reshape(samples_range.shape[0], - features_range.shape[0]) + Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) # plot the actual surface ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.8) @@ -105,9 +103,9 @@ def compute_bench(samples_range, features_range): # support legends (yet?) # ax.plot([1], [1], [1], color=c, label=label) - ax.set_xlabel('n_samples') - ax.set_ylabel('n_features') - ax.set_zlabel('Time (s)') + ax.set_xlabel("n_samples") + ax.set_ylabel("n_features") + ax.set_zlabel("Time (s)") ax.set_zlim3d(0.0, max_time * 1.1) ax.set_title(label) # ax.legend() diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py index 85a8586af024c..2cedb19fb23c4 100644 --- a/benchmarks/bench_plot_neighbors.py +++ b/benchmarks/bench_plot_neighbors.py @@ -1,20 +1,21 @@ """ Plot the scaling of the nearest neighbors algorithms with k, D, and N """ + from time import time -import numpy as np import matplotlib.pyplot as plt +import numpy as np from matplotlib import ticker -from sklearn import neighbors, datasets +from sklearn import datasets, neighbors -def get_data(N, D, dataset='dense'): - if dataset == 'dense': +def get_data(N, D, dataset="dense"): + if dataset == "dense": np.random.seed(0) return np.random.random((N, D)) - elif dataset == 'digits': + elif dataset == "digits": X, _ = datasets.load_digits(return_X_y=True) i = np.argsort(X[0])[::-1] X = X[:, i] @@ -23,129 +24,121 @@ def get_data(N, D, dataset='dense'): raise ValueError("invalid dataset: %s" % dataset) -def barplot_neighbors(Nrange=2 ** np.arange(1, 11), - Drange=2 ** np.arange(7), - krange=2 ** np.arange(10), - N=1000, - D=64, - k=5, - leaf_size=30, - dataset='digits'): - algorithms = ('kd_tree', 'brute', 'ball_tree') - fiducial_values = {'N': N, - 'D': D, - 'k': k} - - #------------------------------------------------------------ +def barplot_neighbors( + Nrange=2 ** np.arange(1, 11), + Drange=2 ** np.arange(7), + krange=2 ** np.arange(10), + N=1000, + D=64, + k=5, + leaf_size=30, + dataset="digits", +): + algorithms = ("kd_tree", "brute", "ball_tree") + fiducial_values = {"N": N, "D": D, "k": k} + + # ------------------------------------------------------------ # varying N - N_results_build = {alg: np.zeros(len(Nrange)) - for alg in algorithms} - N_results_query = {alg: np.zeros(len(Nrange)) - for alg in algorithms} + N_results_build = {alg: np.zeros(len(Nrange)) for alg in algorithms} + N_results_query = {alg: np.zeros(len(Nrange)) for alg in algorithms} for i, NN in enumerate(Nrange): print("N = %i (%i out of %i)" % (NN, i + 1, len(Nrange))) X = get_data(NN, D, dataset) for algorithm in algorithms: - nbrs = neighbors.NearestNeighbors(n_neighbors=min(NN, k), - algorithm=algorithm, - leaf_size=leaf_size) + nbrs = neighbors.NearestNeighbors( + n_neighbors=min(NN, k), algorithm=algorithm, leaf_size=leaf_size + ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() - N_results_build[algorithm][i] = (t1 - t0) - N_results_query[algorithm][i] = (t2 - t1) + N_results_build[algorithm][i] = t1 - t0 + N_results_query[algorithm][i] = t2 - t1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # varying D - D_results_build = {alg: np.zeros(len(Drange)) - for alg in algorithms} - D_results_query = {alg: np.zeros(len(Drange)) - for alg in algorithms} + D_results_build = {alg: np.zeros(len(Drange)) for alg in algorithms} + D_results_query = {alg: np.zeros(len(Drange)) for alg in algorithms} for i, DD in enumerate(Drange): print("D = %i (%i out of %i)" % (DD, i + 1, len(Drange))) X = get_data(N, DD, dataset) for algorithm in algorithms: - nbrs = neighbors.NearestNeighbors(n_neighbors=k, - algorithm=algorithm, - leaf_size=leaf_size) + nbrs = neighbors.NearestNeighbors( + n_neighbors=k, algorithm=algorithm, leaf_size=leaf_size + ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() - D_results_build[algorithm][i] = (t1 - t0) - D_results_query[algorithm][i] = (t2 - t1) + D_results_build[algorithm][i] = t1 - t0 + D_results_query[algorithm][i] = t2 - t1 - #------------------------------------------------------------ + # ------------------------------------------------------------ # varying k - k_results_build = {alg: np.zeros(len(krange)) - for alg in algorithms} - k_results_query = {alg: np.zeros(len(krange)) - for alg in algorithms} + k_results_build = {alg: np.zeros(len(krange)) for alg in algorithms} + k_results_query = {alg: np.zeros(len(krange)) for alg in algorithms} X = get_data(N, DD, dataset) for i, kk in enumerate(krange): print("k = %i (%i out of %i)" % (kk, i + 1, len(krange))) for algorithm in algorithms: - nbrs = neighbors.NearestNeighbors(n_neighbors=kk, - algorithm=algorithm, - leaf_size=leaf_size) + nbrs = neighbors.NearestNeighbors( + n_neighbors=kk, algorithm=algorithm, leaf_size=leaf_size + ) t0 = time() nbrs.fit(X) t1 = time() nbrs.kneighbors(X) t2 = time() - k_results_build[algorithm][i] = (t1 - t0) - k_results_query[algorithm][i] = (t2 - t1) + k_results_build[algorithm][i] = t1 - t0 + k_results_query[algorithm][i] = t2 - t1 plt.figure(figsize=(8, 11)) - for (sbplt, vals, quantity, - build_time, query_time) in [(311, Nrange, 'N', - N_results_build, - N_results_query), - (312, Drange, 'D', - D_results_build, - D_results_query), - (313, krange, 'k', - k_results_build, - k_results_query)]: - ax = plt.subplot(sbplt, yscale='log') + for sbplt, vals, quantity, build_time, query_time in [ + (311, Nrange, "N", N_results_build, N_results_query), + (312, Drange, "D", D_results_build, D_results_query), + (313, krange, "k", k_results_build, k_results_query), + ]: + ax = plt.subplot(sbplt, yscale="log") plt.grid(True) tick_vals = [] tick_labels = [] - bottom = 10 ** np.min([min(np.floor(np.log10(build_time[alg]))) - for alg in algorithms]) + bottom = 10 ** np.min( + [min(np.floor(np.log10(build_time[alg]))) for alg in algorithms] + ) for i, alg in enumerate(algorithms): xvals = 0.1 + i * (1 + len(vals)) + np.arange(len(vals)) width = 0.8 - c_bar = plt.bar(xvals, build_time[alg] - bottom, - width, bottom, color='r') - q_bar = plt.bar(xvals, query_time[alg], - width, build_time[alg], color='b') + c_bar = plt.bar(xvals, build_time[alg] - bottom, width, bottom, color="r") + q_bar = plt.bar(xvals, query_time[alg], width, build_time[alg], color="b") tick_vals += list(xvals + 0.5 * width) - tick_labels += ['%i' % val for val in vals] + tick_labels += ["%i" % val for val in vals] - plt.text((i + 0.02) / len(algorithms), 0.98, alg, - transform=ax.transAxes, - ha='left', - va='top', - bbox=dict(facecolor='w', edgecolor='w', alpha=0.5)) + plt.text( + (i + 0.02) / len(algorithms), + 0.98, + alg, + transform=ax.transAxes, + ha="left", + va="top", + bbox=dict(facecolor="w", edgecolor="w", alpha=0.5), + ) - plt.ylabel('Time (s)') + plt.ylabel("Time (s)") ax.xaxis.set_major_locator(ticker.FixedLocator(tick_vals)) ax.xaxis.set_major_formatter(ticker.FixedFormatter(tick_labels)) @@ -154,32 +147,45 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11), label.set_rotation(-90) label.set_fontsize(10) - title_string = 'Varying %s' % quantity + title_string = "Varying %s" % quantity - descr_string = '' + descr_string = "" - for s in 'NDk': + for s in "NDk": if s == quantity: pass else: - descr_string += '%s = %i, ' % (s, fiducial_values[s]) + descr_string += "%s = %i, " % (s, fiducial_values[s]) descr_string = descr_string[:-2] - plt.text(1.01, 0.5, title_string, - transform=ax.transAxes, rotation=-90, - ha='left', va='center', fontsize=20) - - plt.text(0.99, 0.5, descr_string, - transform=ax.transAxes, rotation=-90, - ha='right', va='center') + plt.text( + 1.01, + 0.5, + title_string, + transform=ax.transAxes, + rotation=-90, + ha="left", + va="center", + fontsize=20, + ) + + plt.text( + 0.99, + 0.5, + descr_string, + transform=ax.transAxes, + rotation=-90, + ha="right", + va="center", + ) plt.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16) - plt.figlegend((c_bar, q_bar), ('construction', 'N-point query'), - 'upper right') + plt.figlegend((c_bar, q_bar), ("construction", "N-point query"), "upper right") + -if __name__ == '__main__': - barplot_neighbors(dataset='digits') - barplot_neighbors(dataset='dense') +if __name__ == "__main__": + barplot_neighbors(dataset="digits") + barplot_neighbors(dataset="dense") plt.show() diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py index 48f1dd1891392..76d1a6de8286c 100644 --- a/benchmarks/bench_plot_nmf.py +++ b/benchmarks/bench_plot_nmf.py @@ -1,34 +1,30 @@ """ Benchmarks of Non-Negative Matrix Factorization """ -# Authors: Tom Dupre la Tour (benchmark) -# Chih-Jen Linn (original projected gradient NMF implementation) -# Anthony Di Franco (projected gradient, Python and NumPy port) -# License: BSD 3 clause -from time import time +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers import sys import warnings -import numbers +from time import time -import numpy as np import matplotlib.pyplot as plt -from joblib import Memory +import numpy as np import pandas +from joblib import Memory -from sklearn.utils._testing import ignore_warnings -from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF -from sklearn.decomposition._nmf import _initialize_nmf -from sklearn.decomposition._nmf import _beta_divergence -from sklearn.decomposition._nmf import _check_init +from sklearn.decomposition._nmf import _beta_divergence, _check_init, _initialize_nmf from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.extmath import safe_sparse_dot, squared_norm +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.utils import check_array +from sklearn.utils._testing import ignore_warnings +from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils.validation import check_is_fitted, check_non_negative - -mem = Memory(cachedir='.', verbose=0) +mem = Memory(cachedir=".", verbose=0) ################### # Start of _PGNMF # @@ -41,13 +37,14 @@ def _norm(x): """Dot product-based Euclidean norm implementation - See: http://fseoane.net/blog/2011/computing-the-vector-norm/ + See: https://fa.bianp.net/blog/2011/computing-the-vector-norm/ """ return np.sqrt(squared_norm(x)) -def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0., - sigma=0.01, beta=0.1): +def _nls_subproblem( + X, W, H, tol, max_iter, alpha=0.0, l1_ratio=0.0, sigma=0.01, beta=0.1 +): """Non-negative least square solver Solves a non-negative least squares subproblem using the projected gradient descent algorithm. @@ -104,7 +101,7 @@ def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0., gamma = 1 for n_iter in range(1, max_iter + 1): grad = np.dot(WtW, H) - WtX - if alpha > 0 and l1_ratio == 1.: + if alpha > 0 and l1_ratio == 1.0: grad += alpha elif alpha > 0: grad += alpha * (l1_ratio + (1 - l1_ratio) * H) @@ -142,18 +139,14 @@ def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0., Hp = Hn if n_iter == max_iter: - warnings.warn("Iteration limit reached in nls subproblem.", - ConvergenceWarning) + warnings.warn("Iteration limit reached in nls subproblem.", ConvergenceWarning) return H, grad, n_iter -def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, - l1_ratio): - gradW = (np.dot(W, np.dot(H, H.T)) - - safe_sparse_dot(X, H.T, dense_output=True)) - gradH = (np.dot(np.dot(W.T, W), H) - - safe_sparse_dot(W.T, X, dense_output=True)) +def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, l1_ratio): + gradW = np.dot(W, np.dot(H, H.T)) - safe_sparse_dot(X, H.T, dense_output=True) + gradH = np.dot(np.dot(W.T, W), H) - safe_sparse_dot(W.T, X, dense_output=True) init_grad = squared_norm(gradW) + squared_norm(gradH.T) # max(0.001, tol) to force alternating minimizations of W and H @@ -165,28 +158,31 @@ def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha, proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0)) proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0)) - if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2: + if (proj_grad_W + proj_grad_H) / init_grad < tol**2: break # update W - Wt, gradWt, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, - alpha=alpha, l1_ratio=l1_ratio) + Wt, gradWt, iterW = _nls_subproblem( + X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio + ) W, gradW = Wt.T, gradWt.T if iterW == 1: tolW = 0.1 * tolW # update H - H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter, - alpha=alpha, l1_ratio=l1_ratio) + H, gradH, iterH = _nls_subproblem( + X, W, H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio + ) if iterH == 1: tolH = 0.1 * tolH - H[H == 0] = 0 # fix up negative zeros + H[H == 0] = 0 # fix up negative zeros if n_iter == max_iter: - Wt, _, _ = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter, - alpha=alpha, l1_ratio=l1_ratio) + Wt, _, _ = _nls_subproblem( + X.T, H.T, W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio + ) W = Wt.T return W, H, n_iter @@ -199,13 +195,30 @@ class _PGNMF(NMF): It may change or disappear without notice. """ - def __init__(self, n_components=None, solver='pg', init=None, - tol=1e-4, max_iter=200, random_state=None, - alpha=0., l1_ratio=0., nls_max_iter=10): + + def __init__( + self, + n_components=None, + solver="pg", + init=None, + tol=1e-4, + max_iter=200, + random_state=None, + alpha=0.0, + l1_ratio=0.0, + nls_max_iter=10, + ): super().__init__( - n_components=n_components, init=init, solver=solver, tol=tol, - max_iter=max_iter, random_state=random_state, alpha=alpha, - l1_ratio=l1_ratio) + n_components=n_components, + init=init, + solver=solver, + tol=tol, + max_iter=max_iter, + random_state=random_state, + alpha_W=alpha, + alpha_H=alpha, + l1_ratio=l1_ratio, + ) self.nls_max_iter = nls_max_iter def fit(self, X, y=None, **params): @@ -228,7 +241,7 @@ def fit_transform(self, X, y=None, W=None, H=None): return W def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): - X = check_array(X, accept_sparse=('csr', 'csc')) + X = check_array(X, accept_sparse=("csr", "csc")) check_non_negative(X, "NMF (input X)") n_samples, n_features = X.shape @@ -236,47 +249,67 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): if n_components is None: n_components = n_features - if (not isinstance(n_components, numbers.Integral) or - n_components <= 0): - raise ValueError("Number of components must be a positive integer;" - " got (n_components=%r)" % n_components) - if (not isinstance(self.max_iter, numbers.Integral) or - self.max_iter < 0): - raise ValueError("Maximum number of iterations must be a positive " - "integer; got (max_iter=%r)" % self.max_iter) + if not isinstance(n_components, numbers.Integral) or n_components <= 0: + raise ValueError( + "Number of components must be a positive integer; got (n_components=%r)" + % n_components + ) + if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: + raise ValueError( + "Maximum number of iterations must be a positive " + "integer; got (max_iter=%r)" % self.max_iter + ) if not isinstance(self.tol, numbers.Number) or self.tol < 0: - raise ValueError("Tolerance for stopping criteria must be " - "positive; got (tol=%r)" % self.tol) + raise ValueError( + "Tolerance for stopping criteria must be positive; got (tol=%r)" + % self.tol + ) # check W and H, or initialize them - if self.init == 'custom' and update_H: + if self.init == "custom" and update_H: _check_init(H, (n_components, n_features), "NMF (input H)") _check_init(W, (n_samples, n_components), "NMF (input W)") elif not update_H: _check_init(H, (n_components, n_features), "NMF (input H)") W = np.zeros((n_samples, n_components)) else: - W, H = _initialize_nmf(X, n_components, init=self.init, - random_state=self.random_state) + W, H = _initialize_nmf( + X, n_components, init=self.init, random_state=self.random_state + ) if update_H: # fit_transform W, H, n_iter = _fit_projected_gradient( - X, W, H, self.tol, self.max_iter, self.nls_max_iter, - self.alpha, self.l1_ratio) + X, + W, + H, + self.tol, + self.max_iter, + self.nls_max_iter, + self.alpha, + self.l1_ratio, + ) else: # transform - Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol, - self.nls_max_iter, - alpha=self.alpha, - l1_ratio=self.l1_ratio) + Wt, _, n_iter = _nls_subproblem( + X.T, + H.T, + W.T, + self.tol, + self.nls_max_iter, + alpha=self.alpha, + l1_ratio=self.l1_ratio, + ) W = Wt.T if n_iter == self.max_iter and self.tol > 0: - warnings.warn("Maximum number of iteration %d reached. Increase it" - " to improve convergence." % self.max_iter, - ConvergenceWarning) + warnings.warn( + "Maximum number of iteration %d reached. Increase it" + " to improve convergence." % self.max_iter, + ConvergenceWarning, + ) return W, H, n_iter + ################# # End of _PGNMF # ################# @@ -287,22 +320,27 @@ def plot_results(results_df, plot_name): return None plt.figure(figsize=(16, 6)) - colors = 'bgr' - markers = 'ovs' + colors = "bgr" + markers = "ovs" ax = plt.subplot(1, 3, 1) - for i, init in enumerate(np.unique(results_df['init'])): + for i, init in enumerate(np.unique(results_df["init"])): plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax) - for j, method in enumerate(np.unique(results_df['method'])): - mask = np.logical_and(results_df['init'] == init, - results_df['method'] == method) + for j, method in enumerate(np.unique(results_df["method"])): + mask = np.logical_and( + results_df["init"] == init, results_df["method"] == method + ) selected_items = results_df[mask] - plt.plot(selected_items['time'], selected_items['loss'], - color=colors[j % len(colors)], ls='-', - marker=markers[j % len(markers)], - label=method) + plt.plot( + selected_items["time"], + selected_items["loss"], + color=colors[j % len(colors)], + ls="-", + marker=markers[j % len(markers)], + label=method, + ) - plt.legend(loc=0, fontsize='x-small') + plt.legend(loc=0, fontsize="x-small") plt.xlabel("Time (s)") plt.ylabel("loss") plt.title("%s" % init) @@ -312,9 +350,10 @@ def plot_results(results_df, plot_name): @ignore_warnings(category=ConvergenceWarning) # use joblib to cache the results. # X_shape is specified in arguments for avoiding hashing X -@mem.cache(ignore=['X', 'W0', 'H0']) -def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init, - n_components, random_state): +@mem.cache(ignore=["X", "W0", "H0"]) +def bench_one( + name, X, W0, H0, X_shape, clf_type, clf_params, init, n_components, random_state +): W = W0.copy() H = H0.copy() @@ -334,22 +373,22 @@ def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio): results = [] for name, clf_type, iter_range, clf_params in clfs: print("Training %s:" % name) - for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')): + for rs, init in enumerate(("nndsvd", "nndsvdar", "random")): print(" %s %s: " % (init, " " * (8 - len(init))), end="") W, H = _initialize_nmf(X, n_components, init, 1e-6, rs) for max_iter in iter_range: - clf_params['alpha'] = alpha - clf_params['l1_ratio'] = l1_ratio - clf_params['max_iter'] = max_iter - clf_params['tol'] = tol - clf_params['random_state'] = rs - clf_params['init'] = 'custom' - clf_params['n_components'] = n_components - - this_loss, duration = bench_one(name, X, W, H, X.shape, - clf_type, clf_params, - init, n_components, rs) + clf_params["alpha"] = alpha + clf_params["l1_ratio"] = l1_ratio + clf_params["max_iter"] = max_iter + clf_params["tol"] = tol + clf_params["random_state"] = rs + clf_params["init"] = "custom" + clf_params["n_components"] = n_components + + this_loss, duration = bench_one( + name, X, W, H, X.shape, clf_type, clf_params, init, n_components, rs + ) init_name = "init='%s'" % init results.append((name, this_loss, duration, init_name)) @@ -359,8 +398,7 @@ def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio): print(" ") # Use a panda dataframe to organize the results - results_df = pandas.DataFrame(results, - columns="method loss time init".split()) + results_df = pandas.DataFrame(results, columns="method loss time init".split()) print("Total time = %0.3f sec\n" % (time() - start)) # plot the results @@ -372,9 +410,11 @@ def load_20news(): print("Loading 20 newsgroups dataset") print("-----------------------------") from sklearn.datasets import fetch_20newsgroups - dataset = fetch_20newsgroups(shuffle=True, random_state=1, - remove=('headers', 'footers', 'quotes')) - vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') + + dataset = fetch_20newsgroups( + shuffle=True, random_state=1, remove=("headers", "footers", "quotes") + ) + vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english") tfidf = vectorizer.fit_transform(dataset.data) return tfidf @@ -383,20 +423,22 @@ def load_faces(): print("Loading Olivetti face dataset") print("-----------------------------") from sklearn.datasets import fetch_olivetti_faces + faces = fetch_olivetti_faces(shuffle=True) return faces.data def build_clfs(cd_iters, pg_iters, mu_iters): - clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}), - ("Projected Gradient", _PGNMF, pg_iters, {'solver': 'pg'}), - ("Multiplicative Update", NMF, mu_iters, {'solver': 'mu'}), - ] + clfs = [ + ("Coordinate Descent", NMF, cd_iters, {"solver": "cd"}), + ("Projected Gradient", _PGNMF, pg_iters, {"solver": "pg"}), + ("Multiplicative Update", NMF, mu_iters, {"solver": "mu"}), + ] return clfs -if __name__ == '__main__': - alpha = 0. +if __name__ == "__main__": + alpha = 0.0 l1_ratio = 0.5 n_components = 10 tol = 1e-15 @@ -417,6 +459,14 @@ def build_clfs(cd_iters, pg_iters, mu_iters): mu_iters = np.arange(1, 30) clfs = build_clfs(cd_iters, pg_iters, mu_iters) X_faces = load_faces() - run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,) + run_bench( + X_faces, + clfs, + plot_name, + n_components, + tol, + alpha, + l1_ratio, + ) plt.show() diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py index bd10183565847..8a4bc9b1a34fe 100644 --- a/benchmarks/bench_plot_omp_lars.py +++ b/benchmarks/bench_plot_omp_lars.py @@ -3,18 +3,18 @@ The input data is mostly low rank but is a fat infinite tail. """ + import gc import sys from time import time import numpy as np -from sklearn.linear_model import lars_path, lars_path_gram, orthogonal_mp from sklearn.datasets import make_sparse_coded_signal +from sklearn.linear_model import lars_path, lars_path_gram, orthogonal_mp def compute_bench(samples_range, features_range): - it = 0 results = dict() @@ -27,10 +27,10 @@ def compute_bench(samples_range, features_range): for i_s, n_samples in enumerate(samples_range): for i_f, n_features in enumerate(features_range): it += 1 - n_informative = n_features / 10 - print('====================') - print('Iteration %03d of %03d' % (it, max_it)) - print('====================') + n_informative = n_features // 10 + print("====================") + print("Iteration %03d of %03d" % (it, max_it)) + print("====================") # dataset_kwargs = { # 'n_train_samples': n_samples, # 'n_test_samples': 2, @@ -41,31 +41,30 @@ def compute_bench(samples_range, features_range): # 'bias': 0.0, # } dataset_kwargs = { - 'n_samples': 1, - 'n_components': n_features, - 'n_features': n_samples, - 'n_nonzero_coefs': n_informative, - 'random_state': 0 + "n_samples": 1, + "n_components": n_features, + "n_features": n_samples, + "n_nonzero_coefs": n_informative, + "random_state": 0, } print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) y, X, _ = make_sparse_coded_signal(**dataset_kwargs) - X = np.asfortranarray(X) + X = np.asfortranarray(X.T) gc.collect() - print("benchmarking lars_path (with Gram):", end='') + print("benchmarking lars_path (with Gram):", end="") sys.stdout.flush() tstart = time() G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) - lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, - max_iter=n_informative) + lars_path_gram(Xy=Xy, Gram=G, n_samples=y.size, max_iter=n_informative) delta = time() - tstart print("%0.3fs" % delta) lars_gram[i_f, i_s] = delta gc.collect() - print("benchmarking lars_path (without Gram):", end='') + print("benchmarking lars_path (without Gram):", end="") sys.stdout.flush() tstart = time() lars_path(X, y, Gram=None, max_iter=n_informative) @@ -74,49 +73,48 @@ def compute_bench(samples_range, features_range): lars[i_f, i_s] = delta gc.collect() - print("benchmarking orthogonal_mp (with Gram):", end='') + print("benchmarking orthogonal_mp (with Gram):", end="") sys.stdout.flush() tstart = time() - orthogonal_mp(X, y, precompute=True, - n_nonzero_coefs=n_informative) + orthogonal_mp(X, y, precompute=True, n_nonzero_coefs=n_informative) delta = time() - tstart print("%0.3fs" % delta) omp_gram[i_f, i_s] = delta gc.collect() - print("benchmarking orthogonal_mp (without Gram):", end='') + print("benchmarking orthogonal_mp (without Gram):", end="") sys.stdout.flush() tstart = time() - orthogonal_mp(X, y, precompute=False, - n_nonzero_coefs=n_informative) + orthogonal_mp(X, y, precompute=False, n_nonzero_coefs=n_informative) delta = time() - tstart print("%0.3fs" % delta) omp[i_f, i_s] = delta - results['time(LARS) / time(OMP)\n (w/ Gram)'] = (lars_gram / omp_gram) - results['time(LARS) / time(OMP)\n (w/o Gram)'] = (lars / omp) + results["time(LARS) / time(OMP)\n (w/ Gram)"] = lars_gram / omp_gram + results["time(LARS) / time(OMP)\n (w/o Gram)"] = lars / omp return results -if __name__ == '__main__': +if __name__ == "__main__": samples_range = np.linspace(1000, 5000, 5).astype(int) features_range = np.linspace(1000, 5000, 5).astype(int) results = compute_bench(samples_range, features_range) max_time = max(np.max(t) for t in results.values()) import matplotlib.pyplot as plt - fig = plt.figure('scikit-learn OMP vs. LARS benchmark results') + + fig = plt.figure("scikit-learn OMP vs. LARS benchmark results") for i, (label, timings) in enumerate(sorted(results.items())): - ax = fig.add_subplot(1, 2, i+1) + ax = fig.add_subplot(1, 2, i + 1) vmax = max(1 - timings.min(), -1 + timings.max()) plt.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax) - ax.set_xticklabels([''] + [str(each) for each in samples_range]) - ax.set_yticklabels([''] + [str(each) for each in features_range]) - plt.xlabel('n_samples') - plt.ylabel('n_features') + ax.set_xticklabels([""] + [str(each) for each in samples_range]) + ax.set_yticklabels([""] + [str(each) for each in features_range]) + plt.xlabel("n_samples") + plt.ylabel("n_features") plt.title(label) plt.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63) ax = plt.axes([0.1, 0.08, 0.8, 0.06]) - plt.colorbar(cax=ax, orientation='horizontal') + plt.colorbar(cax=ax, orientation="horizontal") plt.show() diff --git a/benchmarks/bench_plot_parallel_pairwise.py b/benchmarks/bench_plot_parallel_pairwise.py index 0fed06929bebc..5b7cf81f8fce4 100644 --- a/benchmarks/bench_plot_parallel_pairwise.py +++ b/benchmarks/bench_plot_parallel_pairwise.py @@ -1,12 +1,13 @@ -# Author: Mathieu Blondel -# License: BSD 3 clause +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + import time import matplotlib.pyplot as plt +from sklearn.metrics.pairwise import pairwise_distances, pairwise_kernels from sklearn.utils import check_random_state -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.metrics.pairwise import pairwise_kernels + def plot(func): random_state = check_random_state(0) @@ -25,12 +26,12 @@ def plot(func): func(X, n_jobs=-1) multi_core.append(time.time() - start) - plt.figure('scikit-learn parallel %s benchmark results' % func.__name__) + plt.figure("scikit-learn parallel %s benchmark results" % func.__name__) plt.plot(sample_sizes, one_core, label="one core") plt.plot(sample_sizes, multi_core, label="multi core") - plt.xlabel('n_samples') - plt.ylabel('Time (s)') - plt.title('Parallel %s' % func.__name__) + plt.xlabel("n_samples") + plt.ylabel("Time (s)") + plt.title("Parallel %s" % func.__name__) plt.legend() @@ -41,6 +42,7 @@ def euclidean_distances(X, n_jobs): def rbf_kernels(X, n_jobs): return pairwise_kernels(X, metric="rbf", n_jobs=n_jobs, gamma=0.1) + plot(euclidean_distances) plot(rbf_kernels) plt.show() diff --git a/benchmarks/bench_plot_polynomial_kernel_approximation.py b/benchmarks/bench_plot_polynomial_kernel_approximation.py index 2b7556f37320e..1e23e0a3c79ad 100644 --- a/benchmarks/bench_plot_polynomial_kernel_approximation.py +++ b/benchmarks/bench_plot_polynomial_kernel_approximation.py @@ -30,33 +30,34 @@ [1] Pham, N., & Pagh, R. (2013, August). Fast and scalable polynomial kernels via explicit feature maps. In Proceedings of the 19th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 239-247) -(http://chbrown.github.io/kdd-2013-usb/kdd/p239.pdf) +(https://chbrown.github.io/kdd-2013-usb/kdd/p239.pdf) [2] Charikar, M., Chen, K., & Farach-Colton, M. (2002, July). Finding frequent items in data streams. In International Colloquium on Automata, Languages, and Programming (pp. 693-703). Springer, Berlin, Heidelberg. -(http://www.vldb.org/pvldb/1/1454225.pdf) +(https://people.cs.rutgers.edu/~farach/pubs/FrequentStream.pdf) """ -# Author: Daniel Lopez-Sanchez -# License: BSD 3 clause + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause # Load data manipulation functions -from sklearn.datasets import load_digits -from sklearn.model_selection import train_test_split +# Will use this for timing results +from time import time # Some common libraries import matplotlib.pyplot as plt import numpy as np -# Will use this for timing results -from time import time - -# Import SVM classifiers and feature map approximation algorithms -from sklearn.svm import LinearSVC, SVC +from sklearn.datasets import load_digits from sklearn.kernel_approximation import Nystroem, PolynomialCountSketch +from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline +# Import SVM classifiers and feature map approximation algorithms +from sklearn.svm import SVC, LinearSVC + # Split data in train and test sets X, y = load_digits()["data"], load_digits()["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) @@ -66,11 +67,11 @@ # Evaluate Linear SVM lsvm = LinearSVC().fit(X_train, y_train) -lsvm_score = 100*lsvm.score(X_test, y_test) +lsvm_score = 100 * lsvm.score(X_test, y_test) # Evaluate kernelized SVM -ksvm = SVC(kernel="poly", degree=2, gamma=1.).fit(X_train, y_train) -ksvm_score = 100*ksvm.score(X_test, y_test) +ksvm = SVC(kernel="poly", degree=2, gamma=1.0).fit(X_train, y_train) +ksvm_score = 100 * ksvm.score(X_test, y_test) # Evaluate PolynomialCountSketch + LinearSVM ps_svm_scores = [] @@ -80,11 +81,14 @@ for k in out_dims: score_avg = 0 for _ in range(n_runs): - ps_svm = Pipeline([("PS", PolynomialCountSketch(degree=2, - n_components=k)), - ("SVM", LinearSVC())]) + ps_svm = Pipeline( + [ + ("PS", PolynomialCountSketch(degree=2, n_components=k)), + ("SVM", LinearSVC()), + ] + ) score_avg += ps_svm.fit(X_train, y_train).score(X_test, y_test) - ps_svm_scores.append(100*score_avg/n_runs) + ps_svm_scores.append(100 * score_avg / n_runs) # Evaluate Nystroem + LinearSVM ny_svm_scores = [] @@ -93,23 +97,39 @@ for k in out_dims: score_avg = 0 for _ in range(n_runs): - ny_svm = Pipeline([("NY", Nystroem(kernel="poly", gamma=1., degree=2, - coef0=0, n_components=k)), - ("SVM", LinearSVC())]) + ny_svm = Pipeline( + [ + ( + "NY", + Nystroem( + kernel="poly", gamma=1.0, degree=2, coef0=0, n_components=k + ), + ), + ("SVM", LinearSVC()), + ] + ) score_avg += ny_svm.fit(X_train, y_train).score(X_test, y_test) - ny_svm_scores.append(100*score_avg/n_runs) + ny_svm_scores.append(100 * score_avg / n_runs) # Show results fig, ax = plt.subplots(figsize=(6, 4)) ax.set_title("Accuracy results") -ax.plot(out_dims, ps_svm_scores, label="PolynomialCountSketch + linear SVM", - c="orange") -ax.plot(out_dims, ny_svm_scores, label="Nystroem + linear SVM", - c="blue") -ax.plot([out_dims[0], out_dims[-1]], [lsvm_score, lsvm_score], - label="Linear SVM", c="black", dashes=[2, 2]) -ax.plot([out_dims[0], out_dims[-1]], [ksvm_score, ksvm_score], - label="Poly-kernel SVM", c="red", dashes=[2, 2]) +ax.plot(out_dims, ps_svm_scores, label="PolynomialCountSketch + linear SVM", c="orange") +ax.plot(out_dims, ny_svm_scores, label="Nystroem + linear SVM", c="blue") +ax.plot( + [out_dims[0], out_dims[-1]], + [lsvm_score, lsvm_score], + label="Linear SVM", + c="black", + dashes=[2, 2], +) +ax.plot( + [out_dims[0], out_dims[-1]], + [ksvm_score, ksvm_score], + label="Poly-kernel SVM", + c="red", + dashes=[2, 2], +) ax.legend() ax.set_xlabel("N_components for PolynomialCountSketch and Nystroem") ax.set_ylabel("Accuracy (%)") @@ -137,7 +157,7 @@ # This can take a while due to the inefficient training phase ny_svm_times = [] for k in out_dims: - ny = Nystroem(kernel="poly", gamma=1., degree=2, coef0=0, n_components=k) + ny = Nystroem(kernel="poly", gamma=1.0, degree=2, coef0=0, n_components=k) start = time() ny.fit_transform(fakeData, None) diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py index f752eeb5e863b..e955be64cdee3 100644 --- a/benchmarks/bench_plot_randomized_svd.py +++ b/benchmarks/bench_plot_randomized_svd.py @@ -50,9 +50,10 @@ References ---------- -(1) Finding structure with randomness: Stochastic algorithms for constructing - approximate matrix decompositions - Halko, et al., 2009 https://arxiv.org/abs/0909.4061 +(1) :arxiv:`"Finding structure with randomness: + Stochastic algorithms for constructing approximate matrix decompositions." + <0909.4061>` + Halko, et al., (2009) (2) A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert @@ -62,31 +63,36 @@ A. Szlam et al. 2014 """ -# Author: Giorgio Patrini - -import numpy as np -import scipy as sp -import matplotlib.pyplot as plt +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause import gc +import os.path import pickle -from time import time from collections import defaultdict -import os.path +from time import time -from sklearn.utils._arpack import _init_arpack_v0 +import matplotlib.pyplot as plt +import numpy as np +import scipy as sp + +from sklearn.datasets import ( + fetch_20newsgroups_vectorized, + fetch_lfw_people, + fetch_olivetti_faces, + fetch_openml, + fetch_rcv1, + make_low_rank_matrix, + make_sparse_uncorrelated, +) from sklearn.utils import gen_batches -from sklearn.utils.validation import check_random_state +from sklearn.utils._arpack import _init_arpack_v0 from sklearn.utils.extmath import randomized_svd -from sklearn.datasets import make_low_rank_matrix, make_sparse_uncorrelated -from sklearn.datasets import (fetch_lfw_people, - fetch_openml, - fetch_20newsgroups_vectorized, - fetch_olivetti_faces, - fetch_rcv1) +from sklearn.utils.validation import check_random_state try: import fbpca + fbpca_available = True except ImportError: fbpca_available = False @@ -103,7 +109,7 @@ # Determine when to switch to batch computation for matrix norms, # in case the reconstructed (dense) matrix is too large -MAX_MEMORY = int(2e9) +MAX_MEMORY = int(4e9) # The following datasets can be downloaded manually from: # CIFAR 10: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz @@ -111,15 +117,24 @@ CIFAR_FOLDER = "./cifar-10-batches-py/" SVHN_FOLDER = "./SVHN/" -datasets = ['low rank matrix', 'lfw_people', 'olivetti_faces', '20newsgroups', - 'mnist_784', 'CIFAR', 'a3a', 'SVHN', 'uncorrelated matrix'] +datasets = [ + "low rank matrix", + "lfw_people", + "olivetti_faces", + "20newsgroups", + "mnist_784", + "CIFAR", + "a3a", + "SVHN", + "uncorrelated matrix", +] -big_sparse_datasets = ['big sparse matrix', 'rcv1'] +big_sparse_datasets = ["big sparse matrix", "rcv1"] def unpickle(file_name): - with open(file_name, 'rb') as fo: - return pickle.load(fo, encoding='latin1')["data"] + with open(file_name, "rb") as fo: + return pickle.load(fo, encoding="latin1")["data"] def handle_missing_dataset(file_folder): @@ -131,41 +146,45 @@ def handle_missing_dataset(file_folder): def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) - if dataset_name == 'lfw_people': + if dataset_name == "lfw_people": X = fetch_lfw_people().data - elif dataset_name == '20newsgroups': + elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] - elif dataset_name == 'olivetti_faces': + elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data - elif dataset_name == 'rcv1': + elif dataset_name == "rcv1": X = fetch_rcv1().data - elif dataset_name == 'CIFAR': - if handle_missing_dataset(CIFAR_FOLDER) == "skip": + elif dataset_name == "CIFAR": + if handle_missing_dataset(CIFAR_FOLDER) == 0: return - X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) - for i in range(5)] + X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 - elif dataset_name == 'SVHN': + elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return - X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] + X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 - elif dataset_name == 'low rank matrix': - X = make_low_rank_matrix(n_samples=500, n_features=int(1e4), - effective_rank=100, tail_strength=.5, - random_state=random_state) - elif dataset_name == 'uncorrelated matrix': - X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, - random_state=random_state) - elif dataset_name == 'big sparse matrix': + elif dataset_name == "low rank matrix": + X = make_low_rank_matrix( + n_samples=500, + n_features=int(1e4), + effective_rank=100, + tail_strength=0.5, + random_state=random_state, + ) + elif dataset_name == "uncorrelated matrix": + X, _ = make_sparse_uncorrelated( + n_samples=500, n_features=10000, random_state=random_state + ) + elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) - data = np.random.normal(0, 1, int(sparsity/10)) + data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) @@ -180,16 +199,22 @@ def get_data(dataset_name): def plot_time_vs_s(time, norm, point_labels, title): plt.figure() - colors = ['g', 'b', 'y'] + colors = ["g", "b", "y"] for i, l in enumerate(sorted(norm.keys())): if l != "fbpca": - plt.plot(time[l], norm[l], label=l, marker='o', c=colors.pop()) + plt.plot(time[l], norm[l], label=l, marker="o", c=colors.pop()) else: - plt.plot(time[l], norm[l], label=l, marker='^', c='red') + plt.plot(time[l], norm[l], label=l, marker="^", c="red") for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): - plt.annotate(label, xy=(x, y), xytext=(0, -20), - textcoords='offset points', ha='right', va='bottom') + plt.annotate( + label, + xy=(x, y), + xytext=(0, -20), + textcoords="offset points", + ha="right", + va="bottom", + ) plt.legend(loc="upper right") plt.suptitle(title) plt.ylabel("norm discrepancy") @@ -201,21 +226,33 @@ def scatter_time_vs_s(time, norm, point_labels, title): size = 100 for i, l in enumerate(sorted(norm.keys())): if l != "fbpca": - plt.scatter(time[l], norm[l], label=l, marker='o', c='b', s=size) + plt.scatter(time[l], norm[l], label=l, marker="o", c="b", s=size) for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): - plt.annotate(label, xy=(x, y), xytext=(0, -80), - textcoords='offset points', ha='right', - arrowprops=dict(arrowstyle="->", - connectionstyle="arc3"), - va='bottom', size=11, rotation=90) + plt.annotate( + label, + xy=(x, y), + xytext=(0, -80), + textcoords="offset points", + ha="right", + arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), + va="bottom", + size=11, + rotation=90, + ) else: - plt.scatter(time[l], norm[l], label=l, marker='^', c='red', s=size) + plt.scatter(time[l], norm[l], label=l, marker="^", c="red", s=size) for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): - plt.annotate(label, xy=(x, y), xytext=(0, 30), - textcoords='offset points', ha='right', - arrowprops=dict(arrowstyle="->", - connectionstyle="arc3"), - va='bottom', size=11, rotation=90) + plt.annotate( + label, + xy=(x, y), + xytext=(0, 30), + textcoords="offset points", + ha="right", + arrowprops=dict(arrowstyle="->", connectionstyle="arc3"), + va="bottom", + size=11, + rotation=90, + ) plt.legend(loc="best") plt.suptitle(title) @@ -226,32 +263,40 @@ def scatter_time_vs_s(time, norm, point_labels, title): def plot_power_iter_vs_s(power_iter, s, title): plt.figure() for l in sorted(s.keys()): - plt.plot(power_iter, s[l], label=l, marker='o') - plt.legend(loc="lower right", prop={'size': 10}) + plt.plot(power_iter, s[l], label=l, marker="o") + plt.legend(loc="lower right", prop={"size": 10}) plt.suptitle(title) plt.ylabel("norm discrepancy") plt.xlabel("n_iter") -def svd_timing(X, n_comps, n_iter, n_oversamples, - power_iteration_normalizer='auto', method=None): +def svd_timing( + X, n_comps, n_iter, n_oversamples, power_iteration_normalizer="auto", method=None +): """ Measure time for decomposition """ print("... running SVD ...") - if method is not 'fbpca': + if method != "fbpca": gc.collect() t0 = time() - U, mu, V = randomized_svd(X, n_comps, n_oversamples, n_iter, - power_iteration_normalizer, - random_state=random_state, transpose=False) + U, mu, V = randomized_svd( + X, + n_comps, + n_oversamples=n_oversamples, + n_iter=n_iter, + power_iteration_normalizer=power_iteration_normalizer, + random_state=random_state, + transpose=False, + ) call_time = time() - t0 else: gc.collect() t0 = time() # There is a different convention for l here - U, mu, V = fbpca.pca(X, n_comps, raw=True, n_iter=n_iter, - l=n_oversamples+n_comps) + U, mu, V = fbpca.pca( + X, n_comps, raw=True, n_iter=n_iter, l=n_oversamples + n_comps + ) call_time = time() - t0 return U, mu, V, call_time @@ -270,10 +315,7 @@ def norm_diff(A, norm=2, msg=True, random_state=None): if norm == 2: # s = sp.linalg.norm(A, ord=2) # slow v0 = _init_arpack_v0(min(A.shape), random_state) - value = sp.sparse.linalg.svds(A, - k=1, - return_singular_vectors=False, - v0=v0) + value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False, v0=v0) else: if sp.sparse.issparse(A): value = sp.sparse.linalg.norm(A, ord=norm) @@ -283,36 +325,42 @@ def norm_diff(A, norm=2, msg=True, random_state=None): def scalable_frobenius_norm_discrepancy(X, U, s, V): - # if the input is not too big, just call scipy - if X.shape[0] * X.shape[1] < MAX_MEMORY: + if not sp.sparse.issparse(X) or ( + X.shape[0] * X.shape[1] * X.dtype.itemsize < MAX_MEMORY + ): + # if the input is not sparse or sparse but not too big, + # U.dot(np.diag(s).dot(V)) will fit in RAM A = X - U.dot(np.diag(s).dot(V)) - return norm_diff(A, norm='fro') + return norm_diff(A, norm="fro") print("... computing fro norm by batches...") batch_size = 1000 Vhat = np.diag(s).dot(V) - cum_norm = .0 + cum_norm = 0.0 for batch in gen_batches(X.shape[0], batch_size): M = X[batch, :] - U[batch, :].dot(Vhat) - cum_norm += norm_diff(M, norm='fro', msg=False) + cum_norm += norm_diff(M, norm="fro", msg=False) return np.sqrt(cum_norm) def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps): - all_time = defaultdict(list) if enable_spectral_norm: all_spectral = defaultdict(list) X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) all_frobenius = defaultdict(list) - X_fro_norm = norm_diff(X, norm='fro', msg=False) + X_fro_norm = norm_diff(X, norm="fro", msg=False) for pi in power_iter: - for pm in ['none', 'LU', 'QR']: + for pm in ["none", "LU", "QR"]: print("n_iter = %d on sklearn - %s" % (pi, pm)) - U, s, V, time = svd_timing(X, n_comps, n_iter=pi, - power_iteration_normalizer=pm, - n_oversamples=n_oversamples) + U, s, V, time = svd_timing( + X, + n_comps, + n_iter=pi, + power_iteration_normalizer=pm, + n_oversamples=n_oversamples, + ) label = "sklearn - %s" % pm all_time[label].append(time) if enable_spectral_norm: @@ -325,10 +373,14 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps): if fbpca_available: print("n_iter = %d on fbca" % (pi)) - U, s, V, time = svd_timing(X, n_comps, n_iter=pi, - power_iteration_normalizer=pm, - n_oversamples=n_oversamples, - method='fbpca') + U, s, V, time = svd_timing( + X, + n_comps, + n_iter=pi, + power_iteration_normalizer=pm, + n_oversamples=n_oversamples, + method="fbpca", + ) label = "fbpca" all_time[label].append(time) if enable_spectral_norm: @@ -347,10 +399,13 @@ def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps): def bench_b(power_list): - n_samples, n_features = 1000, 10000 - data_params = {'n_samples': n_samples, 'n_features': n_features, - 'tail_strength': .7, 'random_state': random_state} + data_params = { + "n_samples": n_samples, + "n_features": n_features, + "tail_strength": 0.7, + "random_state": random_state, + } dataset_name = "low rank matrix %d x %d" % (n_samples, n_features) ranks = [10, 50, 100] @@ -361,19 +416,23 @@ def bench_b(power_list): X = make_low_rank_matrix(effective_rank=rank, **data_params) if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) - X_fro_norm = norm_diff(X, norm='fro', msg=False) + X_fro_norm = norm_diff(X, norm="fro", msg=False) - for n_comp in [int(rank/2), rank, rank*2]: + for n_comp in [int(rank / 2), rank, rank * 2]: label = "rank=%d, n_comp=%d" % (rank, n_comp) print(label) for pi in power_list: - U, s, V, _ = svd_timing(X, n_comp, n_iter=pi, n_oversamples=2, - power_iteration_normalizer='LU') + U, s, V, _ = svd_timing( + X, + n_comp, + n_iter=pi, + n_oversamples=2, + power_iteration_normalizer="LU", + ) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( - norm_diff(X - A, norm=2, random_state=0) / - X_spectral_norm + norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm ) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) @@ -398,14 +457,12 @@ def bench_c(datasets, n_comps): if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) - X_fro_norm = norm_diff(X, norm='fro', msg=False) + X_fro_norm = norm_diff(X, norm="fro", msg=False) n_comps = np.minimum(n_comps, np.min(X.shape)) label = "sklearn" - print("%s %d x %d - %s" % - (dataset_name, X.shape[0], X.shape[1], label)) - U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=10, - method=label) + print("%s %d x %d - %s" % (dataset_name, X.shape[0], X.shape[1], label)) + U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=10, method=label) all_time[label].append(time) if enable_spectral_norm: @@ -418,10 +475,10 @@ def bench_c(datasets, n_comps): if fbpca_available: label = "fbpca" - print("%s %d x %d - %s" % - (dataset_name, X.shape[0], X.shape[1], label)) - U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=2, - method=label) + print("%s %d x %d - %s" % (dataset_name, X.shape[0], X.shape[1], label)) + U, s, V, time = svd_timing( + X, n_comps, n_iter=2, n_oversamples=2, method=label + ) all_time[label].append(time) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) @@ -441,20 +498,27 @@ def bench_c(datasets, n_comps): scatter_time_vs_s(all_time, all_frobenius, datasets, title) -if __name__ == '__main__': +if __name__ == "__main__": random_state = check_random_state(1234) - power_iter = np.linspace(0, 6, 7, dtype=int) + power_iter = np.arange(0, 6) n_comps = 50 for dataset_name in datasets: X = get_data(dataset_name) if X is None: continue - print(" >>>>>> Benching sklearn and fbpca on %s %d x %d" % - (dataset_name, X.shape[0], X.shape[1])) - bench_a(X, dataset_name, power_iter, n_oversamples=2, - n_comps=np.minimum(n_comps, np.min(X.shape))) + print( + " >>>>>> Benching sklearn and fbpca on %s %d x %d" + % (dataset_name, X.shape[0], X.shape[1]) + ) + bench_a( + X, + dataset_name, + power_iter, + n_oversamples=2, + n_comps=np.minimum(n_comps, np.min(X.shape)), + ) print(" >>>>>> Benching on simulated low rank matrix with variable rank") bench_b(power_iter) diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py index e58fb48c3051c..f93920cae5305 100644 --- a/benchmarks/bench_plot_svd.py +++ b/benchmarks/bench_plot_svd.py @@ -2,18 +2,19 @@ The data is mostly low rank but is a fat infinite tail. """ + import gc -from time import time -import numpy as np from collections import defaultdict +from time import time +import numpy as np from scipy.linalg import svd -from sklearn.utils.extmath import randomized_svd + from sklearn.datasets import make_low_rank_matrix +from sklearn.utils.extmath import randomized_svd def compute_bench(samples_range, features_range, n_iter=3, rank=50): - it = 0 results = defaultdict(lambda: []) @@ -22,61 +23,58 @@ def compute_bench(samples_range, features_range, n_iter=3, rank=50): for n_samples in samples_range: for n_features in features_range: it += 1 - print('====================') - print('Iteration %03d of %03d' % (it, max_it)) - print('====================') - X = make_low_rank_matrix(n_samples, n_features, - effective_rank=rank, - tail_strength=0.2) + print("====================") + print("Iteration %03d of %03d" % (it, max_it)) + print("====================") + X = make_low_rank_matrix( + n_samples, n_features, effective_rank=rank, tail_strength=0.2 + ) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) - results['scipy svd'].append(time() - tstart) + results["scipy svd"].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=0") tstart = time() randomized_svd(X, rank, n_iter=0) - results['scikit-learn randomized_svd (n_iter=0)'].append( - time() - tstart) + results["scikit-learn randomized_svd (n_iter=0)"].append(time() - tstart) gc.collect() - print("benchmarking scikit-learn randomized_svd: n_iter=%d " - % n_iter) + print("benchmarking scikit-learn randomized_svd: n_iter=%d " % n_iter) tstart = time() randomized_svd(X, rank, n_iter=n_iter) - results['scikit-learn randomized_svd (n_iter=%d)' - % n_iter].append(time() - tstart) + results["scikit-learn randomized_svd (n_iter=%d)" % n_iter].append( + time() - tstart + ) return results -if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection +if __name__ == "__main__": import matplotlib.pyplot as plt + from mpl_toolkits.mplot3d import axes3d # register the 3d projection # noqa: F401 samples_range = np.linspace(2, 1000, 4).astype(int) features_range = np.linspace(2, 1000, 4).astype(int) results = compute_bench(samples_range, features_range) - label = 'scikit-learn singular value decomposition benchmark results' + label = "scikit-learn singular value decomposition benchmark results" fig = plt.figure(label) - ax = fig.gca(projection='3d') - for c, (label, timings) in zip('rbg', sorted(results.items())): + ax = fig.gca(projection="3d") + for c, (label, timings) in zip("rbg", sorted(results.items())): X, Y = np.meshgrid(samples_range, features_range) - Z = np.asarray(timings).reshape(samples_range.shape[0], - features_range.shape[0]) + Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) # plot the actual surface - ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, - color=c) + ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, color=c) # dummy point plot to stick the legend to since surface plot do not # support legends (yet?) ax.plot([1], [1], [1], color=c, label=label) - ax.set_xlabel('n_samples') - ax.set_ylabel('n_features') - ax.set_zlabel('Time (s)') + ax.set_xlabel("n_samples") + ax.set_ylabel("n_features") + ax.set_zlabel("Time (s)") ax.legend() plt.show() diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py index 01fe4f8f025aa..fe5cee201dff4 100644 --- a/benchmarks/bench_plot_ward.py +++ b/benchmarks/bench_plot_ward.py @@ -4,18 +4,17 @@ import time +import matplotlib.pyplot as plt import numpy as np from scipy.cluster import hierarchy -import matplotlib.pyplot as plt from sklearn.cluster import AgglomerativeClustering -ward = AgglomerativeClustering(n_clusters=3, linkage='ward') +ward = AgglomerativeClustering(n_clusters=3, linkage="ward") -n_samples = np.logspace(.5, 3, 9) +n_samples = np.logspace(0.5, 3, 9) n_features = np.logspace(1, 3.5, 7) -N_samples, N_features = np.meshgrid(n_samples, - n_features) +N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) @@ -32,12 +31,18 @@ ratio = scikits_time / scipy_time plt.figure("scikit-learn Ward's method benchmark results") -plt.imshow(np.log(ratio), aspect='auto', origin="lower") +plt.imshow(np.log(ratio), aspect="auto", origin="lower") plt.colorbar() -plt.contour(ratio, levels=[1, ], colors='k') +plt.contour( + ratio, + levels=[ + 1, + ], + colors="k", +) plt.yticks(range(len(n_features)), n_features.astype(int)) -plt.ylabel('N features') +plt.ylabel("N features") plt.xticks(range(len(n_samples)), n_samples.astype(int)) -plt.xlabel('N samples') +plt.xlabel("N samples") plt.title("Scikit's time, in units of scipy time (log)") plt.show() diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py index fb301d2ed0b00..6551de690994b 100644 --- a/benchmarks/bench_random_projections.py +++ b/benchmarks/bench_random_projections.py @@ -6,19 +6,22 @@ Benchmarks for random projections. """ + +import collections import gc -import sys import optparse +import sys from datetime import datetime -import collections import numpy as np import scipy.sparse as sp from sklearn import clone -from sklearn.random_projection import (SparseRandomProjection, - GaussianRandomProjection, - johnson_lindenstrauss_min_dim) +from sklearn.random_projection import ( + GaussianRandomProjection, + SparseRandomProjection, + johnson_lindenstrauss_min_dim, +) def type_auto_or_float(val): @@ -36,27 +39,27 @@ def type_auto_or_int(val): def compute_time(t_start, delta): - mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + mu_second = 0.0 + 10**6 # number of microseconds in a second return delta.seconds + delta.microseconds / mu_second -def bench_scikit_transformer(X, transfomer): +def bench_scikit_transformer(X, transformer): gc.collect() - clf = clone(transfomer) + clf = clone(transformer) # start time t_start = datetime.now() clf.fit(X) - delta = (datetime.now() - t_start) + delta = datetime.now() - t_start # stop time time_to_fit = compute_time(t_start, delta) # start time t_start = datetime.now() clf.transform(X) - delta = (datetime.now() - t_start) + delta = datetime.now() - t_start # stop time time_to_transform = compute_time(t_start, delta) @@ -65,21 +68,30 @@ def bench_scikit_transformer(X, transfomer): # Make some random data with uniformly located non zero entries with # Gaussian distributed values -def make_sparse_random_data(n_samples, n_features, n_nonzeros, - random_state=None): +def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=None): rng = np.random.RandomState(random_state) data_coo = sp.coo_matrix( - (rng.randn(n_nonzeros), - (rng.randint(n_samples, size=n_nonzeros), - rng.randint(n_features, size=n_nonzeros))), - shape=(n_samples, n_features)) + ( + rng.randn(n_nonzeros), + ( + rng.randint(n_samples, size=n_nonzeros), + rng.randint(n_features, size=n_nonzeros), + ), + ), + shape=(n_samples, n_features), + ) return data_coo.toarray(), data_coo.tocsr() def print_row(clf_type, time_fit, time_transform): - print("%s | %s | %s" % (clf_type.ljust(30), - ("%.4fs" % time_fit).center(12), - ("%.4fs" % time_transform).center(12))) + print( + "%s | %s | %s" + % ( + clf_type.ljust(30), + ("%.4fs" % time_fit).center(12), + ("%.4fs" % time_transform).center(12), + ) + ) if __name__ == "__main__": @@ -87,53 +99,89 @@ def print_row(clf_type, time_fit, time_transform): # Option parser ########################################################################### op = optparse.OptionParser() - op.add_option("--n-times", - dest="n_times", default=5, type=int, - help="Benchmark results are average over n_times experiments") - - op.add_option("--n-features", - dest="n_features", default=10 ** 4, type=int, - help="Number of features in the benchmarks") - - op.add_option("--n-components", - dest="n_components", default="auto", - help="Size of the random subspace." - " ('auto' or int > 0)") - - op.add_option("--ratio-nonzeros", - dest="ratio_nonzeros", default=10 ** -3, type=float, - help="Number of features in the benchmarks") - - op.add_option("--n-samples", - dest="n_samples", default=500, type=int, - help="Number of samples in the benchmarks") - - op.add_option("--random-seed", - dest="random_seed", default=13, type=int, - help="Seed used by the random number generators.") - - op.add_option("--density", - dest="density", default=1 / 3, - help="Density used by the sparse random projection." - " ('auto' or float (0.0, 1.0]") - - op.add_option("--eps", - dest="eps", default=0.5, type=float, - help="See the documentation of the underlying transformers.") - - op.add_option("--transformers", - dest="selected_transformers", - default='GaussianRandomProjection,SparseRandomProjection', - type=str, - help="Comma-separated list of transformer to benchmark. " - "Default: %default. Available: " - "GaussianRandomProjection,SparseRandomProjection") - - op.add_option("--dense", - dest="dense", - default=False, - action="store_true", - help="Set input space as a dense matrix.") + op.add_option( + "--n-times", + dest="n_times", + default=5, + type=int, + help="Benchmark results are average over n_times experiments", + ) + + op.add_option( + "--n-features", + dest="n_features", + default=10**4, + type=int, + help="Number of features in the benchmarks", + ) + + op.add_option( + "--n-components", + dest="n_components", + default="auto", + help="Size of the random subspace. ('auto' or int > 0)", + ) + + op.add_option( + "--ratio-nonzeros", + dest="ratio_nonzeros", + default=10**-3, + type=float, + help="Number of features in the benchmarks", + ) + + op.add_option( + "--n-samples", + dest="n_samples", + default=500, + type=int, + help="Number of samples in the benchmarks", + ) + + op.add_option( + "--random-seed", + dest="random_seed", + default=13, + type=int, + help="Seed used by the random number generators.", + ) + + op.add_option( + "--density", + dest="density", + default=1 / 3, + help=( + "Density used by the sparse random projection. ('auto' or float (0.0, 1.0]" + ), + ) + + op.add_option( + "--eps", + dest="eps", + default=0.5, + type=float, + help="See the documentation of the underlying transformers.", + ) + + op.add_option( + "--transformers", + dest="selected_transformers", + default="GaussianRandomProjection,SparseRandomProjection", + type=str, + help=( + "Comma-separated list of transformer to benchmark. " + "Default: %default. Available: " + "GaussianRandomProjection,SparseRandomProjection" + ), + ) + + op.add_option( + "--dense", + dest="dense", + default=False, + action="store_true", + help="Set input space as a dense matrix.", + ) (opts, args) = op.parse_args() if len(args) > 0: @@ -141,27 +189,28 @@ def print_row(clf_type, time_fit, time_transform): sys.exit(1) opts.n_components = type_auto_or_int(opts.n_components) opts.density = type_auto_or_float(opts.density) - selected_transformers = opts.selected_transformers.split(',') + selected_transformers = opts.selected_transformers.split(",") ########################################################################### # Generate dataset ########################################################################### n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) - print('Dataset statics') + print("Dataset statistics") print("===========================") - print('n_samples \t= %s' % opts.n_samples) - print('n_features \t= %s' % opts.n_features) + print("n_samples \t= %s" % opts.n_samples) + print("n_features \t= %s" % opts.n_features) if opts.n_components == "auto": - print('n_components \t= %s (auto)' % - johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, - eps=opts.eps)) + print( + "n_components \t= %s (auto)" + % johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, eps=opts.eps) + ) else: - print('n_components \t= %s' % opts.n_components) - print('n_elements \t= %s' % (opts.n_features * opts.n_samples)) - print('n_nonzeros \t= %s per feature' % n_nonzeros) - print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros) - print('') + print("n_components \t= %s" % opts.n_components) + print("n_elements \t= %s" % (opts.n_features * opts.n_samples)) + print("n_nonzeros \t= %s per feature" % n_nonzeros) + print("ratio_nonzeros \t= %s" % opts.ratio_nonzeros) + print("") ########################################################################### # Set transformer input @@ -172,10 +221,11 @@ def print_row(clf_type, time_fit, time_transform): # Set GaussianRandomProjection input gaussian_matrix_params = { "n_components": opts.n_components, - "random_state": opts.random_seed + "random_state": opts.random_seed, } - transformers["GaussianRandomProjection"] = \ - GaussianRandomProjection(**gaussian_matrix_params) + transformers["GaussianRandomProjection"] = GaussianRandomProjection( + **gaussian_matrix_params + ) ########################################################################### # Set SparseRandomProjection input @@ -186,8 +236,9 @@ def print_row(clf_type, time_fit, time_transform): "eps": opts.eps, } - transformers["SparseRandomProjection"] = \ - SparseRandomProjection(**sparse_matrix_params) + transformers["SparseRandomProjection"] = SparseRandomProjection( + **sparse_matrix_params + ) ########################################################################### # Perform benchmark @@ -195,13 +246,12 @@ def print_row(clf_type, time_fit, time_transform): time_fit = collections.defaultdict(list) time_transform = collections.defaultdict(list) - print('Benchmarks') + print("Benchmarks") print("===========================") print("Generate dataset benchmarks... ", end="") - X_dense, X_sparse = make_sparse_random_data(opts.n_samples, - opts.n_features, - n_nonzeros, - random_state=opts.random_seed) + X_dense, X_sparse = make_sparse_random_data( + opts.n_samples, opts.n_features, n_nonzeros, random_state=opts.random_seed + ) X = X_dense if opts.dense else X_sparse print("done") @@ -210,8 +260,9 @@ def print_row(clf_type, time_fit, time_transform): for iteration in range(opts.n_times): print("\titer %s..." % iteration, end="") - time_to_fit, time_to_transform = bench_scikit_transformer(X_dense, - transformers[name]) + time_to_fit, time_to_transform = bench_scikit_transformer( + X_dense, transformers[name] + ) time_fit[name].append(time_to_fit) time_transform[name].append(time_to_transform) print("done") @@ -224,27 +275,30 @@ def print_row(clf_type, time_fit, time_transform): print("Script arguments") print("===========================") arguments = vars(opts) - print("%s \t | %s " % ("Arguments".ljust(16), - "Value".center(12),)) + print( + "%s \t | %s " + % ( + "Arguments".ljust(16), + "Value".center(12), + ) + ) print(25 * "-" + ("|" + "-" * 14) * 1) for key, value in arguments.items(): - print("%s \t | %s " % (str(key).ljust(16), - str(value).strip().center(12))) + print("%s \t | %s " % (str(key).ljust(16), str(value).strip().center(12))) print("") print("Transformer performance:") print("===========================") print("Results are averaged over %s repetition(s)." % opts.n_times) print("") - print("%s | %s | %s" % ("Transformer".ljust(30), - "fit".center(12), - "transform".center(12))) + print( + "%s | %s | %s" + % ("Transformer".ljust(30), "fit".center(12), "transform".center(12)) + ) print(31 * "-" + ("|" + "-" * 14) * 2) for name in sorted(selected_transformers): - print_row(name, - np.mean(time_fit[name]), - np.mean(time_transform[name])) + print_row(name, np.mean(time_fit[name]), np.mean(time_transform[name])) print("") print("") diff --git a/benchmarks/bench_rcv1_logreg_convergence.py b/benchmarks/bench_rcv1_logreg_convergence.py index 051496c4483a2..27e730736a3de 100644 --- a/benchmarks/bench_rcv1_logreg_convergence.py +++ b/benchmarks/bench_rcv1_logreg_convergence.py @@ -1,16 +1,15 @@ -# Authors: Tom Dupre la Tour -# Olivier Grisel -# -# License: BSD 3 clause +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause -import matplotlib.pyplot as plt -from joblib import Memory -import numpy as np import gc import time -from sklearn.linear_model import (LogisticRegression, SGDClassifier) +import matplotlib.pyplot as plt +import numpy as np +from joblib import Memory + from sklearn.datasets import fetch_rcv1 +from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.linear_model._sag import get_auto_step_size try: @@ -18,16 +17,16 @@ except ImportError: lightning_clf = None -m = Memory(cachedir='.', verbose=0) +m = Memory(cachedir=".", verbose=0) # compute logistic loss def get_loss(w, intercept, myX, myy, C): n_samples = myX.shape[0] w = w.ravel() - p = np.mean(np.log(1. + np.exp(-myy * (myX.dot(w) + intercept)))) - print("%f + %f" % (p, w.dot(w) / 2. / C / n_samples)) - p += w.dot(w) / 2. / C / n_samples + p = np.mean(np.log(1.0 + np.exp(-myy * (myX.dot(w) + intercept)))) + print("%f + %f" % (p, w.dot(w) / 2.0 / C / n_samples)) + p += w.dot(w) / 2.0 / C / n_samples return p @@ -39,7 +38,7 @@ def bench_one(name, clf_type, clf_params, n_iter): clf = clf_type(**clf_params) try: clf.set_params(max_iter=n_iter, random_state=42) - except: + except Exception: clf.set_params(n_iter=n_iter, random_state=42) st = time.time() @@ -48,13 +47,13 @@ def bench_one(name, clf_type, clf_params, n_iter): try: C = 1.0 / clf.alpha / n_samples - except: + except Exception: C = clf.C try: intercept = clf.intercept_ - except: - intercept = 0. + except Exception: + intercept = 0.0 train_loss = get_loss(clf.coef_, intercept, X, y, C) train_score = clf.score(X, y) @@ -65,8 +64,15 @@ def bench_one(name, clf_type, clf_params, n_iter): def bench(clfs): - for (name, clf, iter_range, train_losses, train_scores, - test_scores, durations) in clfs: + for ( + name, + clf, + iter_range, + train_losses, + train_scores, + test_scores, + durations, + ) in clfs: print("training %s" % name) clf_type = type(clf) clf_params = clf.get_params() @@ -75,7 +81,8 @@ def bench(clfs): gc.collect() train_loss, train_score, test_score, duration = bench_one( - name, clf_type, clf_params, n_iter) + name, clf_type, clf_params, n_iter + ) train_losses.append(train_loss) train_scores.append(train_score) @@ -94,8 +101,8 @@ def bench(clfs): def plot_train_losses(clfs): plt.figure() - for (name, _, _, train_losses, _, _, durations) in clfs: - plt.plot(durations, train_losses, '-o', label=name) + for name, _, _, train_losses, _, _, durations in clfs: + plt.plot(durations, train_losses, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("train loss") @@ -103,8 +110,8 @@ def plot_train_losses(clfs): def plot_train_scores(clfs): plt.figure() - for (name, _, _, _, train_scores, _, durations) in clfs: - plt.plot(durations, train_scores, '-o', label=name) + for name, _, _, _, train_scores, _, durations in clfs: + plt.plot(durations, train_scores, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("train score") @@ -113,8 +120,8 @@ def plot_train_scores(clfs): def plot_test_scores(clfs): plt.figure() - for (name, _, _, _, _, test_scores, durations) in clfs: - plt.plot(durations, test_scores, '-o', label=name) + for name, _, _, _, _, test_scores, durations in clfs: + plt.plot(durations, test_scores, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("test score") @@ -124,16 +131,16 @@ def plot_test_scores(clfs): def plot_dloss(clfs): plt.figure() pobj_final = [] - for (name, _, _, train_losses, _, _, durations) in clfs: + for name, _, _, train_losses, _, _, durations in clfs: pobj_final.append(train_losses[-1]) indices = np.argsort(pobj_final) pobj_best = pobj_final[indices[0]] - for (name, _, _, train_losses, _, _, durations) in clfs: + for name, _, _, train_losses, _, _, durations in clfs: log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10) - plt.plot(durations, log_pobj, '-o', label=name) + plt.plot(durations, log_pobj, "-o", label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("log(best - train_loss)") @@ -141,19 +148,20 @@ def plot_dloss(clfs): def get_max_squared_sum(X): """Get the maximum row-wise sum of squares""" - return np.sum(X ** 2, axis=1).max() + return np.sum(X**2, axis=1).max() + rcv1 = fetch_rcv1() X = rcv1.data n_samples, n_features = X.shape # consider the binary classification problem 'CCAT' vs the rest -ccat_idx = rcv1.target_names.tolist().index('CCAT') +ccat_idx = rcv1.target_names.tolist().index("CCAT") y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64) y[y == 0] = -1 # parameters -C = 1. +C = 1.0 fit_intercept = True tol = 1.0e-14 @@ -166,51 +174,116 @@ def get_max_squared_sum(X): sag_iter_range = list(range(1, 37, 3)) clfs = [ - ("LR-liblinear", - LogisticRegression(C=C, tol=tol, - solver="liblinear", fit_intercept=fit_intercept, - intercept_scaling=1), - liblinear_iter_range, [], [], [], []), - ("LR-liblinear-dual", - LogisticRegression(C=C, tol=tol, dual=True, - solver="liblinear", fit_intercept=fit_intercept, - intercept_scaling=1), - liblinear_dual_iter_range, [], [], [], []), - ("LR-SAG", - LogisticRegression(C=C, tol=tol, - solver="sag", fit_intercept=fit_intercept), - sag_iter_range, [], [], [], []), - ("LR-newton-cg", - LogisticRegression(C=C, tol=tol, solver="newton-cg", - fit_intercept=fit_intercept), - newton_iter_range, [], [], [], []), - ("LR-lbfgs", - LogisticRegression(C=C, tol=tol, - solver="lbfgs", fit_intercept=fit_intercept), - lbfgs_iter_range, [], [], [], []), - ("SGD", - SGDClassifier(alpha=1.0 / C / n_samples, penalty='l2', loss='log', - fit_intercept=fit_intercept, verbose=0), - sgd_iter_range, [], [], [], [])] + ( + "LR-liblinear", + LogisticRegression( + C=C, + tol=tol, + solver="liblinear", + fit_intercept=fit_intercept, + intercept_scaling=1, + ), + liblinear_iter_range, + [], + [], + [], + [], + ), + ( + "LR-liblinear-dual", + LogisticRegression( + C=C, + tol=tol, + dual=True, + solver="liblinear", + fit_intercept=fit_intercept, + intercept_scaling=1, + ), + liblinear_dual_iter_range, + [], + [], + [], + [], + ), + ( + "LR-SAG", + LogisticRegression(C=C, tol=tol, solver="sag", fit_intercept=fit_intercept), + sag_iter_range, + [], + [], + [], + [], + ), + ( + "LR-newton-cg", + LogisticRegression( + C=C, tol=tol, solver="newton-cg", fit_intercept=fit_intercept + ), + newton_iter_range, + [], + [], + [], + [], + ), + ( + "LR-lbfgs", + LogisticRegression(C=C, tol=tol, solver="lbfgs", fit_intercept=fit_intercept), + lbfgs_iter_range, + [], + [], + [], + [], + ), + ( + "SGD", + SGDClassifier( + alpha=1.0 / C / n_samples, + penalty="l2", + loss="log_loss", + fit_intercept=fit_intercept, + verbose=0, + ), + sgd_iter_range, + [], + [], + [], + [], + ), +] if lightning_clf is not None and not fit_intercept: - alpha = 1. / C / n_samples + alpha = 1.0 / C / n_samples # compute the same step_size than in LR-sag max_squared_sum = get_max_squared_sum(X) - step_size = get_auto_step_size(max_squared_sum, alpha, "log", - fit_intercept) + step_size = get_auto_step_size(max_squared_sum, alpha, "log", fit_intercept) clfs.append( - ("Lightning-SVRG", - lightning_clf.SVRGClassifier(alpha=alpha, eta=step_size, - tol=tol, loss="log"), - sag_iter_range, [], [], [], [])) + ( + "Lightning-SVRG", + lightning_clf.SVRGClassifier( + alpha=alpha, eta=step_size, tol=tol, loss="log" + ), + sag_iter_range, + [], + [], + [], + [], + ) + ) clfs.append( - ("Lightning-SAG", - lightning_clf.SAGClassifier(alpha=alpha, eta=step_size, - tol=tol, loss="log"), - sag_iter_range, [], [], [], [])) + ( + "Lightning-SAG", + lightning_clf.SAGClassifier( + alpha=alpha, eta=step_size, tol=tol, loss="log" + ), + sag_iter_range, + [], + [], + [], + [], + ) + ) # We keep only 200 features, to have a dense dataset, # and compare to lightning SAG, which seems incorrect in the sparse case. diff --git a/benchmarks/bench_saga.py b/benchmarks/bench_saga.py index 492527d7e4c67..97d4ba7b4b75b 100644 --- a/benchmarks/bench_saga.py +++ b/benchmarks/bench_saga.py @@ -3,45 +3,61 @@ Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain in using multinomial logistic regression in term of learning time. """ + import json -import time import os +import time -from joblib import Parallel -from sklearn.utils.fixes import delayed import matplotlib.pyplot as plt import numpy as np -from sklearn.datasets import fetch_rcv1, load_iris, load_digits, \ - fetch_20newsgroups_vectorized +from sklearn.datasets import ( + fetch_20newsgroups_vectorized, + fetch_rcv1, + load_digits, + load_iris, +) from sklearn.linear_model import LogisticRegression from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split +from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import LabelBinarizer, LabelEncoder from sklearn.utils.extmath import safe_sparse_dot, softmax - - -def fit_single(solver, X, y, penalty='l2', single_target=True, C=1, - max_iter=10, skip_slow=False, dtype=np.float64): - if skip_slow and solver == 'lightning' and penalty == 'l1': - print('skip_slowping l1 logistic regression with solver lightning.') +from sklearn.utils.parallel import Parallel, delayed + + +def fit_single( + solver, + X, + y, + penalty="l2", + single_target=True, + C=1, + max_iter=10, + skip_slow=False, + dtype=np.float64, +): + if skip_slow and solver == "lightning" and penalty == "l1": + print("skip_slowping l1 logistic regression with solver lightning.") return - print('Solving %s logistic regression with penalty %s, solver %s.' - % ('binary' if single_target else 'multinomial', - penalty, solver)) + print( + "Solving %s logistic regression with penalty %s, solver %s." + % ("binary" if single_target else "multinomial", penalty, solver) + ) - if solver == 'lightning': + if solver == "lightning": from lightning.classification import SAGAClassifier - if single_target or solver not in ['sag', 'saga']: - multi_class = 'ovr' + if single_target or solver not in ["sag", "saga"]: + multi_class = "ovr" else: - multi_class = 'multinomial' + multi_class = "multinomial" X = X.astype(dtype) y = y.astype(dtype) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, - stratify=y) + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=42, stratify=y + ) n_samples = X_train.shape[0] n_classes = np.unique(y_train).shape[0] test_scores = [1] @@ -49,32 +65,46 @@ def fit_single(solver, X, y, penalty='l2', single_target=True, C=1, accuracies = [1 / n_classes] times = [0] - if penalty == 'l2': - alpha = 1. / (C * n_samples) + if penalty == "l2": + alpha = 1.0 / (C * n_samples) beta = 0 lightning_penalty = None else: - alpha = 0. - beta = 1. / (C * n_samples) - lightning_penalty = 'l1' + alpha = 0.0 + beta = 1.0 / (C * n_samples) + lightning_penalty = "l1" for this_max_iter in range(1, max_iter + 1, 2): - print('[%s, %s, %s] Max iter: %s' % - ('binary' if single_target else 'multinomial', - penalty, solver, this_max_iter)) - if solver == 'lightning': - lr = SAGAClassifier(loss='log', alpha=alpha, beta=beta, - penalty=lightning_penalty, - tol=-1, max_iter=this_max_iter) + print( + "[%s, %s, %s] Max iter: %s" + % ( + "binary" if single_target else "multinomial", + penalty, + solver, + this_max_iter, + ) + ) + if solver == "lightning": + lr = SAGAClassifier( + loss="log", + alpha=alpha, + beta=beta, + penalty=lightning_penalty, + tol=-1, + max_iter=this_max_iter, + ) else: - lr = LogisticRegression(solver=solver, - multi_class=multi_class, - C=C, - penalty=penalty, - fit_intercept=False, tol=0, - max_iter=this_max_iter, - random_state=42, - ) + lr = LogisticRegression( + solver=solver, + C=C, + penalty=penalty, + fit_intercept=False, + tol=0, + max_iter=this_max_iter, + random_state=42, + ) + if multi_class == "ovr": + lr = OneVsRestClassifier(lr) # Makes cpu cache even for all fit calls X_train.max() @@ -84,15 +114,18 @@ def fit_single(solver, X, y, penalty='l2', single_target=True, C=1, train_time = time.clock() - t0 scores = [] - for (X, y) in [(X_train, y_train), (X_test, y_test)]: + for X, y in [(X_train, y_train), (X_test, y_test)]: try: y_pred = lr.predict_proba(X) except NotImplementedError: # Lightning predict_proba is not implemented for n_classes > 2 y_pred = _predict_proba(lr, X) + if isinstance(lr, OneVsRestClassifier): + coef = np.concatenate([est.coef_ for est in lr.estimators_]) + else: + coef = lr.coef_ score = log_loss(y, y_pred, normalize=False) / n_samples - score += (0.5 * alpha * np.sum(lr.coef_ ** 2) + - beta * np.sum(np.abs(lr.coef_))) + score += 0.5 * alpha * np.sum(coef**2) + beta * np.sum(np.abs(coef)) scores.append(score) train_score, test_score = tuple(scores) @@ -106,21 +139,29 @@ def fit_single(solver, X, y, penalty='l2', single_target=True, C=1, def _predict_proba(lr, X): + """Predict proba for lightning for n_classes >=3.""" pred = safe_sparse_dot(X, lr.coef_.T) if hasattr(lr, "intercept_"): pred += lr.intercept_ return softmax(pred) -def exp(solvers, penalty, single_target, - n_samples=30000, max_iter=20, - dataset='rcv1', n_jobs=1, skip_slow=False): +def exp( + solvers, + penalty, + single_target, + n_samples=30000, + max_iter=20, + dataset="rcv1", + n_jobs=1, + skip_slow=False, +): dtypes_mapping = { "float64": np.float64, "float32": np.float32, } - if dataset == 'rcv1': + if dataset == "rcv1": rcv1 = fetch_rcv1() lbin = LabelBinarizer() @@ -137,17 +178,17 @@ def exp(solvers, penalty, single_target, y_n[y <= 16] = 0 y = y_n - elif dataset == 'digits': + elif dataset == "digits": X, y = load_digits(return_X_y=True) if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n - elif dataset == 'iris': + elif dataset == "iris": iris = load_iris() X, y = iris.data, iris.target - elif dataset == '20newspaper': + elif dataset == "20newspaper": ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target @@ -161,44 +202,55 @@ def exp(solvers, penalty, single_target, y = y[:n_samples] out = Parallel(n_jobs=n_jobs, mmap_mode=None)( - delayed(fit_single)(solver, X, y, - penalty=penalty, single_target=single_target, - dtype=dtype, - C=1, max_iter=max_iter, skip_slow=skip_slow) + delayed(fit_single)( + solver, + X, + y, + penalty=penalty, + single_target=single_target, + dtype=dtype, + C=1, + max_iter=max_iter, + skip_slow=skip_slow, + ) for solver in solvers - for dtype in dtypes_mapping.values()) + for dtype in dtypes_mapping.values() + ) res = [] idx = 0 for dtype_name in dtypes_mapping.keys(): for solver in solvers: - if not (skip_slow and - solver == 'lightning' and - penalty == 'l1'): + if not (skip_slow and solver == "lightning" and penalty == "l1"): lr, times, train_scores, test_scores, accuracies = out[idx] - this_res = dict(solver=solver, penalty=penalty, - dtype=dtype_name, - single_target=single_target, - times=times, train_scores=train_scores, - test_scores=test_scores, - accuracies=accuracies) + this_res = dict( + solver=solver, + penalty=penalty, + dtype=dtype_name, + single_target=single_target, + times=times, + train_scores=train_scores, + test_scores=test_scores, + accuracies=accuracies, + ) res.append(this_res) idx += 1 - with open('bench_saga.json', 'w+') as f: + with open("bench_saga.json", "w+") as f: json.dump(res, f) def plot(outname=None): import pandas as pd - with open('bench_saga.json', 'r') as f: + + with open("bench_saga.json", "r") as f: f = json.load(f) res = pd.DataFrame(f) - res.set_index(['single_target'], inplace=True) + res.set_index(["single_target"], inplace=True) - grouped = res.groupby(level=['single_target']) + grouped = res.groupby(level=["single_target"]) - colors = {'saga': 'C0', 'liblinear': 'C1', 'lightning': 'C2'} + colors = {"saga": "C0", "liblinear": "C1", "lightning": "C2"} linestyles = {"float32": "--", "float64": "-"} alpha = {"float64": 0.5, "float32": 1} @@ -207,93 +259,122 @@ def plot(outname=None): fig, axes = plt.subplots(figsize=(12, 4), ncols=4) ax = axes[0] - for scores, times, solver, dtype in zip(group['train_scores'], - group['times'], - group['solver'], - group["dtype"]): - ax.plot(times, scores, label="%s - %s" % (solver, dtype), - color=colors[solver], - alpha=alpha[dtype], - marker=".", - linestyle=linestyles[dtype]) - ax.axvline(times[-1], color=colors[solver], - alpha=alpha[dtype], - linestyle=linestyles[dtype]) - ax.set_xlabel('Time (s)') - ax.set_ylabel('Training objective (relative to min)') - ax.set_yscale('log') + for scores, times, solver, dtype in zip( + group["train_scores"], group["times"], group["solver"], group["dtype"] + ): + ax.plot( + times, + scores, + label="%s - %s" % (solver, dtype), + color=colors[solver], + alpha=alpha[dtype], + marker=".", + linestyle=linestyles[dtype], + ) + ax.axvline( + times[-1], + color=colors[solver], + alpha=alpha[dtype], + linestyle=linestyles[dtype], + ) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Training objective (relative to min)") + ax.set_yscale("log") ax = axes[1] - for scores, times, solver, dtype in zip(group['test_scores'], - group['times'], - group['solver'], - group["dtype"]): - ax.plot(times, scores, label=solver, color=colors[solver], - linestyle=linestyles[dtype], - marker=".", - alpha=alpha[dtype]) - ax.axvline(times[-1], color=colors[solver], - alpha=alpha[dtype], - linestyle=linestyles[dtype]) - - ax.set_xlabel('Time (s)') - ax.set_ylabel('Test objective (relative to min)') - ax.set_yscale('log') + for scores, times, solver, dtype in zip( + group["test_scores"], group["times"], group["solver"], group["dtype"] + ): + ax.plot( + times, + scores, + label=solver, + color=colors[solver], + linestyle=linestyles[dtype], + marker=".", + alpha=alpha[dtype], + ) + ax.axvline( + times[-1], + color=colors[solver], + alpha=alpha[dtype], + linestyle=linestyles[dtype], + ) + + ax.set_xlabel("Time (s)") + ax.set_ylabel("Test objective (relative to min)") + ax.set_yscale("log") ax = axes[2] - for accuracy, times, solver, dtype in zip(group['accuracies'], - group['times'], - group['solver'], - group["dtype"]): - ax.plot(times, accuracy, label="%s - %s" % (solver, dtype), - alpha=alpha[dtype], - marker=".", - color=colors[solver], linestyle=linestyles[dtype]) - ax.axvline(times[-1], color=colors[solver], - alpha=alpha[dtype], - linestyle=linestyles[dtype]) - - ax.set_xlabel('Time (s)') - ax.set_ylabel('Test accuracy') + for accuracy, times, solver, dtype in zip( + group["accuracies"], group["times"], group["solver"], group["dtype"] + ): + ax.plot( + times, + accuracy, + label="%s - %s" % (solver, dtype), + alpha=alpha[dtype], + marker=".", + color=colors[solver], + linestyle=linestyles[dtype], + ) + ax.axvline( + times[-1], + color=colors[solver], + alpha=alpha[dtype], + linestyle=linestyles[dtype], + ) + + ax.set_xlabel("Time (s)") + ax.set_ylabel("Test accuracy") ax.legend() - name = 'single_target' if single_target else 'multi_target' - name += '_%s' % penalty + name = "single_target" if single_target else "multi_target" + name += "_%s" % penalty plt.suptitle(name) if outname is None: - outname = name + '.png' + outname = name + ".png" fig.tight_layout() fig.subplots_adjust(top=0.9) ax = axes[3] - for scores, times, solver, dtype in zip(group['train_scores'], - group['times'], - group['solver'], - group["dtype"]): - ax.plot(np.arange(len(scores)), - scores, label="%s - %s" % (solver, dtype), - marker=".", - alpha=alpha[dtype], - color=colors[solver], linestyle=linestyles[dtype]) + for scores, times, solver, dtype in zip( + group["train_scores"], group["times"], group["solver"], group["dtype"] + ): + ax.plot( + np.arange(len(scores)), + scores, + label="%s - %s" % (solver, dtype), + marker=".", + alpha=alpha[dtype], + color=colors[solver], + linestyle=linestyles[dtype], + ) ax.set_yscale("log") - ax.set_xlabel('# iterations') - ax.set_ylabel('Objective function') + ax.set_xlabel("# iterations") + ax.set_ylabel("Objective function") ax.legend() plt.savefig(outname) -if __name__ == '__main__': - solvers = ['saga', 'liblinear', 'lightning'] - penalties = ['l1', 'l2'] +if __name__ == "__main__": + solvers = ["saga", "liblinear", "lightning"] + penalties = ["l1", "l2"] n_samples = [100000, 300000, 500000, 800000, None] single_target = True for penalty in penalties: for n_sample in n_samples: - exp(solvers, penalty, single_target, - n_samples=n_sample, n_jobs=1, - dataset='rcv1', max_iter=10) + exp( + solvers, + penalty, + single_target, + n_samples=n_sample, + n_jobs=1, + dataset="rcv1", + max_iter=10, + ) if n_sample is not None: outname = "figures/saga_%s_%d.png" % (penalty, n_sample) else: diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py index fcd41640843e7..39cf1a11ffed6 100644 --- a/benchmarks/bench_sample_without_replacement.py +++ b/benchmarks/bench_sample_without_replacement.py @@ -2,21 +2,22 @@ Benchmarks for sampling without replacement of integer. """ + import gc -import sys +import operator import optparse +import random +import sys from datetime import datetime -import operator import matplotlib.pyplot as plt import numpy as np -import random from sklearn.utils.random import sample_without_replacement def compute_time(t_start, delta): - mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + mu_second = 0.0 + 10**6 # number of microseconds in a second return delta.seconds + delta.microseconds / mu_second @@ -26,38 +27,57 @@ def bench_sample(sampling, n_population, n_samples): # start time t_start = datetime.now() sampling(n_population, n_samples) - delta = (datetime.now() - t_start) + delta = datetime.now() - t_start # stop time time = compute_time(t_start, delta) return time + if __name__ == "__main__": ########################################################################### # Option parser ########################################################################### op = optparse.OptionParser() - op.add_option("--n-times", - dest="n_times", default=5, type=int, - help="Benchmark results are average over n_times experiments") - - op.add_option("--n-population", - dest="n_population", default=100000, type=int, - help="Size of the population to sample from.") - - op.add_option("--n-step", - dest="n_steps", default=5, type=int, - help="Number of step interval between 0 and n_population.") - - default_algorithms = "custom-tracking-selection,custom-auto," \ - "custom-reservoir-sampling,custom-pool,"\ - "python-core-sample,numpy-permutation" - - op.add_option("--algorithm", - dest="selected_algorithm", - default=default_algorithms, - type=str, - help="Comma-separated list of transformer to benchmark. " - "Default: %default. \nAvailable: %default") + op.add_option( + "--n-times", + dest="n_times", + default=5, + type=int, + help="Benchmark results are average over n_times experiments", + ) + + op.add_option( + "--n-population", + dest="n_population", + default=100000, + type=int, + help="Size of the population to sample from.", + ) + + op.add_option( + "--n-step", + dest="n_steps", + default=5, + type=int, + help="Number of step interval between 0 and n_population.", + ) + + default_algorithms = ( + "custom-tracking-selection,custom-auto," + "custom-reservoir-sampling,custom-pool," + "python-core-sample,numpy-permutation" + ) + + op.add_option( + "--algorithm", + dest="selected_algorithm", + default=default_algorithms, + type=str, + help=( + "Comma-separated list of transformer to benchmark. " + "Default: %default. \nAvailable: %default" + ), + ) # op.add_option("--random-seed", # dest="random_seed", default=13, type=int, @@ -68,11 +88,13 @@ def bench_sample(sampling, n_population, n_samples): op.error("this script takes no arguments.") sys.exit(1) - selected_algorithm = opts.selected_algorithm.split(',') + selected_algorithm = opts.selected_algorithm.split(",") for key in selected_algorithm: - if key not in default_algorithms.split(','): - raise ValueError("Unknown sampling algorithm \"%s\" not in (%s)." - % (key, default_algorithms)) + if key not in default_algorithms.split(","): + raise ValueError( + 'Unknown sampling algorithm "%s" not in (%s).' + % (key, default_algorithms) + ) ########################################################################### # List sampling algorithm @@ -84,66 +106,73 @@ def bench_sample(sampling, n_population, n_samples): ########################################################################### # Set Python core input - sampling_algorithm["python-core-sample"] = \ - lambda n_population, n_sample: \ - random.sample(range(n_population), n_sample) + sampling_algorithm["python-core-sample"] = ( + lambda n_population, n_sample: random.sample(range(n_population), n_sample) + ) ########################################################################### # Set custom automatic method selection - sampling_algorithm["custom-auto"] = \ - lambda n_population, n_samples, random_state=None: \ - sample_without_replacement(n_population, n_samples, method="auto", - random_state=random_state) + sampling_algorithm["custom-auto"] = ( + lambda n_population, n_samples, random_state=None: sample_without_replacement( + n_population, n_samples, method="auto", random_state=random_state + ) + ) ########################################################################### # Set custom tracking based method - sampling_algorithm["custom-tracking-selection"] = \ - lambda n_population, n_samples, random_state=None: \ - sample_without_replacement(n_population, - n_samples, - method="tracking_selection", - random_state=random_state) + sampling_algorithm["custom-tracking-selection"] = ( + lambda n_population, n_samples, random_state=None: sample_without_replacement( + n_population, + n_samples, + method="tracking_selection", + random_state=random_state, + ) + ) ########################################################################### # Set custom reservoir based method - sampling_algorithm["custom-reservoir-sampling"] = \ - lambda n_population, n_samples, random_state=None: \ - sample_without_replacement(n_population, - n_samples, - method="reservoir_sampling", - random_state=random_state) + sampling_algorithm["custom-reservoir-sampling"] = ( + lambda n_population, n_samples, random_state=None: sample_without_replacement( + n_population, + n_samples, + method="reservoir_sampling", + random_state=random_state, + ) + ) ########################################################################### # Set custom reservoir based method - sampling_algorithm["custom-pool"] = \ - lambda n_population, n_samples, random_state=None: \ - sample_without_replacement(n_population, - n_samples, - method="pool", - random_state=random_state) + sampling_algorithm["custom-pool"] = ( + lambda n_population, n_samples, random_state=None: sample_without_replacement( + n_population, n_samples, method="pool", random_state=random_state + ) + ) ########################################################################### # Numpy permutation based - sampling_algorithm["numpy-permutation"] = \ - lambda n_population, n_sample: \ - np.random.permutation(n_population)[:n_sample] + sampling_algorithm["numpy-permutation"] = ( + lambda n_population, n_sample: np.random.permutation(n_population)[:n_sample] + ) ########################################################################### # Remove unspecified algorithm - sampling_algorithm = {key: value - for key, value in sampling_algorithm.items() - if key in selected_algorithm} + sampling_algorithm = { + key: value + for key, value in sampling_algorithm.items() + if key in selected_algorithm + } ########################################################################### # Perform benchmark ########################################################################### time = {} - n_samples = np.linspace(start=0, stop=opts.n_population, - num=opts.n_steps).astype(int) + n_samples = np.linspace(start=0, stop=opts.n_population, num=opts.n_steps).astype( + int + ) ratio = n_samples / opts.n_population - print('Benchmarks') + print("Benchmarks") print("===========================") for name in sorted(sampling_algorithm): @@ -152,9 +181,9 @@ def bench_sample(sampling, n_population, n_samples): for step in range(opts.n_steps): for it in range(opts.n_times): - time[name][step, it] = bench_sample(sampling_algorithm[name], - opts.n_population, - n_samples[step]) + time[name][step, it] = bench_sample( + sampling_algorithm[name], opts.n_population, n_samples[step] + ) print("done") @@ -168,12 +197,16 @@ def bench_sample(sampling, n_population, n_samples): print("Script arguments") print("===========================") arguments = vars(opts) - print("%s \t | %s " % ("Arguments".ljust(16), - "Value".center(12),)) + print( + "%s \t | %s " + % ( + "Arguments".ljust(16), + "Value".center(12), + ) + ) print(25 * "-" + ("|" + "-" * 14) * 1) for key, value in arguments.items(): - print("%s \t | %s " % (str(key).ljust(16), - str(value).strip().center(12))) + print("%s \t | %s " % (str(key).ljust(16), str(value).strip().center(12))) print("") print("Sampling algorithm performance:") @@ -181,15 +214,14 @@ def bench_sample(sampling, n_population, n_samples): print("Results are averaged over %s repetition(s)." % opts.n_times) print("") - fig = plt.figure('scikit-learn sample w/o replacement benchmark results') - plt.title("n_population = %s, n_times = %s" % - (opts.n_population, opts.n_times)) + fig = plt.figure("scikit-learn sample w/o replacement benchmark results") + fig.suptitle("n_population = %s, n_times = %s" % (opts.n_population, opts.n_times)) ax = fig.add_subplot(111) for name in sampling_algorithm: ax.plot(ratio, time[name], label=name) - ax.set_xlabel('ratio of n_sample / n_population') - ax.set_ylabel('Time (s)') + ax.set_xlabel("ratio of n_sample / n_population") + ax.set_ylabel("Time (s)") ax.legend() # Sort legend labels diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py index 1f5c6320b03e5..bd00615e3d5f9 100644 --- a/benchmarks/bench_sgd_regression.py +++ b/benchmarks/bench_sgd_regression.py @@ -1,16 +1,15 @@ -# Author: Peter Prettenhofer -# License: BSD 3 clause - -import numpy as np -import matplotlib.pyplot as plt +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause import gc - from time import time -from sklearn.linear_model import Ridge, SGDRegressor, ElasticNet -from sklearn.metrics import mean_squared_error +import matplotlib.pyplot as plt +import numpy as np + from sklearn.datasets import make_regression +from sklearn.linear_model import ElasticNet, Ridge, SGDRegressor +from sklearn.metrics import mean_squared_error """ Benchmark for SGD regression @@ -35,8 +34,11 @@ for i, n_train in enumerate(list_n_samples): for j, n_features in enumerate(list_n_features): X, y, coef = make_regression( - n_samples=n_train + n_test, n_features=n_features, - noise=noise, coef=True) + n_samples=n_train + n_test, + n_features=n_features, + noise=noise, + coef=True, + ) X_train = X[:n_train] y_train = y[:n_train] @@ -70,34 +72,43 @@ clf = ElasticNet(alpha=alpha, l1_ratio=0.5, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) - elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), - y_test) + elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) elnet_results[i, j, 1] = time() - tstart gc.collect() print("- benchmarking SGD") - clf = SGDRegressor(alpha=alpha / n_train, fit_intercept=False, - max_iter=max_iter, learning_rate="invscaling", - eta0=.01, power_t=0.25, tol=1e-3) + clf = SGDRegressor( + alpha=alpha / n_train, + fit_intercept=False, + max_iter=max_iter, + learning_rate="invscaling", + eta0=0.01, + power_t=0.25, + tol=1e-3, + ) tstart = time() clf.fit(X_train, y_train) - sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), - y_test) + sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) sgd_results[i, j, 1] = time() - tstart gc.collect() print("max_iter", max_iter) print("- benchmarking A-SGD") - clf = SGDRegressor(alpha=alpha / n_train, fit_intercept=False, - max_iter=max_iter, learning_rate="invscaling", - eta0=.002, power_t=0.05, tol=1e-3, - average=(max_iter * n_train // 2)) + clf = SGDRegressor( + alpha=alpha / n_train, + fit_intercept=False, + max_iter=max_iter, + learning_rate="invscaling", + eta0=0.002, + power_t=0.05, + tol=1e-3, + average=(max_iter * n_train // 2), + ) tstart = time() clf.fit(X_train, y_train) - asgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), - y_test) + asgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) asgd_results[i, j, 1] = time() - tstart gc.collect() @@ -105,25 +116,19 @@ clf = Ridge(alpha=alpha, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) - ridge_results[i, j, 0] = mean_squared_error(clf.predict(X_test), - y_test) + ridge_results[i, j, 0] = mean_squared_error(clf.predict(X_test), y_test) ridge_results[i, j, 1] = time() - tstart # Plot results i = 0 m = len(list_n_features) - plt.figure('scikit-learn SGD regression benchmark results', - figsize=(5 * 2, 4 * m)) + plt.figure("scikit-learn SGD regression benchmark results", figsize=(5 * 2, 4 * m)) for j in range(m): plt.subplot(m, 2, i + 1) - plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]), - label="ElasticNet") - plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]), - label="SGDRegressor") - plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]), - label="A-SGDRegressor") - plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]), - label="Ridge") + plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]), label="ElasticNet") + plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]), label="SGDRegressor") + plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]), label="A-SGDRegressor") + plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]), label="Ridge") plt.legend(prop={"size": 10}) plt.xlabel("n_train") plt.ylabel("RMSE") @@ -131,20 +136,16 @@ i += 1 plt.subplot(m, 2, i + 1) - plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]), - label="ElasticNet") - plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]), - label="SGDRegressor") - plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]), - label="A-SGDRegressor") - plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]), - label="Ridge") + plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]), label="ElasticNet") + plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]), label="SGDRegressor") + plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]), label="A-SGDRegressor") + plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]), label="Ridge") plt.legend(prop={"size": 10}) plt.xlabel("n_train") plt.ylabel("Time [sec]") plt.title("Training time - %d features" % list_n_features[j]) i += 1 - plt.subplots_adjust(hspace=.30) + plt.subplots_adjust(hspace=0.30) plt.show() diff --git a/benchmarks/bench_sparsify.py b/benchmarks/bench_sparsify.py index be1f3bffe0181..1832ca40c6ddb 100644 --- a/benchmarks/bench_sparsify.py +++ b/benchmarks/bench_sparsify.py @@ -43,8 +43,9 @@ 60 300 381409 1271.4 97.1 clf.predict(X_test_sparse) """ -from scipy.sparse.csr import csr_matrix import numpy as np +from scipy.sparse import csr_matrix + from sklearn.linear_model import SGDRegressor from sklearn.metrics import r2_score @@ -54,16 +55,17 @@ def sparsity_ratio(X): return np.count_nonzero(X) / float(n_samples * n_features) + n_samples, n_features = 5000, 300 X = np.random.randn(n_samples, n_features) inds = np.arange(n_samples) np.random.shuffle(inds) -X[inds[int(n_features / 1.2):]] = 0 # sparsify input +X[inds[int(n_features / 1.2) :]] = 0 # sparsify input print("input data sparsity: %f" % sparsity_ratio(X)) coef = 3 * np.random.randn(n_features) inds = np.arange(n_features) np.random.shuffle(inds) -coef[inds[n_features // 2:]] = 0 # sparsify coef +coef[inds[n_features // 2 :]] = 0 # sparsify coef print("true coef sparsity: %f" % sparsity_ratio(coef)) y = np.dot(X, coef) @@ -72,13 +74,12 @@ def sparsity_ratio(X): # Split data in train set and test set n_samples = X.shape[0] -X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] -X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] +X_train, y_train = X[: n_samples // 2], y[: n_samples // 2] +X_test, y_test = X[n_samples // 2 :], y[n_samples // 2 :] print("test data sparsity: %f" % sparsity_ratio(X_test)) ############################################################################### -clf = SGDRegressor(penalty='l1', alpha=.2, max_iter=2000, - tol=None) +clf = SGDRegressor(penalty="l1", alpha=0.2, max_iter=2000, tol=None) clf.fit(X_train, y_train) print("model sparsity: %f" % sparsity_ratio(clf.coef_)) @@ -98,8 +99,9 @@ def score(y_test, y_pred, case): r2 = r2_score(y_test, y_pred) print("r^2 on test data (%s) : %f" % (case, r2)) -score(y_test, clf.predict(X_test), 'dense model') + +score(y_test, clf.predict(X_test), "dense model") benchmark_dense_predict() clf.sparsify() -score(y_test, clf.predict(X_test), 'sparse model') +score(y_test, clf.predict(X_test), "sparse model") benchmark_sparse_predict() diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 96dbc04312291..2eab7071544f9 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -8,16 +8,20 @@ * psutil (optional, but recommended) """ -import timeit + import itertools +import timeit import numpy as np import pandas as pd from memory_profiler import memory_usage from sklearn.datasets import fetch_20newsgroups -from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer, - HashingVectorizer) +from sklearn.feature_extraction.text import ( + CountVectorizer, + HashingVectorizer, + TfidfVectorizer, +) n_repeat = 3 @@ -26,47 +30,45 @@ def run_vectorizer(Vectorizer, X, **params): def f(): vect = Vectorizer(**params) vect.fit_transform(X) + return f -text = fetch_20newsgroups(subset='train').data[:1000] +text = fetch_20newsgroups(subset="train").data[:1000] -print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n') -print("Using a subset of the 20 newsgroups dataset ({} documents)." - .format(len(text))) +print("=" * 80 + "\n#" + " Text vectorizers benchmark" + "\n" + "=" * 80 + "\n") +print("Using a subset of the 20 newsgroups dataset ({} documents).".format(len(text))) print("This benchmarks runs in ~1 min ...") res = [] for Vectorizer, (analyzer, ngram_range) in itertools.product( - [CountVectorizer, TfidfVectorizer, HashingVectorizer], - [('word', (1, 1)), - ('word', (1, 2)), - ('char', (4, 4)), - ('char_wb', (4, 4)) - ]): - - bench = {'vectorizer': Vectorizer.__name__} - params = {'analyzer': analyzer, 'ngram_range': ngram_range} + [CountVectorizer, TfidfVectorizer, HashingVectorizer], + [("word", (1, 1)), ("word", (1, 2)), ("char", (4, 4)), ("char_wb", (4, 4))], +): + bench = {"vectorizer": Vectorizer.__name__} + params = {"analyzer": analyzer, "ngram_range": ngram_range} bench.update(params) - dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params), - number=1, - repeat=n_repeat) - bench['time'] = "{:.3f} (+-{:.3f})".format(np.mean(dt), np.std(dt)) + dt = timeit.repeat( + run_vectorizer(Vectorizer, text, **params), number=1, repeat=n_repeat + ) + bench["time"] = "{:.3f} (+-{:.3f})".format(np.mean(dt), np.std(dt)) mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params)) - bench['memory'] = "{:.1f}".format(np.max(mem_usage)) + bench["memory"] = "{:.1f}".format(np.max(mem_usage)) res.append(bench) -df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer']) +df = pd.DataFrame(res).set_index(["analyzer", "ngram_range", "vectorizer"]) -print('\n========== Run time performance (sec) ===========\n') -print('Computing the mean and the standard deviation ' - 'of the run time over {} runs...\n'.format(n_repeat)) -print(df['time'].unstack(level=-1)) +print("\n========== Run time performance (sec) ===========\n") +print( + "Computing the mean and the standard deviation " + "of the run time over {} runs...\n".format(n_repeat) +) +print(df["time"].unstack(level=-1)) -print('\n=============== Memory usage (MB) ===============\n') -print(df['memory'].unstack(level=-1)) +print("\n=============== Memory usage (MB) ===============\n") +print(df["memory"].unstack(level=-1)) diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py index 8a0af26d4c221..c522bcb39e994 100644 --- a/benchmarks/bench_tree.py +++ b/benchmarks/bench_tree.py @@ -13,16 +13,18 @@ training set, classify a sample and plot the time taken as a function of the number of dimensions. """ -import numpy as np -import matplotlib.pyplot as plt + import gc from datetime import datetime +import matplotlib.pyplot as plt +import numpy as np + # to store the results scikit_classifier_results = [] scikit_regressor_results = [] -mu_second = 0.0 + 10 ** 6 # number of microseconds in a second +mu_second = 0.0 + 10**6 # number of microseconds in a second def bench_scikit_tree_classifier(X, Y): @@ -36,11 +38,10 @@ def bench_scikit_tree_classifier(X, Y): tstart = datetime.now() clf = DecisionTreeClassifier() clf.fit(X, Y).predict(X) - delta = (datetime.now() - tstart) + delta = datetime.now() - tstart # stop time - scikit_classifier_results.append( - delta.seconds + delta.microseconds / mu_second) + scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second) def bench_scikit_tree_regressor(X, Y): @@ -54,18 +55,16 @@ def bench_scikit_tree_regressor(X, Y): tstart = datetime.now() clf = DecisionTreeRegressor() clf.fit(X, Y).predict(X) - delta = (datetime.now() - tstart) + delta = datetime.now() - tstart # stop time - scikit_regressor_results.append( - delta.seconds + delta.microseconds / mu_second) - + scikit_regressor_results.append(delta.seconds + delta.microseconds / mu_second) -if __name__ == '__main__': - print('============================================') - print('Warning: this is going to take a looong time') - print('============================================') +if __name__ == "__main__": + print("============================================") + print("Warning: this is going to take a looong time") + print("============================================") n = 10 step = 10000 @@ -73,9 +72,9 @@ def bench_scikit_tree_regressor(X, Y): dim = 10 n_classes = 10 for i in range(n): - print('============================================') - print('Entering iteration %s of %s' % (i, n)) - print('============================================') + print("============================================") + print("Entering iteration %s of %s" % (i, n)) + print("============================================") n_samples += step X = np.random.randn(n_samples, dim) Y = np.random.randint(0, n_classes, (n_samples,)) @@ -84,14 +83,14 @@ def bench_scikit_tree_regressor(X, Y): bench_scikit_tree_regressor(X, Y) xx = range(0, n * step, step) - plt.figure('scikit-learn tree benchmark results') + plt.figure("scikit-learn tree benchmark results") plt.subplot(211) - plt.title('Learning with varying number of samples') - plt.plot(xx, scikit_classifier_results, 'g-', label='classification') - plt.plot(xx, scikit_regressor_results, 'r-', label='regression') - plt.legend(loc='upper left') - plt.xlabel('number of samples') - plt.ylabel('Time (s)') + plt.title("Learning with varying number of samples") + plt.plot(xx, scikit_classifier_results, "g-", label="classification") + plt.plot(xx, scikit_regressor_results, "r-", label="regression") + plt.legend(loc="upper left") + plt.xlabel("number of samples") + plt.ylabel("Time (s)") scikit_classifier_results = [] scikit_regressor_results = [] @@ -102,9 +101,9 @@ def bench_scikit_tree_regressor(X, Y): dim = start_dim for i in range(0, n): - print('============================================') - print('Entering iteration %s of %s' % (i, n)) - print('============================================') + print("============================================") + print("Entering iteration %s of %s" % (i, n)) + print("============================================") dim += step X = np.random.randn(100, dim) Y = np.random.randint(0, n_classes, (100,)) @@ -114,11 +113,11 @@ def bench_scikit_tree_regressor(X, Y): xx = np.arange(start_dim, start_dim + n * step, step) plt.subplot(212) - plt.title('Learning in high dimensional spaces') - plt.plot(xx, scikit_classifier_results, 'g-', label='classification') - plt.plot(xx, scikit_regressor_results, 'r-', label='regression') - plt.legend(loc='upper left') - plt.xlabel('number of dimensions') - plt.ylabel('Time (s)') - plt.axis('tight') + plt.title("Learning in high dimensional spaces") + plt.plot(xx, scikit_classifier_results, "g-", label="classification") + plt.plot(xx, scikit_regressor_results, "r-", label="regression") + plt.legend(loc="upper left") + plt.xlabel("number of dimensions") + plt.ylabel("Time (s)") + plt.axis("tight") plt.show() diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index 1f1dc5143d177..8649c7a46b629 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -5,20 +5,21 @@ """ -# License: BSD 3 clause +# SPDX-License-Identifier: BSD-3-Clause +import argparse +import json import os import os.path as op from time import time + import numpy as np -import json -import argparse from joblib import Memory from sklearn.datasets import fetch_openml +from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.neighbors import NearestNeighbors -from sklearn.decomposition import PCA from sklearn.utils import check_array from sklearn.utils import shuffle as _shuffle from sklearn.utils._openmp_helpers import _openmp_effective_n_threads @@ -28,17 +29,16 @@ os.mkdir(LOG_DIR) -memory = Memory(os.path.join(LOG_DIR, 'mnist_tsne_benchmark_data'), - mmap_mode='r') +memory = Memory(os.path.join(LOG_DIR, "mnist_tsne_benchmark_data"), mmap_mode="r") @memory.cache -def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): +def load_data(dtype=np.float32, order="C", shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") - data = fetch_openml('mnist_784') + data = fetch_openml("mnist_784", as_frame=True) - X = check_array(data['data'], dtype=dtype, order=order) + X = check_array(data["data"], dtype=dtype, order=order) y = data["target"] if shuffle: @@ -63,27 +63,43 @@ def tsne_fit_transform(model, data): def sanitize(filename): - return filename.replace("/", '-').replace(" ", "_") + return filename.replace("/", "-").replace(" ", "_") if __name__ == "__main__": - parser = argparse.ArgumentParser('Benchmark for t-SNE') - parser.add_argument('--order', type=str, default='C', - help='Order of the input data') - parser.add_argument('--perplexity', type=float, default=30) - parser.add_argument('--bhtsne', action='store_true', - help="if set and the reference bhtsne code is " - "correctly installed, run it in the benchmark.") - parser.add_argument('--all', action='store_true', - help="if set, run the benchmark with the whole MNIST." - "dataset. Note that it will take up to 1 hour.") - parser.add_argument('--profile', action='store_true', - help="if set, run the benchmark with a memory " - "profiler.") - parser.add_argument('--verbose', type=int, default=0) - parser.add_argument('--pca-components', type=int, default=50, - help="Number of principal components for " - "preprocessing.") + parser = argparse.ArgumentParser("Benchmark for t-SNE") + parser.add_argument( + "--order", type=str, default="C", help="Order of the input data" + ) + parser.add_argument("--perplexity", type=float, default=30) + parser.add_argument( + "--bhtsne", + action="store_true", + help=( + "if set and the reference bhtsne code is " + "correctly installed, run it in the benchmark." + ), + ) + parser.add_argument( + "--all", + action="store_true", + help=( + "if set, run the benchmark with the whole MNIST." + "dataset. Note that it will take up to 1 hour." + ), + ) + parser.add_argument( + "--profile", + action="store_true", + help="if set, run the benchmark with a memory profiler.", + ) + parser.add_argument("--verbose", type=int, default=0) + parser.add_argument( + "--pca-components", + type=int, + default=50, + help="Number of principal components for preprocessing.", + ) args = parser.parse_args() print("Used number of threads: {}".format(_openmp_effective_n_threads())) @@ -92,22 +108,30 @@ def sanitize(filename): if args.pca_components > 0: t0 = time() X = PCA(n_components=args.pca_components).fit_transform(X) - print("PCA preprocessing down to {} dimensions took {:0.3f}s" - .format(args.pca_components, time() - t0)) + print( + "PCA preprocessing down to {} dimensions took {:0.3f}s".format( + args.pca_components, time() - t0 + ) + ) methods = [] # Put TSNE in methods - tsne = TSNE(n_components=2, init='pca', perplexity=args.perplexity, - verbose=args.verbose, n_iter=1000) - methods.append(("sklearn TSNE", - lambda data: tsne_fit_transform(tsne, data))) + tsne = TSNE( + n_components=2, + init="pca", + perplexity=args.perplexity, + verbose=args.verbose, + n_iter=1000, + ) + methods.append(("sklearn TSNE", lambda data: tsne_fit_transform(tsne, data))) if args.bhtsne: try: from bhtsne.bhtsne import run_bh_tsne except ImportError as e: - raise ImportError("""\ + raise ImportError( + """\ If you want comparison with the reference implementation, build the binary from source (https://github.com/lvdmaaten/bhtsne) in the folder benchmarks/bhtsne and add an empty `__init__.py` file in the folder: @@ -117,24 +141,34 @@ def sanitize(filename): $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 $ touch __init__.py $ cd .. -""") from e +""" + ) from e def bhtsne(X): """Wrapper for the reference lvdmaaten/bhtsne implementation.""" # PCA preprocessing is done elsewhere in the benchmark script n_iter = -1 # TODO find a way to report the number of iterations - return run_bh_tsne(X, use_pca=False, perplexity=args.perplexity, - verbose=args.verbose > 0), n_iter + return ( + run_bh_tsne( + X, + use_pca=False, + perplexity=args.perplexity, + verbose=args.verbose > 0, + ), + n_iter, + ) + methods.append(("lvdmaaten/bhtsne", bhtsne)) if args.profile: - try: from memory_profiler import profile except ImportError as e: - raise ImportError("To run the benchmark with `--profile`, you " - "need to install `memory_profiler`. Please " - "run `pip install memory_profiler`.") from e + raise ImportError( + "To run the benchmark with `--profile`, you " + "need to install `memory_profiler`. Please " + "run `pip install memory_profiler`." + ) from e methods = [(n, profile(m)) for n, m in methods] data_size = [100, 500, 1000, 5000, 10000] @@ -143,7 +177,7 @@ def bhtsne(X): results = [] basename = os.path.basename(os.path.splitext(__file__)[0]) - log_filename = os.path.join(LOG_DIR, basename + '.json') + log_filename = os.path.join(LOG_DIR, basename + ".json") for n in data_size: X_train = X[:n] y_train = y[:n] @@ -151,19 +185,24 @@ def bhtsne(X): for name, method in methods: print("Fitting {} on {} samples...".format(name, n)) t0 = time() - np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' - .format('original', n)), X_train) - np.save(os.path.join(LOG_DIR, 'mnist_{}_{}.npy' - .format('original_labels', n)), y_train) + np.save( + os.path.join(LOG_DIR, "mnist_{}_{}.npy".format("original", n)), X_train + ) + np.save( + os.path.join(LOG_DIR, "mnist_{}_{}.npy".format("original_labels", n)), + y_train, + ) X_embedded, n_iter = method(X_train) duration = time() - t0 precision_5 = nn_accuracy(X_train, X_embedded) - print("Fitting {} on {} samples took {:.3f}s in {:d} iterations, " - "nn accuracy: {:0.3f}".format( - name, n, duration, n_iter, precision_5)) + print( + "Fitting {} on {} samples took {:.3f}s in {:d} iterations, " + "nn accuracy: {:0.3f}".format(name, n, duration, n_iter, precision_5) + ) results.append(dict(method=name, duration=duration, n_samples=n)) - with open(log_filename, 'w', encoding='utf-8') as f: + with open(log_filename, "w", encoding="utf-8") as f: json.dump(results, f) method_name = sanitize(name) - np.save(op.join(LOG_DIR, 'mnist_{}_{}.npy'.format(method_name, n)), - X_embedded) + np.save( + op.join(LOG_DIR, "mnist_{}_{}.npy".format(method_name, n)), X_embedded + ) diff --git a/benchmarks/plot_tsne_mnist.py b/benchmarks/plot_tsne_mnist.py index 0ffd32b3de779..fff71eed0a26c 100644 --- a/benchmarks/plot_tsne_mnist.py +++ b/benchmarks/plot_tsne_mnist.py @@ -1,23 +1,26 @@ -import matplotlib.pyplot as plt -import numpy as np -import os.path as op - import argparse +import os.path as op +import matplotlib.pyplot as plt +import numpy as np LOG_DIR = "mnist_tsne_output" if __name__ == "__main__": - parser = argparse.ArgumentParser('Plot benchmark results for t-SNE') + parser = argparse.ArgumentParser("Plot benchmark results for t-SNE") parser.add_argument( - '--labels', type=str, - default=op.join(LOG_DIR, 'mnist_original_labels_10000.npy'), - help='1D integer numpy array for labels') + "--labels", + type=str, + default=op.join(LOG_DIR, "mnist_original_labels_10000.npy"), + help="1D integer numpy array for labels", + ) parser.add_argument( - '--embedding', type=str, - default=op.join(LOG_DIR, 'mnist_sklearn_TSNE_10000.npy'), - help='2D float numpy array for embedded data') + "--embedding", + type=str, + default=op.join(LOG_DIR, "mnist_sklearn_TSNE_10000.npy"), + help="2D float numpy array for embedded data", + ) args = parser.parse_args() X = np.load(args.embedding) @@ -26,5 +29,5 @@ for i in np.unique(y): mask = y == i plt.scatter(X[mask, 0], X[mask, 1], alpha=0.2, label=int(i)) - plt.legend(loc='best') + plt.legend(loc="best") plt.show() diff --git a/build_tools/azure/combine_coverage_reports.sh b/build_tools/azure/combine_coverage_reports.sh new file mode 100755 index 0000000000000..c3b90fdd4fcdb --- /dev/null +++ b/build_tools/azure/combine_coverage_reports.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# Defines the show_installed_libraries and activate_environment functions. +source build_tools/shared.sh + +activate_environment + +# Combine all coverage files generated by subprocesses workers such +# such as pytest-xdist and joblib/loky: +pushd $TEST_DIR +coverage combine --append +coverage xml +popd + +# Copy the combined coverage file to the root of the repository: +cp $TEST_DIR/coverage.xml $BUILD_REPOSITORY_LOCALPATH diff --git a/build_tools/azure/debian_32bit_lock.txt b/build_tools/azure/debian_32bit_lock.txt new file mode 100644 index 0000000000000..c7b8cbceccacb --- /dev/null +++ b/build_tools/azure/debian_32bit_lock.txt @@ -0,0 +1,37 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=build_tools/azure/debian_32bit_lock.txt build_tools/azure/debian_32bit_requirements.txt +# +coverage[toml]==7.8.2 + # via pytest-cov +cython==3.1.1 + # via -r build_tools/azure/debian_32bit_requirements.txt +iniconfig==2.1.0 + # via pytest +joblib==1.5.1 + # via -r build_tools/azure/debian_32bit_requirements.txt +meson==1.8.1 + # via meson-python +meson-python==0.18.0 + # via -r build_tools/azure/debian_32bit_requirements.txt +ninja==1.11.1.4 + # via -r build_tools/azure/debian_32bit_requirements.txt +packaging==25.0 + # via + # meson-python + # pyproject-metadata + # pytest +pluggy==1.6.0 + # via pytest +pyproject-metadata==0.9.1 + # via meson-python +pytest==8.3.5 + # via + # -r build_tools/azure/debian_32bit_requirements.txt + # pytest-cov +pytest-cov==6.1.1 + # via -r build_tools/azure/debian_32bit_requirements.txt +threadpoolctl==3.6.0 + # via -r build_tools/azure/debian_32bit_requirements.txt diff --git a/build_tools/azure/debian_32bit_requirements.txt b/build_tools/azure/debian_32bit_requirements.txt new file mode 100644 index 0000000000000..6dcf67d11c58d --- /dev/null +++ b/build_tools/azure/debian_32bit_requirements.txt @@ -0,0 +1,10 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +cython +joblib +threadpoolctl +pytest +pytest-cov +ninja +meson-python diff --git a/build_tools/azure/get_commit_message.py b/build_tools/azure/get_commit_message.py new file mode 100644 index 0000000000000..0b1246b8d2724 --- /dev/null +++ b/build_tools/azure/get_commit_message.py @@ -0,0 +1,65 @@ +import argparse +import os +import subprocess + + +def get_commit_message(): + """Retrieve the commit message.""" + build_source_version_message = os.environ["BUILD_SOURCEVERSIONMESSAGE"] + + if os.environ["BUILD_REASON"] == "PullRequest": + # By default pull requests use refs/pull/PULL_ID/merge as the source branch + # which has a "Merge ID into ID" as a commit message. The latest commit + # message is the second to last commit + commit_id = build_source_version_message.split()[1] + git_cmd = ["git", "log", commit_id, "-1", "--pretty=%B"] + commit_message = subprocess.run( + git_cmd, capture_output=True, text=True + ).stdout.strip() + else: + commit_message = build_source_version_message + + # Sanitize the commit message to avoid introducing a vulnerability: a PR + # submitter could include the "##vso" special marker in their commit + # message to attempt to obfuscate the injection of arbitrary commands in + # the Azure pipeline. + # + # This can be a problem if the PR reviewers do not pay close enough + # attention to the full commit message prior to clicking the merge button + # and as a result make the inject code run in a protected branch with + # elevated access to CI secrets. On a protected branch, Azure + # already sanitizes `BUILD_SOURCEVERSIONMESSAGE`, but the message + # will still be sanitized here out of precaution. + commit_message = commit_message.replace("##vso", "..vso") + + return commit_message + + +def parsed_args(): + parser = argparse.ArgumentParser( + description=( + "Show commit message that triggered the build in Azure DevOps pipeline" + ) + ) + parser.add_argument( + "--only-show-message", + action="store_true", + default=False, + help=( + "Only print commit message. Useful for direct use in scripts rather than" + " setting output variable of the Azure job" + ), + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parsed_args() + commit_message = get_commit_message() + + if args.only_show_message: + print(commit_message) + else: + # set the environment variable to be propagated to other steps + print(f"##vso[task.setvariable variable=message;isOutput=true]{commit_message}") + print(f"commit message: {commit_message}") # helps debugging diff --git a/build_tools/azure/get_selected_tests.py b/build_tools/azure/get_selected_tests.py new file mode 100644 index 0000000000000..f453748f843c4 --- /dev/null +++ b/build_tools/azure/get_selected_tests.py @@ -0,0 +1,34 @@ +from get_commit_message import get_commit_message + + +def get_selected_tests(): + """Parse the commit message to check if pytest should run only specific tests. + + If so, selected tests will be run with SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all". + + The commit message must take the form: + [all random seeds] + <test_name_1> + <test_name_2> + ... + """ + commit_message = get_commit_message() + + if "[all random seeds]" in commit_message: + selected_tests = commit_message.split("[all random seeds]")[1].strip() + selected_tests = selected_tests.replace("\n", " or ") + else: + selected_tests = "" + + return selected_tests + + +if __name__ == "__main__": + # set the environment variable to be propagated to other steps + selected_tests = get_selected_tests() + + if selected_tests: + print(f"##vso[task.setvariable variable=SELECTED_TESTS]'{selected_tests}'") + print(f"selected tests: {selected_tests}") # helps debugging + else: + print("no selected tests") diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd deleted file mode 100644 index caf28261dcc9f..0000000000000 --- a/build_tools/azure/install.cmd +++ /dev/null @@ -1,45 +0,0 @@ -@rem https://github.com/numba/numba/blob/master/buildscripts/incremental/setup_conda_environment.cmd -@rem The cmd /C hack circumvents a regression where conda installs a conda.bat -@rem script in non-root environments. -set CONDA_INSTALL=cmd /C conda install -q -y -set PIP_INSTALL=pip install -q - -@echo on - -IF "%PYTHON_ARCH%"=="64" ( - @rem Deactivate any environment - call deactivate - @rem Clean up any left-over from a previous build - conda remove --all -q -y -n %VIRTUALENV% - conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython matplotlib wheel pillow joblib - - call activate %VIRTUALENV% - - pip install threadpoolctl - - IF "%PYTEST_VERSION%"=="*" ( - pip install pytest - ) else ( - pip install pytest==%PYTEST_VERSION% - ) -) else ( - pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl -) - -IF "%PYTEST_XDIST%" == "true" ( - pip install pytest-xdist -) - -if "%COVERAGE%" == "true" ( - pip install coverage codecov pytest-cov -) -python --version -pip --version - -@rem Install the build and runtime dependencies of the project. -python setup.py bdist_wheel bdist_wininst -b doc\logos\scikit-learn-logo.bmp - -@rem Install the generated wheel package to test it -pip install --pre --no-index --find-links dist\ scikit-learn - -if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index fbe0c90a473ab..9ae67f8db5e29 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -3,167 +3,136 @@ set -e set -x +# defines the get_dep and show_installed_libraries functions +source build_tools/shared.sh + UNAMESTR=`uname` +CCACHE_LINKS_DIR="/tmp/ccache" -make_conda() { - TO_INSTALL="$@" - conda create -n $VIRTUALENV --yes $TO_INSTALL - source activate $VIRTUALENV +setup_ccache() { + CCACHE_BIN=`which ccache || echo ""` + if [[ "${CCACHE_BIN}" == "" ]]; then + echo "ccache not found, skipping..." + elif [[ -d "${CCACHE_LINKS_DIR}" ]]; then + echo "ccache already configured, skipping..." + else + echo "Setting up ccache with CCACHE_DIR=${CCACHE_DIR}" + mkdir ${CCACHE_LINKS_DIR} + which ccache + for name in gcc g++ cc c++ clang clang++ i686-linux-gnu-gcc i686-linux-gnu-c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++ x86_64-apple-darwin13.4.0-clang x86_64-apple-darwin13.4.0-clang++; do + ln -s ${CCACHE_BIN} "${CCACHE_LINKS_DIR}/${name}" + done + export PATH="${CCACHE_LINKS_DIR}:${PATH}" + ccache -M 256M + + # Zeroing statistics so that ccache statistics are shown only for this build + ccache -z + fi } -setup_ccache() { - echo "Setting up ccache" - mkdir /tmp/ccache/ - which ccache - for name in gcc g++ cc c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++; do - ln -s $(which ccache) "/tmp/ccache/${name}" +pre_python_environment_install() { + if [[ "$DISTRIB" == "ubuntu" ]]; then + sudo apt-get update + sudo apt-get install python3-scipy python3-matplotlib \ + libatlas3-base libatlas-base-dev python3-virtualenv ccache + + elif [[ "$DISTRIB" == "debian-32" ]]; then + apt-get update + apt-get install -y python3-dev python3-numpy python3-scipy \ + python3-matplotlib libopenblas-dev \ + python3-virtualenv python3-pandas ccache git + fi +} + +check_packages_dev_version() { + for package in $@; do + package_version=$(python -c "import $package; print($package.__version__)") + if [[ $package_version =~ "^[.0-9]+$" ]]; then + echo "$package is not a development version: $package_version" + exit 1 + fi done - export PATH="/tmp/ccache/:${PATH}" - ccache -M 256M } -# imports get_dep -source build_tools/shared.sh +python_environment_install_and_activate() { + if [[ "$DISTRIB" == "conda"* ]]; then + create_conda_environment_from_lock_file $VIRTUALENV $LOCK_FILE + activate_environment -if [[ "$DISTRIB" == "conda" ]]; then + elif [[ "$DISTRIB" == "ubuntu" || "$DISTRIB" == "debian-32" ]]; then + python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV + activate_environment + pip install -r "${LOCK_FILE}" - if [[ "$CONDA_CHANNEL" != "" ]]; then - TO_INSTALL="-c $CONDA_CHANNEL" - else - TO_INSTALL="" fi - TO_INSTALL="$TO_INSTALL python=$PYTHON_VERSION ccache pip blas[build=$BLAS]" - - TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep cython $CYTHON_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep joblib $JOBLIB_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep pandas $PANDAS_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep pyamg $PYAMG_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep Pillow $PILLOW_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep matplotlib $MATPLOTLIB_VERSION)" - - if [[ "$UNAMESTR" == "Darwin" ]]; then - if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then - # on macOS, install an OpenMP-enabled clang/llvm from conda-forge. - # TODO: Remove !=1.1.0 when the following is fixed: - # sklearn/svm/_libsvm.cpython-38-darwin.so, - # 2): Symbol not found: _svm_check_parameter error - TO_INSTALL="$TO_INSTALL compilers>=1.0.4,!=1.1.0 llvm-openmp" - fi - fi - make_conda $TO_INSTALL - setup_ccache + # Install additional packages on top of the lock-file in specific cases + if [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then + echo "Installing development dependency wheels" + dev_anaconda_url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple + dev_packages="numpy scipy pandas Cython" + pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url $dev_packages --only-binary :all: -elif [[ "$DISTRIB" == "ubuntu" ]]; then - sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get install python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv ccache - python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV - source $VIRTUALENV/bin/activate - setup_ccache - python -m pip install $(get_dep cython $CYTHON_VERSION) \ - $(get_dep joblib $JOBLIB_VERSION) + check_packages_dev_version $dev_packages -elif [[ "$DISTRIB" == "ubuntu-32" ]]; then - apt-get update - apt-get install -y python3-dev python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache + echo "Installing joblib from latest sources" + pip install https://github.com/joblib/joblib/archive/master.zip + echo "Installing pillow from latest sources" + pip install https://github.com/python-pillow/Pillow/archive/main.zip + fi +} - python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV - source $VIRTUALENV/bin/activate - setup_ccache - python -m pip install $(get_dep cython $CYTHON_VERSION) \ - $(get_dep joblib $JOBLIB_VERSION) - -elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then - # Since conda main channel usually lacks behind on the latest releases, - # we use pypi to test against the latest releases of the dependencies. - # conda is still used as a convenient way to install Python and pip. - make_conda "ccache python=$PYTHON_VERSION" +scikit_learn_install() { setup_ccache - python -m pip install -U pip - - # Do not build scikit-image from source because it is an optional dependency - python -m pip install --only-binary :all: scikit-image || true - - python -m pip install pandas matplotlib pyamg - # do not install dependencies for lightgbm since it requires scikit-learn - # and install a version less than 3.0.0 until the issue #18316 is solved. - python -m pip install "lightgbm<3.0.0" --no-deps -elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then - make_conda "ccache python=$PYTHON_VERSION" - python -m pip install -U pip - echo "Installing numpy and scipy master wheels" - dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple - pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy pandas - - # issue with metadata in scipy dev builds https://github.com/scipy/scipy/issues/13196 - # --use-deprecated=legacy-resolver needs to be included - pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url scipy --use-deprecated=legacy-resolver - pip install --pre cython - setup_ccache - echo "Installing joblib master" - pip install https://github.com/joblib/joblib/archive/master.zip - echo "Installing pillow master" - pip install https://github.com/python-pillow/Pillow/archive/master.zip -fi - -python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ - $(get_dep pytest $PYTEST_VERSION) \ - $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) - -if [[ "$COVERAGE" == "true" ]]; then - python -m pip install codecov pytest-cov -fi - -if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then - python -m pip install pytest-xdist -fi - -if [[ "$TEST_DOCSTRINGS" == "true" ]]; then - # numpydoc requires sphinx - python -m pip install sphinx - python -m pip install numpydoc -fi - -python --version -python -c "import numpy; print('numpy %s' % numpy.__version__)" -python -c "import scipy; print('scipy %s' % scipy.__version__)" -python -c "\ -try: - import pandas - print('pandas %s' % pandas.__version__) -except ImportError: - print('pandas not installed') -" -# Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI -# workers with 2 cores when building the compiled extensions of scikit-learn. -export SKLEARN_BUILD_PARALLEL=3 - -python -m pip list -if [[ "$DISTRIB" == "conda-pip-latest" ]]; then - # Check that pip can automatically build scikit-learn with the build - # dependencies specified in pyproject.toml using an isolated build - # environment: - pip install --verbose --editable . -else - if [[ "$BUILD_WITH_ICC" == "true" ]]; then - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - sudo apt-get update - sudo apt-get install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic - source /opt/intel/oneapi/setvars.sh + show_installed_libraries + + if [[ "$UNAMESTR" == "Darwin" && "$SKLEARN_TEST_NO_OPENMP" == "true" ]]; then + # Without openmp, we use the system clang. Here we use /usr/bin/ar + # instead because llvm-ar errors + export AR=/usr/bin/ar + # Make sure omp.h is not present in the conda environment, so that + # using an unprotected "cimport openmp" will make this build fail. At + # the time of writing (2023-01-13), on OSX, blas (mkl or openblas) + # brings in openmp so that you end up having the omp.h include inside + # the conda environment. + find $CONDA_PREFIX -name omp.h -delete -print + # meson >= 1.5 detects OpenMP installed with brew and OpenMP may be installed + # with brew in CI runner. OpenMP was installed with brew in macOS-12 CI + # runners which doesn't seem to be the case in macOS-13 runners anymore, + # but we keep the next line just to be safe ... + brew uninstall --ignore-dependencies --force libomp + fi + + if [[ "$UNAMESTR" == "Linux" ]]; then + # FIXME: temporary fix to link against system libraries on linux + # https://github.com/scikit-learn/scikit-learn/issues/20640 + export LDFLAGS="$LDFLAGS -Wl,--sysroot=/" + fi - # The "build_clib" command is implicitly used to build "libsvm-skl". - # To compile with a different compiler, we also need to specify the - # compiler for this command - python setup.py build_ext --compiler=intelem -i build_clib --compiler=intelem + if [[ "$PIP_BUILD_ISOLATION" == "true" ]]; then + # Check that pip can automatically build scikit-learn with the build + # dependencies specified in pyproject.toml using an isolated build + # environment: + pip install --verbose . + else + if [[ "$UNAMESTR" == "MINGW64"* ]]; then + # Needed on Windows CI to compile with Visual Studio compiler + # otherwise Meson detects a MINGW64 platform and use MINGW64 + # toolchain + ADDITIONAL_PIP_OPTIONS='-Csetup-args=--vsenv' + fi + # Use the pre-installed build dependencies and build directly in the + # current environment. + pip install --verbose --no-build-isolation --editable . $ADDITIONAL_PIP_OPTIONS fi - # Use the pre-installed build dependencies and build directly in the - # current environment. - python setup.py develop -fi -ccache -s + + ccache -s || echo "ccache not installed, skipping ccache statistics" +} + +main() { + pre_python_environment_install + python_environment_install_and_activate + scikit_learn_install +} + +main diff --git a/build_tools/azure/install_setup_conda.sh b/build_tools/azure/install_setup_conda.sh new file mode 100755 index 0000000000000..d09a02cda5a9f --- /dev/null +++ b/build_tools/azure/install_setup_conda.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -e +set -x + +if [[ -z "${CONDA}" ]]; then + # In some runners (macOS-13 and macOS-14 in October 2024) conda is not + # installed so we install it ourselves + MINIFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + wget ${MINIFORGE_URL} -O miniforge.sh + bash miniforge.sh -b -u -p $HOME/miniforge3 + CONDA="$HOME/miniforge3" +else + # In most runners (in October 2024) conda is installed, + # but in a system folder and we want it user writable + sudo chown -R $USER $CONDA +fi + +# Add conda to the PATH so that it can be used in further Azure CI steps. +# Need set +x for ##vso Azure magic otherwise it may add a quote in the PATH. +# For more details, see https://github.com/microsoft/azure-pipelines-tasks/issues/10331 +set +x +echo "##vso[task.prependpath]$CONDA/bin" +set -x diff --git a/build_tools/azure/posix-32.yml b/build_tools/azure/posix-32.yml deleted file mode 100644 index 5e4689a2505e5..0000000000000 --- a/build_tools/azure/posix-32.yml +++ /dev/null @@ -1,84 +0,0 @@ -parameters: - name: '' - vmImage: '' - matrix: [] - dependsOn: [] - condition: ne(variables['Build.Reason'], 'Schedule') - -jobs: -- job: ${{ parameters.name }} - dependsOn: ${{ parameters.dependsOn }} - condition: ${{ parameters.condition }} - pool: - vmImage: ${{ parameters.vmImage }} - variables: - TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' - JUNITXML: 'test-data.xml' - OMP_NUM_THREADS: '2' - OPENBLAS_NUM_THREADS: '2' - SKLEARN_SKIP_NETWORK_TESTS: '1' - NUMPY_VERSION: 'latest' - SCIPY_VERSION: 'latest' - CYTHON_VERSION: 'latest' - JOBLIB_VERSION: 'latest' - PANDAS_VERSION: 'latest' - PYAMG_VERSION: 'latest' - PILLOW_VERSION: 'latest' - MATPLOTLIB_VERSION: 'latest' - PYTEST_VERSION: 'latest' - PYTEST_XDIST_VERSION: 'latest' - THREADPOOLCTL_VERSION: 'latest' - COVERAGE: 'false' - TEST_DOCSTRINGS: 'false' - strategy: - matrix: - ${{ insert }}: ${{ parameters.matrix }} - - steps: - # Container is detached and sleeping, allowing steps to run commmands - # in the container. The TEST_DIR is mapped allowing the host to access - # the JUNITXML file - - script: > - docker container run --rm - --volume $TEST_DIR:/temp_dir - --volume $PWD:/io - -w /io - --detach - --name skcontainer - -e DISTRIB=ubuntu-32 - -e TEST_DIR=/temp_dir - -e JUNITXML=$JUNITXML - -e VIRTUALENV=testvenv - -e NUMPY_VERSION=$NUMPY_VERSION - -e SCIPY_VERSION=$SCIPY_VERSION - -e CYTHON_VERSION=$CYTHON_VERSION - -e JOBLIB_VERSION=$JOBLIB_VERSION - -e PANDAS_VERSION=$PANDAS_VERSION - -e PYAMG_VERSION=$PYAMG_VERSION - -e PILLOW_VERSION=$PILLOW_VERSION - -e MATPLOTLIB_VERSION=$MATPLOTLIB_VERSION - -e PYTEST_VERSION=$PYTEST_VERSION - -e PYTEST_XDIST_VERSION=$PYTEST_XDIST_VERSION - -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION - -e OMP_NUM_THREADS=$OMP_NUM_THREADS - -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS - -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS - i386/ubuntu:18.04 - sleep 1000000 - displayName: 'Start container' - - script: > - docker exec skcontainer ./build_tools/azure/install.sh - displayName: 'Install' - - script: > - docker exec skcontainer ./build_tools/azure/test_script.sh - displayName: 'Test Library' - - task: PublishTestResults@2 - inputs: - testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' - testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} - displayName: 'Publish Test Results' - condition: succeededOrFailed() - - script: > - docker container stop skcontainer - displayName: 'Stop container' - condition: always() diff --git a/build_tools/azure/posix-all-parallel.yml b/build_tools/azure/posix-all-parallel.yml new file mode 100644 index 0000000000000..45d2b4569110f --- /dev/null +++ b/build_tools/azure/posix-all-parallel.yml @@ -0,0 +1,50 @@ +# This configuration allows enables a job based on `posix.yml` to have two modes: +# +# 1. When `[azure parallel]` *is not* in the commit message, then this job will +# run first. If this job succeeds, then all dependent jobs can run. +# 2. When `[azure parallel]` *is* in the commit message, then this job will +# run with name `{{ parameters.name }}_Parallel` along with all other jobs. +# +# To enable this template, all dependent jobs should check if this job succeeded +# or skipped by using: +# dependsOn: in(dependencies[{{ parameters.name }}]['result'], 'Succeeded', 'Skipped') + +parameters: + name: '' + vmImage: '' + matrix: [] + dependsOn: [] + condition: '' + commitMessage: '' + +jobs: + +# When [azure parallel] *is not* in the commit message, this job will run +# first. +- template: posix.yml + parameters: + name: ${{ parameters.name }} + vmImage: ${{ parameters.vmImage }} + matrix: ${{ parameters.matrix }} + dependsOn: ${{ parameters.dependsOn }} + condition: | + and( + ${{ parameters.condition }}, + not(contains(${{ parameters.commitMessage }}, '[azure parallel]')) + ) + +# When [azure parallel] *is* in the commit message, this job and dependent +# jobs will run in parallel. Implementation-wise, the job above is skipped and +# this job, named ${{ parameters.name }}_Parallel, will run in parallel with +# the other jobs. +- template: posix.yml + parameters: + name: ${{ parameters.name }}_Parallel + vmImage: ${{ parameters.vmImage }} + matrix: ${{ parameters.matrix }} + dependsOn: ${{ parameters.dependsOn }} + condition: | + and( + ${{ parameters.condition }}, + contains(${{ parameters.commitMessage }}, '[azure parallel]') + ) diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml new file mode 100644 index 0000000000000..49b0eb5f0f356 --- /dev/null +++ b/build_tools/azure/posix-docker.yml @@ -0,0 +1,134 @@ +parameters: + name: '' + vmImage: '' + matrix: [] + dependsOn: [] + condition: ne(variables['Build.Reason'], 'Schedule') + +jobs: +- job: ${{ parameters.name }} + dependsOn: ${{ parameters.dependsOn }} + condition: ${{ parameters.condition }} + timeoutInMinutes: 120 + pool: + vmImage: ${{ parameters.vmImage }} + variables: + VIRTUALENV: 'testvenv' + TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' + JUNITXML: 'test-data.xml' + SKLEARN_SKIP_NETWORK_TESTS: '1' + PYTEST_XDIST_VERSION: 'latest' + COVERAGE: 'false' + # Set in azure-pipelines.yml + DISTRIB: '' + DOCKER_CONTAINER: '' + CREATE_ISSUE_ON_TRACKER: 'true' + CCACHE_DIR: $(Pipeline.Workspace)/ccache + CCACHE_COMPRESS: '1' + strategy: + matrix: + ${{ insert }}: ${{ parameters.matrix }} + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.9' + addToPath: false + name: pyTools + displayName: Select python version to run CI python scripts + - bash: $(pyTools.pythonLocation)/bin/python build_tools/azure/get_selected_tests.py + displayName: Check selected tests for all random seeds + condition: eq(variables['Build.Reason'], 'PullRequest') + - task: Cache@2 + inputs: + key: '"ccache-v1" | "$(Agent.JobName)" | "$(Build.BuildNumber)"' + restoreKeys: | + "ccache-v1" | "$(Agent.JobName)" + path: $(CCACHE_DIR) + displayName: ccache + continueOnError: true + - script: > + mkdir -p $CCACHE_DIR + # Container is detached and sleeping, allowing steps to run commands + # in the container. The TEST_DIR is mapped allowing the host to access + # the JUNITXML file + - script: > + docker container run --rm + --volume $TEST_DIR:/temp_dir + --volume $BUILD_REPOSITORY_LOCALPATH:/repo_localpath + --volume $PWD:/io + --volume $CCACHE_DIR:/ccache + -w /io + --detach + --name skcontainer + -e BUILD_SOURCESDIRECTORY=/io + -e TEST_DIR=/temp_dir + -e CCACHE_DIR=/ccache + -e BUILD_REPOSITORY_LOCALPATH=/repo_localpath + -e COVERAGE + -e DISTRIB + -e LOCK_FILE + -e JUNITXML + -e VIRTUALENV + -e PYTEST_XDIST_VERSION + -e SKLEARN_SKIP_NETWORK_TESTS + -e SELECTED_TESTS + -e CCACHE_COMPRESS + -e BUILD_SOURCEVERSIONMESSAGE + -e BUILD_REASON + $DOCKER_CONTAINER + sleep 1000000 + displayName: 'Start container' + - script: > + docker exec skcontainer ./build_tools/azure/install.sh + displayName: 'Install' + - script: > + docker exec skcontainer ./build_tools/azure/test_script.sh + displayName: 'Test Library' + - script: > + docker exec skcontainer ./build_tools/azure/combine_coverage_reports.sh + condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + displayName: 'Combine coverage' + - task: PublishTestResults@2 + inputs: + testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' + testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} + displayName: 'Publish Test Results' + condition: succeededOrFailed() + - script: > + docker container stop skcontainer + displayName: 'Stop container' + condition: always() + - bash: | + set -ex + if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then + echo "GitHub Token is not set. Issue tracker will not be updated." + exit + fi + + LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID" + CI_NAME="$SYSTEM_JOBIDENTIFIER" + ISSUE_REPO="$BUILD_REPOSITORY_NAME" + + $(pyTools.pythonLocation)/bin/pip install defusedxml PyGithub + $(pyTools.pythonLocation)/bin/python maint_tools/update_tracking_issue.py \ + $(BOT_GITHUB_TOKEN) \ + $CI_NAME \ + $ISSUE_REPO \ + $LINK_TO_RUN \ + --junit-file $JUNIT_FILE \ + --auto-close false + displayName: 'Update issue tracker' + env: + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) + condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), + eq(variables['Build.Reason'], 'Schedule')) + - bash: bash build_tools/azure/upload_codecov.sh + condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + displayName: 'Upload To Codecov' + retryCountOnTaskFailure: 5 + env: + CODECOV_TOKEN: $(CODECOV_TOKEN) + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index ae5726aab0b65..e0f504ba540db 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -9,46 +9,44 @@ jobs: - job: ${{ parameters.name }} dependsOn: ${{ parameters.dependsOn }} condition: ${{ parameters.condition }} + timeoutInMinutes: 120 pool: vmImage: ${{ parameters.vmImage }} variables: TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' - OMP_NUM_THREADS: '2' - OPENBLAS_NUM_THREADS: '2' SKLEARN_SKIP_NETWORK_TESTS: '1' CCACHE_DIR: $(Pipeline.Workspace)/ccache CCACHE_COMPRESS: '1' - NUMPY_VERSION: 'latest' - SCIPY_VERSION: 'latest' - CYTHON_VERSION: 'latest' - JOBLIB_VERSION: 'latest' - PANDAS_VERSION: 'latest' - PYAMG_VERSION: 'latest' - PILLOW_VERSION: 'latest' - MATPLOTLIB_VERSION: 'latest' - PYTEST_VERSION: 'latest' PYTEST_XDIST_VERSION: 'latest' - THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'true' - TEST_DOCSTRINGS: 'false' + CREATE_ISSUE_ON_TRACKER: 'true' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - - bash: echo "##vso[task.prependpath]$CONDA/bin" - displayName: Add conda to PATH - condition: startsWith(variables['DISTRIB'], 'conda') - - bash: sudo chown -R $USER $CONDA - displayName: Take ownership of conda installation + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.9' + addToPath: false + name: pyTools + displayName: Select python version to run CI python scripts + - bash: $(pyTools.pythonLocation)/bin/python build_tools/azure/get_selected_tests.py + displayName: Check selected tests for all random seeds + condition: eq(variables['Build.Reason'], 'PullRequest') + - bash: build_tools/azure/install_setup_conda.sh + displayName: Install conda if necessary and set it up condition: startsWith(variables['DISTRIB'], 'conda') - task: Cache@2 inputs: - key: '"$(Agent.JobName)"' + key: '"ccache-v1" | "$(Agent.JobName)" | "$(Build.BuildNumber)"' + restoreKeys: | + "ccache-v1" | "$(Agent.JobName)" path: $(CCACHE_DIR) displayName: ccache + continueOnError: true - script: | build_tools/azure/install.sh displayName: 'Install' @@ -58,19 +56,54 @@ jobs: - script: | build_tools/azure/test_docs.sh displayName: 'Test Docs' + condition: and(succeeded(), eq(variables['SELECTED_TESTS'], '')) - script: | build_tools/azure/test_pytest_soft_dependency.sh displayName: 'Test Soft Dependency' - condition: eq(variables['CHECK_PYTEST_SOFT_DEPENDENCY'], 'true') + condition: and(succeeded(), + eq(variables['CHECK_PYTEST_SOFT_DEPENDENCY'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + - script: | + build_tools/azure/combine_coverage_reports.sh + condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + displayName: 'Combine coverage' - task: PublishTestResults@2 inputs: testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() + - bash: | + set -ex + if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then + echo "GitHub Token is not set. Issue tracker will not be updated." + exit + fi + + LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID" + CI_NAME="$SYSTEM_JOBIDENTIFIER" + ISSUE_REPO="$BUILD_REPOSITORY_NAME" + + $(pyTools.pythonLocation)/bin/pip install defusedxml PyGithub + $(pyTools.pythonLocation)/bin/python maint_tools/update_tracking_issue.py \ + $(BOT_GITHUB_TOKEN) \ + $CI_NAME \ + $ISSUE_REPO \ + $LINK_TO_RUN \ + --junit-file $JUNIT_FILE \ + --auto-close false + displayName: 'Update issue tracker' + env: + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) + condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), + eq(variables['Build.Reason'], 'Schedule')) - script: | build_tools/azure/upload_codecov.sh - condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) + condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) displayName: 'Upload To Codecov' + retryCountOnTaskFailure: 5 env: CODECOV_TOKEN: $(CODECOV_TOKEN) + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) diff --git a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock new file mode 100644 index 0000000000000..e99219a40736d --- /dev/null +++ b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock @@ -0,0 +1,246 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: f524d159a11a0a80ead3448f16255169f24edde269f6b81e8e28453bc4f7fc53 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_0.conda#11b1bed92c943d3b741e8a1e1a815ed1 +https://conda.anaconda.org/conda-forge/linux-64/mkl-include-2024.2.2-ha957f24_16.conda#42b0d14354b5910a9f41e29289914f6b +https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda#d76872d096d063e226482c99337209dc +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-7_cp313.conda#e84b44e6300f1703cb25d29120c5b1d8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 +https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.6-h024ca30_0.conda#e4ece7ed81e43ae97a3b58ac4230c3c5 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-3_kmp_llvm.conda#ee5c2118262e30b972bc0b4db8ef0ba5 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 +https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda#76df83c2a9035c54df5d04ff81bcc02d +https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.3-hb9d3cd8_0.conda#8448031a22c697fac3ed98d69e8a9160 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda#7c7927b404672409d9917d49bff5f2d6 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.10.0-h202a827_0.conda#0f98f3e95272d118f7931b6bef69bfe5 +https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb9d3cd8_0.conda#1349c022c92c5efd3fd705a79a5804d8 +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.1-h5e3027f_0.conda#da0b556585013ad26b3c052b61205f74 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-hafb2847_5.conda#e96cc668c0f9478f5771b37d57f90386 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-hafb2847_0.conda#65853df44b7e4029d978c50be888ed89 +https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-hafb2847_1.conda#6d28d50637fac4f081a0903b4b33d56d +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda#bfd56492d8346d669010eccafe0ba058 +https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda#d411fc29e338efb48c5fd4576d71d881 +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libabseil-20250127.1-cxx17_hbbce691_0.conda#00290e549c5c8a32cc271020acc9ec6b +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.0-h29eaf8c_0.conda#d2f1c87d4416d1e7344cf92b1aaee1c4 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/s2n-1.5.19-h763c568_0.conda#1e0cc52ba44d40408e5fb7b4dbdcde8f +https://conda.anaconda.org/conda-forge/linux-64/sleef-3.8-h1b44611_0.conda#aec4dba5d4c2924730088753f6fa164b +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda#3b3e64af585eadfb52bb90b553db5edf +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda#a37843723437ba75f42c9270ffe800b1 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.19.1-h93b6419_1.conda#86689a4e294ac0e78b046f5989157f7a +https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 +https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda#ff862eebdfeb2fd048ae9dc92510baca +https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2#c965a5aa0d5c1c37ffc62dff36e28400 +https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.124-hb9d3cd8_0.conda#8bc89311041d7fcb510238cf0848ccae +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b +https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda#edb86556cf4a0c133e7932a1597ff236 +https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2024.07.02-hba17884_3.conda#545e93a513c10603327c76c15485e946 +https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda#dcb95c0a98ba9ff737f7ae482aef7833 +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.3-hf636f53_101_cp313.conda#10622e12d649154af0bd76bcf33a7c5c +https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda#353823361b1d27eb3960efb076dfcaf6 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-haaa725d_10.conda#ed15f12bd23f3861d61e3d71c0e639ee +https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.1-hd7992d4_3.conda#8ee52f649777534cd21fc6905b83316d +https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cpython-3.13.3-py313hd8ed1ab_101.conda#904a822cbd380adafb9070debf8579a8 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb +https://conda.anaconda.org/conda-forge/linux-64/cython-3.1.1-py313h5dec8f5_1.conda#f114755cdd37627732b1884b7b15d4b5 +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda#4547b39256e296bb758166893e909a7c +https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda#2d2c9ef879a7e64e2dc657b09272c2b6 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py313h33d0bda_0.conda#9862d13a5e466273d5a4738cffcb8d6c +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.0-h332b0f4_0.conda#d1738cf06503218acee63669029fd8e8 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c +https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda#21b62c55924f01b6eef6827167b46acb +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81 +https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_1.conda#3585aa87c43ab15b167b574cd73b057b +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/noarch/networkx-3.4.2-pyh267e887_2.conda#fd40bf7f7f4bc4b647dc8512053d9873 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/linux-64/orc-2.1.2-h17f744e_0.conda#ef7f9897a244b2023a066c22a1089ce4 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh145f28c_0.conda#01384ff1639c6330a0924791413b8714 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/pybind11-global-2.13.6-pyh217bc35_3.conda#730a5284e26d6bdb73332dafb26aec82 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/linux-64/re2-2024.07.02-h9925aae_3.conda#6f445fb139c356f903746b2b91bbe786 +https://conda.anaconda.org/conda-forge/noarch/setuptools-75.8.2-pyhff2d567_0.conda#9bddfdbf4e061821a1a443f93223be61 +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py313h536fd9c_0.conda#e9434a5155db25c38ade26f71a2f5a48 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 +https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.44-hb9d3cd8_0.conda#7c91bfc90672888259675ad2ad28af9c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-h3318fae_10.conda#0966b2b633190f1f24a92ddb25559ff8 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.0-h7b3935a_4.conda#d60f0cefef7f0dd076d8335781a676fd +https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda#0a8838771cc2e985cd295e01ae83baf1 +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.11.3-h80c52d3_0.conda#eb517c6a2b960c3ccb6f1db1005f063a +https://conda.anaconda.org/conda-forge/linux-64/coverage-7.8.2-py313h8060acc_0.conda#b278629953bd3424060870fca744de4a +https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda#679616eb5ad4e521c83da4650860aba7 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.1-py313h8060acc_0.conda#f03a1dc39346922cb5cf2ee190ac9b95 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a +https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.71.0-h8e591d7_1.conda#c3cfd72cbb14113abee7bbd86f44ad69 +https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.2-default_h0d58e46_1001.conda#804ca9e91bcaea0824a341d55b1684f2 +https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.6-he9d0ab4_0.conda#bf8ccdd2c1c1a54a3fa25bb61f26460e +https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda#fedf6bfe5d21d21d2b1785ec00a8889a +https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda#e71f31f8cfb0a91439f2086fc8aa0461 +https://conda.anaconda.org/conda-forge/linux-64/mpc-1.3.1-h24ddda3_1.conda#aa14b9a5196a6d8dd364164b7ce56acf +https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda#2e5bf4f1da39c0b32778561c3c4e5878 +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py313h8db990d_0.conda#91b00afee98d72d29dc3d1c1ab0008d7 +https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda#a83f6a2fdc079e643237887a37460668 +https://conda.anaconda.org/conda-forge/noarch/pybind11-2.13.6-pyhc790b64_3.conda#1594696beebf1ecb6d29a1136f859a74 +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.3-h4df99d1_101.conda#82c2641f2f0f513f7d2d1b847a2588e3 +https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.13.2-h0e9735f_0.conda#568ed1300869dca0ba09fb750cda5dbb +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda#eb44b3b6deb1cab08d72cb61686fe64c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda#2ccd714aa2242315acaf0a67faea780b +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda#17dcc85db3c7886650b8908b183d6876 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda#2de7f99d6581a4a7adbff607b5c278ca +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda#5efa5fa6243a622445fdfd72aee15efa +https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda#aaa2a381ccc56eac91d63b6c1240312f +https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.0-h365f71b_1.conda#eff3d619b784eb000e62a6fd7797d6a5 +https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.10.0-h113e628_0.conda#73f73f60854f325a55f1d31459f2ab73 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.8.0-h736e048_1.conda#13de36be8de3ae3f05ba127631599213 +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py313h11186cd_0.conda#54d020e0eaacf1e99bfb2410b9aa2e5e +https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.6-default_h1df26ce_0.conda#99ead3b974685e44df8b1e3953503cfc +https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.6-default_he06ed0a_0.conda#cc6c469d9d7fc0ac106cef5f45d973a9 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda#ae36e6296a8dd8e8a9a8375965bf6398 +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda#4b25cd8720fd8d5319206e4f899f2707 +https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda#6458be24f09e1b034902ab44fe9de908 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/optree-0.16.0-py313h33d0bda_0.conda#5c211bb056e1a3263a163ba21e3fbf73 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.13.0-hceb3a55_1.conda#ba7726b8df7b9d34ea80e82b097a4893 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda#7bbe9a0cc0df0ac5f5a8ad6d6a11af2f +https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.6-h96107e3_3.conda#c8e99a9c0b60e7695de83512e53b9eb3 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.13.0-h3cf044e_1.conda#7eb66060455c7a47d9dcdbfa9f46579b +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_1.conda#a0f7588c1f0a26d550e7bae4fb49427a +https://conda.anaconda.org/conda-forge/linux-64/mkl-2024.2.2-ha957f24_16.conda#1459379c79dda834673426504d52b319 +https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.1.1-pyhd8ed1ab_0.conda#1e35d8f975bc0e984a19819aa91c440a +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_105.conda#8c09fac3785696e1c477156192d64b91 +https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-hf282fd2_9.conda#1583a30af14c72e76034ca6e04abe64c +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-ha633028_1.conda#7c1980f89dd41b097549782121a73490 +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_hfdb39a5_mkl.conda#bdf4a57254e8248222cb631db4393ff1 +https://conda.anaconda.org/conda-forge/linux-64/mkl-devel-2024.2.2-ha770c72_16.conda#140891ea14285fc634353b31e9e40a95 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h6b3f9de_5_cpu.conda#2f4c442e11ca9695ff779ffa709e0026 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_h372d94f_mkl.conda#2a06a6c16b45bd3d10002927ca204b67 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_hc41d3b0_mkl.conda#10d012ddd7cc1c7ff9093d4974a34e53 +https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.0-h0384650_3.conda#8aa69e15597a205fd6f81781fe62c232 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hcb10f89_5_cpu.conda#2d29a8510e16df15bfb6c0b149c78c84 +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-31_hbc6e62b_mkl.conda#562026e418363dc346ad5a9e18cce73c +https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h081d1f1_5_cpu.conda#6f60536e2136749dc1cb29da302c13e3 +https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.0-cpu_mkl_hf6ddc5a_100.conda#6bdda0b10852c6d03b030bab7ec251f0 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py313h17eae1a_0.conda#7a2d2f9adecd86ed5c29c2115354f615 +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py313he5f92c8_0_cpu.conda#2afdef63d9fbc2cd0e52f8e8f3472404 +https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.0-py313h5f61773_0.conda#f51f25ec8fcbf777f8b186bb5deeed40 +https://conda.anaconda.org/conda-forge/noarch/array-api-strict-2.3.1-pyhd8ed1ab_0.conda#11107d0aeb8c590a34fee0894909816b +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-31_hcf00494_mkl.conda#368c93bde87a67d24a74de15bf4c49fd +https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py313h33d0bda_0.conda#5dc81fffe102f63045225007a33d6199 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hcb10f89_5_cpu.conda#d07f972af0508d56382c74846a386185 +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.3-py313ha87cce1_3.conda#6248b529e537b1d4cb5ab3ef7f537795 +https://conda.anaconda.org/conda-forge/linux-64/polars-default-1.30.0-py39hfac2b71_0.conda#cd33cf1e631b4d766858c90e333b4832 +https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.0-cpu_mkl_py313_hea9ba1b_100.conda#3c2ce6a304aa827f1e3cc21f7df9190d +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py313h86fcf2b_0.conda#ca68acd9febc86448eeed68d0c6c8643 +https://conda.anaconda.org/conda-forge/noarch/scipy-doctest-1.8.0-pyhe01879c_0.conda#5bc3f4bc1e027aa4ba6fdad1a84b5d3c +https://conda.anaconda.org/conda-forge/linux-64/blas-2.131-mkl.conda#9bb865b7e01104255ca54e61a58ded15 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h1bed206_5_cpu.conda#dbfd38071ac2e09f7761e9c8129c18c4 +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py313h129903b_0.conda#4f8816d006b1c155ec416bcf7ff6cee2 +https://conda.anaconda.org/conda-forge/linux-64/polars-1.30.0-default_h1443d73_0.conda#19698b29e8544d2dd615699826037039 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.2.1-py313hf0ab243_1.conda#4c769bf3858f424cb2ecf952175ec600 +https://conda.anaconda.org/conda-forge/linux-64/pytorch-cpu-2.7.0-cpu_mkl_hc60beec_100.conda#20b3051f55ad823a27818dfa46a41c8f +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py313h78bf25f_0.conda#cc9324e614a297fdf23439d887d3513d +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-20.0.0-py313h78bf25f_0.conda#6b8d388845ce750fe2ad8436669182f3 diff --git a/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml new file mode 100644 index 0000000000000..e804bf1ce8e31 --- /dev/null +++ b/build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml @@ -0,0 +1,31 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python + - numpy + - blas[build=mkl] + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - pytest-cov + - coverage + - ccache + - pytorch + - pytorch-cpu + - polars + - pyarrow + - array-api-strict + - scipy-doctest diff --git a/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock new file mode 100644 index 0000000000000..edb27c0d3475d --- /dev/null +++ b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock @@ -0,0 +1,133 @@ +# Generated by conda-lock. +# platform: osx-64 +# input_hash: cee22335ff0a429180f2d8eeb31943f2646e3e653f1197f57ba6e39fc9659b05 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/libgfortran-devel_osx-64-13.3.0-h297be85_105.conda#c4967f8e797d0ffef3c5650fcdc2cdb5 +https://conda.anaconda.org/conda-forge/osx-64/mkl-include-2023.2.0-h6bab518_50500.conda#835abb8ded5e26f23ea6996259c7972e +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-7_cp313.conda#e84b44e6300f1703cb25d29120c5b1d8 +https://conda.anaconda.org/conda-forge/osx-64/tbb-2021.10.0-h1c7c39f_2.conda#73434bcf87082942e938352afae9b0fa +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-hfdf4475_7.conda#7ed4301d437b59045be7e051a0308211 +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/osx-64/icu-75.1-h120a0e1_0.conda#d68d48a3060eb5abdc1cdc8e2a3a5966 +https://conda.anaconda.org/conda-forge/osx-64/libbrotlicommon-1.1.0-h00291cd_2.conda#58f2c4bdd56c46cc7451596e4ae68e0b +https://conda.anaconda.org/conda-forge/osx-64/libcxx-20.1.6-hf95d169_0.conda#460934df319a215557816480e9ea78cf +https://conda.anaconda.org/conda-forge/osx-64/libdeflate-1.24-hcc1b750_0.conda#f0a46c359722a3e84deb05cd4072d153 +https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.7.0-h240833e_0.conda#026d0a1056ba2a3dbbea6d4b08188676 +https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.6-h281671d_1.conda#4ca9ea59839a9ca8df84170fab4ceb41 +https://conda.anaconda.org/conda-forge/osx-64/libiconv-1.18-h4b5e92a_1.conda#6283140d7b2b55b6b095af939b71b13f +https://conda.anaconda.org/conda-forge/osx-64/libjpeg-turbo-3.1.0-h6e16a3a_0.conda#87537967e6de2f885a9fcebd42b7cb10 +https://conda.anaconda.org/conda-forge/osx-64/liblzma-5.8.1-hd471939_1.conda#f87e8821e0e38a4140a7ed4f52530053 +https://conda.anaconda.org/conda-forge/osx-64/libmpdec-4.0.0-h6e16a3a_0.conda#18b81186a6adb43f000ad19ed7b70381 +https://conda.anaconda.org/conda-forge/osx-64/libwebp-base-1.5.0-h6cf52b4_0.conda#5e0cefc99a231ac46ba21e27ae44689f +https://conda.anaconda.org/conda-forge/osx-64/libzlib-1.3.1-hd23fc13_2.conda#003a54a4e32b02f7355b50a837e699da +https://conda.anaconda.org/conda-forge/osx-64/llvm-openmp-20.1.6-ha54dae1_0.conda#c55751d61e1f8be539e0e4beffad3e5a +https://conda.anaconda.org/conda-forge/osx-64/ncurses-6.5-h0622a9a_3.conda#ced34dd9929f491ca6dab6a2927aff25 +https://conda.anaconda.org/conda-forge/osx-64/pthread-stubs-0.4-h00291cd_1002.conda#8bcf980d2c6b17094961198284b8e862 +https://conda.anaconda.org/conda-forge/osx-64/xorg-libxau-1.0.12-h6e16a3a_0.conda#4cf40e60b444d56512a64f39d12c20bd +https://conda.anaconda.org/conda-forge/osx-64/xorg-libxdmcp-1.1.5-h00291cd_0.conda#9f438e1b6f4e73fd9e6d78bfe7c36743 +https://conda.anaconda.org/conda-forge/osx-64/gmp-6.3.0-hf036a51_2.conda#427101d13f19c4974552a4e5b072eef1 +https://conda.anaconda.org/conda-forge/osx-64/isl-0.26-imath32_h2e86a7b_101.conda#d06222822a9144918333346f145b68c6 +https://conda.anaconda.org/conda-forge/osx-64/lerc-4.0.0-hcca01a6_1.conda#21f765ced1a0ef4070df53cb425e1967 +https://conda.anaconda.org/conda-forge/osx-64/libbrotlidec-1.1.0-h00291cd_2.conda#34709a1f5df44e054c4a12ab536c5459 +https://conda.anaconda.org/conda-forge/osx-64/libbrotlienc-1.1.0-h00291cd_2.conda#691f0dcb36f1ae67f5c489f20ae987ea +https://conda.anaconda.org/conda-forge/osx-64/libcxx-devel-18.1.8-h7c275be_8.conda#a9513c41f070a9e2d5c370ba5d6c0c00 +https://conda.anaconda.org/conda-forge/osx-64/libgfortran5-14.2.0-h58528f3_105.conda#94560312ff3c78225bed62ab59854c31 +https://conda.anaconda.org/conda-forge/osx-64/libpng-1.6.47-h3c4a55f_0.conda#8461ab86d2cdb76d6e971aab225be73f +https://conda.anaconda.org/conda-forge/osx-64/libsqlite-3.50.0-hdb6dae5_0.conda#caf16742f7e16475603cd9981ef36195 +https://conda.anaconda.org/conda-forge/osx-64/libxcb-1.17.0-hf1f96e2_0.conda#bbeca862892e2898bdb45792a61c4afc +https://conda.anaconda.org/conda-forge/osx-64/libxml2-2.14.3-h8c082e5_0.conda#f886f309637a6ff2ff858b38b7395aa1 +https://conda.anaconda.org/conda-forge/osx-64/mkl-2023.2.0-h54c2260_50500.conda#0a342ccdc79e4fcd359245ac51941e7b +https://conda.anaconda.org/conda-forge/osx-64/ninja-1.12.1-hd6aca1a_1.conda#1cf196736676270fa876001901e4e1db +https://conda.anaconda.org/conda-forge/osx-64/openssl-3.5.0-hc426f3f_1.conda#919faa07b9647beb99a0e7404596a465 +https://conda.anaconda.org/conda-forge/osx-64/qhull-2020.2-h3c5361c_5.conda#dd1ea9ff27c93db7c01a7b7656bd4ad4 +https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h7cca4af_2.conda#342570f8e02f2f022147a7f841475784 +https://conda.anaconda.org/conda-forge/osx-64/tapi-1300.6.5-h390ca13_0.conda#c6ee25eb54accb3f1c8fc39203acfaf1 +https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.13-hf689a15_2.conda#9864891a6946c2fe037c02fca7392ab4 +https://conda.anaconda.org/conda-forge/osx-64/zlib-1.3.1-hd23fc13_2.conda#c989e0295dcbdc08106fe5d9e935f0b9 +https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda#cd60a4a5a8d6a476b30d8aa4bb49251a +https://conda.anaconda.org/conda-forge/osx-64/brotli-bin-1.1.0-h00291cd_2.conda#049933ecbf552479a12c7917f0a4ce59 +https://conda.anaconda.org/conda-forge/osx-64/libblas-3.9.0-20_osx64_mkl.conda#160fdc97a51d66d51dc782fb67d35205 +https://conda.anaconda.org/conda-forge/osx-64/libfreetype6-2.13.3-h40dfd5c_1.conda#c76e6f421a0e95c282142f820835e186 +https://conda.anaconda.org/conda-forge/osx-64/libgfortran-14.2.0-hef36b68_105.conda#6b27baf030f5d6603713c7e72d3f6b9a +https://conda.anaconda.org/conda-forge/osx-64/libllvm18-18.1.8-default_h3571c67_5.conda#01dd8559b569ad39b64fef0a61ded1e9 +https://conda.anaconda.org/conda-forge/osx-64/libtiff-4.7.0-h1167cee_5.conda#fc84af14a09e779f1d37ab1d16d5c4e2 +https://conda.anaconda.org/conda-forge/osx-64/mkl-devel-2023.2.0-h694c41f_50500.conda#1b4d0235ef253a1e19459351badf4f9f +https://conda.anaconda.org/conda-forge/osx-64/mpfr-4.2.1-haed47dc_3.conda#d511e58aaaabfc23136880d9956fa7a6 +https://conda.anaconda.org/conda-forge/osx-64/python-3.13.3-h534c281_101_cp313.conda#ebcc7c42561d8d8b01477020b63218c0 +https://conda.anaconda.org/conda-forge/osx-64/sigtool-0.1.3-h88f4db0_0.tar.bz2#fbfb84b9de9a6939cb165c02c69b1865 +https://conda.anaconda.org/conda-forge/osx-64/brotli-1.1.0-h00291cd_2.conda#2db0c38a7f2321c5bdaf32b181e832c7 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/osx-64/cython-3.1.1-py313h9efc8c2_1.conda#b94bca8fec5fbaa69375656928e05c1d +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/osx-64/kiwisolver-1.4.7-py313h0c4e38b_0.conda#c37fceab459e104e77bb5456e219fc37 +https://conda.anaconda.org/conda-forge/osx-64/lcms2-2.17-h72f5680_0.conda#bf210d0c63f2afb9e414a858b79f0eaa +https://conda.anaconda.org/conda-forge/osx-64/ld64_osx-64-951.9-h33512f0_6.conda#6cd120f5c9dae65b858e1fad2b7959a0 +https://conda.anaconda.org/conda-forge/osx-64/libcblas-3.9.0-20_osx64_mkl.conda#51089a4865eb4aec2bc5c7468bd07f9f +https://conda.anaconda.org/conda-forge/osx-64/libclang-cpp18.1-18.1.8-default_h3571c67_10.conda#bf6753267e6f848f369c5bc2373dddd6 +https://conda.anaconda.org/conda-forge/osx-64/libfreetype-2.13.3-h694c41f_1.conda#07c8d3fbbe907f32014b121834b36dd5 +https://conda.anaconda.org/conda-forge/osx-64/libhiredis-1.0.2-h2beb688_0.tar.bz2#524282b2c46c9dedf051b3bc2ae05494 +https://conda.anaconda.org/conda-forge/osx-64/liblapack-3.9.0-20_osx64_mkl.conda#58f08e12ad487fac4a08f90ff0b87aec +https://conda.anaconda.org/conda-forge/osx-64/llvm-tools-18-18.1.8-default_h3571c67_5.conda#4391981e855468ced32ca1940b3d7613 +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/osx-64/mpc-1.3.1-h9d8efa1_1.conda#0520855aaae268ea413d6bc913f1384c +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/osx-64/openjpeg-2.5.3-h7fd6d84_0.conda#025c711177fc3309228ca1a32374458d +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh145f28c_0.conda#01384ff1639c6330a0924791413b8714 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/osx-64/tornado-6.5.1-py313h63b0ddb_0.conda#7554d07cbe64f41c73a403e99bccf3c6 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/osx-64/ccache-4.11.3-h33566b8_0.conda#b65cad834bd6c1f660c101cca09430bf +https://conda.anaconda.org/conda-forge/osx-64/clang-18-18.1.8-default_h3571c67_10.conda#62e1cd0882dad47d6a6878ad037f7b9d +https://conda.anaconda.org/conda-forge/osx-64/coverage-7.8.2-py313h717bdf5_0.conda#73eb83ea3d00f06bf78e242cca5e8e44 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/osx-64/fonttools-4.58.1-py313h717bdf5_0.conda#b08a911c3cc2cf9ff4f48f4e06bbc2dd +https://conda.anaconda.org/conda-forge/osx-64/freetype-2.13.3-h694c41f_1.conda#126dba1baf5030cb6f34533718924577 +https://conda.anaconda.org/conda-forge/osx-64/gfortran_impl_osx-64-13.3.0-hbf5bf67_105.conda#f56a107c8d1253346d01785ecece7977 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/osx-64/ld64-951.9-h4e51db5_6.conda#45bf526d53b1bc95bc0b932a91a41576 +https://conda.anaconda.org/conda-forge/osx-64/liblapacke-3.9.0-20_osx64_mkl.conda#124ae8e384268a8da66f1d64114a1eda +https://conda.anaconda.org/conda-forge/osx-64/llvm-tools-18.1.8-default_h3571c67_5.conda#cc07ff74d2547da1f1452c42b67bafd6 +https://conda.anaconda.org/conda-forge/osx-64/numpy-2.2.6-py313hc518a0f_0.conda#7b80c7ace05b1b9d7ec6f55130776988 +https://conda.anaconda.org/conda-forge/osx-64/pillow-11.2.1-py313h0c4f865_0.conda#b4647eda8779d0e5d25cc8c9b124b303 +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/osx-64/blas-devel-3.9.0-20_osx64_mkl.conda#cc3260179093918b801e373c6e888e02 +https://conda.anaconda.org/conda-forge/osx-64/cctools_osx-64-1010.6-hd19c6af_6.conda#4694e9e497454a8ce5b9fb61e50d9c5d +https://conda.anaconda.org/conda-forge/osx-64/clang-18.1.8-default_h576c50e_10.conda#350a10c62423982b0c80a043b9921c00 +https://conda.anaconda.org/conda-forge/osx-64/contourpy-1.3.2-py313ha0b1807_0.conda#2c2d1f840df1c512b34e0537ef928169 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/osx-64/pandas-2.2.3-py313h2e7108f_3.conda#5c37fc7549913fc4895d7d2e097091ed +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/osx-64/scipy-1.15.2-py313h7e69c36_0.conda#53c23f87aedf2d139d54c88894c8a07f +https://conda.anaconda.org/conda-forge/osx-64/blas-2.120-mkl.conda#b041a7677a412f3d925d8208936cb1e2 +https://conda.anaconda.org/conda-forge/osx-64/cctools-1010.6-ha66f10e_6.conda#a126dcde2752751ac781b67238f7fac4 +https://conda.anaconda.org/conda-forge/osx-64/clangxx-18.1.8-default_heb2e8d1_10.conda#c39251c90faf5ba495d9f9ef88d7563e +https://conda.anaconda.org/conda-forge/osx-64/matplotlib-base-3.10.3-py313he981572_0.conda#91c22969c0974f2f23470d517774d457 +https://conda.anaconda.org/conda-forge/osx-64/pyamg-5.2.1-py313h0322a6a_1.conda#4bda5182eeaef3d2017a2ec625802e1a +https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.1.1-pyhd8ed1ab_0.conda#1e35d8f975bc0e984a19819aa91c440a +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/noarch/compiler-rt_osx-64-18.1.8-hf2b8a54_1.conda#76f906e6bdc58976c5593f650290ae20 +https://conda.anaconda.org/conda-forge/osx-64/matplotlib-3.10.3-py313habf4b1d_0.conda#c1043254f405998ece984e5f66a10943 +https://conda.anaconda.org/conda-forge/osx-64/compiler-rt-18.1.8-h1020d70_1.conda#bc1714a1e73be18e411cff30dc1fe011 +https://conda.anaconda.org/conda-forge/osx-64/clang_impl_osx-64-18.1.8-h6a44ed1_25.conda#bfc995f8ab9e8c22ebf365844da3383d +https://conda.anaconda.org/conda-forge/osx-64/clang_osx-64-18.1.8-h7e5c614_25.conda#1fea06d9ced6b87fe63384443bc2efaf +https://conda.anaconda.org/conda-forge/osx-64/c-compiler-1.9.0-h09a7c41_0.conda#ab45badcb5d035d3bddfdbdd96e00967 +https://conda.anaconda.org/conda-forge/osx-64/clangxx_impl_osx-64-18.1.8-h4b7810f_25.conda#c03c94381d9ffbec45c98b800e7d3e86 +https://conda.anaconda.org/conda-forge/osx-64/gfortran_osx-64-13.3.0-h3223c34_1.conda#a6eeb1519091ac3239b88ee3914d6cb6 +https://conda.anaconda.org/conda-forge/osx-64/clangxx_osx-64-18.1.8-h7e5c614_25.conda#2e5c84e93a3519d77a0d8d9b3ea664fd +https://conda.anaconda.org/conda-forge/osx-64/gfortran-13.3.0-hcc3c99d_1.conda#e1177b9b139c6cf43250427819f2f07b +https://conda.anaconda.org/conda-forge/osx-64/cxx-compiler-1.9.0-h20888b2_0.conda#cd17d9bf9780b0db4ed31fb9958b167f +https://conda.anaconda.org/conda-forge/osx-64/fortran-compiler-1.9.0-h02557f8_0.conda#2cf645572d7ae534926093b6e9f3bdff +https://conda.anaconda.org/conda-forge/osx-64/compilers-1.9.0-h694c41f_0.conda#b84884262dcd1c2f56a9e1961fdd3326 diff --git a/build_tools/azure/pylatest_conda_forge_mkl_osx-64_environment.yml b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_environment.yml new file mode 100644 index 0000000000000..ad177e4ed391b --- /dev/null +++ b/build_tools/azure/pylatest_conda_forge_mkl_osx-64_environment.yml @@ -0,0 +1,27 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python + - numpy + - blas[build=mkl] + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - pytest-cov + - coverage + - ccache + - compilers + - llvm-openmp diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml new file mode 100644 index 0000000000000..0c2eec344c26b --- /dev/null +++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_environment.yml @@ -0,0 +1,28 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - defaults +dependencies: + - python + - numpy + - blas[build=mkl] + - scipy<1.12 + - joblib + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - pytest-cov + - coverage + - ccache + - pip + - pip: + - cython + - threadpoolctl + - meson-python + - meson diff --git a/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock new file mode 100644 index 0000000000000..7be311177e65f --- /dev/null +++ b/build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock @@ -0,0 +1,82 @@ +# Generated by conda-lock. +# platform: osx-64 +# input_hash: cc639ea0beeaceb46e2ad729ba559d5d5e746b8f6ff522bc718109af6265069c +@EXPLICIT +https://repo.anaconda.com/pkgs/main/osx-64/blas-1.0-mkl.conda#cb2c87e85ac8e0ceae776d26d4214c8a +https://repo.anaconda.com/pkgs/main/osx-64/bzip2-1.0.8-h6c40b1e_6.conda#96224786021d0765ce05818fa3c59bdb +https://repo.anaconda.com/pkgs/main/osx-64/ca-certificates-2025.2.25-hecd8cb5_0.conda#12ab77db61795036e15a5b14929ad4a1 +https://repo.anaconda.com/pkgs/main/osx-64/jpeg-9e-h46256e1_3.conda#b1d9769eac428e11f5f922531a1da2e0 +https://repo.anaconda.com/pkgs/main/osx-64/libcxx-14.0.6-h9765a3e_0.conda#387757bb354ae9042370452cd0fb5627 +https://repo.anaconda.com/pkgs/main/osx-64/libdeflate-1.22-h46256e1_0.conda#7612fb79e5e76fcd16655c7d026f4a66 +https://repo.anaconda.com/pkgs/main/osx-64/libffi-3.4.4-hecd8cb5_1.conda#eb7f09ada4d95f1a26f483f1009d9286 +https://repo.anaconda.com/pkgs/main/osx-64/libwebp-base-1.3.2-h46256e1_1.conda#399c11b50e6e7a6969aca9a84ea416b7 +https://repo.anaconda.com/pkgs/main/osx-64/llvm-openmp-14.0.6-h0dcd299_0.conda#b5804d32b87dc61ca94561ade33d5f2d +https://repo.anaconda.com/pkgs/main/osx-64/ncurses-6.4-hcec6c5f_0.conda#0214d1ee980e217fabc695f1e40662aa +https://repo.anaconda.com/pkgs/main/noarch/tzdata-2025b-h04d1e81_0.conda#1d027393db3427ab22a02aa44a56f143 +https://repo.anaconda.com/pkgs/main/osx-64/xz-5.6.4-h46256e1_1.conda#ce989a528575ad332a650bb7c7f7e5d5 +https://repo.anaconda.com/pkgs/main/osx-64/zlib-1.2.13-h4b97444_1.conda#38e35f7c817fac0973034bfce6706ec2 +https://repo.anaconda.com/pkgs/main/osx-64/ccache-3.7.9-hf120daa_0.conda#a01515a32e721c51d631283f991bc8ea +https://repo.anaconda.com/pkgs/main/osx-64/expat-2.7.1-h6d0c2b6_0.conda#6cdc93776b7551083854e7f106a62720 +https://repo.anaconda.com/pkgs/main/osx-64/intel-openmp-2023.1.0-ha357a0b_43548.conda#ba8a89ffe593eb88e4c01334753c40c3 +https://repo.anaconda.com/pkgs/main/osx-64/lerc-4.0.0-h6d0c2b6_0.conda#824f87854c58df1525557c8639ce7f93 +https://repo.anaconda.com/pkgs/main/osx-64/libgfortran5-11.3.0-h9dfd629_28.conda#1fa1a27ee100b1918c3021dbfa3895a3 +https://repo.anaconda.com/pkgs/main/osx-64/libpng-1.6.39-h6c40b1e_0.conda#a3c824835f53ad27aeb86d2b55e47804 +https://repo.anaconda.com/pkgs/main/osx-64/lz4-c-1.9.4-hcec6c5f_1.conda#aee0efbb45220e1985533dbff48551f8 +https://repo.anaconda.com/pkgs/main/osx-64/ninja-base-1.12.1-h1962661_0.conda#9c0a94a811e88f182519d9309cf5f634 +https://repo.anaconda.com/pkgs/main/osx-64/openssl-3.0.16-h184c1cd_0.conda#8e3c130ef85c3260d535153b4d0fd63a +https://repo.anaconda.com/pkgs/main/osx-64/readline-8.2-hca72f7f_0.conda#971667436260e523f6f7355fdfa238bf +https://repo.anaconda.com/pkgs/main/osx-64/tbb-2021.8.0-ha357a0b_0.conda#fb48530a3eea681c11dafb95b3387c0f +https://repo.anaconda.com/pkgs/main/osx-64/tk-8.6.14-h4d00af3_0.conda#a2c03940c2ae54614301ec82e6a98d75 +https://repo.anaconda.com/pkgs/main/osx-64/freetype-2.13.3-h02243ff_0.conda#acf5e48106235eb200eecb79119c7ffc +https://repo.anaconda.com/pkgs/main/osx-64/libgfortran-5.0.0-11_3_0_hecd8cb5_28.conda#2eb13b680803f1064e53873ae0aaafb3 +https://repo.anaconda.com/pkgs/main/osx-64/mkl-2023.1.0-h8e150cf_43560.conda#85d0f3431dd5c6ae44f8725fdd3d3e59 +https://repo.anaconda.com/pkgs/main/osx-64/sqlite-3.45.3-h6c40b1e_0.conda#2edf909b937b3aad48322c9cb2e8f1a0 +https://repo.anaconda.com/pkgs/main/osx-64/zstd-1.5.6-h138b38a_0.conda#f4d15d7d0054d39e6a24fe8d7d1e37c5 +https://repo.anaconda.com/pkgs/main/osx-64/libtiff-4.7.0-h2dfa3ea_0.conda#82a118ce0139e2bf6f7a99c4cfbd4749 +https://repo.anaconda.com/pkgs/main/osx-64/python-3.12.9-hcd54a6c_0.conda#1bf9af06f3e476df1f72e8674a9224df +https://repo.anaconda.com/pkgs/main/osx-64/brotli-python-1.0.9-py312h6d0c2b6_9.conda#425936421fe402074163ac3ffe33a060 +https://repo.anaconda.com/pkgs/main/osx-64/coverage-7.6.9-py312h46256e1_0.conda#f8c1547bbf522a600ee795901240a7b0 +https://repo.anaconda.com/pkgs/main/noarch/cycler-0.11.0-pyhd3eb1b0_0.conda#f5e365d2cdb66d547eb8c3ab93843aab +https://repo.anaconda.com/pkgs/main/noarch/execnet-2.1.1-pyhd3eb1b0_0.conda#b3cb797432ee4657d5907b91a5dc65ad +https://repo.anaconda.com/pkgs/main/noarch/iniconfig-1.1.1-pyhd3eb1b0_0.tar.bz2#e40edff2c5708f342cef43c7f280c507 +https://repo.anaconda.com/pkgs/main/osx-64/joblib-1.4.2-py312hecd8cb5_0.conda#8ab03dfa447b4e0bfa0bd3d25930f3b6 +https://repo.anaconda.com/pkgs/main/osx-64/kiwisolver-1.4.8-py312h6d0c2b6_0.conda#060d4498fcc967a640829cb7e55c95f2 +https://repo.anaconda.com/pkgs/main/osx-64/lcms2-2.16-h31d93a5_1.conda#42450b66e91caf9ab0672a599e2a7bd0 +https://repo.anaconda.com/pkgs/main/osx-64/mkl-service-2.4.0-py312h46256e1_2.conda#04297cb766cabf38613ed6eb4eec85c3 +https://repo.anaconda.com/pkgs/main/osx-64/ninja-1.12.1-hecd8cb5_0.conda#ee3b660616ef0fbcbd0096a67c11c94b +https://repo.anaconda.com/pkgs/main/osx-64/openjpeg-2.5.2-h2d09ccc_1.conda#0f2e221843154b436b5982c695df627b +https://repo.anaconda.com/pkgs/main/osx-64/packaging-24.2-py312hecd8cb5_0.conda#76512e47c9c37443444ef0624769f620 +https://repo.anaconda.com/pkgs/main/osx-64/pluggy-1.5.0-py312hecd8cb5_0.conda#ca381e438f1dbd7986ac0fa0da70c9d8 +https://repo.anaconda.com/pkgs/main/osx-64/pyparsing-3.2.0-py312hecd8cb5_0.conda#e4086daaaed13f68cc8d5b9da7db73cc +https://repo.anaconda.com/pkgs/main/noarch/python-tzdata-2025.2-pyhd3eb1b0_0.conda#5ac858f05dbf9d3cdb04d53516901247 +https://repo.anaconda.com/pkgs/main/osx-64/pytz-2024.1-py312hecd8cb5_0.conda#2b28ec0e0d07f5c0c701f75200b1e8b6 +https://repo.anaconda.com/pkgs/main/osx-64/setuptools-78.1.1-py312hecd8cb5_0.conda#76b66b96a1564cb76011408c1eb8df3e +https://repo.anaconda.com/pkgs/main/osx-64/six-1.17.0-py312hecd8cb5_0.conda#aadd782bc06426887ae0835eedd98ceb +https://repo.anaconda.com/pkgs/main/noarch/toml-0.10.2-pyhd3eb1b0_0.conda#cda05f5f6d8509529d1a2743288d197a +https://repo.anaconda.com/pkgs/main/osx-64/tornado-6.5-py312h46256e1_0.conda#7e82973ed53e71854971e7b3922fad24 +https://repo.anaconda.com/pkgs/main/osx-64/unicodedata2-15.1.0-py312h46256e1_1.conda#4a7fd1dec7277c8ab71aa11aa08df86b +https://repo.anaconda.com/pkgs/main/osx-64/wheel-0.45.1-py312hecd8cb5_0.conda#fafb8687668467d8624d2ddd0909bce9 +https://repo.anaconda.com/pkgs/main/osx-64/fonttools-4.55.3-py312h46256e1_0.conda#f7680dd6b8b1c2f8aab17cf6630c6deb +https://repo.anaconda.com/pkgs/main/osx-64/numpy-base-1.26.4-py312h6f81483_0.conda#87f73efbf26ab2e2ea7c32481a71bd47 +https://repo.anaconda.com/pkgs/main/osx-64/pillow-11.1.0-py312h935ef2f_1.conda#c2f7a3f027cc93a3626d50b765b75dc5 +https://repo.anaconda.com/pkgs/main/noarch/pip-25.1-pyhc872135_2.conda#2778327d2a700153fefe0e69438b18e1 +https://repo.anaconda.com/pkgs/main/osx-64/pytest-8.3.4-py312hecd8cb5_0.conda#b15ee02022967632dfa1672669228bee +https://repo.anaconda.com/pkgs/main/osx-64/python-dateutil-2.9.0post0-py312hecd8cb5_2.conda#1047dde28f78127dd9f6121e882926dd +https://repo.anaconda.com/pkgs/main/osx-64/pytest-cov-6.0.0-py312hecd8cb5_0.conda#db697e319a4d1145363246a51eef0352 +https://repo.anaconda.com/pkgs/main/osx-64/pytest-xdist-3.6.1-py312hecd8cb5_0.conda#38df9520774ee82bf143218f1271f936 +https://repo.anaconda.com/pkgs/main/osx-64/bottleneck-1.4.2-py312ha2b695f_0.conda#7efb63b6a5b33829a3b2c7a3efcf53ce +https://repo.anaconda.com/pkgs/main/osx-64/contourpy-1.3.1-py312h1962661_0.conda#41499d3a415721b0514f0cccb8288cb1 +https://repo.anaconda.com/pkgs/main/osx-64/matplotlib-3.10.0-py312hecd8cb5_0.conda#2977e81a7775be7963daf49df981b6e0 +https://repo.anaconda.com/pkgs/main/osx-64/matplotlib-base-3.10.0-py312h919b35b_0.conda#afc11bf311f5921ca4674ebac9592cf8 +https://repo.anaconda.com/pkgs/main/osx-64/mkl_fft-1.3.8-py312h6c40b1e_0.conda#d59d01b940493f2b6a84aac922fd0c76 +https://repo.anaconda.com/pkgs/main/osx-64/mkl_random-1.2.4-py312ha357a0b_0.conda#c1ea9c8eee79a5af3399f3c31be0e9c6 +https://repo.anaconda.com/pkgs/main/osx-64/numpy-1.26.4-py312hac873b0_0.conda#3150bac1e382156f82a153229e1ebd06 +https://repo.anaconda.com/pkgs/main/osx-64/numexpr-2.8.7-py312hac873b0_0.conda#6303ba071636ef57fddf69eb6f440ec1 +https://repo.anaconda.com/pkgs/main/osx-64/scipy-1.11.4-py312h81688c2_0.conda#7d57b4c21a9261f97fa511e0940c5d93 +https://repo.anaconda.com/pkgs/main/osx-64/pandas-2.2.3-py312h6d0c2b6_0.conda#84ce5b8ec4a986d13a5df17811f556a2 +https://repo.anaconda.com/pkgs/main/osx-64/pyamg-5.2.1-py312h1962661_0.conda#58881950d4ce74c9302b56961f97a43c +# pip cython @ https://files.pythonhosted.org/packages/78/06/83ff82381319ff68ae46f9dd3024b1d5101997e81a8e955811525b6f934b/cython-3.1.1-cp312-cp312-macosx_10_13_x86_64.whl#sha256=9d7dc0e4d0cd491fac679a61e9ede348c64ca449f99a284f9a01851aa1dbc7f6 +# pip meson @ https://files.pythonhosted.org/packages/46/77/726b14be352aa6911e206ca7c4d95c5be49660604dfee0bfed0fc75823e5/meson-1.8.1-py3-none-any.whl#sha256=374bbf71247e629475fc10b0bd2ef66fc418c2d8f4890572f74de0f97d0d42da +# pip threadpoolctl @ https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl#sha256=43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb +# pip pyproject-metadata @ https://files.pythonhosted.org/packages/7e/b1/8e63033b259e0a4e40dd1ec4a9fee17718016845048b43a36ec67d62e6fe/pyproject_metadata-0.9.1-py3-none-any.whl#sha256=ee5efde548c3ed9b75a354fc319d5afd25e9585fa918a34f62f904cc731973ad +# pip meson-python @ https://files.pythonhosted.org/packages/28/58/66db620a8a7ccb32633de9f403fe49f1b63c68ca94e5c340ec5cceeb9821/meson_python-0.18.0-py3-none-any.whl#sha256=3b0fe051551cc238f5febb873247c0949cd60ded556efa130aa57021804868e2 diff --git a/build_tools/azure/pylatest_free_threaded_environment.yml b/build_tools/azure/pylatest_free_threaded_environment.yml new file mode 100644 index 0000000000000..8980bfce4adaf --- /dev/null +++ b/build_tools/azure/pylatest_free_threaded_environment.yml @@ -0,0 +1,18 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python-freethreading + - numpy + - scipy + - cython + - joblib + - threadpoolctl + - pytest + - pytest-xdist + - ninja + - meson-python + - ccache + - pip diff --git a/build_tools/azure/pylatest_free_threaded_linux-64_conda.lock b/build_tools/azure/pylatest_free_threaded_linux-64_conda.lock new file mode 100644 index 0000000000000..40254398d3bb7 --- /dev/null +++ b/build_tools/azure/pylatest_free_threaded_linux-64_conda.lock @@ -0,0 +1,61 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: b76364b5635e8c36a0fc0777955b5664a336ba94ac96f3ade7aad842ab7e15c5 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-7_cp313t.conda#df81edcc11a1176315e8226acab83eec +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_2.conda#fbe7d535ff9d3a168c148e07358cd5b1 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.29-pthreads_h94d23a6_0.conda#0a4d0252248ef9a0f88f2ba8b8a08e12 +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.3-h4724d56_1_cp313t.conda#8193603fe48ace3d8801cfb246f44491 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cpython-3.13.3-py313hd8ed1ab_1.conda#6ba9ba47b91b7758cb963d0f0eaf3422 +https://conda.anaconda.org/conda-forge/noarch/cython-3.1.1-pyh2c78169_101.conda#f085516359786c4fea51bf05227ef3d2 +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 +https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh145f28c_0.conda#01384ff1639c6330a0924791413b8714 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.11.3-h80c52d3_0.conda#eb517c6a2b960c3ccb6f1db1005f063a +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_he106b2a_openblas.conda#abb32c727da370c481a1c206f5159ce9 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_h7ac8fdf_openblas.conda#452b98eafe050ecff932f0ec832dd03f +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-freethreading-3.13.3-h92d6c8b_1.conda#4fa25290aec662a01642ba4b3c0ff5c1 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py313h103f029_0.conda#7ae0a483b2cbbdf15d8429eb38f74a9e +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py313h7f7b39c_0.conda#65f0c403e4324062633e648933f20a2e diff --git a/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml b/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml new file mode 100644 index 0000000000000..6c3da4bb863b4 --- /dev/null +++ b/build_tools/azure/pylatest_pip_openblas_pandas_environment.yml @@ -0,0 +1,31 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - defaults +dependencies: + - python + - ccache + - pip + - pip: + - numpy + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - ninja + - meson-python + - pytest-cov + - coverage + - sphinx + - numpydoc + - lightgbm + - scikit-image + - array-api-strict + - scipy-doctest diff --git a/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock new file mode 100644 index 0000000000000..9861391b52e67 --- /dev/null +++ b/build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock @@ -0,0 +1,91 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 50f16a0198b6eb575a737fee25051b52a644d72f5fca26bd661651a85fcb6a07 +@EXPLICIT +https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 +https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2025.2.25-h06a4308_0.conda#495015d24da8ad929e3ae2d18571016d +https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.40-h12ee557_0.conda#ee672b5f635340734f58d618b7bca024 +https://repo.anaconda.com/pkgs/main/linux-64/python_abi-3.13-0_cp313.conda#d4009c49dd2b54ffded7f1365b5f6505 +https://repo.anaconda.com/pkgs/main/noarch/tzdata-2025b-h04d1e81_0.conda#1d027393db3427ab22a02aa44a56f143 +https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b372c0eea9b60732fdae4b817a63c8cd +https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda#57623d10a70e09e1d048c2b2b6f4e2dd +https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda#71d281e9c2192cb3fa425655a8defb85 +https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda#a87728dabf3151fb9cfa990bd2eb0464 +https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h5eee18b_6.conda#f21a3ff51c1b271977f53ce956a69297 +https://repo.anaconda.com/pkgs/main/linux-64/expat-2.7.1-h6a678d5_0.conda#269942a9f3f943e2e5d8a2516a861f7c +https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.4.4-h6a678d5_1.conda#70646cc713f0c43926cfdcfe9b695fe0 +https://repo.anaconda.com/pkgs/main/linux-64/libmpdec-4.0.0-h5eee18b_0.conda#feb10f42b1a7b523acbf85461be41a3e +https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.41.5-h5eee18b_0.conda#4a6a2354414c9080327274aa514e5299 +https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.4-h6a678d5_0.conda#5558eec6e2191741a92f832ea826251c +https://repo.anaconda.com/pkgs/main/linux-64/openssl-3.0.16-h5eee18b_0.conda#5875526739afa058cfa84da1fa7a2ef4 +https://repo.anaconda.com/pkgs/main/linux-64/xz-5.6.4-h5eee18b_1.conda#3581505fa450962d631bd82b8616350e +https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.13-h5eee18b_1.conda#92e42d8310108b0a440fb2e60b2b2a25 +https://repo.anaconda.com/pkgs/main/linux-64/ccache-3.7.9-hfe4627d_0.conda#bef6fc681c273bb7bd0c67d1a591365e +https://repo.anaconda.com/pkgs/main/linux-64/readline-8.2-h5eee18b_0.conda#be42180685cce6e6b0329201d9f48efb +https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.14-h39e8969_0.conda#78dbc5e3c69143ebc037fc5d5b22e597 +https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.45.3-h5eee18b_0.conda#acf93d6aceb74d6110e20b44cc45939e +https://repo.anaconda.com/pkgs/main/linux-64/python-3.13.2-hf623796_100_cp313.conda#bf836f30ac4c16fd3d71c1aaa25da08c +https://repo.anaconda.com/pkgs/main/linux-64/setuptools-78.1.1-py313h06a4308_0.conda#8f8e1c1e3af9d2d371aaa0ee8316ae7c +https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.45.1-py313h06a4308_0.conda#29057e876eedce0e37c2388c138a19f9 +https://repo.anaconda.com/pkgs/main/noarch/pip-25.1-pyhc872135_2.conda#2778327d2a700153fefe0e69438b18e1 +# pip alabaster @ https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl#sha256=fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b +# pip babel @ https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl#sha256=4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2 +# pip certifi @ https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl#sha256=30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3 +# pip charset-normalizer @ https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c +# pip coverage @ https://files.pythonhosted.org/packages/89/60/f5f50f61b6332451520e6cdc2401700c48310c64bc2dd34027a47d6ab4ca/coverage-7.8.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=dc67994df9bcd7e0150a47ef41278b9e0a0ea187caba72414b71dc590b99a108 +# pip cycler @ https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl#sha256=85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30 +# pip cython @ https://files.pythonhosted.org/packages/ca/90/9fe8b93fa239b4871252274892c232415f53d5af0859c4a6ac9b1cbf9950/cython-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=7da069ca769903c5dee56c5f7ab47b2b7b91030eee48912630db5f4f3ec5954a +# pip docutils @ https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl#sha256=dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 +# pip execnet @ https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl#sha256=26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc +# pip fonttools @ https://files.pythonhosted.org/packages/c1/46/8b46469c6edac393de1c380c7ec61922d5440f25605dfca7849e5ffff295/fonttools-4.58.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=9b8860f8cd48b345bd1df1d7be650f600f69ee971ffe338c5bd5bcb6bdb3b92c +# pip idna @ https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl#sha256=946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 +# pip imagesize @ https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl#sha256=0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b +# pip iniconfig @ https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl#sha256=9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760 +# pip joblib @ https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl#sha256=4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a +# pip kiwisolver @ https://files.pythonhosted.org/packages/8f/e9/6a7d025d8da8c4931522922cd706105aa32b3291d1add8c5427cdcd66e63/kiwisolver-1.4.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=a5ce1e481a74b44dd5e92ff03ea0cb371ae7a0268318e202be06c8f04f4f1246 +# pip markupsafe @ https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 +# pip meson @ https://files.pythonhosted.org/packages/46/77/726b14be352aa6911e206ca7c4d95c5be49660604dfee0bfed0fc75823e5/meson-1.8.1-py3-none-any.whl#sha256=374bbf71247e629475fc10b0bd2ef66fc418c2d8f4890572f74de0f97d0d42da +# pip networkx @ https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl#sha256=0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec +# pip ninja @ https://files.pythonhosted.org/packages/eb/7a/455d2877fe6cf99886849c7f9755d897df32eaf3a0fba47b56e615f880f7/ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl#sha256=096487995473320de7f65d622c3f1d16c3ad174797602218ca8c967f51ec38a0 +# pip numpy @ https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f +# pip packaging @ https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl#sha256=29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 +# pip pillow @ https://files.pythonhosted.org/packages/13/eb/2552ecebc0b887f539111c2cd241f538b8ff5891b8903dfe672e997529be/pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl#sha256=ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155 +# pip pluggy @ https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl#sha256=e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 +# pip pygments @ https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl#sha256=9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c +# pip pyparsing @ https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl#sha256=a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf +# pip pytz @ https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl#sha256=5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00 +# pip roman-numerals-py @ https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl#sha256=9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c +# pip six @ https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl#sha256=4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 +# pip snowballstemmer @ https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl#sha256=6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064 +# pip sphinxcontrib-applehelp @ https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl#sha256=4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5 +# pip sphinxcontrib-devhelp @ https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl#sha256=aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2 +# pip sphinxcontrib-htmlhelp @ https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl#sha256=166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8 +# pip sphinxcontrib-jsmath @ https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl#sha256=2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178 +# pip sphinxcontrib-qthelp @ https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl#sha256=b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb +# pip sphinxcontrib-serializinghtml @ https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl#sha256=6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331 +# pip tabulate @ https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl#sha256=024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f +# pip threadpoolctl @ https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl#sha256=43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb +# pip tzdata @ https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl#sha256=1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 +# pip urllib3 @ https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl#sha256=4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813 +# pip array-api-strict @ https://files.pythonhosted.org/packages/fe/c7/a97e26083985b49a7a54006364348cf1c26e5523850b8522a39b02b19715/array_api_strict-2.3.1-py3-none-any.whl#sha256=0ca6988be1c82d2f05b6cd44bc7e14cb390555d1455deb50f431d6d0cf468ded +# pip contourpy @ https://files.pythonhosted.org/packages/c8/65/5245ce8c548a8422236c13ffcdcdada6a2a812c361e9e0c70548bb40b661/contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841 +# pip imageio @ https://files.pythonhosted.org/packages/cb/bd/b394387b598ed84d8d0fa90611a90bee0adc2021820ad5729f7ced74a8e2/imageio-2.37.0-py3-none-any.whl#sha256=11efa15b87bc7871b61590326b2d635439acc321cf7f8ce996f812543ce10eed +# pip jinja2 @ https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl#sha256=85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 +# pip lazy-loader @ https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl#sha256=342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc +# pip pyproject-metadata @ https://files.pythonhosted.org/packages/7e/b1/8e63033b259e0a4e40dd1ec4a9fee17718016845048b43a36ec67d62e6fe/pyproject_metadata-0.9.1-py3-none-any.whl#sha256=ee5efde548c3ed9b75a354fc319d5afd25e9585fa918a34f62f904cc731973ad +# pip pytest @ https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl#sha256=c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820 +# pip python-dateutil @ https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl#sha256=a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 +# pip requests @ https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl#sha256=70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 +# pip scipy @ https://files.pythonhosted.org/packages/b5/09/c5b6734a50ad4882432b6bb7c02baf757f5b2f256041da5df242e2d7e6b6/scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7 +# pip tifffile @ https://files.pythonhosted.org/packages/4d/77/7f7dfcf2d847c1c1c63a2d4157c480eb4c74e4aa56e844008795ff01f86d/tifffile-2025.6.1-py3-none-any.whl#sha256=ff7163f1aaea519b769a2ac77c43be69e7d83e5b5d5d6a676497399de50535e5 +# pip lightgbm @ https://files.pythonhosted.org/packages/42/86/dabda8fbcb1b00bcfb0003c3776e8ade1aa7b413dff0a2c08f457dace22f/lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl#sha256=cb19b5afea55b5b61cbb2131095f50538bd608a00655f23ad5d25ae3e3bf1c8d +# pip matplotlib @ https://files.pythonhosted.org/packages/f5/64/41c4367bcaecbc03ef0d2a3ecee58a7065d0a36ae1aa817fe573a2da66d4/matplotlib-3.10.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=a80fcccbef63302c0efd78042ea3c2436104c5b1a4d3ae20f864593696364ac7 +# pip meson-python @ https://files.pythonhosted.org/packages/28/58/66db620a8a7ccb32633de9f403fe49f1b63c68ca94e5c340ec5cceeb9821/meson_python-0.18.0-py3-none-any.whl#sha256=3b0fe051551cc238f5febb873247c0949cd60ded556efa130aa57021804868e2 +# pip pandas @ https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24 +# pip pyamg @ https://files.pythonhosted.org/packages/cd/a7/0df731cbfb09e73979a1a032fc7bc5be0eba617d798b998a0f887afe8ade/pyamg-5.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=6999b351ab969c79faacb81faa74c0fa9682feeff3954979212872a3ee40c298 +# pip pytest-cov @ https://files.pythonhosted.org/packages/28/d0/def53b4a790cfb21483016430ed828f64830dd981ebe1089971cd10cab25/pytest_cov-6.1.1-py3-none-any.whl#sha256=bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde +# pip pytest-xdist @ https://files.pythonhosted.org/packages/0d/b2/0e802fde6f1c5b2f7ae7e9ad42b83fd4ecebac18a8a8c2f2f14e39dce6e1/pytest_xdist-3.7.0-py3-none-any.whl#sha256=7d3fbd255998265052435eb9daa4e99b62e6fb9cfb6efd1f858d4d8c0c7f0ca0 +# pip scikit-image @ https://files.pythonhosted.org/packages/cd/9b/c3da56a145f52cd61a68b8465d6a29d9503bc45bc993bb45e84371c97d94/scikit_image-0.25.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=b8abd3c805ce6944b941cfed0406d88faeb19bab3ed3d4b50187af55cf24d147 +# pip scipy-doctest @ https://files.pythonhosted.org/packages/c9/13/cd25d1875f3804b73fd4a4ae00e2c76e274e1e0608d79148cac251b644b1/scipy_doctest-1.8.0-py3-none-any.whl#sha256=5863208368c35486e143ce3283ab2f517a0d6b0c63d0d5f19f38a823fc82016f +# pip sphinx @ https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl#sha256=4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3 +# pip numpydoc @ https://files.pythonhosted.org/packages/6c/45/56d99ba9366476cd8548527667f01869279cedb9e66b28eb4dfb27701679/numpydoc-1.8.0-py3-none-any.whl#sha256=72024c7fd5e17375dec3608a27c03303e8ad00c81292667955c6fea7a3ccf541 diff --git a/build_tools/azure/pylatest_pip_scipy_dev_environment.yml b/build_tools/azure/pylatest_pip_scipy_dev_environment.yml new file mode 100644 index 0000000000000..01709b79e3720 --- /dev/null +++ b/build_tools/azure/pylatest_pip_scipy_dev_environment.yml @@ -0,0 +1,22 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - defaults +dependencies: + - python + - ccache + - pip + - pip: + - threadpoolctl + - pytest + - pytest-xdist + - pip + - ninja + - meson-python + - pytest-cov + - coverage + - pooch + - sphinx + - numpydoc + - python-dateutil diff --git a/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock new file mode 100644 index 0000000000000..a8fac4ea35b6c --- /dev/null +++ b/build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock @@ -0,0 +1,70 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 7555819e95d879c5a5147e6431581e17ffc5d77e8a43b19c8a911821378d2521 +@EXPLICIT +https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda#c3473ff8bdb3d124ed5ff11ec380d6f9 +https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2025.2.25-h06a4308_0.conda#495015d24da8ad929e3ae2d18571016d +https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.40-h12ee557_0.conda#ee672b5f635340734f58d618b7bca024 +https://repo.anaconda.com/pkgs/main/linux-64/python_abi-3.13-0_cp313.conda#d4009c49dd2b54ffded7f1365b5f6505 +https://repo.anaconda.com/pkgs/main/noarch/tzdata-2025b-h04d1e81_0.conda#1d027393db3427ab22a02aa44a56f143 +https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda#b372c0eea9b60732fdae4b817a63c8cd +https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda#57623d10a70e09e1d048c2b2b6f4e2dd +https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda#71d281e9c2192cb3fa425655a8defb85 +https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda#a87728dabf3151fb9cfa990bd2eb0464 +https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h5eee18b_6.conda#f21a3ff51c1b271977f53ce956a69297 +https://repo.anaconda.com/pkgs/main/linux-64/expat-2.7.1-h6a678d5_0.conda#269942a9f3f943e2e5d8a2516a861f7c +https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.4.4-h6a678d5_1.conda#70646cc713f0c43926cfdcfe9b695fe0 +https://repo.anaconda.com/pkgs/main/linux-64/libmpdec-4.0.0-h5eee18b_0.conda#feb10f42b1a7b523acbf85461be41a3e +https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.41.5-h5eee18b_0.conda#4a6a2354414c9080327274aa514e5299 +https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.4-h6a678d5_0.conda#5558eec6e2191741a92f832ea826251c +https://repo.anaconda.com/pkgs/main/linux-64/openssl-3.0.16-h5eee18b_0.conda#5875526739afa058cfa84da1fa7a2ef4 +https://repo.anaconda.com/pkgs/main/linux-64/xz-5.6.4-h5eee18b_1.conda#3581505fa450962d631bd82b8616350e +https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.13-h5eee18b_1.conda#92e42d8310108b0a440fb2e60b2b2a25 +https://repo.anaconda.com/pkgs/main/linux-64/ccache-3.7.9-hfe4627d_0.conda#bef6fc681c273bb7bd0c67d1a591365e +https://repo.anaconda.com/pkgs/main/linux-64/readline-8.2-h5eee18b_0.conda#be42180685cce6e6b0329201d9f48efb +https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.14-h39e8969_0.conda#78dbc5e3c69143ebc037fc5d5b22e597 +https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.45.3-h5eee18b_0.conda#acf93d6aceb74d6110e20b44cc45939e +https://repo.anaconda.com/pkgs/main/linux-64/python-3.13.2-hf623796_100_cp313.conda#bf836f30ac4c16fd3d71c1aaa25da08c +https://repo.anaconda.com/pkgs/main/linux-64/setuptools-78.1.1-py313h06a4308_0.conda#8f8e1c1e3af9d2d371aaa0ee8316ae7c +https://repo.anaconda.com/pkgs/main/linux-64/wheel-0.45.1-py313h06a4308_0.conda#29057e876eedce0e37c2388c138a19f9 +https://repo.anaconda.com/pkgs/main/noarch/pip-25.1-pyhc872135_2.conda#2778327d2a700153fefe0e69438b18e1 +# pip alabaster @ https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl#sha256=fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b +# pip babel @ https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl#sha256=4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2 +# pip certifi @ https://files.pythonhosted.org/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl#sha256=30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3 +# pip charset-normalizer @ https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c +# pip coverage @ https://files.pythonhosted.org/packages/89/60/f5f50f61b6332451520e6cdc2401700c48310c64bc2dd34027a47d6ab4ca/coverage-7.8.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=dc67994df9bcd7e0150a47ef41278b9e0a0ea187caba72414b71dc590b99a108 +# pip docutils @ https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl#sha256=dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 +# pip execnet @ https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl#sha256=26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc +# pip idna @ https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl#sha256=946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 +# pip imagesize @ https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl#sha256=0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b +# pip iniconfig @ https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl#sha256=9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760 +# pip markupsafe @ https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 +# pip meson @ https://files.pythonhosted.org/packages/46/77/726b14be352aa6911e206ca7c4d95c5be49660604dfee0bfed0fc75823e5/meson-1.8.1-py3-none-any.whl#sha256=374bbf71247e629475fc10b0bd2ef66fc418c2d8f4890572f74de0f97d0d42da +# pip ninja @ https://files.pythonhosted.org/packages/eb/7a/455d2877fe6cf99886849c7f9755d897df32eaf3a0fba47b56e615f880f7/ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl#sha256=096487995473320de7f65d622c3f1d16c3ad174797602218ca8c967f51ec38a0 +# pip packaging @ https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl#sha256=29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 +# pip platformdirs @ https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl#sha256=ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4 +# pip pluggy @ https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl#sha256=e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 +# pip pygments @ https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl#sha256=9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c +# pip roman-numerals-py @ https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl#sha256=9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c +# pip six @ https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl#sha256=4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 +# pip snowballstemmer @ https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl#sha256=6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064 +# pip sphinxcontrib-applehelp @ https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl#sha256=4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5 +# pip sphinxcontrib-devhelp @ https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl#sha256=aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2 +# pip sphinxcontrib-htmlhelp @ https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl#sha256=166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8 +# pip sphinxcontrib-jsmath @ https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl#sha256=2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178 +# pip sphinxcontrib-qthelp @ https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl#sha256=b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb +# pip sphinxcontrib-serializinghtml @ https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl#sha256=6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331 +# pip tabulate @ https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl#sha256=024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f +# pip threadpoolctl @ https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl#sha256=43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb +# pip urllib3 @ https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl#sha256=4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813 +# pip jinja2 @ https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl#sha256=85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67 +# pip pyproject-metadata @ https://files.pythonhosted.org/packages/7e/b1/8e63033b259e0a4e40dd1ec4a9fee17718016845048b43a36ec67d62e6fe/pyproject_metadata-0.9.1-py3-none-any.whl#sha256=ee5efde548c3ed9b75a354fc319d5afd25e9585fa918a34f62f904cc731973ad +# pip pytest @ https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl#sha256=c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820 +# pip python-dateutil @ https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl#sha256=a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 +# pip requests @ https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl#sha256=70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 +# pip meson-python @ https://files.pythonhosted.org/packages/28/58/66db620a8a7ccb32633de9f403fe49f1b63c68ca94e5c340ec5cceeb9821/meson_python-0.18.0-py3-none-any.whl#sha256=3b0fe051551cc238f5febb873247c0949cd60ded556efa130aa57021804868e2 +# pip pooch @ https://files.pythonhosted.org/packages/a8/87/77cc11c7a9ea9fd05503def69e3d18605852cd0d4b0d3b8f15bbeb3ef1d1/pooch-1.8.2-py3-none-any.whl#sha256=3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 +# pip pytest-cov @ https://files.pythonhosted.org/packages/28/d0/def53b4a790cfb21483016430ed828f64830dd981ebe1089971cd10cab25/pytest_cov-6.1.1-py3-none-any.whl#sha256=bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde +# pip pytest-xdist @ https://files.pythonhosted.org/packages/0d/b2/0e802fde6f1c5b2f7ae7e9ad42b83fd4ecebac18a8a8c2f2f14e39dce6e1/pytest_xdist-3.7.0-py3-none-any.whl#sha256=7d3fbd255998265052435eb9daa4e99b62e6fb9cfb6efd1f858d4d8c0c7f0ca0 +# pip sphinx @ https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl#sha256=4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3 +# pip numpydoc @ https://files.pythonhosted.org/packages/6c/45/56d99ba9366476cd8548527667f01869279cedb9e66b28eb4dfb27701679/numpydoc-1.8.0-py3-none-any.whl#sha256=72024c7fd5e17375dec3608a27c03303e8ad00c81292667955c6fea7a3ccf541 diff --git a/build_tools/azure/pymin_conda_forge_mkl_environment.yml b/build_tools/azure/pymin_conda_forge_mkl_environment.yml new file mode 100644 index 0000000000000..fe6ce91950e4a --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_mkl_environment.yml @@ -0,0 +1,24 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy + - blas[build=mkl] + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - pytest-cov + - coverage + - wheel + - pip diff --git a/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock b/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock new file mode 100644 index 0000000000000..9e7e414a90156 --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_mkl_win-64_conda.lock @@ -0,0 +1,118 @@ +# Generated by conda-lock. +# platform: win-64 +# input_hash: cc5e2a711eb32773dc46fe159e1c3fe14f4fd07565fc8d3dedf2d748d4f2f694 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2024.2.1-h57928b3_1083.conda#2d89243bfb53652c182a7c73182cce4f +https://conda.anaconda.org/conda-forge/win-64/mkl-include-2024.2.2-h66d3029_15.conda#e2f516189b44b6e042199d13e7015361 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.22621.0-h57928b3_1.conda#6797b005cd0f439c4c5c9ac565783700 +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-h4c7d964_0.conda#23c7fd5062b48d8294fc7f61bf157fba +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/win-64/libwinpthread-12.0.0.r4.gg4f2fc60ca-h57928b3_9.conda#08bfa5da6e242025304b206d152479ef +https://conda.anaconda.org/conda-forge/win-64/vc14_runtime-14.42.34438-hfd919c2_26.conda#91651a36d31aa20c7ba36299fb7068f4 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/win-64/libgomp-15.1.0-h1383e82_2.conda#5fbacaa9b41e294a6966602205b99747 +https://conda.anaconda.org/conda-forge/win-64/vc-14.3-h2b53caa_26.conda#d3f0381e38093bde620a8d85f266ae55 +https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-2_gnu.conda#37e16618af5c4851a3f3d66dd0e11141 +https://conda.anaconda.org/conda-forge/win-64/bzip2-1.0.8-h2466b09_7.conda#276e7ffe9ffe39688abc665ef0f45596 +https://conda.anaconda.org/conda-forge/win-64/double-conversion-3.3.1-he0c23c2_0.conda#e9a1402439c18a4e3c7a52e4246e9e1c +https://conda.anaconda.org/conda-forge/win-64/graphite2-1.3.13-h63175ca_1003.conda#3194499ee7d1a67404a87d0eefdd92c6 +https://conda.anaconda.org/conda-forge/win-64/icu-75.1-he0c23c2_0.conda#8579b6bb8d18be7c0b27fb08adeeeb40 +https://conda.anaconda.org/conda-forge/win-64/lerc-4.0.0-h6470a55_1.conda#c1b81da6d29a14b542da14a36c9fbf3f +https://conda.anaconda.org/conda-forge/win-64/libbrotlicommon-1.1.0-h2466b09_2.conda#f7dc9a8f21d74eab46456df301da2972 +https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.24-h76ddb4d_0.conda#08d988e266c6ae77e03d164b83786dc4 +https://conda.anaconda.org/conda-forge/win-64/libexpat-2.7.0-he0c23c2_0.conda#b6f5352fdb525662f4169a0431d2dd7a +https://conda.anaconda.org/conda-forge/win-64/libffi-3.4.6-h537db12_1.conda#85d8fa5e55ed8f93f874b3b23ed54ec6 +https://conda.anaconda.org/conda-forge/win-64/libiconv-1.18-h135ad9c_1.conda#21fc5dba2cbcd8e5e26ff976a312122c +https://conda.anaconda.org/conda-forge/win-64/libjpeg-turbo-3.1.0-h2466b09_0.conda#7c51d27540389de84852daa1cdb9c63c +https://conda.anaconda.org/conda-forge/win-64/liblzma-5.8.1-h2466b09_1.conda#14a1042c163181e143a7522dfb8ad6ab +https://conda.anaconda.org/conda-forge/win-64/libsqlite-3.50.0-h67fdade_0.conda#92b11b0b2120d563caa1629928122cee +https://conda.anaconda.org/conda-forge/win-64/libwebp-base-1.5.0-h3b0e114_0.conda#33f7313967072c6e6d8f865f5493c7ae +https://conda.anaconda.org/conda-forge/win-64/libzlib-1.3.1-h2466b09_2.conda#41fbfac52c601159df6c01f875de31b9 +https://conda.anaconda.org/conda-forge/win-64/ninja-1.12.1-hc790b64_1.conda#3974c522f3248d4a93e6940c463d2de7 +https://conda.anaconda.org/conda-forge/win-64/openssl-3.5.0-ha4e3fda_1.conda#72c07e46b6766bb057018a9a74861b89 +https://conda.anaconda.org/conda-forge/win-64/pixman-0.46.0-had0cd8c_0.conda#01617534ef71b5385ebba940a6d6150d +https://conda.anaconda.org/conda-forge/win-64/qhull-2020.2-hc790b64_5.conda#854fbdff64b572b5c0b470f334d34c11 +https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h2c6b04d_2.conda#ebd0e761de9aa879a51d22cc721bd095 +https://conda.anaconda.org/conda-forge/win-64/krb5-1.21.3-hdf4eb48_0.conda#31aec030344e962fbd7dbbbbd68e60a9 +https://conda.anaconda.org/conda-forge/win-64/libbrotlidec-1.1.0-h2466b09_2.conda#9bae75ce723fa34e98e239d21d752a7e +https://conda.anaconda.org/conda-forge/win-64/libbrotlienc-1.1.0-h2466b09_2.conda#85741a24d97954a991e55e34bc55990b +https://conda.anaconda.org/conda-forge/win-64/libgcc-15.1.0-h1383e82_2.conda#9bedb24480136bfeb81ebc81d4285e70 +https://conda.anaconda.org/conda-forge/win-64/libintl-0.22.5-h5728263_3.conda#2cf0cf76cc15d360dfa2f17fd6cf9772 +https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.47-h7a4582a_0.conda#ad620e92b82d2948bc019e029c574ebb +https://conda.anaconda.org/conda-forge/win-64/libxml2-2.13.8-h442d1da_0.conda#833c2dbc1a5020007b520b044c713ed3 +https://conda.anaconda.org/conda-forge/win-64/pcre2-10.45-h99c9b8b_0.conda#f4c483274001678e129f5cbaf3a8d765 +https://conda.anaconda.org/conda-forge/win-64/python-3.10.17-h8c5b53a_0_cpython.conda#0c59918f056ab2e9c7bb45970d32b2ea +https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.7-hbeecb71_2.conda#21f56217d6125fb30c3c3f10c786d751 +https://conda.anaconda.org/conda-forge/win-64/brotli-bin-1.1.0-h2466b09_2.conda#d22534a9be5771fc58eb7564947f669d +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/win-64/cython-3.1.1-py310h6bd2d47_1.conda#165131d296d24f798fa76a26694d4565 +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/win-64/kiwisolver-1.4.7-py310hc19bc0b_0.conda#50d96539497fc7493cbe469fbb6b8b6e +https://conda.anaconda.org/conda-forge/win-64/libclang13-20.1.6-default_h6e92b77_0.conda#3920536319b052a9a49639e02fda2db7 +https://conda.anaconda.org/conda-forge/win-64/libfreetype6-2.13.3-h0b5ce68_1.conda#a84b7d1a13060a9372bea961a8131dbc +https://conda.anaconda.org/conda-forge/win-64/libglib-2.84.2-hbc94333_0.conda#fee05801cc5db97bec20a5e78fb3905b +https://conda.anaconda.org/conda-forge/win-64/libhwloc-2.11.2-default_ha69328c_1001.conda#b87a0ac5ab6495d8225db5dc72dd21cd +https://conda.anaconda.org/conda-forge/win-64/libtiff-4.7.0-h05922d8_5.conda#75370aba951b47ec3b5bfe689f1bcf7f +https://conda.anaconda.org/conda-forge/win-64/libxslt-1.1.39-h3df6e99_0.conda#279ee338c9b34871d578cb3c7aa68f70 +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/win-64/pthread-stubs-0.4-h0e40799_1002.conda#3c8f2573569bb816483e5cf57efbbe29 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/win-64/tornado-6.5.1-py310ha8f682b_0.conda#4c8f599990e386f3a0aba3f3bd8608da +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/win-64/unicodedata2-16.0.0-py310ha8f682b_0.conda#b28aead44c6e19a1fbba7752aa242b34 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/win-64/xorg-libxau-1.0.12-h0e40799_0.conda#2ffbfae4548098297c033228256eb96e +https://conda.anaconda.org/conda-forge/win-64/xorg-libxdmcp-1.1.5-h0e40799_0.conda#8393c0f7e7870b4eb45553326f81f0ff +https://conda.anaconda.org/conda-forge/win-64/brotli-1.1.0-h2466b09_2.conda#378f1c9421775dfe644731cb121c8979 +https://conda.anaconda.org/conda-forge/win-64/coverage-7.8.2-py310h38315fa_0.conda#5e09090744ab0b70b2882bc415c0d5ad +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/win-64/lcms2-2.17-hbcf6048_0.conda#3538827f77b82a837fa681a4579e37a1 +https://conda.anaconda.org/conda-forge/win-64/libfreetype-2.13.3-h57928b3_1.conda#410ba2c8e7bdb278dfbb5d40220e39d2 +https://conda.anaconda.org/conda-forge/win-64/libxcb-1.17.0-h0e4246c_0.conda#a69bbf778a462da324489976c84cfc8c +https://conda.anaconda.org/conda-forge/win-64/openjpeg-2.5.3-h4d64b90_0.conda#fc050366dd0b8313eb797ed1ffef3a29 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/win-64/tbb-2021.13.0-h62715c5_1.conda#9190dd0a23d925f7602f9628b3aed511 +https://conda.anaconda.org/conda-forge/win-64/fonttools-4.58.1-py310h38315fa_0.conda#76a9c04ac1c23cee8b00733eb942f8e5 +https://conda.anaconda.org/conda-forge/win-64/freetype-2.13.3-h57928b3_1.conda#633504fe3f96031192e40e3e6c18ef06 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/win-64/mkl-2024.2.2-h66d3029_15.conda#302dff2807f2927b3e9e0d19d60121de +https://conda.anaconda.org/conda-forge/win-64/pillow-11.2.1-py310h9595edc_0.conda#33d0663d469cc146b5fc68587348f450 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/win-64/fontconfig-2.15.0-h765892d_1.conda#9bb0026a2131b09404c59c4290c697cd +https://conda.anaconda.org/conda-forge/win-64/libblas-3.9.0-31_h641d27c_mkl.conda#d05563c577fe2f37693a554b3f271e8f +https://conda.anaconda.org/conda-forge/win-64/mkl-devel-2024.2.2-h57928b3_15.conda#a85f53093da069c7c657f090e388f3ef +https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.1.1-pyhd8ed1ab_0.conda#1e35d8f975bc0e984a19819aa91c440a +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/win-64/cairo-1.18.4-h5782bbf_0.conda#20e32ced54300292aff690a69c5e7b97 +https://conda.anaconda.org/conda-forge/win-64/libcblas-3.9.0-31_h5e41251_mkl.conda#43c100b94ad2607382b0cf0f3a6b0bf3 +https://conda.anaconda.org/conda-forge/win-64/liblapack-3.9.0-31_h1aa476e_mkl.conda#40b47ee720a185289760960fc6185750 +https://conda.anaconda.org/conda-forge/win-64/harfbuzz-11.2.1-h8796e6f_0.conda#bccea58fbf7910ce868b084f27ffe8bd +https://conda.anaconda.org/conda-forge/win-64/liblapacke-3.9.0-31_h845c4fa_mkl.conda#003a2041cb07a7cf698f48dd26301273 +https://conda.anaconda.org/conda-forge/win-64/numpy-2.2.6-py310h4987827_0.conda#d2596785ac2cf5bab04e2ee9e5d04041 +https://conda.anaconda.org/conda-forge/win-64/blas-devel-3.9.0-31_hfb1a452_mkl.conda#0deeb3d9d6f0e56393c55ef382899010 +https://conda.anaconda.org/conda-forge/win-64/contourpy-1.3.2-py310hc19bc0b_0.conda#039416813b5290e7d100a05bb4326110 +https://conda.anaconda.org/conda-forge/win-64/qt6-main-6.9.0-h02ddd7d_3.conda#8aeebdf27e439648236c3eb856ce7777 +https://conda.anaconda.org/conda-forge/win-64/scipy-1.15.2-py310h15c175c_0.conda#81798168111d1021e3d815217c444418 +https://conda.anaconda.org/conda-forge/win-64/blas-2.131-mkl.conda#1842bfaa4e349875c47bde1d9871bda6 +https://conda.anaconda.org/conda-forge/win-64/matplotlib-base-3.10.3-py310h37e0a56_0.conda#de9ddae6f97b78860c256de480ea1a84 +https://conda.anaconda.org/conda-forge/win-64/pyside6-6.9.0-py310hc1b6536_0.conda#e90c8d8a817b5d63b7785d7d18c99ae0 +https://conda.anaconda.org/conda-forge/win-64/matplotlib-3.10.3-py310h5588dad_0.conda#103adee33db124a0263d0b4551e232e3 diff --git a/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_environment.yml b/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_environment.yml new file mode 100644 index 0000000000000..a179c55fed993 --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_environment.yml @@ -0,0 +1,26 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy=1.22.0 # min + - blas[build=openblas] + - scipy=1.8.0 # min + - cython=3.0.10 # min + - joblib=1.2.0 # min + - threadpoolctl=3.1.0 # min + - matplotlib=3.5.0 # min + - pandas=1.4.0 # min + - pyamg=4.2.1 # min + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python=0.16.0 # min + - pytest-cov + - coverage + - ccache + - polars=0.20.30 # min diff --git a/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_linux-64_conda.lock b/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_linux-64_conda.lock new file mode 100644 index 0000000000000..f55381fb64f3f --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_openblas_min_dependencies_linux-64_conda.lock @@ -0,0 +1,189 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 41111e5656d9d33f83f1160f643ec4ab314aa8e179923dbe1350c87b0ac2f400 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 +https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.6-h024ca30_0.conda#e4ece7ed81e43ae97a3b58ac4230c3c5 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-3_kmp_llvm.conda#ee5c2118262e30b972bc0b4db8ef0ba5 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 +https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda#76df83c2a9035c54df5d04ff81bcc02d +https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.24.1-h5888daf_0.conda#d54305672f0361c2f3886750e7165b5f +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.24.1-h5888daf_0.conda#2ee6d71b72f75d50581f2f68e965efdb +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda#7c7927b404672409d9917d49bff5f2d6 +https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda#68e52064ed3897463c0e958ab5c8f91b +https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda#b64523fb87ac6f87f0790f324ad43046 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxshmfence-1.3.3-hb9d3cd8_0.conda#9a809ce9f65460195777f2f2116bae02 +https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.1-h166bdaf_1.tar.bz2#d9c69a24ad678ffce24c6543a0176b00 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2#a8832b479f93521a9e7b5b743803be51 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.24.1-h8e693c7_0.conda#57566a81dd1e5aa3d98ac7582e8bfe03 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda#8f04c7aae6a46503bc36d1ed5abc8c7c +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libgpg-error-1.55-h3f2d84a_0.conda#2bd47db5807daade8500ed7ca4c512a4 +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda#c7f302fd11eeb0987a6a5e1f3aed6a21 +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/nspr-4.36-h5888daf_0.conda#de9cd5bca9e4918527b9b72b6e2e1409 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.0-h29eaf8c_0.conda#d2f1c87d4416d1e7344cf92b1aaee1c4 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.24.1-h8e693c7_0.conda#8f66ed2e34507b7ae44afa31c3e4ec79 +https://conda.anaconda.org/conda-forge/linux-64/libcap-2.75-h39aace5_0.conda#c44c16d6976d2aebbd65894d7741e67e +https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.124-hb9d3cd8_0.conda#8bc89311041d7fcb510238cf0848ccae +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libgcrypt-lib-1.11.1-hb9d3cd8_0.conda#8504a291085c9fb809b66cabd5834307 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2#309dec04b70a3cc0f1e84a4013683bc0 +https://conda.anaconda.org/conda-forge/linux-64/nss-3.112-h159eef7_0.conda#688a8bc02e57e6b741a040c84e931a7d +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.17-hd6af730_0_cpython.conda#7bb89638dae9ce1b8e051d0b721e83c2 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.4.26-pyhd8ed1ab_0.conda#c33eeaaa33f45031be34cda513df39b6 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb +https://conda.anaconda.org/conda-forge/linux-64/cython-3.0.10-py310hc6cd4ac_0.conda#bd1d71ee240be36f1d85c86177d6964f +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/linux-64/gettext-0.24.1-h5888daf_0.conda#c63e7590d4d6f4c85721040ed8b12888 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py310h3788b33_0.conda#4186d9b4d004b0fe0de6aa62496fb48a +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c +https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.25-pthreads_h413a1c8_0.conda#d172b34a443b95f86089e8229ddc9a17 +https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.4-h4e0b6ca_1.conda#04bcf3055e51f8dde6fab9672fb9fca0 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/ply-3.11-pyhd8ed1ab_3.conda#fd5062942bfa1b0bd5e0d2a4397b099e +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.1.0-pyh8a188c0_0.tar.bz2#a2995ee828f65687ac5b1e71a2ab1e0c +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py310ha75aee5_0.conda#6f3da1072c0c4d2a1beb1e84615f7c9c +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py310ha75aee5_0.conda#1d7a4b9202cdd10d56ecdd7f6c347190 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 +https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.44-hb9d3cd8_0.conda#7c91bfc90672888259675ad2ad28af9c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.11.3-h80c52d3_0.conda#eb517c6a2b960c3ccb6f1db1005f063a +https://conda.anaconda.org/conda-forge/linux-64/coverage-7.8.2-py310h89163eb_0.conda#5ca8ab35287adc83b2d1996e5c2ac14c +https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda#679616eb5ad4e521c83da4650860aba7 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.1-py310h89163eb_0.conda#f4f46207c6defa5ea17b0299298ba849 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.84.2-h4833e2c_0.conda#f2ec1facec64147850b7674633978050 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.2.0-pyhd8ed1ab_0.tar.bz2#7583652522d71ad78ba536bba06940eb +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-20_linux64_openblas.conda#2b7bb4f7562c8cf334fc2e20c2d28abc +https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda#ee48bf17cc83a00f59ca1494d5646869 +https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a +https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.6-he9d0ab4_0.conda#bf8ccdd2c1c1a54a3fa25bb61f26460e +https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda#fedf6bfe5d21d21d2b1785ec00a8889a +https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.25-pthreads_h7a3da1a_0.conda#87661673941b5e702275fdf0fc095ad0 +https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda#2e5bf4f1da39c0b32778561c3c4e5878 +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py310h7e6dc6c_0.conda#5645a243d90adb50909b9edc209d84fe +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/linux-64/sip-6.10.0-py310hf71b8c6_0.conda#2d7e4445be227e8210140b75725689ad +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda#5efa5fa6243a622445fdfd72aee15efa +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/linux-64/glib-2.84.2-h6287aef_0.conda#704648df3a01d4d24bc2c0466b718d63 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-20_linux64_openblas.conda#36d486d72ab64ffea932329a1d3729a3 +https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.6-default_h1df26ce_0.conda#99ead3b974685e44df8b1e3953503cfc +https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.6-default_he06ed0a_0.conda#cc6c469d9d7fc0ac106cef5f45d973a9 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-20_linux64_openblas.conda#6fabc51f5e647d09cc010c40061557e0 +https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda#6458be24f09e1b034902ab44fe9de908 +https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc60ed4a_1.conda#ef1910918dd895516a769ed36b5b3a4e +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.16.0-pyh0c530f3_0.conda#e16f0dbf502da873be9f9adb0dc52547 +https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.17.0-py310hf71b8c6_0.conda#012465861673a67a30bc8ca6284074f3 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.24.11-hc37bda9_0.conda#056d86cacf2b48c79c6a562a2486eb8c +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-20_linux64_openblas.conda#05c5862c7dc25e65ba6c471d96429dae +https://conda.anaconda.org/conda-forge/linux-64/numpy-1.22.0-py310h454958d_1.tar.bz2#607c66f0cce2986515a8fe9e136b2b57 +https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-17.0-hac146a9_1.conda#66b1fa9608d8836e25f9919159adc9c6 +https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.1.1-pyhd8ed1ab_0.conda#1e35d8f975bc0e984a19819aa91c440a +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-20_linux64_openblas.conda#9932a1d4e9ecf2d35fb19475446e361e +https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.24.11-h651a532_0.conda#d8d8894f8ced2c9be76dc9ad1ae531ce +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.5.0-py310h23f4a51_0.tar.bz2#9911225650b298776c8e8c083b5cacf1 +https://conda.anaconda.org/conda-forge/linux-64/pandas-1.4.0-py310hb5077e9_0.tar.bz2#43e920bc9856daa7d8d18fcbfb244c4e +https://conda.anaconda.org/conda-forge/linux-64/polars-0.20.30-py310h031f9ce_0.conda#0743f5db9f978b6df92d412935ff8371 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.8.0-py310hea5193d_1.tar.bz2#664d80ddeb51241629b3ada5ea926e4d +https://conda.anaconda.org/conda-forge/linux-64/blas-2.120-openblas.conda#c8f6916a81a340650078171b1d852574 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-4.2.1-py310h7c3ba0c_0.tar.bz2#89f5a48e1f23b5cf3163a6094903d181 +https://conda.anaconda.org/conda-forge/linux-64/qt-main-5.15.15-hea1682b_4.conda#c054d7f22cc719e12c72d454b2328d6c +https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.15.11-py310hf392a12_0.conda#65924d3e57be25342c76530d23d75f0f +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.5.0-py310hff52083_0.tar.bz2#1b2f3b135d5d9c594b5e0e6150c03b7b diff --git a/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_environment.yml b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_environment.yml new file mode 100644 index 0000000000000..267c149fd1c35 --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_environment.yml @@ -0,0 +1,24 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy + - blas[build=openblas] + - scipy + - cython + - joblib + - threadpoolctl + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - sphinx + - numpydoc + - ccache diff --git a/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock new file mode 100644 index 0000000000000..08a8597ed4fae --- /dev/null +++ b/build_tools/azure/pymin_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock @@ -0,0 +1,116 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 26bb2530999c20f24bbab0f7b6e3545ad84d059a25027cb624997210afc23693 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_2.conda#fbe7d535ff9d3a168c148e07358cd5b1 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.29-pthreads_h94d23a6_0.conda#0a4d0252248ef9a0f88f2ba8b8a08e12 +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.17-hd6af730_0_cpython.conda#7bb89638dae9ce1b8e051d0b721e83c2 +https://conda.anaconda.org/conda-forge/noarch/alabaster-1.0.0-pyhd8ed1ab_1.conda#1fd9696649f65fd6611fcdb4ffec738a +https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hf71b8c6_2.conda#bf502c169c71e3c6ac0d6175addfacc2 +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.4.26-pyhd8ed1ab_0.conda#c33eeaaa33f45031be34cda513df39b6 +https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda#40fe4284b8b5835a9073a645139f35af +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/linux-64/cython-3.1.1-py310had8cdd9_1.conda#4904cb1ba6e72940ff22a5235554532d +https://conda.anaconda.org/conda-forge/noarch/docutils-0.21.2-pyhd8ed1ab_1.conda#24c1ca34138ee57de72a943237cde4cc +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda#0a802cb9888dd14eeefc611f05c40b6e +https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda#8e6923fc12f1fe8f8c4e5c9f343256ac +https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 +https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py310h89163eb_1.conda#8ce3f0332fd6de0d737e2911d329523f +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.29-pthreads_h6ec200e_0.conda#7e4d48870b3258bea920d51b7f495a81 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda#12c566707c80111f9799308d9e265aef +https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.1-pyhd8ed1ab_0.conda#232fb4577b6687b2d503ef8e254270c9 +https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda#461219d1a5bd61342293efa2c0c90eac +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-3.0.1-pyhd8ed1ab_0.conda#755cf22df8693aa0d1aec1c123fa5863 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_1.conda#fa839b5ff59e192f411ccc7dae6588bb +https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_2.conda#959484a66b4b76befcddc4fa97c95567 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/noarch/babel-2.17.0-pyhd8ed1ab_0.conda#0a01c169f0ab0f91b26e77a3301fbfe4 +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.11.3-h80c52d3_0.conda#eb517c6a2b960c3ccb6f1db1005f063a +https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py310h8deb56e_0.conda#1fc24a3196ad5ede2a68148be61894f4 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda#b4754fb1bdcb70c8fd54f918301582c6 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_he106b2a_openblas.conda#abb32c727da370c481a1c206f5159ce9 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_h7ac8fdf_openblas.conda#452b98eafe050ecff932f0ec832dd03f +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py310h7e6dc6c_0.conda#5645a243d90adb50909b9edc209d84fe +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-31_he2f377e_openblas.conda#7e5fff7d0db69be3a266f7e79a3bb0e2 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py310hefbff90_0.conda#b0cea2c364bf65cd19e023040eeab05d +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py310ha75aee5_2.conda#f9254b5b0193982416b91edcb4b2676f +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-31_h1ea3ea9_openblas.conda#ba652ee0576396d4765e567f043c57f9 +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.3-py310h5eaa309_3.conda#07697a584fab513ce895c4511f7a2403 +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py310h1d65ade_0.conda#8c29cd33b64b2eb78597fa28b5595c8d +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.4.0-pyhd8ed1ab_0.conda#c1e349028e0052c4eea844e94f773065 +https://conda.anaconda.org/conda-forge/linux-64/blas-2.131-openblas.conda#38b2ec894c69bb4be0e66d2ef7fc60bf +https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.2.1-py310ha2bacc8_1.conda#817d32861729e14f474249f1036291c4 +https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_1.conda#a9b9368f3701a417eac9edbcae7cb737 +https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.8.0-pyhd8ed1ab_1.conda#5af206d64d18d6c8dfb3122b4d9e643b +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda#16e3f039c0aa6446513e94ab18a8784b +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda#910f28a05c178feba832f842155cbfff +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.1.0-pyhd8ed1ab_1.conda#e9fb3fe8a5b758b4aff187d434f94f03 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-2.0.0-pyhd8ed1ab_1.conda#00534ebcc0375929b45c3039b5ba7636 +https://conda.anaconda.org/conda-forge/noarch/sphinx-8.1.3-pyhd8ed1ab_1.conda#1a3281a0dc355c02b5506d87db2d78ac +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_1.conda#3bc61f7161d28137797e038263c04c54 diff --git a/build_tools/azure/test_docs.sh b/build_tools/azure/test_docs.sh index 18b3ccb148b5e..f3f824d5806b0 100755 --- a/build_tools/azure/test_docs.sh +++ b/build_tools/azure/test_docs.sh @@ -1,15 +1,21 @@ #!/bin/bash -set -e +set -ex -if [[ "$DISTRIB" =~ ^conda.* ]]; then - source activate $VIRTUALENV -elif [[ "$DISTRIB" == "ubuntu" ]]; then - source $VIRTUALENV/bin/activate -fi +source build_tools/shared.sh +activate_environment -if [[ "$BUILD_WITH_ICC" == "true" ]]; then - source /opt/intel/oneapi/setvars.sh +scipy_doctest_installed=$(python -c 'import scipy_doctest' && echo "True" || echo "False") +if [[ "$scipy_doctest_installed" == "True" ]]; then + doc_rst_files=$(find $PWD/doc -name '*.rst' | sort) + # Changing dir, as we do in build_tools/azure/test_script.sh, avoids an + # error when importing sklearn. Not sure why this happens ... I am going to + # wild guess that it has something to do with the bespoke way we set up + # conda with putting conda in the PATH and source activate, rather than + # source <conda_root>/etc/profile.d/conda.sh + conda activate. + cd $TEST_DIR + # with scipy-doctest, --doctest-modules only runs doctests (in contrary to + # vanilla pytest where it runs doctests on top of normal tests) + python -m pytest --doctest-modules --pyargs sklearn + python -m pytest --doctest-modules $doc_rst_files fi - -make test-doc diff --git a/build_tools/azure/test_script.cmd b/build_tools/azure/test_script.cmd deleted file mode 100644 index c5f8d0e33889c..0000000000000 --- a/build_tools/azure/test_script.cmd +++ /dev/null @@ -1,24 +0,0 @@ -@echo on - -@rem Only 64 bit uses conda and uses a python newer than 3.5 -IF "%PYTHON_ARCH%"=="64" ( - call activate %VIRTUALENV% -) - -mkdir %TMP_FOLDER% -cd %TMP_FOLDER% - -if "%PYTEST_XDIST%" == "true" ( - set PYTEST_ARGS=%PYTEST_ARGS% -n2 -) - -if "%CHECK_WARNINGS%" == "true" ( - REM numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage - set PYTEST_ARGS=%PYTEST_ARGS% -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning -) - -if "%COVERAGE%" == "true" ( - set PYTEST_ARGS=%PYTEST_ARGS% --cov sklearn -) - -pytest --junitxml=%JUNITXML% --showlocals --durations=20 %PYTEST_ARGS% --pyargs sklearn diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 858d691b38216..eb4414283be2b 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -2,30 +2,44 @@ set -e -if [[ "$DISTRIB" =~ ^conda.* ]]; then - source activate $VIRTUALENV -elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "ubuntu-32" ]]; then - source $VIRTUALENV/bin/activate +# Defines the show_installed_libraries and activate_environment functions. +source build_tools/shared.sh + +activate_environment + +if [[ "$BUILD_REASON" == "Schedule" ]]; then + # Enable global random seed randomization to discover seed-sensitive tests + # only on nightly builds. + # https://scikit-learn.org/stable/computing/parallelism.html#environment-variables + export SKLEARN_TESTS_GLOBAL_RANDOM_SEED=$(($RANDOM % 100)) + echo "To reproduce this test run, set the following environment variable:" + echo " SKLEARN_TESTS_GLOBAL_RANDOM_SEED=$SKLEARN_TESTS_GLOBAL_RANDOM_SEED", + echo "See: https://scikit-learn.org/dev/computing/parallelism.html#sklearn-tests-global-random-seed" + + # Enable global dtype fixture for all nightly builds to discover + # numerical-sensitive tests. + # https://scikit-learn.org/stable/computing/parallelism.html#environment-variables + export SKLEARN_RUN_FLOAT32_TESTS=1 fi -if [[ "$BUILD_WITH_ICC" == "true" ]]; then - source /opt/intel/oneapi/setvars.sh +COMMIT_MESSAGE=$(python build_tools/azure/get_commit_message.py --only-show-message) + +if [[ "$COMMIT_MESSAGE" =~ \[float32\] ]]; then + echo "float32 tests will be run due to commit message" + export SKLEARN_RUN_FLOAT32_TESTS=1 fi -python --version -python -c "import numpy; print('numpy %s' % numpy.__version__)" -python -c "import scipy; print('scipy %s' % scipy.__version__)" -python -c "\ -try: - import pandas - print('pandas %s' % pandas.__version__) -except ImportError: - print('pandas not installed') -" -python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" -pip list - -TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" +mkdir -p $TEST_DIR +cp pyproject.toml $TEST_DIR +cd $TEST_DIR + +python -c "import joblib; print(f'Number of cores (physical): \ +{joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')" +python -c "import sklearn; sklearn.show_versions()" + +show_installed_libraries + +TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML -o junit_family=legacy" if [[ "$COVERAGE" == "true" ]]; then # Note: --cov-report= is used to disable to long text output report in the @@ -34,22 +48,43 @@ if [[ "$COVERAGE" == "true" ]]; then # report that otherwise hides the test failures and forces long scrolls in # the CI logs. export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --cov-config=$COVERAGE_PROCESS_START --cov sklearn --cov-report=" -fi -if [[ -n "$CHECK_WARNINGS" ]]; then - # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage - TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning" + # Use sys.monitoring to make coverage faster for Python >= 3.12 + HAS_SYSMON=$(python -c 'import sys; print(sys.version_info >= (3, 12))') + if [[ "$HAS_SYSMON" == "True" ]]; then + export COVERAGE_CORE=sysmon + fi + TEST_CMD="$TEST_CMD --cov-config='$COVERAGE_PROCESS_START' --cov sklearn --cov-report=" fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then - TEST_CMD="$TEST_CMD -n2" + XDIST_WORKERS=$(python -c "import joblib; print(joblib.cpu_count(only_physical_cores=True))") + TEST_CMD="$TEST_CMD -n$XDIST_WORKERS" fi -mkdir -p $TEST_DIR -cp setup.cfg $TEST_DIR -cd $TEST_DIR +if [[ -n "$SELECTED_TESTS" ]]; then + TEST_CMD="$TEST_CMD -k $SELECTED_TESTS" + + # Override to make selected tests run on all random seeds + export SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" +fi + +if which lscpu ; then + lscpu +else + echo "Could not inspect CPU architecture." +fi + +if [[ "$DISTRIB" == "conda-free-threaded" ]]; then + # Make sure that GIL is disabled even when importing extensions that have + # not declared free-threaded compatibility. This can be removed when numpy, + # scipy and scikit-learn extensions all have declared free-threaded + # compatibility. + export PYTHON_GIL=0 +fi + +TEST_CMD="$TEST_CMD --pyargs sklearn" set -x -$TEST_CMD --pyargs sklearn +eval "$TEST_CMD" set +x diff --git a/build_tools/azure/ubuntu_atlas_lock.txt b/build_tools/azure/ubuntu_atlas_lock.txt new file mode 100644 index 0000000000000..9c1faa23ab962 --- /dev/null +++ b/build_tools/azure/ubuntu_atlas_lock.txt @@ -0,0 +1,45 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=build_tools/azure/ubuntu_atlas_lock.txt build_tools/azure/ubuntu_atlas_requirements.txt +# +cython==3.0.10 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +exceptiongroup==1.3.0 + # via pytest +execnet==2.1.1 + # via pytest-xdist +iniconfig==2.1.0 + # via pytest +joblib==1.2.0 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +meson==1.8.1 + # via meson-python +meson-python==0.18.0 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +ninja==1.11.1.4 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +packaging==25.0 + # via + # meson-python + # pyproject-metadata + # pytest +pluggy==1.6.0 + # via pytest +pyproject-metadata==0.9.1 + # via meson-python +pytest==8.3.5 + # via + # -r build_tools/azure/ubuntu_atlas_requirements.txt + # pytest-xdist +pytest-xdist==3.7.0 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +threadpoolctl==3.1.0 + # via -r build_tools/azure/ubuntu_atlas_requirements.txt +tomli==2.2.1 + # via + # meson-python + # pytest +typing-extensions==4.13.2 + # via exceptiongroup diff --git a/build_tools/azure/ubuntu_atlas_requirements.txt b/build_tools/azure/ubuntu_atlas_requirements.txt new file mode 100644 index 0000000000000..dfb0cfebc54d1 --- /dev/null +++ b/build_tools/azure/ubuntu_atlas_requirements.txt @@ -0,0 +1,10 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +cython==3.0.10 # min +joblib==1.2.0 # min +threadpoolctl==3.1.0 # min +pytest +pytest-xdist +ninja +meson-python diff --git a/build_tools/azure/upload_codecov.cmd b/build_tools/azure/upload_codecov.cmd deleted file mode 100644 index 6150b75a1ea54..0000000000000 --- a/build_tools/azure/upload_codecov.cmd +++ /dev/null @@ -1,10 +0,0 @@ -@echo on - -@rem Only 64 bit uses conda -IF "%PYTHON_ARCH%"=="64" ( - call activate %VIRTUALENV% -) - -copy %TMP_FOLDER%\.coverage %BUILD_REPOSITORY_LOCALPATH% - -codecov --root %BUILD_REPOSITORY_LOCALPATH% -t %CODECOV_TOKEN% diff --git a/build_tools/azure/upload_codecov.sh b/build_tools/azure/upload_codecov.sh index 274106cb19f75..4c3db8fe8bbd6 100755 --- a/build_tools/azure/upload_codecov.sh +++ b/build_tools/azure/upload_codecov.sh @@ -2,15 +2,58 @@ set -e -# called when COVERAGE=="true" and DISTRIB=="conda" -export PATH=$HOME/miniconda3/bin:$PATH -source activate $VIRTUALENV +# Do not upload to codecov on forks +if [[ "$BUILD_REPOSITORY_NAME" != "scikit-learn/scikit-learn" ]]; then + exit 0 +fi -# Need to run codecov from a git checkout, so we copy .coverage -# from TEST_DIR where pytest has been run -pushd $TEST_DIR -coverage combine --append -popd -cp $TEST_DIR/.coverage $BUILD_REPOSITORY_LOCALPATH +# When we update the codecov uploader version, we need to update the checksums. +# The checksum for each codecov binary is available at +# https://cli.codecov.io e.g. for linux +# https://cli.codecov.io/v10.2.1/linux/codecov.SHA256SUM. -codecov --root $BUILD_REPOSITORY_LOCALPATH -t $CODECOV_TOKEN || echo "codecov upload failed" +# Instead of hardcoding a specific version and signature in this script, it +# would be possible to use the "latest" symlink URL but then we need to +# download both the codecov.SHA256SUM files each time and check the signatures +# with the codecov gpg key as well, see: +# https://docs.codecov.com/docs/codecov-uploader#integrity-checking-the-uploader +# However this approach would yield a larger number of downloads from +# codecov.io and keybase.io, therefore increasing the risk of running into +# network failures. +CODECOV_CLI_VERSION=10.2.1 +CODECOV_BASE_URL="https://cli.codecov.io/v$CODECOV_CLI_VERSION" + +# Check that the git repo is located at the expected location: +if [[ ! -d "$BUILD_REPOSITORY_LOCALPATH/.git" ]]; then + echo "Could not find the git checkout at $BUILD_REPOSITORY_LOCALPATH" + exit 1 +fi + +# Check that the combined coverage file exists at the expected location: +export COVERAGE_XML="$BUILD_REPOSITORY_LOCALPATH/coverage.xml" +if [[ ! -f "$COVERAGE_XML" ]]; then + echo "Could not find the combined coverage file at $COVERAGE_XML" + exit 1 +fi + +if [[ $OSTYPE == *"linux"* ]]; then + curl -Os "$CODECOV_BASE_URL/linux/codecov" + SHA256SUM="39dd112393680356daf701c07f375303aef5de62f06fc80b466b5c3571336014 codecov" + echo "$SHA256SUM" | shasum -a256 -c + chmod +x codecov + ./codecov upload-coverage -t ${CODECOV_TOKEN} -f coverage.xml -Z + ./codecov do-upload --disable-search --report-type test_results --file $JUNIT_FILE +elif [[ $OSTYPE == *"darwin"* ]]; then + curl -Os "$CODECOV_BASE_URL/macos/codecov" + SHA256SUM="01183f6367c7baff4947cce389eaa511b7a6d938e37ae579b08a86b51f769fd9 codecov" + echo "$SHA256SUM" | shasum -a256 -c + chmod +x codecov + ./codecov upload-coverage -t ${CODECOV_TOKEN} -f coverage.xml -Z + ./codecov do-upload --disable-search --report-type test_results --file $JUNIT_FILE +else + curl -Os "$CODECOV_BASE_URL/windows/codecov.exe" + SHA256SUM="e54e9520428701a510ef451001db56b56fb17f9b0484a266f184b73dd27b77e7 codecov.exe" + echo "$SHA256SUM" | sha256sum -c + ./codecov.exe upload-coverage -t ${CODECOV_TOKEN} -f coverage.xml -Z + ./codecov.exe do-upload --disable-search --report-type test_results --file $JUNIT_FILE +fi diff --git a/build_tools/azure/windows.yml b/build_tools/azure/windows.yml index 8a5edd4b93019..b3fcf130f9350 100644 --- a/build_tools/azure/windows.yml +++ b/build_tools/azure/windows.yml @@ -16,39 +16,71 @@ jobs: VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' SKLEARN_SKIP_NETWORK_TESTS: '1' - PYTEST_VERSION: '5.2.1' - PYTEST_XDIST: 'true' - TMP_FOLDER: '$(Agent.WorkFolder)\tmp_folder' + PYTEST_XDIST_VERSION: 'latest' + TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' + SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} steps: - - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" - displayName: Add conda to PATH for 64 bit Python - condition: eq(variables['PYTHON_ARCH'], '64') + - bash: python build_tools/azure/get_selected_tests.py + displayName: Check selected tests for all random seeds + condition: eq(variables['Build.Reason'], 'PullRequest') + - bash: echo "##vso[task.prependpath]$CONDA/Scripts" + displayName: Add conda to PATH + condition: startsWith(variables['DISTRIB'], 'conda') - task: UsePythonVersion@0 inputs: versionSpec: '$(PYTHON_VERSION)' addToPath: true architecture: 'x86' displayName: Use 32 bit System Python - condition: eq(variables['PYTHON_ARCH'], '32') - - script: | - build_tools\\azure\\install.cmd + condition: and(succeeded(), eq(variables['PYTHON_ARCH'], '32')) + - bash: ./build_tools/azure/install.sh displayName: 'Install' - - script: | - build_tools\\azure\\test_script.cmd + - bash: ./build_tools/azure/test_script.sh displayName: 'Test Library' - - script: | - build_tools\\azure\\upload_codecov.cmd - condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) - displayName: 'Upload To Codecov' - env: - CODECOV_TOKEN: $(CODECOV_TOKEN) + - bash: ./build_tools/azure/combine_coverage_reports.sh + condition: and(succeeded(), eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + displayName: 'Combine coverage' - task: PublishTestResults@2 inputs: - testResultsFiles: '$(TMP_FOLDER)\$(JUNITXML)' + testResultsFiles: '$(TEST_DIR)/$(JUNITXML)' testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() + - bash: | + set -ex + if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then + echo "GitHub Token is not set. Issue tracker will not be updated." + exit + fi + + LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID" + CI_NAME="$SYSTEM_JOBIDENTIFIER" + ISSUE_REPO="$BUILD_REPOSITORY_NAME" + + $(pyTools.pythonLocation)/bin/pip install defusedxml PyGithub + $(pyTools.pythonLocation)/bin/python maint_tools/update_tracking_issue.py \ + $(BOT_GITHUB_TOKEN) \ + $CI_NAME \ + $ISSUE_REPO \ + $LINK_TO_RUN \ + --junit-file $JUNIT_FILE \ + --auto-close false + displayName: 'Update issue tracker' + env: + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) + condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), + eq(variables['Build.Reason'], 'Schedule')) + - bash: ./build_tools/azure/upload_codecov.sh + condition: and(succeeded(), + eq(variables['COVERAGE'], 'true'), + eq(variables['SELECTED_TESTS'], '')) + displayName: 'Upload To Codecov' + retryCountOnTaskFailure: 5 + env: + CODECOV_TOKEN: $(CODECOV_TOKEN) + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) diff --git a/build_tools/check-meson-openmp-dependencies.py b/build_tools/check-meson-openmp-dependencies.py new file mode 100644 index 0000000000000..43a7426494160 --- /dev/null +++ b/build_tools/check-meson-openmp-dependencies.py @@ -0,0 +1,172 @@ +""" +Check that OpenMP dependencies are correctly defined in meson.build files. + +This is based on trying to make sure the the following two things match: +- the Cython files using OpenMP (based on a git grep regex) +- the Cython extension modules that are built with OpenMP compiler flags (based + on meson introspect json output) +""" + +import json +import re +import subprocess +from pathlib import Path + + +def has_source_openmp_flags(target_source): + return any("openmp" in arg for arg in target_source["parameters"]) + + +def has_openmp_flags(target): + """Return whether target sources use OpenMP flags. + + Make sure that both compiler and linker source use OpenMP. + Look at `get_meson_info` docstring to see what `target` looks like. + """ + target_sources = target["target_sources"] + + target_use_openmp_flags = any( + has_source_openmp_flags(target_source) for target_source in target_sources + ) + + if not target_use_openmp_flags: + return False + + # When the target use OpenMP we expect a compiler + linker source and we + # want to make sure that both the compiler and the linker use OpenMP + assert len(target_sources) == 2 + compiler_source, linker_source = target_sources + assert "compiler" in compiler_source + assert "linker" in linker_source + + compiler_use_openmp_flags = any( + "openmp" in arg for arg in compiler_source["parameters"] + ) + linker_use_openmp_flags = any( + "openmp" in arg for arg in linker_source["parameters"] + ) + + assert compiler_use_openmp_flags == linker_use_openmp_flags + return compiler_use_openmp_flags + + +def get_canonical_name_meson(target, build_path): + """Return a name based on generated shared library. + + The goal is to return a name that can be easily matched with the output + from `git_grep_info`. + + Look at `get_meson_info` docstring to see what `target` looks like. + """ + # Expect a list with one element with the name of the shared library + assert len(target["filename"]) == 1 + shared_library_path = Path(target["filename"][0]) + shared_library_relative_path = shared_library_path.relative_to( + build_path.absolute() + ) + # Needed on Windows to match git grep output + rel_path = shared_library_relative_path.as_posix() + # OS-specific naming of the shared library .cpython- on POSIX and + # something like .cp312- on Windows + pattern = r"\.(cpython|cp\d+)-.+" + return re.sub(pattern, "", str(rel_path)) + + +def get_canonical_name_git_grep(filename): + """Return name based on filename. + + The goal is to return a name that can easily be matched with the output + from `get_meson_info`. + """ + return re.sub(r"\.pyx(\.tp)?", "", filename) + + +def get_meson_info(): + """Return names of extension that use OpenMP based on meson introspect output. + + The meson introspect json info is a list of targets where a target is a dict + that looks like this (parts not used in this script are not shown for simplicity): + { + 'name': '_k_means_elkan.cpython-312-x86_64-linux-gnu', + 'filename': [ + '<meson_build_dir>/sklearn/cluster/_k_means_elkan.cpython-312-x86_64-linux-gnu.so' + ], + 'target_sources': [ + { + 'compiler': ['ccache', 'cc'], + 'parameters': [ + '-Wall', + '-std=c11', + '-fopenmp', + ... + ], + ... + }, + { + 'linker': ['cc'], + 'parameters': [ + '-shared', + '-fPIC', + '-fopenmp', + ... + ] + } + ] + } + """ + build_path = Path("build/introspect") + subprocess.check_call(["meson", "setup", build_path, "--reconfigure"]) + + json_out = subprocess.check_output( + ["meson", "introspect", build_path, "--targets"], text=True + ) + target_list = json.loads(json_out) + meson_targets = [target for target in target_list if has_openmp_flags(target)] + + return [get_canonical_name_meson(each, build_path) for each in meson_targets] + + +def get_git_grep_info(): + """Return names of extensions that use OpenMP based on git grep regex.""" + git_grep_filenames = subprocess.check_output( + ["git", "grep", "-lP", "cython.*parallel|_openmp_helpers"], text=True + ).splitlines() + git_grep_filenames = [f for f in git_grep_filenames if ".pyx" in f] + + return [get_canonical_name_git_grep(each) for each in git_grep_filenames] + + +def main(): + from_meson = set(get_meson_info()) + from_git_grep = set(get_git_grep_info()) + + only_in_git_grep = from_git_grep - from_meson + only_in_meson = from_meson - from_git_grep + + msg = "" + if only_in_git_grep: + only_in_git_grep_msg = "\n".join( + [f" {each}" for each in sorted(only_in_git_grep)] + ) + msg += ( + "Some Cython files use OpenMP," + " but their meson.build is missing the openmp_dep dependency:\n" + f"{only_in_git_grep_msg}\n\n" + ) + + if only_in_meson: + only_in_meson_msg = "\n".join([f" {each}" for each in sorted(only_in_meson)]) + msg += ( + "Some Cython files do not use OpenMP," + " you should remove openmp_dep from their meson.build:\n" + f"{only_in_meson_msg}\n\n" + ) + + if from_meson != from_git_grep: + raise ValueError( + f"Some issues have been found in Meson OpenMP dependencies:\n\n{msg}" + ) + + +if __name__ == "__main__": + main() diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 37afb1841d368..e85f3ab15e617 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -x set -e +set -x # Decide what kind of documentation build to run, and run it. # @@ -17,6 +17,32 @@ set -e # If the inspection of the current commit fails for any reason, the default # behavior is to quick build the documentation. +# defines the get_dep and show_installed_libraries functions +source build_tools/shared.sh + +if [ -n "$GITHUB_ACTION" ] +then + # Map the variables from Github Action to CircleCI + CIRCLE_SHA1=$(git log -1 --pretty=format:%H) + + CIRCLE_JOB=$GITHUB_JOB + + if [ "$GITHUB_EVENT_NAME" == "pull_request" ] + then + CIRCLE_BRANCH=$GITHUB_HEAD_REF + CI_PULL_REQUEST=true + CI_TARGET_BRANCH=$GITHUB_BASE_REF + else + CIRCLE_BRANCH=$GITHUB_REF_NAME + fi +fi + +if [[ -n "$CI_PULL_REQUEST" && -z "$CI_TARGET_BRANCH" ]] +then + # Get the target branch name when using CircleCI + CI_TARGET_BRANCH=$(curl -s "https://api.github.com/repos/scikit-learn/scikit-learn/pulls/$CIRCLE_PR_NUMBER" | jq -r .base.ref) +fi + get_build_type() { if [ -z "$CIRCLE_SHA1" ] then @@ -130,83 +156,81 @@ else make_args=html fi -make_args="SPHINXOPTS=-T $make_args" # show full traceback on exception - # Installing required system packages to support the rendering of math # notation in the HTML documentation and to optimize the image files -sudo -E apt-get -yq update +sudo -E apt-get -yq update --allow-releaseinfo-change sudo -E apt-get -yq --no-install-suggests --no-install-recommends \ install dvipng gsfonts ccache zip optipng -# deactivate circleci virtualenv and setup a miniconda env instead +# deactivate circleci virtualenv and setup a conda env instead if [[ `type -t deactivate` ]]; then deactivate fi -MINICONDA_PATH=$HOME/miniconda -# Install dependencies with miniconda -wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - -O miniconda.sh -chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH -export PATH="/usr/lib/ccache:$MINICONDA_PATH/bin:$PATH" +# Install Miniforge +MINIFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh" +curl -L --retry 10 $MINIFORGE_URL -o miniconda.sh +MINIFORGE_PATH=$HOME/miniforge3 +bash ./miniconda.sh -b -p $MINIFORGE_PATH +source $MINIFORGE_PATH/etc/profile.d/conda.sh +conda activate + +create_conda_environment_from_lock_file $CONDA_ENV_NAME $LOCK_FILE +conda activate $CONDA_ENV_NAME + +# Sets up ccache when using system compiler +export PATH="/usr/lib/ccache:$PATH" +# Sets up ccache when using conda-forge compilers (needs to be after conda +# activate which sets CC and CXX) +export CC="ccache $CC" +export CXX="ccache $CXX" ccache -M 512M export CCACHE_COMPRESS=1 +# Zeroing statistics so that ccache statistics are shown only for this build +ccache -z -# Old packages coming from the 'free' conda channel have been removed but we -# are using them for our min-dependencies doc generation. See -# https://www.anaconda.com/why-we-removed-the-free-channel-in-conda-4-7/ for -# more details. -if [[ "$CIRCLE_JOB" == "doc-min-dependencies" ]]; then - conda config --set restore_free_channel true -fi +show_installed_libraries -# imports get_dep -source build_tools/shared.sh +# Specify explicitly ninja -j argument because ninja does not handle cgroups v2 and +# use the same default rule as ninja (-j3 since we have 2 cores on CircleCI), see +# https://github.com/scikit-learn/scikit-learn/pull/30333 +pip install -e . --no-build-isolation --config-settings=compile-args="-j 3" -# packaging won't be needed once setuptools starts shipping packaging>=17.0 -conda create -n $CONDA_ENV_NAME --yes --quiet \ - python="${PYTHON_VERSION:-*}" \ - "$(get_dep numpy $NUMPY_VERSION)" \ - "$(get_dep scipy $SCIPY_VERSION)" \ - "$(get_dep cython $CYTHON_VERSION)" \ - "$(get_dep matplotlib $MATPLOTLIB_VERSION)" \ - "$(get_dep sphinx $SPHINX_VERSION)" \ - "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" \ - "$(get_dep pandas $PANDAS_VERSION)" \ - joblib memory_profiler packaging seaborn pillow pytest coverage - -source activate testenv -pip install sphinx-gallery -pip install numpydoc -pip install sphinx-prompt - -# Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI -# workers with 2 cores when building the compiled extensions of scikit-learn. -export SKLEARN_BUILD_PARALLEL=3 -python setup.py develop +echo "ccache build summary:" +ccache -s export OMP_NUM_THREADS=1 +if [[ "$CIRCLE_BRANCH" == "main" || "$CI_TARGET_BRANCH" == "main" ]] +then + towncrier build --yes +fi + if [[ "$CIRCLE_BRANCH" =~ ^main$ && -z "$CI_PULL_REQUEST" ]] then # List available documentation versions if on main - python build_tools/circle/list_versions.py > doc/versions.rst + python build_tools/circle/list_versions.py --json doc/js/versions.json --rst doc/versions.rst fi + # The pipefail is requested to propagate exit code set -o pipefail && cd doc && make $make_args 2>&1 | tee ~/log.txt -# Insert the version warning for deployment -find _build/html/stable -name "*.html" | xargs sed -i '/<\/body>/ i \ -\ <script src="https://scikit-learn.org/versionwarning.js"></script>' - cd - set +o pipefail affected_doc_paths() { + scikit_learn_version=$(python -c 'import re; import sklearn; print(re.sub(r"(\d+\.\d+).+", r"\1", sklearn.__version__))') files=$(git diff --name-only origin/main...$CIRCLE_SHA1) - echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/' + # use sed to replace files ending by .rst or .rst.template by .html + echo "$files" | grep -vP 'upcoming_changes/.*/\d+.*\.rst' | grep ^doc/.*\.rst | \ + sed 's/^doc\/\(.*\)\.rst$/\1.html/; s/^doc\/\(.*\)\.rst\.template$/\1.html/' + # replace towncrier fragment files by link to changelog. uniq is used + # because in some edge cases multiple fragments can be added and we want a + # single link to the changelog. + echo "$files" | grep -P 'upcoming_changes/.*/\d+.*\.rst' | sed "s@.*@whats_new/v${scikit_learn_version}.html@" | uniq + echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/' sklearn_files=$(echo "$files" | grep '^sklearn/') if [ -n "$sklearn_files" ] @@ -244,7 +268,7 @@ then ( echo '<html><body><ul>' echo "$affected" | sed 's|.*|<li><a href="&">&</a> [<a href="https://scikit-learn.org/dev/&">dev</a>, <a href="https://scikit-learn.org/stable/&">stable</a>]</li>|' - echo '</ul><p>General: <a href="index.html">Home</a> | <a href="modules/classes.html">API Reference</a> | <a href="auto_examples/index.html">Examples</a></p>' + echo '</ul><p>General: <a href="index.html">Home</a> | <a href="api/index.html">API Reference</a> | <a href="auto_examples/index.html">Examples</a></p>' echo '<strong>Sphinx Warnings in affected files</strong><ul>' echo "$warnings" | sed 's/\/home\/circleci\/project\//<li>/g' echo '</ul></body></html>' diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh deleted file mode 100755 index c1def8ce2670b..0000000000000 --- a/build_tools/circle/build_test_pypy.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -set -x -set -e - -# System build tools -apt-get -yq update -apt-get -yq install wget bzip2 build-essential ccache - -# Install pypy and all the scikit-learn dependencies from conda-forge. In -# particular, we want to install pypy compatible binary packages for numpy and -# scipy as it would be to costly to build those from source. -conda install -y mamba -mamba create -n pypy -y \ - pypy numpy scipy cython \ - joblib threadpoolctl pillow pytest \ - sphinx numpydoc docutils - -eval "$(conda shell.bash hook)" -conda activate pypy - -# Check that we are running PyPy instead of CPython in this environment. -python --version -which python -python -c "import platform; assert platform.python_implementation() == 'PyPy'" - -# Build and install scikit-learn in dev mode -ccache -M 512M -export CCACHE_COMPRESS=1 -export PATH=/usr/lib/ccache:$PATH -export LOKY_MAX_CPU_COUNT="2" -export OMP_NUM_THREADS="1" -# Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI -# workers with 2 cores when building the compiled extensions of scikit-learn. -export SKLEARN_BUILD_PARALLEL=3 -pip install --no-build-isolation -e . - -python -m pytest sklearn diff --git a/build_tools/circle/doc_environment.yml b/build_tools/circle/doc_environment.yml new file mode 100644 index 0000000000000..bc36e178de058 --- /dev/null +++ b/build_tools/circle/doc_environment.yml @@ -0,0 +1,44 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy + - blas + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - scikit-image + - seaborn + - memory_profiler + - compilers + - sphinx + - sphinx-gallery + - sphinx-copybutton + - numpydoc + - sphinx-prompt + - plotly + - polars + - pooch + - sphinxext-opengraph + - sphinx-remove-toctrees + - sphinx-design + - pydata-sphinx-theme + - towncrier + - pip + - pip: + - jupyterlite-sphinx + - jupyterlite-pyodide-kernel + - sphinxcontrib-sass diff --git a/build_tools/circle/doc_linux-64_conda.lock b/build_tools/circle/doc_linux-64_conda.lock new file mode 100644 index 0000000000000..d19f830684796 --- /dev/null +++ b/build_tools/circle/doc_linux-64_conda.lock @@ -0,0 +1,329 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 93cb6f7aa17dce662512650f1419e87eae56ed49163348847bf965697cd268bb +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda#ad8527bf134a90e1c9ed35fa0b64318c +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-13.3.0-hc03c837_102.conda#4c1d6961a6a54f602ae510d9bf31fa60 +https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_2.conda#fbe7d535ff9d3a168c148e07358cd5b1 +https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-13.3.0-hc03c837_102.conda#aa38de2738c5f4a72a880e3d31ffe8b4 +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda#460eba7851277ec1fd80a1a24080787a +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_4.conda#ef67db625ad0d2dce398837102f875ed +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 +https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 +https://conda.anaconda.org/conda-forge/linux-64/binutils-2.43-h4852527_4.conda#29782348a527eda3ecfc673109d28e93 +https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_4.conda#c87e146f5b685672d4aa6b527c6d3b5e +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda#76df83c2a9035c54df5d04ff81bcc02d +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda#7c7927b404672409d9917d49bff5f2d6 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/rav1e-0.7.1-h8fae777_3.conda#2c42649888aac645608191ffdc80d13a +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda#418c6ca5929a611cbd69204907a83995 +https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda#bfd56492d8346d669010eccafe0ba058 +https://conda.anaconda.org/conda-forge/linux-64/giflib-5.2.2-hd590300_0.conda#3bf7b9fd5a7136126e0234db4b87c8b6 +https://conda.anaconda.org/conda-forge/linux-64/jxrlib-1.1-hd590300_3.conda#5aeabe88534ea4169d4c49998f293d6c +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libhwy-1.2.0-hf40a0c7_0.conda#2f433d593a66044c3f163cb25f0a09de +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda#2b6cdf7bb95d3d10ef4e38ce0bc95dba +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.0-h29eaf8c_0.conda#d2f1c87d4416d1e7344cf92b1aaee1c4 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda#3b3e64af585eadfb52bb90b553db5edf +https://conda.anaconda.org/conda-forge/linux-64/svt-av1-3.0.2-h5888daf_0.conda#0096882bd623e6cc09e8bf920fc8fb47 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda#a37843723437ba75f42c9270ffe800b1 +https://conda.anaconda.org/conda-forge/linux-64/zfp-1.0.1-h5888daf_2.conda#e0409515c467b87176b070bff5d9442e +https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.2.4-h7955e40_0.conda#c8a816dbf59eb8ba6346a8f10014b302 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/aom-3.9.1-hac33072_0.conda#346722a0be40f6edc53f12640d301338 +https://conda.anaconda.org/conda-forge/linux-64/blosc-1.21.6-he440d0b_1.conda#2c2fae981fd2afd00812c92ac47d023d +https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 +https://conda.anaconda.org/conda-forge/linux-64/c-blosc2-2.17.1-h3122c55_0.conda#009d16d3c9ed3e70d58ed46dab1571d1 +https://conda.anaconda.org/conda-forge/linux-64/charls-2.4.2-h59595ed_0.conda#4336bd67920dd504cd8c6761d6a99645 +https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-13.3.0-h1e990d8_2.conda#f46cf0acdcb6019397d37df1e407ab91 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libaec-1.1.3-h59595ed_0.conda#5e97e271911b8b2001a8b71860c32faa +https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.124-hb9d3cd8_0.conda#8bc89311041d7fcb510238cf0848ccae +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.11.1-h7b0646d_1.conda#959fc2b6c0df7883e070b3fe525219a5 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.29-pthreads_h94d23a6_0.conda#0a4d0252248ef9a0f88f2ba8b8a08e12 +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/libzopfli-1.0.3-h9c3ff4c_0.tar.bz2#c66fe2d123249af7651ebde8984c51c2 +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.17-hd6af730_0_cpython.conda#7bb89638dae9ce1b8e051d0b721e83c2 +https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda#353823361b1d27eb3960efb076dfcaf6 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/noarch/alabaster-1.0.0-pyhd8ed1ab_1.conda#1fd9696649f65fd6611fcdb4ffec738a +https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f +https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hf71b8c6_2.conda#bf502c169c71e3c6ac0d6175addfacc2 +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.4.26-pyhd8ed1ab_0.conda#c33eeaaa33f45031be34cda513df39b6 +https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda#40fe4284b8b5835a9073a645139f35af +https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda#94b550b8d3a614dbd326af798c7dfb40 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cpython-3.10.17-py310hd8ed1ab_0.conda#e2b81369f0473107784f8b7da8e6a8e9 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb +https://conda.anaconda.org/conda-forge/linux-64/cython-3.1.1-py310had8cdd9_1.conda#4904cb1ba6e72940ff22a5235554532d +https://conda.anaconda.org/conda-forge/noarch/docutils-0.21.2-pyhd8ed1ab_1.conda#24c1ca34138ee57de72a943237cde4cc +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/linux-64/gcc-13.3.0-h9576a4e_2.conda#d92e51bf4b6bdbfe45e5884fb0755afe +https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-13.3.0-hc28eda2_10.conda#d151142bbafe5e68ec7fc065c5e6f80c +https://conda.anaconda.org/conda-forge/linux-64/gfortran_impl_linux-64-13.3.0-h84c1745_2.conda#4e21ed177b76537067736f20f54fee0a +https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-13.3.0-hae580e1_2.conda#b55f02540605c322a47719029f8404cc +https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda#0a802cb9888dd14eeefc611f05c40b6e +https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda#8e6923fc12f1fe8f8c4e5c9f343256ac +https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 +https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py310h3788b33_0.conda#4186d9b4d004b0fe0de6aa62496fb48a +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libavif16-1.3.0-h766b0b6_0.conda#f17f2d0e5c9ad6b958547fd67b155771 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 +https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py310h89163eb_1.conda#8ce3f0332fd6de0d737e2911d329523f +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/noarch/narwhals-1.41.0-pyhe01879c_0.conda#580a340cc0f5eab2b18adb1b55e032e5 +https://conda.anaconda.org/conda-forge/noarch/networkx-3.4.2-pyh267e887_2.conda#fd40bf7f7f4bc4b647dc8512053d9873 +https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.29-pthreads_h6ec200e_0.conda#7e4d48870b3258bea920d51b7f495a81 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda#424844562f5d337077b445ec6b1398a7 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/linux-64/psutil-7.0.0-py310ha75aee5_0.conda#da7d592394ff9084a23f62a1186451a2 +https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda#12c566707c80111f9799308d9e265aef +https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.1-pyhd8ed1ab_0.conda#232fb4577b6687b2d503ef8e254270c9 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda#461219d1a5bd61342293efa2c0c90eac +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-3.0.1-pyhd8ed1ab_0.conda#755cf22df8693aa0d1aec1c123fa5863 +https://conda.anaconda.org/conda-forge/noarch/soupsieve-2.7-pyhd8ed1ab_0.conda#fb32097c717486aa34b38a9db57eb49e +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_1.conda#fa839b5ff59e192f411ccc7dae6588bb +https://conda.anaconda.org/conda-forge/noarch/tabulate-0.9.0-pyhd8ed1ab_2.conda#959484a66b4b76befcddc4fa97c95567 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py310ha75aee5_0.conda#6f3da1072c0c4d2a1beb1e84615f7c9c +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py310ha75aee5_0.conda#1d7a4b9202cdd10d56ecdd7f6c347190 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 +https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.44-hb9d3cd8_0.conda#7c91bfc90672888259675ad2ad28af9c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/noarch/zipp-3.22.0-pyhd8ed1ab_0.conda#234be740b00b8e41567e5b0ed95aaba9 +https://conda.anaconda.org/conda-forge/noarch/accessible-pygments-0.0.5-pyhd8ed1ab_1.conda#74ac5069774cdbc53910ec4d631a3999 +https://conda.anaconda.org/conda-forge/noarch/babel-2.17.0-pyhd8ed1ab_0.conda#0a01c169f0ab0f91b26e77a3301fbfe4 +https://conda.anaconda.org/conda-forge/linux-64/brunsli-0.1-h9c3ff4c_0.tar.bz2#c1ac6229d0bfd14f8354ff9ad2a26cad +https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.9.0-h2b85faf_0.conda#3cb814f83f1f71ac1985013697f80cc1 +https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py310h8deb56e_0.conda#1fc24a3196ad5ede2a68148be61894f4 +https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda#679616eb5ad4e521c83da4650860aba7 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.1-py310h89163eb_0.conda#f4f46207c6defa5ea17b0299298ba849 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/linux-64/gfortran-13.3.0-h9576a4e_2.conda#19e6d3c9cde10a0a9a170a684082588e +https://conda.anaconda.org/conda-forge/linux-64/gfortran_linux-64-13.3.0-hb919d3a_10.conda#7ce070e3329cd10bf79dbed562a21bd4 +https://conda.anaconda.org/conda-forge/linux-64/gxx-13.3.0-h9576a4e_2.conda#07e8df00b7cd3084ad3ef598ce32a71c +https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-h6834431_10.conda#9a8ebde471cec5cc9c48f8682f434f92 +https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda#b4754fb1bdcb70c8fd54f918301582c6 +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda#63ccfdc3a3ce25b027b8767eb722fca8 +https://conda.anaconda.org/conda-forge/noarch/importlib_resources-6.5.2-pyhd8ed1ab_0.conda#c85c76dc67d75619a92f51dfbce06992 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_he106b2a_openblas.conda#abb32c727da370c481a1c206f5159ce9 +https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_h7ac8fdf_openblas.conda#452b98eafe050ecff932f0ec832dd03f +https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.6-he9d0ab4_0.conda#bf8ccdd2c1c1a54a3fa25bb61f26460e +https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda#fedf6bfe5d21d21d2b1785ec00a8889a +https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda#e71f31f8cfb0a91439f2086fc8aa0461 +https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhd8ed1ab_1.conda#71abbefb6f3b95e1668cd5e0af3affb9 +https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda#2e5bf4f1da39c0b32778561c3c4e5878 +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py310h7e6dc6c_0.conda#5645a243d90adb50909b9edc209d84fe +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/plotly-6.1.2-pyhd8ed1ab_0.conda#f547ee092ef42452ddaffdfa59ff4987 +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/noarch/python-gil-3.10.17-hd8ed1ab_0.conda#c856adbd93a57004e21cd26564f4f724 +https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.13.2-h0e9735f_0.conda#568ed1300869dca0ba09fb750cda5dbb +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda#eb44b3b6deb1cab08d72cb61686fe64c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda#2ccd714aa2242315acaf0a67faea780b +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda#17dcc85db3c7886650b8908b183d6876 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda#2de7f99d6581a4a7adbff607b5c278ca +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda#5efa5fa6243a622445fdfd72aee15efa +https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda#aaa2a381ccc56eac91d63b6c1240312f +https://conda.anaconda.org/conda-forge/noarch/beautifulsoup4-4.13.4-pyha770c72_0.conda#9f07c4fc992adb2d6c30da7fab3959a7 +https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.9.0-h1a2810e_0.conda#1ce8b218d359d9ed0ab481f2a3f3c512 +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/linux-64/fortran-compiler-1.9.0-h36df796_0.conda#cc0cf942201f9d3b0e9654ea02e12486 +https://conda.anaconda.org/conda-forge/noarch/importlib-resources-6.5.2-pyhd8ed1ab_0.conda#e376ea42e9ae40f3278b0f79c9bf9826 +https://conda.anaconda.org/conda-forge/noarch/lazy-loader-0.4-pyhd8ed1ab_2.conda#d10d9393680734a8febc4b362a4c94f2 +https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.6-default_h1df26ce_0.conda#99ead3b974685e44df8b1e3953503cfc +https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.6-default_he06ed0a_0.conda#cc6c469d9d7fc0ac106cef5f45d973a9 +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-31_he2f377e_openblas.conda#7e5fff7d0db69be3a266f7e79a3bb0e2 +https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda#6458be24f09e1b034902ab44fe9de908 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py310hefbff90_0.conda#b0cea2c364bf65cd19e023040eeab05d +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda#7bbe9a0cc0df0ac5f5a8ad6d6a11af2f +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py310ha75aee5_2.conda#f9254b5b0193982416b91edcb4b2676f +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-31_h1ea3ea9_openblas.conda#ba652ee0576396d4765e567f043c57f9 +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/compilers-1.9.0-ha770c72_0.conda#5859096e397aba423340d0bbbb11ec64 +https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py310h3788b33_0.conda#b6420d29123c7c823de168f49ccdfe6a +https://conda.anaconda.org/conda-forge/linux-64/imagecodecs-2025.3.30-py310h481ba9f_0.conda#453c8da1b70f7b76b3884e18015bc568 +https://conda.anaconda.org/conda-forge/noarch/imageio-2.37.0-pyhfb79c49_0.conda#b5577bc2212219566578fd5af9993af6 +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.3-py310h5eaa309_3.conda#07697a584fab513ce895c4511f7a2403 +https://conda.anaconda.org/conda-forge/noarch/patsy-1.0.1-pyhd8ed1ab_1.conda#ee23fabfd0a8c6b8d6f3729b47b2859d +https://conda.anaconda.org/conda-forge/linux-64/polars-default-1.30.0-py39hfac2b71_0.conda#cd33cf1e631b4d766858c90e333b4832 +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-64/pywavelets-1.8.0-py310hf462985_0.conda#4c441eff2be2e65bd67765c5642051c5 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py310h1d65ade_0.conda#8c29cd33b64b2eb78597fa28b5595c8d +https://conda.anaconda.org/conda-forge/noarch/towncrier-24.8.0-pyhd8ed1ab_1.conda#820b6a1ddf590fba253f8204f7200d82 +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.4.0-pyhd8ed1ab_0.conda#c1e349028e0052c4eea844e94f773065 +https://conda.anaconda.org/conda-forge/linux-64/blas-2.131-openblas.conda#38b2ec894c69bb4be0e66d2ef7fc60bf +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py310h68603db_0.conda#50084ca38bf28440e2762966bac143fc +https://conda.anaconda.org/conda-forge/linux-64/polars-1.30.0-default_h1443d73_0.conda#19698b29e8544d2dd615699826037039 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.2.1-py310ha2bacc8_1.conda#817d32861729e14f474249f1036291c4 +https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_1.conda#a9b9368f3701a417eac9edbcae7cb737 +https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.14.4-py310hf462985_0.conda#636d3c500d8a851e377360e88ec95372 +https://conda.anaconda.org/conda-forge/noarch/tifffile-2025.5.10-pyhd8ed1ab_0.conda#1fdb801f28bf4987294c49aaa314bf5e +https://conda.anaconda.org/conda-forge/noarch/pooch-1.8.2-pyhd8ed1ab_1.conda#b3e783e8e8ed7577cf0b6dee37d1fbac +https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.0-h0384650_3.conda#8aa69e15597a205fd6f81781fe62c232 +https://conda.anaconda.org/conda-forge/linux-64/scikit-image-0.25.2-py310h5eaa309_1.conda#ed21ab72d049ecdb60f829f04b4dca1c +https://conda.anaconda.org/conda-forge/noarch/seaborn-base-0.13.2-pyhd8ed1ab_3.conda#fd96da444e81f9e6fcaac38590f3dd42 +https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.0-py310hfd10a26_0.conda#1610ccfe262ee519716bb69bd4395572 +https://conda.anaconda.org/conda-forge/noarch/seaborn-0.13.2-hd8ed1ab_3.conda#62afb877ca2c2b4b6f9ecb37320085b6 +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py310hff52083_0.conda#4162a00ddf1d805557aff34ddf113f46 +https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.8.0-pyhd8ed1ab_1.conda#5af206d64d18d6c8dfb3122b4d9e643b +https://conda.anaconda.org/conda-forge/noarch/pydata-sphinx-theme-0.16.1-pyhd8ed1ab_0.conda#837aaf71ddf3b27acae0e7e9015eebc6 +https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda#bf22cb9c439572760316ce0748af3713 +https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.6.1-pyhd8ed1ab_2.conda#3e6c15d914b03f83fc96344f917e0838 +https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.19.0-pyhd8ed1ab_0.conda#3cfa26d23bd7987d84051879f202a855 +https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.4.0-pyhd8ed1ab_0.tar.bz2#88ee91e8679603f2a5bd036d52919cc2 +https://conda.anaconda.org/conda-forge/noarch/sphinx-remove-toctrees-1.0.0.post1-pyhd8ed1ab_1.conda#b275c865b753413caaa8548b9d44c024 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda#16e3f039c0aa6446513e94ab18a8784b +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda#910f28a05c178feba832f842155cbfff +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.1.0-pyhd8ed1ab_1.conda#e9fb3fe8a5b758b4aff187d434f94f03 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-2.0.0-pyhd8ed1ab_1.conda#00534ebcc0375929b45c3039b5ba7636 +https://conda.anaconda.org/conda-forge/noarch/sphinx-8.1.3-pyhd8ed1ab_1.conda#1a3281a0dc355c02b5506d87db2d78ac +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_1.conda#3bc61f7161d28137797e038263c04c54 +https://conda.anaconda.org/conda-forge/noarch/sphinxext-opengraph-0.9.1-pyhd8ed1ab_1.conda#79f5d05ad914baf152fb7f75073fe36d +# pip attrs @ https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl#sha256=427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3 +# pip cloudpickle @ https://files.pythonhosted.org/packages/7e/e8/64c37fadfc2816a7701fa8a6ed8d87327c7d54eacfbfb6edab14a2f2be75/cloudpickle-3.1.1-py3-none-any.whl#sha256=c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e +# pip defusedxml @ https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl#sha256=a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 +# pip fastjsonschema @ https://files.pythonhosted.org/packages/90/2b/0817a2b257fe88725c25589d89aec060581aabf668707a8d03b2e9e0cb2a/fastjsonschema-2.21.1-py3-none-any.whl#sha256=c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667 +# pip fqdn @ https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl#sha256=3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 +# pip json5 @ https://files.pythonhosted.org/packages/41/9f/3500910d5a98549e3098807493851eeef2b89cdd3032227558a104dfe926/json5-0.12.0-py3-none-any.whl#sha256=6d37aa6c08b0609f16e1ec5ff94697e2cbbfbad5ac112afa05794da9ab7810db +# pip jsonpointer @ https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl#sha256=13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942 +# pip jupyterlab-pygments @ https://files.pythonhosted.org/packages/b1/dd/ead9d8ea85bf202d90cc513b533f9c363121c7792674f78e0d8a854b63b4/jupyterlab_pygments-0.3.0-py3-none-any.whl#sha256=841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780 +# pip libsass @ https://files.pythonhosted.org/packages/fd/5a/eb5b62641df0459a3291fc206cf5bd669c0feed7814dded8edef4ade8512/libsass-0.23.0-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.whl#sha256=4a218406d605f325d234e4678bd57126a66a88841cb95bee2caeafdc6f138306 +# pip mdurl @ https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl#sha256=84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 +# pip overrides @ https://files.pythonhosted.org/packages/2c/ab/fc8290c6a4c722e5514d80f62b2dc4c4df1a68a41d1364e625c35990fcf3/overrides-7.7.0-py3-none-any.whl#sha256=c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49 +# pip pandocfilters @ https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl#sha256=93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc +# pip pkginfo @ https://files.pythonhosted.org/packages/fa/3d/f4f2ba829efb54b6cd2d91349c7463316a9cc55a43fc980447416c88540f/pkginfo-1.12.1.2-py3-none-any.whl#sha256=c783ac885519cab2c34927ccfa6bf64b5a704d7c69afaea583dd9b7afe969343 +# pip prometheus-client @ https://files.pythonhosted.org/packages/50/c7/cee159ba3d7192e84a4c166ec1752f44a5fa859ac0eeda2d73a1da65ab47/prometheus_client-0.22.0-py3-none-any.whl#sha256=c8951bbe64e62b96cd8e8f5d917279d1b9b91ab766793f33d4dce6c228558713 +# pip ptyprocess @ https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 +# pip python-json-logger @ https://files.pythonhosted.org/packages/08/20/0f2523b9e50a8052bc6a8b732dfc8568abbdc42010aef03a2d750bdab3b2/python_json_logger-3.3.0-py3-none-any.whl#sha256=dd980fae8cffb24c13caf6e158d3d61c0d6d22342f932cb6e9deedab3d35eec7 +# pip pyyaml @ https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed +# pip rfc3986-validator @ https://files.pythonhosted.org/packages/9e/51/17023c0f8f1869d8806b979a2bffa3f861f26a3f1a66b094288323fba52f/rfc3986_validator-0.1.1-py2.py3-none-any.whl#sha256=2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9 +# pip rpds-py @ https://files.pythonhosted.org/packages/eb/76/66b523ffc84cf47db56efe13ae7cf368dee2bacdec9d89b9baca5e2e6301/rpds_py-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=0701942049095741a8aeb298a31b203e735d1c61f4423511d2b1a41dcd8a16da +# pip send2trash @ https://files.pythonhosted.org/packages/40/b0/4562db6223154aa4e22f939003cb92514c79f3d4dccca3444253fd17f902/Send2Trash-1.8.3-py3-none-any.whl#sha256=0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 +# pip sniffio @ https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl#sha256=2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 +# pip traitlets @ https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl#sha256=b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f +# pip types-python-dateutil @ https://files.pythonhosted.org/packages/c5/3f/b0e8db149896005adc938a1e7f371d6d7e9eca4053a29b108978ed15e0c2/types_python_dateutil-2.9.0.20250516-py3-none-any.whl#sha256=2b2b3f57f9c6a61fba26a9c0ffb9ea5681c9b83e69cd897c6b5f668d9c0cab93 +# pip uri-template @ https://files.pythonhosted.org/packages/e7/00/3fca040d7cf8a32776d3d81a00c8ee7457e00f80c649f1e4a863c8321ae9/uri_template-1.3.0-py3-none-any.whl#sha256=a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363 +# pip webcolors @ https://files.pythonhosted.org/packages/60/e8/c0e05e4684d13459f93d312077a9a2efbe04d59c393bc2b8802248c908d4/webcolors-24.11.1-py3-none-any.whl#sha256=515291393b4cdf0eb19c155749a096f779f7d909f7cceea072791cb9095b92e9 +# pip webencodings @ https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl#sha256=a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 +# pip websocket-client @ https://files.pythonhosted.org/packages/5a/84/44687a29792a70e111c5c477230a72c4b957d88d16141199bf9acb7537a3/websocket_client-1.8.0-py3-none-any.whl#sha256=17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526 +# pip anyio @ https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl#sha256=9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c +# pip argon2-cffi-bindings @ https://files.pythonhosted.org/packages/ec/f7/378254e6dd7ae6f31fe40c8649eea7d4832a42243acaf0f1fff9083b2bed/argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=b746dba803a79238e925d9046a63aa26bf86ab2a2fe74ce6b009a1c3f5c8f2ae +# pip arrow @ https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl#sha256=c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80 +# pip doit @ https://files.pythonhosted.org/packages/44/83/a2960d2c975836daa629a73995134fd86520c101412578c57da3d2aa71ee/doit-0.36.0-py3-none-any.whl#sha256=ebc285f6666871b5300091c26eafdff3de968a6bd60ea35dd1e3fc6f2e32479a +# pip jupyter-core @ https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl#sha256=c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0 +# pip markdown-it-py @ https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl#sha256=355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 +# pip mistune @ https://files.pythonhosted.org/packages/01/4d/23c4e4f09da849e127e9f123241946c23c1e30f45a88366879e064211815/mistune-3.1.3-py3-none-any.whl#sha256=1a32314113cff28aa6432e99e522677c8587fd83e3d51c29b82a52409c842bd9 +# pip pyzmq @ https://files.pythonhosted.org/packages/c1/3e/2de5928cdadc2105e7c8f890cc5f404136b41ce5b6eae5902167f1d5641c/pyzmq-26.4.0-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=7dacb06a9c83b007cc01e8e5277f94c95c453c5851aac5e83efe93e72226353f +# pip referencing @ https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl#sha256=e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 +# pip rfc3339-validator @ https://files.pythonhosted.org/packages/7b/44/4e421b96b67b2daff264473f7465db72fbdf36a07e05494f50300cc7b0c6/rfc3339_validator-0.1.4-py2.py3-none-any.whl#sha256=24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa +# pip sphinxcontrib-sass @ https://files.pythonhosted.org/packages/3f/ec/194f2dbe55b3fe0941b43286c21abb49064d9d023abfb99305c79ad77cad/sphinxcontrib_sass-0.3.5-py2.py3-none-any.whl#sha256=850c83a36ed2d2059562504ccf496ca626c9c0bb89ec642a2d9c42105704bef6 +# pip terminado @ https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl#sha256=a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0 +# pip tinycss2 @ https://files.pythonhosted.org/packages/e6/34/ebdc18bae6aa14fbee1a08b63c015c72b64868ff7dae68808ab500c492e2/tinycss2-1.4.0-py3-none-any.whl#sha256=3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289 +# pip argon2-cffi @ https://files.pythonhosted.org/packages/a4/6a/e8a041599e78b6b3752da48000b14c8d1e8a04ded09c88c714ba047f34f5/argon2_cffi-23.1.0-py3-none-any.whl#sha256=c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea +# pip bleach @ https://files.pythonhosted.org/packages/fc/55/96142937f66150805c25c4d0f31ee4132fd33497753400734f9dfdcbdc66/bleach-6.2.0-py3-none-any.whl#sha256=117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e +# pip isoduration @ https://files.pythonhosted.org/packages/7b/55/e5326141505c5d5e34c5e0935d2908a74e4561eca44108fbfb9c13d2911a/isoduration-20.11.0-py3-none-any.whl#sha256=b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042 +# pip jsonschema-specifications @ https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl#sha256=4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af +# pip jupyter-client @ https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl#sha256=e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f +# pip jupyter-server-terminals @ https://files.pythonhosted.org/packages/07/2d/2b32cdbe8d2a602f697a649798554e4f072115438e92249624e532e8aca6/jupyter_server_terminals-0.5.3-py3-none-any.whl#sha256=41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa +# pip jupyterlite-core @ https://files.pythonhosted.org/packages/46/15/1d9160819d1e6e018d15de0e98b9297d0a09cfcfdc73add6e24ee3b2b83c/jupyterlite_core-0.5.1-py3-none-any.whl#sha256=76381619a632f06bf67fb47e5464af762ad8836df5ffe3d7e7ee0e316c1407ee +# pip mdit-py-plugins @ https://files.pythonhosted.org/packages/a7/f7/7782a043553ee469c1ff49cfa1cdace2d6bf99a1f333cf38676b3ddf30da/mdit_py_plugins-0.4.2-py3-none-any.whl#sha256=0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636 +# pip jsonschema @ https://files.pythonhosted.org/packages/a2/3d/023389198f69c722d039351050738d6755376c8fd343e91dc493ea485905/jsonschema-4.24.0-py3-none-any.whl#sha256=a462455f19f5faf404a7902952b6f0e3ce868f3ee09a359b05eca6673bd8412d +# pip jupyterlite-pyodide-kernel @ https://files.pythonhosted.org/packages/1b/b5/959a03ca011d1031abac03c18af9e767c18d6a9beb443eb106dda609748c/jupyterlite_pyodide_kernel-0.5.2-py3-none-any.whl#sha256=63ba6ce28d32f2cd19f636c40c153e171369a24189e11e2235457bd7000c5907 +# pip jupyter-events @ https://files.pythonhosted.org/packages/e2/48/577993f1f99c552f18a0428731a755e06171f9902fa118c379eb7c04ea22/jupyter_events-0.12.0-py3-none-any.whl#sha256=6464b2fa5ad10451c3d35fabc75eab39556ae1e2853ad0c0cc31b656731a97fb +# pip nbformat @ https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl#sha256=3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b +# pip jupytext @ https://files.pythonhosted.org/packages/ed/f1/82ea8e783433707cafd9790099a2d19f113c22f32a31c8bb5abdc7a61dbb/jupytext-1.17.2-py3-none-any.whl#sha256=4f85dc43bb6a24b75491c5c434001ad5ef563932f68f15dd3e1c8ce12a4a426b +# pip nbclient @ https://files.pythonhosted.org/packages/34/6d/e7fa07f03a4a7b221d94b4d586edb754a9b0dc3c9e2c93353e9fa4e0d117/nbclient-0.10.2-py3-none-any.whl#sha256=4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d +# pip nbconvert @ https://files.pythonhosted.org/packages/cc/9a/cd673b2f773a12c992f41309ef81b99da1690426bd2f96957a7ade0d3ed7/nbconvert-7.16.6-py3-none-any.whl#sha256=1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b +# pip jupyter-server @ https://files.pythonhosted.org/packages/46/1f/5ebbced977171d09a7b0c08a285ff9a20aafb9c51bde07e52349ff1ddd71/jupyter_server-2.16.0-py3-none-any.whl#sha256=3d8db5be3bc64403b1c65b400a1d7f4647a5ce743f3b20dbdefe8ddb7b55af9e +# pip jupyterlab-server @ https://files.pythonhosted.org/packages/54/09/2032e7d15c544a0e3cd831c51d77a8ca57f7555b2e1b2922142eddb02a84/jupyterlab_server-2.27.3-py3-none-any.whl#sha256=e697488f66c3db49df675158a77b3b017520d772c6e1548c7d9bcc5df7944ee4 +# pip jupyterlite-sphinx @ https://files.pythonhosted.org/packages/b8/68/d35f70a5ae17b30da996c48138c2d655232c2ee839c881ef44587d75d0d3/jupyterlite_sphinx-0.20.1-py3-none-any.whl#sha256=6f477879e9793813b5ed554f08d87b2d949b68595ec5b7570332aa2d0fe0a8c1 diff --git a/build_tools/circle/doc_min_dependencies_environment.yml b/build_tools/circle/doc_min_dependencies_environment.yml new file mode 100644 index 0000000000000..1a93231019fbb --- /dev/null +++ b/build_tools/circle/doc_min_dependencies_environment.yml @@ -0,0 +1,42 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy=1.22.0 # min + - blas + - scipy=1.8.0 # min + - cython=3.0.10 # min + - joblib + - threadpoolctl + - matplotlib=3.5.0 # min + - pandas=1.4.0 # min + - pyamg=4.2.1 # min + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - scikit-image=0.19.0 # min + - seaborn + - memory_profiler + - compilers + - sphinx=7.3.7 # min + - sphinx-gallery=0.17.1 # min + - sphinx-copybutton=0.5.2 # min + - numpydoc=1.2.0 # min + - sphinx-prompt=1.4.0 # min + - plotly=5.14.0 # min + - polars=0.20.30 # min + - pooch=1.6.0 # min + - sphinx-remove-toctrees=1.0.0.post1 # min + - sphinx-design=0.6.0 # min + - pydata-sphinx-theme=0.15.3 # min + - towncrier=24.8.0 # min + - pip + - pip: + - sphinxext-opengraph==0.9.1 # min + - sphinxcontrib-sass==0.3.4 # min diff --git a/build_tools/circle/doc_min_dependencies_linux-64_conda.lock b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock new file mode 100644 index 0000000000000..89d34a59f3b7b --- /dev/null +++ b/build_tools/circle/doc_min_dependencies_linux-64_conda.lock @@ -0,0 +1,296 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: cf86af2534e8e281654ed19bc893b468656b355b2b200b12321dbc61cce562db +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda#ad8527bf134a90e1c9ed35fa0b64318c +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-13.3.0-hc03c837_102.conda#4c1d6961a6a54f602ae510d9bf31fa60 +https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_2.conda#fbe7d535ff9d3a168c148e07358cd5b1 +https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-13.3.0-hc03c837_102.conda#aa38de2738c5f4a72a880e3d31ffe8b4 +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda#460eba7851277ec1fd80a1a24080787a +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_4.conda#ef67db625ad0d2dce398837102f875ed +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 +https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 +https://conda.anaconda.org/conda-forge/linux-64/binutils-2.43-h4852527_4.conda#29782348a527eda3ecfc673109d28e93 +https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_4.conda#c87e146f5b685672d4aa6b527c6d3b5e +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda#76df83c2a9035c54df5d04ff81bcc02d +https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.24.1-h5888daf_0.conda#d54305672f0361c2f3886750e7165b5f +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.24.1-h5888daf_0.conda#2ee6d71b72f75d50581f2f68e965efdb +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda#7c7927b404672409d9917d49bff5f2d6 +https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda#68e52064ed3897463c0e958ab5c8f91b +https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda#b64523fb87ac6f87f0790f324ad43046 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/rav1e-0.7.1-h8fae777_3.conda#2c42649888aac645608191ffdc80d13a +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxshmfence-1.3.3-hb9d3cd8_0.conda#9a809ce9f65460195777f2f2116bae02 +https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.1-h166bdaf_1.tar.bz2#d9c69a24ad678ffce24c6543a0176b00 +https://conda.anaconda.org/conda-forge/linux-64/blis-0.9.0-h4ab18f5_2.conda#6f77ba1352b69c4a6f8a6d20def30e4e +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/dav1d-1.2.1-hd590300_0.conda#418c6ca5929a611cbd69204907a83995 +https://conda.anaconda.org/conda-forge/linux-64/giflib-5.2.2-hd590300_0.conda#3bf7b9fd5a7136126e0234db4b87c8b6 +https://conda.anaconda.org/conda-forge/linux-64/jxrlib-1.1-hd590300_3.conda#5aeabe88534ea4169d4c49998f293d6c +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2#a8832b479f93521a9e7b5b743803be51 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.24.1-h8e693c7_0.conda#57566a81dd1e5aa3d98ac7582e8bfe03 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d +https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda#8f04c7aae6a46503bc36d1ed5abc8c7c +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libgpg-error-1.55-h3f2d84a_0.conda#2bd47db5807daade8500ed7ca4c512a4 +https://conda.anaconda.org/conda-forge/linux-64/libhwy-1.2.0-hf40a0c7_0.conda#2f433d593a66044c3f163cb25f0a09de +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda#2b6cdf7bb95d3d10ef4e38ce0bc95dba +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda#c7f302fd11eeb0987a6a5e1f3aed6a21 +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/nspr-4.36-h5888daf_0.conda#de9cd5bca9e4918527b9b72b6e2e1409 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.0-h29eaf8c_0.conda#d2f1c87d4416d1e7344cf92b1aaee1c4 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda#3b3e64af585eadfb52bb90b553db5edf +https://conda.anaconda.org/conda-forge/linux-64/svt-av1-3.0.2-h5888daf_0.conda#0096882bd623e6cc09e8bf920fc8fb47 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2#4cb3ad778ec2d5a7acbdf254eb1c42ae +https://conda.anaconda.org/conda-forge/linux-64/zfp-1.0.1-h5888daf_2.conda#e0409515c467b87176b070bff5d9442e +https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.2.4-h7955e40_0.conda#c8a816dbf59eb8ba6346a8f10014b302 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/aom-3.9.1-hac33072_0.conda#346722a0be40f6edc53f12640d301338 +https://conda.anaconda.org/conda-forge/linux-64/blosc-1.21.6-he440d0b_1.conda#2c2fae981fd2afd00812c92ac47d023d +https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 +https://conda.anaconda.org/conda-forge/linux-64/c-blosc2-2.17.1-h3122c55_0.conda#009d16d3c9ed3e70d58ed46dab1571d1 +https://conda.anaconda.org/conda-forge/linux-64/charls-2.4.2-h59595ed_0.conda#4336bd67920dd504cd8c6761d6a99645 +https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-13.3.0-h1e990d8_2.conda#f46cf0acdcb6019397d37df1e407ab91 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libaec-1.1.3-h59595ed_0.conda#5e97e271911b8b2001a8b71860c32faa +https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.24.1-h8e693c7_0.conda#8f66ed2e34507b7ae44afa31c3e4ec79 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h66dfbfd_blis.conda#612d513ce8103e41dbcb4d941a325027 +https://conda.anaconda.org/conda-forge/linux-64/libcap-2.75-h39aace5_0.conda#c44c16d6976d2aebbd65894d7741e67e +https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.124-hb9d3cd8_0.conda#8bc89311041d7fcb510238cf0848ccae +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libgcrypt-lib-1.11.1-hb9d3cd8_0.conda#8504a291085c9fb809b66cabd5834307 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libjxl-0.11.1-h7b0646d_1.conda#959fc2b6c0df7883e070b3fe525219a5 +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2#309dec04b70a3cc0f1e84a4013683bc0 +https://conda.anaconda.org/conda-forge/linux-64/libzopfli-1.0.3-h9c3ff4c_0.tar.bz2#c66fe2d123249af7651ebde8984c51c2 +https://conda.anaconda.org/conda-forge/linux-64/nss-3.112-h159eef7_0.conda#688a8bc02e57e6b741a040c84e931a7d +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.17-hd6af730_0_cpython.conda#7bb89638dae9ce1b8e051d0b721e83c2 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/noarch/alabaster-0.7.16-pyhd8ed1ab_0.conda#def531a3ac77b7fb8c21d17bb5d0badb +https://conda.anaconda.org/conda-forge/noarch/appdirs-1.4.4-pyhd8ed1ab_1.conda#f4e90937bbfc3a4a92539545a37bb448 +https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f +https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hf71b8c6_2.conda#bf502c169c71e3c6ac0d6175addfacc2 +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.4.26-pyhd8ed1ab_0.conda#c33eeaaa33f45031be34cda513df39b6 +https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda#40fe4284b8b5835a9073a645139f35af +https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda#94b550b8d3a614dbd326af798c7dfb40 +https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.1-pyhd8ed1ab_0.conda#364ba6c9fb03886ac979b482f39ebb92 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb +https://conda.anaconda.org/conda-forge/linux-64/cython-3.0.10-py310hc6cd4ac_0.conda#bd1d71ee240be36f1d85c86177d6964f +https://conda.anaconda.org/conda-forge/noarch/docutils-0.21.2-pyhd8ed1ab_1.conda#24c1ca34138ee57de72a943237cde4cc +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda#2d2c9ef879a7e64e2dc657b09272c2b6 +https://conda.anaconda.org/conda-forge/linux-64/gcc-13.3.0-h9576a4e_2.conda#d92e51bf4b6bdbfe45e5884fb0755afe +https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-13.3.0-hc28eda2_10.conda#d151142bbafe5e68ec7fc065c5e6f80c +https://conda.anaconda.org/conda-forge/linux-64/gettext-0.24.1-h5888daf_0.conda#c63e7590d4d6f4c85721040ed8b12888 +https://conda.anaconda.org/conda-forge/linux-64/gfortran_impl_linux-64-13.3.0-h84c1745_2.conda#4e21ed177b76537067736f20f54fee0a +https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-13.3.0-hae580e1_2.conda#b55f02540605c322a47719029f8404cc +https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda#0a802cb9888dd14eeefc611f05c40b6e +https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda#8e6923fc12f1fe8f8c4e5c9f343256ac +https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 +https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py310h3788b33_0.conda#4186d9b4d004b0fe0de6aa62496fb48a +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libavif16-1.3.0-h766b0b6_0.conda#f17f2d0e5c9ad6b958547fd67b155771 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_hba4ea11_blis.conda#1ea7ae3db0fea0c5222388d841583c51 +https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-12_hd37a5e2_netlib.conda#4b181b55915cefcd35c8398c9274e629 +https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.4-h4e0b6ca_1.conda#04bcf3055e51f8dde6fab9672fb9fca0 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2#91e27ef3d05cc772ce627e51cff111c4 +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py310h89163eb_1.conda#8ce3f0332fd6de0d737e2911d329523f +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/noarch/networkx-3.2-pyhd8ed1ab_0.conda#cec8cc498664cc00a070676aa89e69a7 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/ply-3.11-pyhd8ed1ab_3.conda#fd5062942bfa1b0bd5e0d2a4397b099e +https://conda.anaconda.org/conda-forge/linux-64/psutil-7.0.0-py310ha75aee5_0.conda#da7d592394ff9084a23f62a1186451a2 +https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda#12c566707c80111f9799308d9e265aef +https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.1-pyhd8ed1ab_0.conda#232fb4577b6687b2d503ef8e254270c9 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda#461219d1a5bd61342293efa2c0c90eac +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py310h89163eb_2.conda#fd343408e64cf1e273ab7c710da374db +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/snowballstemmer-3.0.1-pyhd8ed1ab_0.conda#755cf22df8693aa0d1aec1c123fa5863 +https://conda.anaconda.org/conda-forge/noarch/soupsieve-2.7-pyhd8ed1ab_0.conda#fb32097c717486aa34b38a9db57eb49e +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-jsmath-1.0.1-pyhd8ed1ab_1.conda#fa839b5ff59e192f411ccc7dae6588bb +https://conda.anaconda.org/conda-forge/noarch/tenacity-9.1.2-pyhd8ed1ab_0.conda#5d99943f2ae3cc69e1ada12ce9d4d701 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/noarch/toolz-1.0.0-pyhd8ed1ab_1.conda#40d0ed782a8aaa16ef248e68c06c168d +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py310ha75aee5_0.conda#6f3da1072c0c4d2a1beb1e84615f7c9c +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py310ha75aee5_0.conda#1d7a4b9202cdd10d56ecdd7f6c347190 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 +https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.44-hb9d3cd8_0.conda#7c91bfc90672888259675ad2ad28af9c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/noarch/zipp-3.22.0-pyhd8ed1ab_0.conda#234be740b00b8e41567e5b0ed95aaba9 +https://conda.anaconda.org/conda-forge/noarch/accessible-pygments-0.0.5-pyhd8ed1ab_1.conda#74ac5069774cdbc53910ec4d631a3999 +https://conda.anaconda.org/conda-forge/noarch/babel-2.17.0-pyhd8ed1ab_0.conda#0a01c169f0ab0f91b26e77a3301fbfe4 +https://conda.anaconda.org/conda-forge/linux-64/brunsli-0.1-h9c3ff4c_0.tar.bz2#c1ac6229d0bfd14f8354ff9ad2a26cad +https://conda.anaconda.org/conda-forge/linux-64/c-compiler-1.9.0-h2b85faf_0.conda#3cb814f83f1f71ac1985013697f80cc1 +https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py310h8deb56e_0.conda#1fc24a3196ad5ede2a68148be61894f4 +https://conda.anaconda.org/conda-forge/linux-64/cytoolz-1.0.1-py310ha75aee5_0.conda#d0be1adaa04a03aed745f3d02afb59ce +https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda#679616eb5ad4e521c83da4650860aba7 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.1-py310h89163eb_0.conda#f4f46207c6defa5ea17b0299298ba849 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/linux-64/gfortran-13.3.0-h9576a4e_2.conda#19e6d3c9cde10a0a9a170a684082588e +https://conda.anaconda.org/conda-forge/linux-64/gfortran_linux-64-13.3.0-hb919d3a_10.conda#7ce070e3329cd10bf79dbed562a21bd4 +https://conda.anaconda.org/conda-forge/linux-64/glib-tools-2.84.2-h4833e2c_0.conda#f2ec1facec64147850b7674633978050 +https://conda.anaconda.org/conda-forge/linux-64/gxx-13.3.0-h9576a4e_2.conda#07e8df00b7cd3084ad3ef598ce32a71c +https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-h6834431_10.conda#9a8ebde471cec5cc9c48f8682f434f92 +https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda#b4754fb1bdcb70c8fd54f918301582c6 +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda#63ccfdc3a3ce25b027b8767eb722fca8 +https://conda.anaconda.org/conda-forge/noarch/importlib_resources-6.5.2-pyhd8ed1ab_0.conda#c85c76dc67d75619a92f51dfbce06992 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda#ee48bf17cc83a00f59ca1494d5646869 +https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-12_hce4cc19_netlib.conda#bdcf65db13abdddba7af29592f93600b +https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.6-he9d0ab4_0.conda#bf8ccdd2c1c1a54a3fa25bb61f26460e +https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda#fedf6bfe5d21d21d2b1785ec00a8889a +https://conda.anaconda.org/conda-forge/noarch/memory_profiler-0.61.0-pyhd8ed1ab_1.conda#71abbefb6f3b95e1668cd5e0af3affb9 +https://conda.anaconda.org/conda-forge/linux-64/numpy-1.22.0-py310h454958d_1.tar.bz2#607c66f0cce2986515a8fe9e136b2b57 +https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda#2e5bf4f1da39c0b32778561c3c4e5878 +https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda#0badf9c54e24cecfb0ad2f99d680c163 +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py310h7e6dc6c_0.conda#5645a243d90adb50909b9edc209d84fe +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/plotly-5.14.0-pyhd8ed1ab_0.conda#6a7bcc42ef58dd6cf3da9333ea102433 +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/linux-64/sip-6.10.0-py310hf71b8c6_0.conda#2d7e4445be227e8210140b75725689ad +https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.13.2-h0e9735f_0.conda#568ed1300869dca0ba09fb750cda5dbb +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda#5efa5fa6243a622445fdfd72aee15efa +https://conda.anaconda.org/conda-forge/noarch/beautifulsoup4-4.13.4-pyha770c72_0.conda#9f07c4fc992adb2d6c30da7fab3959a7 +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-31_hdec4247_blis.conda#1675e95a742c910204645f7b6d7e56dc +https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.9.0-h1a2810e_0.conda#1ce8b218d359d9ed0ab481f2a3f3c512 +https://conda.anaconda.org/conda-forge/noarch/dask-core-2025.5.1-pyhd8ed1ab_0.conda#8f0ef561cd615a17df3256742a3457c4 +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/linux-64/fortran-compiler-1.9.0-h36df796_0.conda#cc0cf942201f9d3b0e9654ea02e12486 +https://conda.anaconda.org/conda-forge/linux-64/glib-2.84.2-h6287aef_0.conda#704648df3a01d4d24bc2c0466b718d63 +https://conda.anaconda.org/conda-forge/linux-64/imagecodecs-2025.3.30-py310h481ba9f_0.conda#453c8da1b70f7b76b3884e18015bc568 +https://conda.anaconda.org/conda-forge/noarch/imageio-2.37.0-pyhfb79c49_0.conda#b5577bc2212219566578fd5af9993af6 +https://conda.anaconda.org/conda-forge/noarch/importlib-resources-6.5.2-pyhd8ed1ab_0.conda#e376ea42e9ae40f3278b0f79c9bf9826 +https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.6-default_h1df26ce_0.conda#99ead3b974685e44df8b1e3953503cfc +https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.6-default_he06ed0a_0.conda#cc6c469d9d7fc0ac106cef5f45d973a9 +https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda#6458be24f09e1b034902ab44fe9de908 +https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc60ed4a_1.conda#ef1910918dd895516a769ed36b5b3a4e +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.5.0-py310h23f4a51_0.tar.bz2#9911225650b298776c8e8c083b5cacf1 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/pandas-1.4.0-py310hb5077e9_0.tar.bz2#43e920bc9856daa7d8d18fcbfb244c4e +https://conda.anaconda.org/conda-forge/noarch/patsy-1.0.1-pyhd8ed1ab_1.conda#ee23fabfd0a8c6b8d6f3729b47b2859d +https://conda.anaconda.org/conda-forge/linux-64/polars-0.20.30-py310h031f9ce_0.conda#0743f5db9f978b6df92d412935ff8371 +https://conda.anaconda.org/conda-forge/linux-64/pyqt5-sip-12.17.0-py310hf71b8c6_0.conda#012465861673a67a30bc8ca6284074f3 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/pywavelets-1.6.0-py310h261611a_0.conda#04a405ee0bccb4de8d1ed0c87704f5f6 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.8.0-py310hea5193d_1.tar.bz2#664d80ddeb51241629b3ada5ea926e4d +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py310ha75aee5_2.conda#f9254b5b0193982416b91edcb4b2676f +https://conda.anaconda.org/conda-forge/linux-64/blas-2.131-blis.conda#87829e6b9fe49a926280e100959b7d2b +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/compilers-1.9.0-ha770c72_0.conda#5859096e397aba423340d0bbbb11ec64 +https://conda.anaconda.org/conda-forge/linux-64/gstreamer-1.24.11-hc37bda9_0.conda#056d86cacf2b48c79c6a562a2486eb8c +https://conda.anaconda.org/conda-forge/linux-64/pulseaudio-client-17.0-hac146a9_1.conda#66b1fa9608d8836e25f9919159adc9c6 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-4.2.1-py310h7c3ba0c_0.tar.bz2#89f5a48e1f23b5cf3163a6094903d181 +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/noarch/seaborn-base-0.13.2-pyhd8ed1ab_3.conda#fd96da444e81f9e6fcaac38590f3dd42 +https://conda.anaconda.org/conda-forge/linux-64/statsmodels-0.14.2-py310h261611a_0.conda#4b8508bab02b2aa2cef12eab4883f4a1 +https://conda.anaconda.org/conda-forge/noarch/tifffile-2025.5.10-pyhd8ed1ab_0.conda#1fdb801f28bf4987294c49aaa314bf5e +https://conda.anaconda.org/conda-forge/noarch/towncrier-24.8.0-pyhd8ed1ab_1.conda#820b6a1ddf590fba253f8204f7200d82 +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.4.0-pyhd8ed1ab_0.conda#c1e349028e0052c4eea844e94f773065 +https://conda.anaconda.org/conda-forge/linux-64/gst-plugins-base-1.24.11-h651a532_0.conda#d8d8894f8ced2c9be76dc9ad1ae531ce +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/noarch/requests-2.32.3-pyhd8ed1ab_1.conda#a9b9368f3701a417eac9edbcae7cb737 +https://conda.anaconda.org/conda-forge/linux-64/scikit-image-0.19.0-py310hb5077e9_0.tar.bz2#aa24b3a4aa979641ac3144405209cd89 +https://conda.anaconda.org/conda-forge/noarch/seaborn-0.13.2-hd8ed1ab_3.conda#62afb877ca2c2b4b6f9ecb37320085b6 +https://conda.anaconda.org/conda-forge/noarch/pooch-1.6.0-pyhd8ed1ab_0.tar.bz2#6429e1d1091c51f626b5dcfdd38bf429 +https://conda.anaconda.org/conda-forge/linux-64/qt-main-5.15.15-hea1682b_4.conda#c054d7f22cc719e12c72d454b2328d6c +https://conda.anaconda.org/conda-forge/linux-64/pyqt-5.15.11-py310hf392a12_0.conda#65924d3e57be25342c76530d23d75f0f +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.5.0-py310hff52083_0.tar.bz2#1b2f3b135d5d9c594b5e0e6150c03b7b +https://conda.anaconda.org/conda-forge/noarch/numpydoc-1.2-pyhd8ed1ab_0.tar.bz2#025ad7ca2c7f65007ab6b6f5d93a56eb +https://conda.anaconda.org/conda-forge/noarch/pydata-sphinx-theme-0.15.3-pyhd8ed1ab_0.conda#55e445f4fcb07f2471fb0e1102d36488 +https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda#bf22cb9c439572760316ce0748af3713 +https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.6.0-pyhd8ed1ab_0.conda#b04f3c04e4f7939c6207dc0c0355f468 +https://conda.anaconda.org/conda-forge/noarch/sphinx-gallery-0.17.1-pyhd8ed1ab_0.conda#0adfccc6e7269a29a63c1c8ee3c6d8ba +https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.4.0-pyhd8ed1ab_0.tar.bz2#88ee91e8679603f2a5bd036d52919cc2 +https://conda.anaconda.org/conda-forge/noarch/sphinx-remove-toctrees-1.0.0.post1-pyhd8ed1ab_1.conda#b275c865b753413caaa8548b9d44c024 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-applehelp-2.0.0-pyhd8ed1ab_1.conda#16e3f039c0aa6446513e94ab18a8784b +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-devhelp-2.0.0-pyhd8ed1ab_1.conda#910f28a05c178feba832f842155cbfff +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-htmlhelp-2.1.0-pyhd8ed1ab_1.conda#e9fb3fe8a5b758b4aff187d434f94f03 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-qthelp-2.0.0-pyhd8ed1ab_1.conda#00534ebcc0375929b45c3039b5ba7636 +https://conda.anaconda.org/conda-forge/noarch/sphinx-7.3.7-pyhd8ed1ab_0.conda#7b1465205e28d75d2c0e1a868ee00a67 +https://conda.anaconda.org/conda-forge/noarch/sphinxcontrib-serializinghtml-1.1.10-pyhd8ed1ab_1.conda#3bc61f7161d28137797e038263c04c54 +# pip libsass @ https://files.pythonhosted.org/packages/fd/5a/eb5b62641df0459a3291fc206cf5bd669c0feed7814dded8edef4ade8512/libsass-0.23.0-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.whl#sha256=4a218406d605f325d234e4678bd57126a66a88841cb95bee2caeafdc6f138306 +# pip sphinxcontrib-sass @ https://files.pythonhosted.org/packages/2e/87/7c2eb08e3ca1d6baae32c0a5e005330fe1cec93a36aa085e714c3b3a3c7d/sphinxcontrib_sass-0.3.4-py2.py3-none-any.whl#sha256=a0c79a44ae8b8935c02dc340ebe40c9e002c839331201c899dc93708970c355a +# pip sphinxext-opengraph @ https://files.pythonhosted.org/packages/92/0a/970b80b4fa1feeb6deb6f2e22d4cb14e388b27b315a1afdb9db930ff91a4/sphinxext_opengraph-0.9.1-py3-none-any.whl#sha256=b3b230cc6a5b5189139df937f0d9c7b23c7c204493b22646273687969dcb760e diff --git a/build_tools/circle/download_documentation.sh b/build_tools/circle/download_documentation.sh new file mode 100755 index 0000000000000..c2d6d09d0abb9 --- /dev/null +++ b/build_tools/circle/download_documentation.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e +set -x + +wget $GITHUB_ARTIFACT_URL +mkdir -p doc/_build/html/stable +unzip doc*.zip -d doc/_build/html/stable diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh deleted file mode 100755 index aebe42dfecc70..0000000000000 --- a/build_tools/circle/linting.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash - -# This script is used in CircleCI to check that PRs do not add obvious -# flake8 violations. It relies on two things: -# - find common ancestor between branch and -# scikit-learn/scikit-learn remote -# - run flake8 --diff on the diff between the branch and the common -# ancestor -# -# Additional features: -# - the line numbers in Travis match the local branch on the PR -# author machine. -# - ./build_tools/circle/flake8_diff.sh can be run locally for quick -# turn-around - -set -e -# pipefail is necessary to propagate exit codes -set -o pipefail - -PROJECT=scikit-learn/scikit-learn -PROJECT_URL=https://github.com/$PROJECT.git - -# Find the remote with the project name (upstream in most cases) -REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') - -# Add a temporary remote if needed. For example this is necessary when -# Travis is configured to run in a fork. In this case 'origin' is the -# fork and not the reference repo we want to diff against. -if [[ -z "$REMOTE" ]]; then - TMP_REMOTE=tmp_reference_upstream - REMOTE=$TMP_REMOTE - git remote add $REMOTE $PROJECT_URL -fi - -echo "Remotes:" -echo '--------------------------------------------------------------------------------' -git remote --verbose - -# Travis does the git clone with a limited depth (50 at the time of -# writing). This may not be enough to find the common ancestor with -# $REMOTE/main so we unshallow the git checkout -if [[ -a .git/shallow ]]; then - echo -e '\nTrying to unshallow the repo:' - echo '--------------------------------------------------------------------------------' - git fetch --unshallow -fi - -if [[ "$TRAVIS" == "true" ]]; then - if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] - then - # In main repo, using TRAVIS_COMMIT_RANGE to test the commits - # that were pushed into a branch - if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then - if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then - echo "New branch, no commit range from Travis so passing this test by convention" - exit 0 - fi - COMMIT_RANGE=$TRAVIS_COMMIT_RANGE - fi - else - # We want to fetch the code as it is in the PR branch and not - # the result of the merge into main. This way line numbers - # reported by Travis will match with the local code. - LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST - # In Travis the PR target is always origin - git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF - fi -fi - -# If not using the commit range from Travis we need to find the common -# ancestor between $LOCAL_BRANCH_REF and $REMOTE/main -if [[ -z "$COMMIT_RANGE" ]]; then - if [[ -z "$LOCAL_BRANCH_REF" ]]; then - LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD) - fi - echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:" - echo '--------------------------------------------------------------------------------' - git --no-pager log -2 $LOCAL_BRANCH_REF - - REMOTE_MAIN_REF="$REMOTE/main" - # Make sure that $REMOTE_MAIN_REF is a valid reference - echo -e "\nFetching $REMOTE_MAIN_REF" - echo '--------------------------------------------------------------------------------' - git fetch $REMOTE main:refs/remotes/$REMOTE_MAIN_REF - LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF) - REMOTE_MAIN_SHORT_HASH=$(git rev-parse --short $REMOTE_MAIN_REF) - - COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MAIN_REF) || \ - echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MAIN_REF -q)" - - if [ -z "$COMMIT" ]; then - exit 1 - fi - - COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT) - - echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\ - "and $REMOTE_MAIN_REF ($REMOTE_MAIN_SHORT_HASH) is $COMMIT_SHORT_HASH:" - echo '--------------------------------------------------------------------------------' - git --no-pager show --no-patch $COMMIT_SHORT_HASH - - COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH" - - if [[ -n "$TMP_REMOTE" ]]; then - git remote remove $TMP_REMOTE - fi - -else - echo "Got the commit range from Travis: $COMMIT_RANGE" -fi - -echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ - "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" -echo '--------------------------------------------------------------------------------' - -# We ignore files from sklearn/externals. Unfortunately there is no -# way to do it with flake8 directly (the --exclude does not seem to -# work with --diff). We could use the exclude magic in the git pathspec -# ':!sklearn/externals' but it is only available on git 1.9 and Travis -# uses git 1.8. -# We need the following command to exit with 0 hence the echo in case -# there is no match -MODIFIED_FILES="$(git diff --name-only $COMMIT_RANGE | grep -v 'sklearn/externals' | \ - grep -v 'doc/sphinxext' || echo "no_match")" - -check_files() { - files="$1" - shift - options="$*" - if [ -n "$files" ]; then - # Conservative approach: diff without context (--unified=0) so that code - # that was not changed does not create failures - git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options - fi -} - -if [[ "$MODIFIED_FILES" == "no_match" ]]; then - echo "No file outside sklearn/externals and doc/sphinxext has been modified" -else - check_files "$MODIFIED_FILES" - # check code for unused imports - flake8 --exclude=sklearn/externals/ --select=F401 sklearn/ examples/ -fi -echo -e "No problem detected by flake8\n" - -# For docstrings and warnings of deprecated attributes to be rendered -# properly, the property decorator must come before the deprecated decorator -# (else they are treated as functions) - -# do not error when grep -B1 "@property" finds nothing -set +e -bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` - -if [ ! -z "$bad_deprecation_property_order" ] -then - echo "property decorator should come before deprecated decorator" - echo "found the following occurrencies:" - echo $bad_deprecation_property_order - exit 1 -fi - -# Check for default doctest directives ELLIPSIS and NORMALIZE_WHITESPACE - -doctest_directive="$(git grep -nw -E "# doctest\: \+(ELLIPSIS|NORMALIZE_WHITESPACE)")" - -if [ ! -z "$doctest_directive" ] -then - echo "ELLIPSIS and NORMALIZE_WHITESPACE doctest directives are enabled by default, but were found in:" - echo "$doctest_directive" - exit 1 -fi - -joblib_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/_joblib.py" ":!sklearn/utils/fixes.py")" - -if [ ! -z "$joblib_import" ]; then - echo "Use from sklearn.utils.fixes import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:" - echo "$joblib_import" - exit 1 -fi diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py index 19bee5ae1cfc7..00526f062f200 100755 --- a/build_tools/circle/list_versions.py +++ b/build_tools/circle/list_versions.py @@ -1,19 +1,24 @@ #!/usr/bin/env python3 -# List all available versions of the documentation +# Write the available versions page (--rst) and the version switcher JSON (--json). +# Version switcher see: +# https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/version-dropdown.html +# https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/announcements.html#announcement-banners + +import argparse import json import re import sys - -from distutils.version import LooseVersion from urllib.request import urlopen +from sklearn.utils.fixes import parse_version + def json_urlread(url): try: - return json.loads(urlopen(url).read().decode('utf8')) + return json.loads(urlopen(url).read().decode("utf8")) except Exception: - print('Error reading', url, file=sys.stderr) + print("Error reading", url, file=sys.stderr) raise @@ -21,8 +26,7 @@ def human_readable_data_quantity(quantity, multiple=1024): # https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size if quantity == 0: quantity = +0 - SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] - for i in "KMGTPEZY"] + SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] for i in "KMGTPEZY"] for suffix in SUFFIXES: if quantity < multiple or suffix == SUFFIXES[-1]: if suffix == SUFFIXES[0]: @@ -34,55 +38,60 @@ def human_readable_data_quantity(quantity, multiple=1024): def get_file_extension(version): - if 'dev' in version: - # The 'dev' branch should be explictly handled - return 'zip' + if "dev" in version: + # The 'dev' branch should be explicitly handled + return "zip" - current_version = LooseVersion(version) - min_zip_version = LooseVersion('0.24') + current_version = parse_version(version) + min_zip_version = parse_version("0.24") - return 'zip' if current_version >= min_zip_version else 'pdf' + return "zip" if current_version >= min_zip_version else "pdf" def get_file_size(version): - api_url = ROOT_URL + '%s/_downloads' % version + api_url = ROOT_URL + "%s/_downloads" % version for path_details in json_urlread(api_url): file_extension = get_file_extension(version) - file_path = f'scikit-learn-docs.{file_extension}' - if path_details['name'] == file_path: - return human_readable_data_quantity(path_details['size'], 1000) - - -print(':orphan:') -print() -heading = 'Available documentation for Scikit-learn' -print(heading) -print('=' * len(heading)) -print() -print('Web-based documentation is available for versions listed below:') -print() - -ROOT_URL = 'https://api.github.com/repos/scikit-learn/scikit-learn.github.io/contents/' # noqa -RAW_FMT = 'https://raw.githubusercontent.com/scikit-learn/scikit-learn.github.io/master/%s/index.html' # noqa + file_path = f"scikit-learn-docs.{file_extension}" + if path_details["name"] == file_path: + return human_readable_data_quantity(path_details["size"], 1000) + + +parser = argparse.ArgumentParser() +parser.add_argument("--rst", type=str, required=True) +parser.add_argument("--json", type=str, required=True) +args = parser.parse_args() + +heading = "Available documentation for scikit-learn" +json_content = [] +rst_content = [ + ":orphan:\n", + heading, + "=" * len(heading) + "\n", + "Web-based documentation is available for versions listed below:\n", +] + +ROOT_URL = "https://api.github.com/repos/scikit-learn/scikit-learn.github.io/contents/" +RAW_FMT = "https://raw.githubusercontent.com/scikit-learn/scikit-learn.github.io/master/%s/index.html" VERSION_RE = re.compile(r"scikit-learn ([\w\.\-]+) documentation") -NAMED_DIRS = ['dev', 'stable'] +NAMED_DIRS = ["dev", "stable"] # Gather data for each version directory, including symlinks dirs = {} symlinks = {} root_listing = json_urlread(ROOT_URL) for path_details in root_listing: - name = path_details['name'] + name = path_details["name"] if not (name[:1].isdigit() or name in NAMED_DIRS): continue - if path_details['type'] == 'dir': - html = urlopen(RAW_FMT % name).read().decode('utf8') + if path_details["type"] == "dir": + html = urlopen(RAW_FMT % name).read().decode("utf8") version_num = VERSION_RE.search(html).group(1) file_size = get_file_size(name) dirs[name] = (version_num, file_size) - if path_details['type'] == 'symlink': - symlinks[name] = json_urlread(path_details['_links']['self'])['target'] + if path_details["type"] == "symlink": + symlinks[name] = json_urlread(path_details["_links"]["self"])["target"] # Symlinks should have same data as target @@ -92,21 +101,42 @@ def get_file_size(version): # Output in order: dev, stable, decreasing other version seen = set() -for name in (NAMED_DIRS + - sorted((k for k in dirs if k[:1].isdigit()), - key=LooseVersion, reverse=True)): +for i, name in enumerate( + NAMED_DIRS + + sorted((k for k in dirs if k[:1].isdigit()), key=parse_version, reverse=True) +): version_num, file_size = dirs[name] if version_num in seen: # symlink came first continue else: seen.add(version_num) - name_display = '' if name[:1].isdigit() else ' (%s)' % name - path = 'https://scikit-learn.org/%s/' % name - out = ('* `Scikit-learn %s%s documentation <%s>`_' - % (version_num, name_display, path)) + + full_name = f"{version_num}" if name[:1].isdigit() else f"{version_num} ({name})" + path = f"https://scikit-learn.org/{name}/" + + # Update JSON for the version switcher; only keep the 8 latest versions to avoid + # overloading the version switcher dropdown + if i < 8: + info = {"name": full_name, "version": version_num, "url": path} + if name == "stable": + info["preferred"] = True + json_content.append(info) + + # Printout for the historical version page + out = f"* `scikit-learn {full_name} documentation <{path}>`_" if file_size is not None: file_extension = get_file_extension(version_num) - out += (f' (`{file_extension.upper()} {file_size} <{path}/' - f'_downloads/scikit-learn-docs.{file_extension}>`_)') - print(out) + out += ( + f" (`{file_extension.upper()} {file_size} <{path}/" + f"_downloads/scikit-learn-docs.{file_extension}>`_)" + ) + rst_content.append(out) + +with open(args.rst, "w", encoding="utf-8") as f: + f.write("\n".join(rst_content) + "\n") +print(f"Written {args.rst}") + +with open(args.json, "w", encoding="utf-8") as f: + json.dump(json_content, f, indent=2) +print(f"Written {args.json}") diff --git a/build_tools/circle/push_doc.sh b/build_tools/circle/push_doc.sh index 5b94211e4e30e..f959b8b65c85c 100755 --- a/build_tools/circle/push_doc.sh +++ b/build_tools/circle/push_doc.sh @@ -1,8 +1,8 @@ #!/bin/bash # This script is meant to be called in the "deploy" step defined in -# circle.yml. See https://circleci.com/docs/ for more details. +# .circleci/config.yml. See https://circleci.com/docs/ for more details. # The behavior of the script is controlled by environment variable defined -# in the circle.yml in the top level folder of the project. +# in the .circleci/config.yml file. set -ex @@ -56,7 +56,7 @@ then git rm -rf $dir/ && rm -rf $dir/ fi cp -R $GENERATED_DOC_DIR $dir -git config user.email "olivier.grisel+sklearn-ci@gmail.com" +git config user.email "ci@scikit-learn.org" git config user.name $USERNAME git config push.default matching git add -f $dir/ diff --git a/build_tools/codespell_ignore_words.txt b/build_tools/codespell_ignore_words.txt new file mode 100644 index 0000000000000..6b942a2eabe6d --- /dev/null +++ b/build_tools/codespell_ignore_words.txt @@ -0,0 +1,56 @@ +achin +aggresive +aline +ba +basf +boun +bre +bu +cach +chanel +complies +coo +copys +datas +deine +didi +feld +fo +fpr +fro +fwe +gool +hart +heping +hist +ines +inout +ist +jaques +lamas +linke +lod +mape +mis +mor +nd +nmae +ocur +pullrequest +repid +ro +ser +soler +suh +suprised +te +technic +teh +thi +usal +vie +vor +wan +whis +winn +yau diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py index f8b1191d14d9b..6dcddda40af4d 100644 --- a/build_tools/generate_authors_table.py +++ b/build_tools/generate_authors_table.py @@ -6,27 +6,31 @@ The table should be updated for each new inclusion in the teams. Generating the table requires admin rights. """ -import sys -import requests + import getpass +import sys import time -from pathlib import Path from os import path +from pathlib import Path + +import requests -print("user:", file=sys.stderr) +print("Input user:", file=sys.stderr) user = input() -token = getpass.getpass("access token:\n") +token = getpass.getpass("Input access token:\n") auth = (user, token) -LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4' +LOGO_URL = "https://avatars2.githubusercontent.com/u/365630?v=4" REPO_FOLDER = Path(path.abspath(__file__)).parent.parent def get(url): for sleep_time in [10, 30, 0]: reply = requests.get(url, auth=auth) - api_limit = ("message" in reply.json() - and "API rate limit exceeded" in reply.json()["message"]) + api_limit = ( + "message" in reply.json() + and "API rate limit exceeded" in reply.json()["message"] + ) if not api_limit: break print("API rate limit exceeded, waiting..") @@ -38,55 +42,113 @@ def get(url): def get_contributors(): """Get the list of contributor profiles. Require admin rights.""" - # get core devs and triage team + # get core devs and contributor experience team core_devs = [] - triage_team = [] - for team_id, lst in zip((11523, 3593183), (core_devs, triage_team)): + documentation_team = [] + contributor_experience_team = [] + comm_team = [] + core_devs_slug = "core-devs" + contributor_experience_team_slug = "contributor-experience-team" + comm_team_slug = "communication-team" + documentation_team_slug = "documentation-team" + + entry_point = "https://api.github.com/orgs/scikit-learn/" + + for team_slug, lst in zip( + ( + core_devs_slug, + contributor_experience_team_slug, + comm_team_slug, + documentation_team_slug, + ), + (core_devs, contributor_experience_team, comm_team, documentation_team), + ): + print(f"Retrieving {team_slug}\n") for page in [1, 2]: # 30 per page - reply = get( - f"https://api.github.com/teams/{team_id}/members?page={page}" - ) + reply = get(f"{entry_point}teams/{team_slug}/members?page={page}") lst.extend(reply.json()) # get members of scikit-learn on GitHub + print("Retrieving members\n") members = [] - for page in [1, 2]: # 30 per page - reply = get( - "https://api.github.com/orgs/scikit-learn/members?page=%d" % - (page, )) + for page in [1, 2, 3]: # 30 per page + reply = get(f"{entry_point}members?page={page}") members.extend(reply.json()) # keep only the logins - core_devs = set(c['login'] for c in core_devs) - triage_team = set(c['login'] for c in triage_team) - members = set(c['login'] for c in members) + core_devs = set(c["login"] for c in core_devs) + documentation_team = set(c["login"] for c in documentation_team) + contributor_experience_team = set(c["login"] for c in contributor_experience_team) + comm_team = set(c["login"] for c in comm_team) + members = set(c["login"] for c in members) # add missing contributors with GitHub accounts - members |= {'dubourg', 'mbrucher', 'thouis', 'jarrodmillman'} + members |= {"dubourg", "mbrucher", "thouis", "jarrodmillman"} # add missing contributors without GitHub accounts - members |= {'Angel Soler Gollonet'} + members |= {"Angel Soler Gollonet"} # remove CI bots - members -= {'sklearn-ci', 'sklearn-lgtm', 'sklearn-wheels'} - triage_team -= core_devs # remove ogrisel from triage_team + members -= {"sklearn-ci", "sklearn-wheels", "sklearn-lgtm"} + contributor_experience_team -= ( + core_devs # remove ogrisel from contributor_experience_team + ) + + emeritus = ( + members + - core_devs + - contributor_experience_team + - comm_team + - documentation_team + ) + + # hard coded + emeritus_contributor_experience_team = { + "cmarmo", + } + emeritus_comm_team = {"reshamas"} + + # Up-to-now, we can subtract the team emeritus from the original emeritus + emeritus -= emeritus_contributor_experience_team | emeritus_comm_team - emeritus = members - core_devs - triage_team + comm_team -= {"reshamas"} # in the comm team but not on the web page # get profiles from GitHub core_devs = [get_profile(login) for login in core_devs] emeritus = [get_profile(login) for login in emeritus] - triage_team = [get_profile(login) for login in triage_team] + contributor_experience_team = [ + get_profile(login) for login in contributor_experience_team + ] + emeritus_contributor_experience_team = [ + get_profile(login) for login in emeritus_contributor_experience_team + ] + comm_team = [get_profile(login) for login in comm_team] + emeritus_comm_team = [get_profile(login) for login in emeritus_comm_team] + documentation_team = [get_profile(login) for login in documentation_team] # sort by last name core_devs = sorted(core_devs, key=key) emeritus = sorted(emeritus, key=key) - triage_team = sorted(triage_team, key=key) - - return core_devs, emeritus, triage_team + contributor_experience_team = sorted(contributor_experience_team, key=key) + emeritus_contributor_experience_team = sorted( + emeritus_contributor_experience_team, key=key + ) + documentation_team = sorted(documentation_team, key=key) + comm_team = sorted(comm_team, key=key) + emeritus_comm_team = sorted(emeritus_comm_team, key=key) + + return ( + core_devs, + emeritus, + contributor_experience_team, + emeritus_contributor_experience_team, + comm_team, + emeritus_comm_team, + documentation_team, + ) def get_profile(login): """Get the GitHub profile from login""" - print("get profile for %s" % (login, )) + print("get profile for %s" % (login,)) try: profile = get("https://api.github.com/users/%s" % login).json() except requests.exceptions.HTTPError: @@ -97,11 +159,11 @@ def get_profile(login): # fix missing names missing_names = { - 'bthirion': 'Bertrand Thirion', - 'dubourg': 'Vincent Dubourg', - 'Duchesnay': 'Edouard Duchesnay', - 'Lars': 'Lars Buitinck', - 'MechCoder': 'Manoj Kumar', + "bthirion": "Bertrand Thirion", + "dubourg": "Vincent Dubourg", + "Duchesnay": "Edouard Duchesnay", + "Lars": "Lars Buitinck", + "MechCoder": "Manoj Kumar", } if profile["name"] in missing_names: profile["name"] = missing_names[profile["name"]] @@ -111,46 +173,83 @@ def get_profile(login): def key(profile): """Get a sorting key based on the lower case last name, then firstname""" - components = profile["name"].lower().split(' ') + components = profile["name"].lower().split(" ") return " ".join([components[-1]] + components[:-1]) def generate_table(contributors): lines = [ - (".. raw :: html\n"), - (" "), - ("
"), - (" "), + ".. raw :: html\n", + " ", + '
', + " ", ] for contributor in contributors: lines.append("
") lines.append( - "
" % - (contributor["html_url"], contributor["avatar_url"])) - lines.append("

%s

" % (contributor["name"], )) + "
" + % (contributor["html_url"], contributor["avatar_url"]) + ) + lines.append("

%s

" % (contributor["name"],)) lines.append("
") lines.append("
") - return '\n'.join(lines) + return "\n".join(lines) + "\n" def generate_list(contributors): lines = [] for contributor in contributors: - lines.append("- %s" % (contributor["name"], )) - return '\n'.join(lines) + lines.append("- %s" % (contributor["name"],)) + return "\n".join(lines) + "\n" if __name__ == "__main__": - - core_devs, emeritus, triage_team = get_contributors() - - with open(REPO_FOLDER / "doc" / "authors.rst", "w+") as rst_file: + ( + core_devs, + emeritus, + contributor_experience_team, + emeritus_contributor_experience_team, + comm_team, + emeritus_comm_team, + documentation_team, + ) = get_contributors() + + print("Generating rst files") + with open( + REPO_FOLDER / "doc" / "maintainers.rst", "w+", encoding="utf-8" + ) as rst_file: rst_file.write(generate_table(core_devs)) - with open(REPO_FOLDER / "doc" / "authors_emeritus.rst", "w+") as rst_file: + with open( + REPO_FOLDER / "doc" / "maintainers_emeritus.rst", "w+", encoding="utf-8" + ) as rst_file: rst_file.write(generate_list(emeritus)) - with open(REPO_FOLDER / "doc" / "triage_team.rst", "w+") as rst_file: - rst_file.write(generate_table(triage_team)) + with open( + REPO_FOLDER / "doc" / "contributor_experience_team.rst", "w+", encoding="utf-8" + ) as rst_file: + rst_file.write(generate_table(contributor_experience_team)) + + with open( + REPO_FOLDER / "doc" / "contributor_experience_team_emeritus.rst", + "w+", + encoding="utf-8", + ) as rst_file: + rst_file.write(generate_list(emeritus_contributor_experience_team)) + + with open( + REPO_FOLDER / "doc" / "communication_team.rst", "w+", encoding="utf-8" + ) as rst_file: + rst_file.write(generate_table(comm_team)) + + with open( + REPO_FOLDER / "doc" / "communication_team_emeritus.rst", "w+", encoding="utf-8" + ) as rst_file: + rst_file.write(generate_list(emeritus_comm_team)) + + with open( + REPO_FOLDER / "doc" / "documentation_team.rst", "w+", encoding="utf-8" + ) as rst_file: + rst_file.write(generate_table(documentation_team)) diff --git a/build_tools/get_comment.py b/build_tools/get_comment.py new file mode 100644 index 0000000000000..48ff14a058c9a --- /dev/null +++ b/build_tools/get_comment.py @@ -0,0 +1,351 @@ +# This script is used to generate a comment for a PR when linting issues are +# detected. It is used by the `Comment on failed linting` GitHub Action. +# This script fails if there are not comments to be posted. + +import os + +import requests + + +def get_versions(versions_file): + """Get the versions of the packages used in the linter job. + + Parameters + ---------- + versions_file : str + The path to the file that contains the versions of the packages. + + Returns + ------- + versions : dict + A dictionary with the versions of the packages. + """ + with open("versions.txt", "r") as f: + return dict(line.strip().split("=") for line in f) + + +def get_step_message(log, start, end, title, message, details): + """Get the message for a specific test. + + Parameters + ---------- + log : str + The log of the linting job. + + start : str + The string that marks the start of the test. + + end : str + The string that marks the end of the test. + + title : str + The title for this section. + + message : str + The message to be added at the beginning of the section. + + details : bool + Whether to add the details of each step. + + Returns + ------- + message : str + The message to be added to the comment. + """ + if end not in log: + return "" + res = ( + f"-----------------------------------------------\n### {title}\n\n{message}\n\n" + ) + if details: + res += ( + "
\n\n```\n" + + log[log.find(start) + len(start) + 1 : log.find(end) - 1] + + "\n```\n\n
\n\n" + ) + return res + + +def get_message(log_file, repo, pr_number, sha, run_id, details, versions): + with open(log_file, "r") as f: + log = f.read() + + sub_text = ( + "\n\n _Generated for commit:" + f" [{sha[:7]}](https://github.com/{repo}/pull/{pr_number}/commits/{sha}). " + "Link to the linter CI: [here]" + f"(https://github.com/{repo}/actions/runs/{run_id})_ " + ) + + if "### Linting completed ###" not in log: + return ( + "## ❌ Linting issues\n\n" + "There was an issue running the linter job. Please update with " + "`upstream/main` ([link](" + "https://scikit-learn.org/dev/developers/contributing.html" + "#how-to-contribute)) and push the changes. If you already have done " + "that, please send an empty commit with `git commit --allow-empty` " + "and push the changes to trigger the CI.\n\n" + sub_text + ) + + message = "" + + # ruff check + message += get_step_message( + log, + start="### Running the ruff linter ###", + end="Problems detected by ruff check", + title="`ruff check`", + message=( + "`ruff` detected issues. Please run " + "`ruff check --fix --output-format=full` locally, fix the remaining " + "issues, and push the changes. Here you can see the detected issues. Note " + f"that the installed `ruff` version is `ruff={versions['ruff']}`." + ), + details=details, + ) + + # ruff format + message += get_step_message( + log, + start="### Running the ruff formatter ###", + end="Problems detected by ruff format", + title="`ruff format`", + message=( + "`ruff` detected issues. Please run `ruff format` locally and push " + "the changes. Here you can see the detected issues. Note that the " + f"installed `ruff` version is `ruff={versions['ruff']}`." + ), + details=details, + ) + + # mypy + message += get_step_message( + log, + start="### Running mypy ###", + end="Problems detected by mypy", + title="`mypy`", + message=( + "`mypy` detected issues. Please fix them locally and push the changes. " + "Here you can see the detected issues. Note that the installed `mypy` " + f"version is `mypy={versions['mypy']}`." + ), + details=details, + ) + + # cython-lint + message += get_step_message( + log, + start="### Running cython-lint ###", + end="Problems detected by cython-lint", + title="`cython-lint`", + message=( + "`cython-lint` detected issues. Please fix them locally and push " + "the changes. Here you can see the detected issues. Note that the " + "installed `cython-lint` version is " + f"`cython-lint={versions['cython-lint']}`." + ), + details=details, + ) + + # deprecation order + message += get_step_message( + log, + start="### Checking for bad deprecation order ###", + end="Problems detected by deprecation order check", + title="Deprecation Order", + message=( + "Deprecation order check detected issues. Please fix them locally and " + "push the changes. Here you can see the detected issues." + ), + details=details, + ) + + # doctest directives + message += get_step_message( + log, + start="### Checking for default doctest directives ###", + end="Problems detected by doctest directive check", + title="Doctest Directives", + message=( + "doctest directive check detected issues. Please fix them locally and " + "push the changes. Here you can see the detected issues." + ), + details=details, + ) + + # joblib imports + message += get_step_message( + log, + start="### Checking for joblib imports ###", + end="Problems detected by joblib import check", + title="Joblib Imports", + message=( + "`joblib` import check detected issues. Please fix them locally and " + "push the changes. Here you can see the detected issues." + ), + details=details, + ) + + if not message: + # no issues detected, so this script "fails" + return ( + "## âœ”ī¸ Linting Passed\n" + "All linting checks passed. Your pull request is in excellent shape! â˜€ī¸" + + sub_text + ) + + if not details: + # This happens if posting the log fails, which happens if the log is too + # long. Typically, this happens if the PR branch hasn't been updated + # since we've introduced import sorting. + branch_not_updated = ( + "_Merging with `upstream/main` might fix / improve the issues if you " + "haven't done that since 21.06.2023._\n\n" + ) + else: + branch_not_updated = "" + + message = ( + "## ❌ Linting issues\n\n" + + branch_not_updated + + "This PR is introducing linting issues. Here's a summary of the issues. " + + "Note that you can avoid having linting issues by enabling `pre-commit` " + + "hooks. Instructions to enable them can be found [here](" + + "https://scikit-learn.org/dev/developers/contributing.html#how-to-contribute)" + + ".\n\n" + + "You can see the details of the linting issues under the `lint` job [here]" + + f"(https://github.com/{repo}/actions/runs/{run_id})\n\n" + + message + + sub_text + ) + + return message + + +def get_headers(token): + """Get the headers for the GitHub API.""" + return { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + "X-GitHub-Api-Version": "2022-11-28", + } + + +def find_lint_bot_comments(repo, token, pr_number): + """Get the comment from the linting bot.""" + # repo is in the form of "org/repo" + # API doc: https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments + response = requests.get( + f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments", + headers=get_headers(token), + ) + response.raise_for_status() + all_comments = response.json() + + failed_comment = "❌ Linting issues" + success_comment = "âœ”ī¸ Linting Passed" + + # Find all comments that match the linting bot, and return the first one. + # There should always be only one such comment, or none, if the PR is + # just created. + comments = [ + comment + for comment in all_comments + if comment["user"]["login"] == "github-actions[bot]" + and (failed_comment in comment["body"] or success_comment in comment["body"]) + ] + + if len(all_comments) > 25 and not comments: + # By default the API returns the first 30 comments. If we can't find the + # comment created by the bot in those, then we raise and we skip creating + # a comment in the first place. + raise RuntimeError("Comment not found in the first 30 comments.") + + return comments[0] if comments else None + + +def create_or_update_comment(comment, message, repo, pr_number, token): + """Create a new comment or update existing one.""" + # repo is in the form of "org/repo" + if comment is not None: + print("updating existing comment") + # API doc: https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#update-an-issue-comment + response = requests.patch( + f"https://api.github.com/repos/{repo}/issues/comments/{comment['id']}", + headers=get_headers(token), + json={"body": message}, + ) + else: + print("creating new comment") + # API doc: https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#create-an-issue-comment + response = requests.post( + f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments", + headers=get_headers(token), + json={"body": message}, + ) + + response.raise_for_status() + + +if __name__ == "__main__": + repo = os.environ["GITHUB_REPOSITORY"] + token = os.environ["GITHUB_TOKEN"] + pr_number = os.environ["PR_NUMBER"] + sha = os.environ["BRANCH_SHA"] + log_file = os.environ["LOG_FILE"] + run_id = os.environ["RUN_ID"] + versions_file = os.environ["VERSIONS_FILE"] + + versions = get_versions(versions_file) + + if not repo or not token or not pr_number or not log_file or not run_id: + raise ValueError( + "One of the following environment variables is not set: " + "GITHUB_REPOSITORY, GITHUB_TOKEN, PR_NUMBER, LOG_FILE, RUN_ID" + ) + + try: + comment = find_lint_bot_comments(repo, token, pr_number) + except RuntimeError: + print("Comment not found in the first 30 comments. Skipping!") + exit(0) + + try: + message = get_message( + log_file, + repo=repo, + pr_number=pr_number, + sha=sha, + run_id=run_id, + details=True, + versions=versions, + ) + create_or_update_comment( + comment=comment, + message=message, + repo=repo, + pr_number=pr_number, + token=token, + ) + print(message) + except requests.HTTPError: + # The above fails if the message is too long. In that case, we + # try again without the details. + message = get_message( + log_file, + repo=repo, + pr_number=pr_number, + sha=sha, + run_id=run_id, + details=False, + versions=versions, + ) + create_or_update_comment( + comment=comment, + message=message, + repo=repo, + pr_number=pr_number, + token=token, + ) + print(message) diff --git a/build_tools/github/Windows b/build_tools/github/Windows deleted file mode 100644 index 5ba35f790ca5e..0000000000000 --- a/build_tools/github/Windows +++ /dev/null @@ -1,15 +0,0 @@ -# Get the Python version of the base image from a build argument -ARG PYTHON_VERSION -FROM winamd64/python:$PYTHON_VERSION-windowsservercore - -ARG WHEEL_NAME -ARG CONFTEST_NAME -ARG CIBW_TEST_REQUIRES - -# Copy and install the Windows wheel -COPY $WHEEL_NAME $WHEEL_NAME -COPY $CONFTEST_NAME $CONFTEST_NAME -RUN pip install $env:WHEEL_NAME - -# Install the testing dependencies -RUN pip install $env:CIBW_TEST_REQUIRES.split(" ") diff --git a/build_tools/github/build_minimal_windows_image.sh b/build_tools/github/build_minimal_windows_image.sh old mode 100644 new mode 100755 index 67a6180952591..8cc9af937dfd9 --- a/build_tools/github/build_minimal_windows_image.sh +++ b/build_tools/github/build_minimal_windows_image.sh @@ -4,28 +4,48 @@ set -e set -x PYTHON_VERSION=$1 -BITNESS=$2 -if [[ "$PYTHON_VERSION" == "36" || "$BITNESS" == "32" ]]; then - # Python 3.6 and 32-bit architectures are not supported - # by the official Docker images: Tests will just be run - # on the host (instead of the minimal Docker container). - exit 0 -fi +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" + +if [[ $FREE_THREADED_BUILD == "False" ]]; then + # Prepare a minimal Windows environment without any developer runtime libraries + # installed to check that the scikit-learn wheel does not implicitly rely on + # external DLLs when running the tests. + TEMP_FOLDER="$HOME/AppData/Local/Temp" + WHEEL_PATH=$(ls -d $TEMP_FOLDER/**/*/repaired_wheel/*) + WHEEL_NAME=$(basename $WHEEL_PATH) + + cp $WHEEL_PATH $WHEEL_NAME -TEMP_FOLDER="$HOME/AppData/Local/Temp" -WHEEL_PATH=$(ls -d $TEMP_FOLDER/*/repaired_wheel/*) -WHEEL_NAME=$(basename $WHEEL_PATH) + # Dot the Python version for identifying the base Docker image + PYTHON_DOCKER_IMAGE_PART=$(echo ${PYTHON_VERSION:0:1}.${PYTHON_VERSION:1:2}) -cp $WHEEL_PATH $WHEEL_NAME + if [[ "$CIBW_PRERELEASE_PYTHONS" =~ [tT]rue ]]; then + PYTHON_DOCKER_IMAGE_PART="${PYTHON_DOCKER_IMAGE_PART}-rc" + fi -# Dot the Python version for identyfing the base Docker image -PYTHON_VERSION=$(echo ${PYTHON_VERSION:0:1}.${PYTHON_VERSION:1:2}) + # We could have all of the following logic in a Dockerfile but it's a lot + # easier to do it in bash rather than figure out how to do it in Powershell + # inside the Dockerfile ... + DOCKER_IMAGE="winamd64/python:${PYTHON_DOCKER_IMAGE_PART}-windowsservercore" + MNT_FOLDER="C:/mnt" + CONTAINER_ID=$(docker run -it -v "$(cygpath -w $PWD):$MNT_FOLDER" -d $DOCKER_IMAGE) -# Build a minimal Windows Docker image for testing the wheels -docker build --build-arg PYTHON_VERSION=$PYTHON_VERSION \ - --build-arg WHEEL_NAME=$WHEEL_NAME \ - --build-arg CONFTEST_NAME=$CONFTEST_NAME \ - --build-arg CIBW_TEST_REQUIRES="$CIBW_TEST_REQUIRES" \ - -f build_tools/github/Windows \ - -t scikit-learn/minimal-windows . + function exec_inside_container() { + docker exec $CONTAINER_ID powershell -Command $1 + } + + exec_inside_container "python -m pip install $MNT_FOLDER/$WHEEL_NAME" + exec_inside_container "python -m pip install $CIBW_TEST_REQUIRES" + + # Save container state to scikit-learn/minimal-windows image. On Windows the + # container needs to be stopped first. + docker stop $CONTAINER_ID + docker commit $CONTAINER_ID scikit-learn/minimal-windows +else + # This is too cumbersome to use a Docker image in the free-threaded case + # TODO When pandas has a release with a Windows free-threaded wheel we can + # replace the next line with + # python -m pip install CIBW_TEST_REQUIRES + python -m pip install pytest +fi diff --git a/build_tools/github/build_source.sh b/build_tools/github/build_source.sh old mode 100644 new mode 100755 index a4d9c7bd05387..ec53284012fa4 --- a/build_tools/github/build_source.sh +++ b/build_tools/github/build_source.sh @@ -11,10 +11,10 @@ python -m venv build_env source build_env/bin/activate python -m pip install numpy scipy cython -python -m pip install twine +python -m pip install twine build cd scikit-learn/scikit-learn -python setup.py sdist +python -m build --sdist # Check whether the source distribution will render correctly twine check dist/*.tar.gz diff --git a/build_tools/github/build_test_arm.sh b/build_tools/github/build_test_arm.sh new file mode 100755 index 0000000000000..db11fdc0e82f0 --- /dev/null +++ b/build_tools/github/build_test_arm.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -e +set -x + +UNAMESTR=`uname` +N_CORES=`nproc --all` + +# defines the get_dep and show_installed_libraries functions +source build_tools/shared.sh + +setup_ccache() { + echo "Setting up ccache" + mkdir /tmp/ccache/ + which ccache + for name in gcc g++ cc c++ x86_64-linux-gnu-gcc x86_64-linux-gnu-c++; do + ln -s $(which ccache) "/tmp/ccache/${name}" + done + export PATH="/tmp/ccache:${PATH}" + # Unset ccache limits + ccache -F 0 + ccache -M 0 +} + +setup_ccache + +python --version + +# Disable the build isolation and build in the tree so that the same folder can be +# cached between CI runs. +pip install --verbose --no-build-isolation . + +# Report cache usage +ccache -s --verbose + +micromamba list + +# Changing directory not to have module resolution use scikit-learn source +# directory but to the installed package. +cd /tmp +python -c "import sklearn; sklearn.show_versions()" +python -m threadpoolctl --import sklearn +# Test using as many workers as available cores +pytest --pyargs -n $N_CORES sklearn diff --git a/build_tools/github/build_wheels.sh b/build_tools/github/build_wheels.sh deleted file mode 100644 index 9b45481cbb978..0000000000000 --- a/build_tools/github/build_wheels.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -e -set -x - -# OpenMP is not present on macOS by default -if [[ "$RUNNER_OS" == "macOS" ]]; then - # Make sure to use a libomp version binary compatible with the oldest - # supported version of the macos SDK as libomp will be vendored into the - # scikit-learn wheels for macos. The list of bottles can be found at: - # https://formulae.brew.sh/api/formula/libomp.json. Currently, the oldest - # supported macos version is: High Sierra / 10.13. When upgrading this, be - # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in - # wheels.yml accordingly. - wget https://homebrew.bintray.com/bottles/libomp-11.0.0.high_sierra.bottle.tar.gz - brew install libomp-11.0.0.high_sierra.bottle.tar.gz - export CC=/usr/bin/clang - export CXX=/usr/bin/clang++ - export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" - export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include" - export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include" - export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp" -fi - -# The version of the built dependencies are specified -# in the pyproject.toml file, while the tests are run -# against the most recent version of the dependencies - -python -m pip install cibuildwheel -python -m cibuildwheel --output-dir wheelhouse diff --git a/build_tools/github/check_build_trigger.sh b/build_tools/github/check_build_trigger.sh old mode 100644 new mode 100755 index b80fe4674864d..e6bc77b00e71f --- a/build_tools/github/check_build_trigger.sh +++ b/build_tools/github/check_build_trigger.sh @@ -7,6 +7,7 @@ COMMIT_MSG=$(git log --no-merges -1 --oneline) # The commit marker "[cd build]" will trigger the build when required if [[ "$GITHUB_EVENT_NAME" == schedule || + "$GITHUB_EVENT_NAME" == workflow_dispatch || "$COMMIT_MSG" =~ \[cd\ build\] ]]; then - echo "::set-output name=build::true" + echo "build=true" >> $GITHUB_OUTPUT fi diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py index c213991394a6b..21c9a529b265b 100644 --- a/build_tools/github/check_wheels.py +++ b/build_tools/github/check_wheels.py @@ -1,41 +1,29 @@ """Checks that dist/* contains the number of wheels built from the .github/workflows/wheels.yml config.""" -import yaml -from pathlib import Path + import sys +from pathlib import Path + +import yaml gh_wheel_path = Path.cwd() / ".github" / "workflows" / "wheels.yml" -with gh_wheel_path.open('r') as f: +with gh_wheel_path.open("r") as f: wheel_config = yaml.safe_load(f) -build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix'] -n_python_versions = len(build_matrix['python']) - -# For each python version we have: 7 wheels -# 1 osx wheel (x86_64) -# 4 linux wheel (i686 + x86_64) * (manylinux1 + manylinux2010) -# 2 windows wheel (win32 + wind_amd64) -n_wheels = 7 * n_python_versions +build_matrix = wheel_config["jobs"]["build_wheels"]["strategy"]["matrix"]["include"] +n_wheels = len(build_matrix) # plus one more for the sdist n_wheels += 1 -# aarch64 builds from travis -travis_config_path = Path.cwd() / ".travis.yml" -with travis_config_path.open('r') as f: - travis_config = yaml.safe_load(f) - -jobs = travis_config['jobs']['include'] -travis_builds = [j for j in jobs - if any("CIBW_BUILD" in env for env in j["env"])] -n_wheels += len(travis_builds) - -dist_files = list(Path("dist").glob('**/*')) +dist_files = list(Path("dist").glob("**/*")) n_dist_files = len(dist_files) if n_dist_files != n_wheels: - print(f"Expected {n_wheels} wheels in dist/* but " - f"got {n_dist_files} artifacts instead.") + print( + f"Expected {n_wheels} wheels in dist/* but " + f"got {n_dist_files} artifacts instead." + ) sys.exit(1) print(f"dist/* has the expected {n_wheels} wheels:") diff --git a/build_tools/github/create_gpu_environment.sh b/build_tools/github/create_gpu_environment.sh new file mode 100755 index 0000000000000..96a62d7678566 --- /dev/null +++ b/build_tools/github/create_gpu_environment.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e +set -x + +curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" +bash Miniforge3-$(uname)-$(uname -m).sh -b -p "${HOME}/conda" +source "${HOME}/conda/etc/profile.d/conda.sh" + + +# defines the get_dep and show_installed_libraries functions +source build_tools/shared.sh +conda activate base + +CONDA_ENV_NAME=sklearn +LOCK_FILE=build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock +create_conda_environment_from_lock_file $CONDA_ENV_NAME $LOCK_FILE + +conda activate $CONDA_ENV_NAME +conda list diff --git a/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock new file mode 100644 index 0000000000000..315164f96c77c --- /dev/null +++ b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_conda.lock @@ -0,0 +1,254 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 0c167b26e12c284b769bf4d76bd3e604db266ed21c8f9e11e4bb737419ccdc93 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/cuda-version-11.8-h70ddcb2_3.conda#670f0e1593b8c1d84f57ad5fe5256799 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda#ad8527bf134a90e1c9ed35fa0b64318c +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.18.0-ha770c72_1.conda#4fb055f57404920a43b147031471e03b +https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda#d76872d096d063e226482c99337209dc +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-7_cp313.conda#e84b44e6300f1703cb25d29120c5b1d8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_4.conda#01f8d123c96816249efd255a31ad7712 +https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda#434ca7e50e40f4918ab701e3facd59a0 +https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.6-h024ca30_0.conda#e4ece7ed81e43ae97a3b58ac4230c3c5 +https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda#460eba7851277ec1fd80a1a24080787a +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-3_kmp_llvm.conda#ee5c2118262e30b972bc0b4db8ef0ba5 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda#c151d5eb730e9b7480e6d48c0fc44048 +https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda#7df50d44d4a14d6c31a2c54f2cd92157 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda#ea8ac52380885ed41c1baa8f1d6d2b93 +https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda#76df83c2a9035c54df5d04ff81bcc02d +https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.0-hb9d3cd8_0.conda#f65c946f28f0518f41ced702f44c52b7 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_2.conda#41b599ed2b02abcfdd84302bff174b23 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda#64f0c503da58ec25ebd359e4d990afa8 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda#db0bfbe7dd197b68ad5f30333bae6ce0 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda#ddca86c7040dd0e73b2b69bd7833d225 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda#01de444988ed960031dbe84cf4f9b1fc +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda#e796ff8ddc598affdf7c173d6145f087 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_1.conda#a76fd702c93cd2dfd89eff30a5fd45a8 +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda#7c7927b404672409d9917d49bff5f2d6 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda#1cb1c67961f6dd257eae9e9691b341aa +https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.10.0-h202a827_0.conda#0f98f3e95272d118f7931b6bef69bfe5 +https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb9d3cd8_0.conda#1349c022c92c5efd3fd705a79a5804d8 +https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda#63f790534398730f59e1b899c3644d4a +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda#de356753cfdbffcde5bb1e86e3aa6cd0 +https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda#b3c17d95b5a10c6e64a21fa17573e70e +https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda#fb901ff28063514abb6046c9ec2c4a45 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda#f6ebe2cb3f82ba6c057dde5d9debe4f7 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda#8035c64cb77ed555e3f150b7b3972480 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.8.7-h043a21b_0.conda#4fdf835d66ea197e693125c64fbd4482 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-h3870646_2.conda#17ccde79d864e6183a83c5bbb8fff34d +https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.3-h3870646_2.conda#06008b5ab42117c89c982aa2a32a5b25 +https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.3-h3870646_2.conda#303d9e83e0518f1dcb66e90054635ca6 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 +https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda#bfd56492d8346d669010eccafe0ba058 +https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda#d411fc29e338efb48c5fd4576d71d881 +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda#9344155d33912347b37f0ae6c410a835 +https://conda.anaconda.org/conda-forge/linux-64/libabseil-20240722.0-cxx17_hbbce691_4.conda#488f260ccda0afaf08acb286db439c2f +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_2.conda#9566f0bd264fbd463002e759b8a82401 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_2.conda#06f70867945ea6a84d35836af780f1de +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda#f92e6e0a3c0c0c85561ef61aa59d555d +https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hd590300_0.conda#48f4330bfcd959c3cfb704d424903c82 +https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.47-h943b412_0.conda#55199e2ae2c3651f6f9b2a447b47bdc9 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.0-hee588c1_0.conda#71888e92098d0f8c41b09a671ad289bc +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda#9d2072af184b5caa29492bf2344597bb +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda#92ed62436b625154323d40d5f2f11dd7 +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda#2322531904f27501ee19847b87ba7c64 +https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.0-h29eaf8c_0.conda#d2f1c87d4416d1e7344cf92b1aaee1c4 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/s2n-1.5.14-h6c98b2b_0.conda#efab4ad81ba5731b2fefa0ab4359e884 +https://conda.anaconda.org/conda-forge/linux-64/sleef-3.8-h1b44611_0.conda#aec4dba5d4c2924730088753f6fa164b +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda#3b3e64af585eadfb52bb90b553db5edf +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda#a37843723437ba75f42c9270ffe800b1 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.17.0-h3dad3f2_6.conda#3a127d28266cdc0da93384d1f59fe8df +https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_2.conda#c63b5e52939e795ba8d26e35d767a843 +https://conda.anaconda.org/conda-forge/linux-64/cudatoolkit-11.8.0-h4ba93d1_13.conda#eb43f5f1f16e2fad2eba22219c3e499b +https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda#ff862eebdfeb2fd048ae9dc92510baca +https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda#c94a5994ef49749880a8139cf9afcbe1 +https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda#f87c7b7c2cb45f323ffbce941c78ab7c +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2#c965a5aa0d5c1c37ffc62dff36e28400 +https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.124-hb9d3cd8_0.conda#8bc89311041d7fcb510238cf0848ccae +https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda#3c255be50a506c50765a93a6644f32fe +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-15.1.0-h69a702a_2.conda#a483a87b71e974bb75d1b9413d4436dd +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.29-pthreads_h94d23a6_0.conda#0a4d0252248ef9a0f88f2ba8b8a08e12 +https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.28.3-h6128344_1.conda#d8703f1ffe5a06356f06467f1d0b9464 +https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2024.07.02-hbbce691_2.conda#b2fede24428726dd867611664fb372e8 +https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda#dcb95c0a98ba9ff737f7ae482aef7833 +https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda#e79a094918988bb1807462cd42c83962 +https://conda.anaconda.org/conda-forge/linux-64/nccl-2.26.6.1-h03a54cd_0.conda#323f9253803b715728e8d0c94c495f53 +https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda#b90bece58b4c2bf25969b70f3be42d25 +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.3-hf636f53_101_cp313.conda#10622e12d649154af0bd76bcf33a7c5c +https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda#353823361b1d27eb3960efb076dfcaf6 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-hb711507_2.conda#8637c3e5821654d0edf97e2b0404b443 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda#ad748ccca349aec3e91743e08b5e2b50 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda#0e0cbe0564d03a99afd5fd7b362feecd +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda#608e0ef8256b81d04456e8d211eee3e8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda#1c74ff8c35dcadf952a16f752ca5aa49 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda#db038ce880f100acc74dba10302b5630 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h04a3f94_2.conda#81096a80f03fc2f0fb2a230f5d028643 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.9.4-hb9b18c6_4.conda#773c99d0dbe2b3704af165f97ff399e5 +https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_2.conda#98514fe74548d768907ce7a13f680e8f +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cpython-3.13.3-py313hd8ed1ab_101.conda#904a822cbd380adafb9070debf8579a8 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.27-h54b06d7_7.conda#dce22f70b4e5a407ce88f2be046f4ceb +https://conda.anaconda.org/conda-forge/linux-64/cython-3.1.1-py313h5dec8f5_1.conda#f114755cdd37627732b1884b7b15d4b5 +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/linux-64/fastrlock-0.8.3-py313h9800cb9_1.conda#54dd71b3be2ed6ccc50f180347c901db +https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda#4547b39256e296bb758166893e909a7c +https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda#2d2c9ef879a7e64e2dc657b09272c2b6 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py313h33d0bda_0.conda#9862d13a5e466273d5a4738cffcb8d6c +https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda#000e85703f0fd9594c81710dd5066471 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 +https://conda.anaconda.org/conda-forge/linux-64/libcudnn-9.10.1.4-h7d33bf5_0.conda#93fe78190bc6fe40d5e7a737c8065286 +https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-h4637d8d_4.conda#d4529f4dff3057982a7617c7ac58fde3 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.0-h332b0f4_0.conda#d1738cf06503218acee63669029fd8e8 +https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda#51f5be229d83ecd401fb369ab96ae669 +https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda#072ab14a02164b7c0c089055368ff776 +https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda#c8013e438185f33b13814c5c488acd5c +https://conda.anaconda.org/conda-forge/linux-64/libhiredis-1.0.2-h2cc385e_0.tar.bz2#b34907d3a81a3cd8095ee83d174c074a +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda#14dbe05b929e329dbaa6f2d0aa19466d +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda#21b62c55924f01b6eef6827167b46acb +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda#2eeb50cab6652538eee8fc0bc3340c81 +https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_1.conda#3585aa87c43ab15b167b574cd73b057b +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/noarch/networkx-3.4.2-pyh267e887_2.conda#fd40bf7f7f4bc4b647dc8512053d9873 +https://conda.anaconda.org/conda-forge/linux-64/openblas-0.3.29-pthreads_h6ec200e_0.conda#7e4d48870b3258bea920d51b7f495a81 +https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda#9e5816bc95d285c115a3ebc2f8563564 +https://conda.anaconda.org/conda-forge/linux-64/orc-2.1.1-h2271f48_0.conda#67075ef2cb33079efee3abfe58127a3b +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh145f28c_0.conda#01384ff1639c6330a0924791413b8714 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/linux-64/re2-2024.07.02-h9925aae_2.conda#e84ddf12bde691e8ec894b00ea829ddf +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_1.conda#b0dd904de08b7db706167240bf37b164 +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py313h536fd9c_0.conda#e9434a5155db25c38ade26f71a2f5a48 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda#a0901183f08b6c7107aab109733a3c91 +https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.44-hb9d3cd8_0.conda#7c91bfc90672888259675ad2ad28af9c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda#febbab7d15033c913d53c7a2c102309d +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda#4bdb303603e9821baf5fe5fdff1dc8f8 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda#96d57aba173e878a2089d5638016dc5e +https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.8.6-hd08a7f5_4.conda#f5a770ac1fd2cb34b21327fc513013a7 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.12.2-h108da3e_2.conda#90e07c8bac8da6378ee1882ef0a9374a +https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda#0a8838771cc2e985cd295e01ae83baf1 +https://conda.anaconda.org/conda-forge/linux-64/ccache-4.11.3-h80c52d3_0.conda#eb517c6a2b960c3ccb6f1db1005f063a +https://conda.anaconda.org/conda-forge/linux-64/coverage-7.8.2-py313h8060acc_0.conda#b278629953bd3424060870fca744de4a +https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda#679616eb5ad4e521c83da4650860aba7 +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.1-py313h8060acc_0.conda#f03a1dc39346922cb5cf2ee190ac9b95 +https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda#9ccd736d31e0c6e41f54e704e5312811 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_he106b2a_openblas.conda#abb32c727da370c481a1c206f5159ce9 +https://conda.anaconda.org/conda-forge/linux-64/libcudnn-dev-9.10.1.4-h0fdc2d1_0.conda#a0c0b44d26a4710e6ea577fcddbe09d1 +https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda#928b8be80851f5d8ffb016f9c81dae7a +https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.67.1-h25350d4_2.conda#bfcedaf5f9b003029cc6abe9431f66bf +https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.2-default_h0d58e46_1001.conda#804ca9e91bcaea0824a341d55b1684f2 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_h7ac8fdf_openblas.conda#452b98eafe050ecff932f0ec832dd03f +https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.6-he9d0ab4_0.conda#bf8ccdd2c1c1a54a3fa25bb61f26460e +https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda#fedf6bfe5d21d21d2b1785ec00a8889a +https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda#e71f31f8cfb0a91439f2086fc8aa0461 +https://conda.anaconda.org/conda-forge/linux-64/mpc-1.3.1-h24ddda3_1.conda#aa14b9a5196a6d8dd364164b7ce56acf +https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda#2e5bf4f1da39c0b32778561c3c4e5878 +https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py313h8db990d_0.conda#91b00afee98d72d29dc3d1c1ab0008d7 +https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda#a83f6a2fdc079e643237887a37460668 +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/noarch/python-gil-3.13.3-h4df99d1_101.conda#82c2641f2f0f513f7d2d1b847a2588e3 +https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda#eb44b3b6deb1cab08d72cb61686fe64c +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda#d3c295b50f092ab525ffe3c2aa4b7413 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda#2ccd714aa2242315acaf0a67faea780b +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda#b5fcc7172d22516e1f965490e65e33a4 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda#17dcc85db3c7886650b8908b183d6876 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda#2de7f99d6581a4a7adbff607b5c278ca +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda#5efa5fa6243a622445fdfd72aee15efa +https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda#aaa2a381ccc56eac91d63b6c1240312f +https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.7.13-h822ba82_2.conda#9cf2c3c13468f2209ee814be2c88655f +https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.10.0-h113e628_0.conda#73f73f60854f325a55f1d31459f2ab73 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.8.0-h736e048_1.conda#13de36be8de3ae3f05ba127631599213 +https://conda.anaconda.org/conda-forge/linux-64/cudnn-9.10.1.4-haad7af6_0.conda#8382d957333e0d3280dcbf5691516dc1 +https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda#8f5b0b297b59e1ac160ad4beec99dbee +https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py313h11186cd_0.conda#54d020e0eaacf1e99bfb2410b9aa2e5e +https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.6-default_h1df26ce_0.conda#99ead3b974685e44df8b1e3953503cfc +https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.6-default_he06ed0a_0.conda#cc6c469d9d7fc0ac106cef5f45d973a9 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-h2b5623c_0.conda#c96ca58ad3352a964bfcb85de6cd1496 +https://conda.anaconda.org/conda-forge/linux-64/liblapacke-3.9.0-31_he2f377e_openblas.conda#7e5fff7d0db69be3a266f7e79a3bb0e2 +https://conda.anaconda.org/conda-forge/linux-64/libmagma-2.9.0-h45b15fe_0.conda#703a1ab01e36111d8bb40bc7517e900b +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.18.0-hfcad708_1.conda#1f5a5d66e77a39dc5bd639ec953705cf +https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda#6458be24f09e1b034902ab44fe9de908 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py313h17eae1a_0.conda#7a2d2f9adecd86ed5c29c2115354f615 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.13.0-hceb3a55_1.conda#ba7726b8df7b9d34ea80e82b097a4893 +https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda#7bbe9a0cc0df0ac5f5a8ad6d6a11af2f +https://conda.anaconda.org/conda-forge/noarch/array-api-strict-2.3.1-pyhd8ed1ab_0.conda#11107d0aeb8c590a34fee0894909816b +https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.31.0-h55f77e1_4.conda#0627af705ed70681f5bede31e72348e5 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.13.0-h3cf044e_1.conda#7eb66060455c7a47d9dcdbfa9f46579b +https://conda.anaconda.org/conda-forge/linux-64/blas-devel-3.9.0-31_h1ea3ea9_openblas.conda#ba652ee0576396d4765e567f043c57f9 +https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda#09262e66b19567aff4f592fb53b28760 +https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py313h33d0bda_0.conda#5dc81fffe102f63045225007a33d6199 +https://conda.anaconda.org/conda-forge/linux-64/cupy-core-13.4.1-py313hc2a895b_0.conda#46dd595e816b278b178e3bef8a6acf71 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_0.conda#fc5efe1833a4d709953964037985bb72 +https://conda.anaconda.org/conda-forge/linux-64/libmagma_sparse-2.9.0-h45b15fe_0.conda#beac0a5bbe0af75db6b16d3d8fd24f7e +https://conda.anaconda.org/conda-forge/linux-64/mkl-2024.2.2-ha957f24_16.conda#1459379c79dda834673426504d52b319 +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.3-py313ha87cce1_3.conda#6248b529e537b1d4cb5ab3ef7f537795 +https://conda.anaconda.org/conda-forge/linux-64/polars-default-1.30.0-py39hfac2b71_0.conda#cd33cf1e631b4d766858c90e333b4832 +https://conda.anaconda.org/conda-forge/noarch/pytest-cov-6.1.1-pyhd8ed1ab_0.conda#1e35d8f975bc0e984a19819aa91c440a +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py313h86fcf2b_0.conda#ca68acd9febc86448eeed68d0c6c8643 +https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_105.conda#8c09fac3785696e1c477156192d64b91 +https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h37a5c72_3.conda#beb8577571033140c6897d257acc7724 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-ha633028_1.conda#7c1980f89dd41b097549782121a73490 +https://conda.anaconda.org/conda-forge/linux-64/blas-2.131-openblas.conda#38b2ec894c69bb4be0e66d2ef7fc60bf +https://conda.anaconda.org/conda-forge/linux-64/cupy-13.4.1-py313h66a2ee2_0.conda#784d6bd149ef2b5d9c733ea3dd4d15ad +https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda#0e6e192d4b3d95708ad192d957cf3163 +https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.4.1-cuda118_mkl_hee7131c_306.conda#28b3b3da11973494ed0100aa50f47328 +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py313h129903b_0.conda#4f8816d006b1c155ec416bcf7ff6cee2 +https://conda.anaconda.org/conda-forge/linux-64/polars-1.30.0-default_h1443d73_0.conda#19698b29e8544d2dd615699826037039 +https://conda.anaconda.org/conda-forge/linux-64/pyamg-5.2.1-py313hf0ab243_1.conda#4c769bf3858f424cb2ecf952175ec600 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-19.0.1-hc7b3859_3_cpu.conda#9ed3ded6da29dec8417f2e1db68798f2 +https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.4.1-cuda118_mkl_py313_h909c4c2_306.conda#de6e45613bbdb51127e9ff483c31bf41 +https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.0-h0384650_3.conda#8aa69e15597a205fd6f81781fe62c232 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-19.0.1-hcb10f89_3_cpu.conda#8f8dc214d89e06933f1bc1dcd2310b9c +https://conda.anaconda.org/conda-forge/linux-64/libparquet-19.0.1-h081d1f1_3_cpu.conda#1d04307cdb1d8aeb5f55b047d5d403ea +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-19.0.1-py313he5f92c8_0_cpu.conda#7d8649531c807b24295c8f9a0a396a78 +https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.0-py313h5f61773_0.conda#f51f25ec8fcbf777f8b186bb5deeed40 +https://conda.anaconda.org/conda-forge/linux-64/pytorch-gpu-2.4.1-cuda118_mkl_hf8a3b2d_306.conda#b1802a39f1ca7ebed5f8c35755bffec1 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-19.0.1-hcb10f89_3_cpu.conda#a28f04b6e68a1c76de76783108ad729d +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py313h78bf25f_0.conda#cc9324e614a297fdf23439d887d3513d +https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-19.0.1-h08228c5_3_cpu.conda#a58e4763af8293deaac77b63bc7804d8 +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-19.0.1-py313h78bf25f_0.conda#e8efe6998a383dd149787c83d3d6a92e diff --git a/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_environment.yml b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_environment.yml new file mode 100644 index 0000000000000..bbfb91d24fd1a --- /dev/null +++ b/build_tools/github/pylatest_conda_forge_cuda_array-api_linux-64_environment.yml @@ -0,0 +1,32 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge + - pytorch + - nvidia +dependencies: + - python + - numpy + - blas + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pandas + - pyamg + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - pytest-cov + - coverage + - ccache + - pytorch-gpu + - polars + - pyarrow + - cupy + - array-api-strict diff --git a/build_tools/github/pymin_conda_forge_arm_environment.yml b/build_tools/github/pymin_conda_forge_arm_environment.yml new file mode 100644 index 0000000000000..c65ab4aaecf14 --- /dev/null +++ b/build_tools/github/pymin_conda_forge_arm_environment.yml @@ -0,0 +1,22 @@ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + - conda-forge +dependencies: + - python=3.10 + - numpy + - blas + - scipy + - cython + - joblib + - threadpoolctl + - matplotlib + - pytest + - pytest-xdist + - pillow + - pip + - ninja + - meson-python + - pip + - ccache diff --git a/build_tools/github/pymin_conda_forge_arm_linux-aarch64_conda.lock b/build_tools/github/pymin_conda_forge_arm_linux-aarch64_conda.lock new file mode 100644 index 0000000000000..fa19d32158855 --- /dev/null +++ b/build_tools/github/pymin_conda_forge_arm_linux-aarch64_conda.lock @@ -0,0 +1,160 @@ +# Generated by conda-lock. +# platform: linux-aarch64 +# input_hash: f12646c755adbf5f02f95c5d07e868bf1570777923e737bc27273eb1a5e40cd7 +@EXPLICIT +https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2#0c96522c6bdaed4b1566d11387caaf45 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2#34893075a5c9e55cdafac56607368fc6 +https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2#4d59c254e01d9cde7957100457e2d5fb +https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda#49023d73832ef61042f6a237cb2687e7 +https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.43-h80caac9_4.conda#80c9ad5e05e91bb6c0967af3880c9742 +https://conda.anaconda.org/conda-forge/linux-aarch64/libglvnd-1.7.0-hd24410f_2.conda#9e115653741810778c9a915a2f8439e7 +https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.1.0-he277a41_2.conda#a28544b28961994eab37e1132a7dadcf +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.10-7_cp310.conda#44e871cba2b162368476a84b8d040b6c +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-2_gnu.tar.bz2#6168d71addc746e8f2b8d57dfd2edcea +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.4.26-hbd8a1cb_0.conda#95db94f75ba080a22eb623590993167b +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2#f766549260d6815b0c52253f1fb1bb29 +https://conda.anaconda.org/conda-forge/linux-aarch64/libegl-1.7.0-hd24410f_2.conda#cf105bce884e4ef8c8ccdca9fe6695e7 +https://conda.anaconda.org/conda-forge/linux-aarch64/libopengl-1.7.0-hd24410f_2.conda#cf9d12bfab305e48d095a4c79002c922 +https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2#fee5683a3f04bd15cbd8318b096a27ab +https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.1.0-he277a41_2.conda#224e999bbcad260d7bd4c0c27fdb99a4 +https://conda.anaconda.org/conda-forge/linux-aarch64/alsa-lib-1.2.14-h86ecc28_0.conda#a696b24c1b473ecc4774bcb5a6ac6337 +https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlicommon-1.1.0-h86ecc28_2.conda#3ee026955c688f551a9999840cff4c67 +https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.24-he377734_0.conda#f0b3d6494663b3385bf87fc206d7451a +https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.0-h5ad3122_0.conda#d41a057e7968705dae8dcb7c8ba2c8dd +https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.4.6-he21f813_1.conda#15a131f30cae36e9a655ca81fee9a285 +https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.1.0-he9431aa_2.conda#d12a4b26073751bbc3db18de83ccba5f +https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.1.0-hbc25352_2.conda#4b5f4d119f9b28f254f82dbe56b2406f +https://conda.anaconda.org/conda-forge/linux-aarch64/libiconv-1.18-hc99b53d_1.conda#81541d85a45fbf4d0a29346176f1f21c +https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.0-h86ecc28_0.conda#a689388210d502364b79e8b19e7fa2cb +https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.1-h86ecc28_1.conda#8ced9a547a29f7a71b7f15a4443ad1de +https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.1.0-h3f4de04_2.conda#6247ea6d1ecac20a9e98674342984726 +https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.5.0-h0886dbf_0.conda#95ef4a689b8cc1b7e18b53784d88f96b +https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.1-h86ecc28_2.conda#08aad7cbe9f5a6b460d0976076b6ae64 +https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda#182afabe009dc78d8b73100255ee6868 +https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.5.0-hd08dc88_1.conda#ee68fdc3a8723e9c58bdd2f10544658f +https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda#bb5a90c93e3bac3d5690acf76b4a6386 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libice-1.1.2-h86ecc28_0.conda#c8d8ec3e00cd0fd8a231789b91a7c5b7 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-h86ecc28_0.conda#d5397424399a66d33c80b1f2345a36a6 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-h57736b2_0.conda#25a5a7b797fe6e084e04ffe2db02fc62 +https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h68df207_7.conda#56398c28220513b9ea13d7b450acfb20 +https://conda.anaconda.org/conda-forge/linux-aarch64/double-conversion-3.3.1-h5ad3122_0.conda#399959d889e1a73fc99f12ce480e77e1 +https://conda.anaconda.org/conda-forge/linux-aarch64/keyutils-1.6.1-h4e544f5_0.tar.bz2#1f24853e59c68892452ef94ddd8afd4b +https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.0.0-hfdc4d58_1.conda#60dceb7e876f4d74a9cbd42bbbc6b9cf +https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlidec-1.1.0-h86ecc28_2.conda#e64d0f3b59c7c4047446b97a8624a72d +https://conda.anaconda.org/conda-forge/linux-aarch64/libbrotlienc-1.1.0-h86ecc28_2.conda#0e9bd365480c72b25c71a448257b537d +https://conda.anaconda.org/conda-forge/linux-aarch64/libedit-3.1.20250104-pl5321h976ea20_0.conda#fb640d776fc92b682a14e001980825b1 +https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.1.0-he9431aa_2.conda#dc8675aa2658bb0d92cefbff83ce2db8 +https://conda.anaconda.org/conda-forge/linux-aarch64/libnsl-2.0.1-h31becfc_0.conda#c14f32510f694e3185704d89967ec422 +https://conda.anaconda.org/conda-forge/linux-aarch64/libntlm-1.4-hf897c2e_1002.tar.bz2#835c7c4137821de5c309f4266a51ba89 +https://conda.anaconda.org/conda-forge/linux-aarch64/libpciaccess-0.18-h31becfc_0.conda#6d48179630f00e8c9ad9e30879ce1e54 +https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.47-hec79eb8_0.conda#c4b1ba0d7cef5002759d2f156722feee +https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.50.0-h5eb1b54_0.conda#634a05a598cd4b3b852443f8e3b45003 +https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-ng-15.1.0-hf1166c9_2.conda#18e532d1a39ae9f78cc8988a034f1cae +https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.38.1-hb4cce97_0.conda#000e30b09db0b7c775b21695dff30969 +https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda#cd14ee5cca2464a425b1dbfc24d90db2 +https://conda.anaconda.org/conda-forge/linux-aarch64/libxcrypt-4.4.36-h31becfc_1.conda#b4df5d7d4b63579d081fd3a4cf99740e +https://conda.anaconda.org/conda-forge/linux-aarch64/ninja-1.12.1-h17cf362_1.conda#885414635e2a65ed06f284f6d569cdff +https://conda.anaconda.org/conda-forge/linux-aarch64/pixman-0.46.0-h86a87f0_0.conda#1328d5bad76f7b31926ccd2a33e0d6ef +https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.2-h8382b9d_2.conda#c0f08fc2737967edde1a272d4bf41ed9 +https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h5688188_102.conda#2562c9bfd1de3f9c590f0fe53858d85c +https://conda.anaconda.org/conda-forge/linux-aarch64/wayland-1.23.1-h698ed42_1.conda#229b00f81a229af79547a7e4776ccf6e +https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-hbcf94c1_2.conda#5be90c5a3e4b43c53e38f50a85e11527 +https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-bin-1.1.0-h86ecc28_2.conda#7d48b185fe1f722f8cda4539bb931f85 +https://conda.anaconda.org/conda-forge/linux-aarch64/graphite2-1.3.13-h2f0025b_1003.conda#f33009add6a08358bc12d114ceec1304 +https://conda.anaconda.org/conda-forge/linux-aarch64/icu-75.1-hf9b3779_0.conda#268203e8b983fddb6412b36f2024e75c +https://conda.anaconda.org/conda-forge/linux-aarch64/krb5-1.21.3-h50a48e9_0.conda#29c10432a2ca1472b53f299ffb2ffa37 +https://conda.anaconda.org/conda-forge/linux-aarch64/libdrm-2.4.124-h86ecc28_0.conda#a8058bcb6b4fa195aaa20452437c7727 +https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.13.3-he93130f_1.conda#51eae9012d75b8f7e4b0adfe61a83330 +https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-ng-15.1.0-he9431aa_2.conda#55c5691e8b65612aaa0ef109cf645724 +https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.29-pthreads_h9d3fd7e_0.conda#a99e2bfcb1ad6362544c71281eb617e9 +https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.0-h7c15681_5.conda#264a9aac20276b1784dac8c5f8d3704a +https://conda.anaconda.org/conda-forge/linux-aarch64/pcre2-10.45-hf4ec17f_0.conda#ad22a9a9497f7aedce73e0da53cd215f +https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.10.17-h256493d_0_cpython.conda#c496213b6ede3c5a30ce1bf02bebf382 +https://conda.anaconda.org/conda-forge/linux-aarch64/qhull-2020.2-h70be974_5.conda#bb138086d938e2b64f5f364945793ebf +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-0.4.1-h5c728e9_2.conda#b4cf8ba6cff9cdf1249bcfe1314222b0 +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-keysyms-0.4.1-h5c728e9_0.conda#57ca8564599ddf8b633c4ea6afee6f3a +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-renderutil-0.3.10-h5c728e9_0.conda#7beeda4223c5484ef72d89fb66b7e8c1 +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-wm-0.4.2-h5c728e9_0.conda#f14dcda6894722e421da2b7dcffb0b78 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libsm-1.2.6-h0808dbd_0.conda#2d1409c50882819cb1af2de82e2b7208 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libx11-1.8.12-hca56bd8_0.conda#3df132f0048b9639bc091ef22937c111 +https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-1.1.0-h86ecc28_2.conda#5094acc34eb173f74205c0b55f0dd4a4 +https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda#962b9857ee8e7018c22f2776ffa0b2d7 +https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda#44600c4667a319d67dbe0681fc0bc833 +https://conda.anaconda.org/conda-forge/linux-aarch64/cyrus-sasl-2.1.27-hf6b2984_7.conda#7a85d417c8acd7a5215c082c5b9219e5 +https://conda.anaconda.org/conda-forge/linux-aarch64/cython-3.1.1-py310hc86cfe9_1.conda#927cfc32db9db46c1804d977aaa11173 +https://conda.anaconda.org/conda-forge/noarch/execnet-2.1.1-pyhd8ed1ab_1.conda#a71efeae2c160f6789900ba2631a2c90 +https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 +https://conda.anaconda.org/conda-forge/linux-aarch64/kiwisolver-1.4.7-py310h5d7f10c_0.conda#b86d594bf17c9ad7a291593368ae8ba7 +https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.17-hc88f144_0.conda#b87b1abd2542cf65a00ad2e2461a3083 +https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.9.0-31_h1a9f1db_openblas.conda#48bd5bf15ccf3e409840be9caafc0ad5 +https://conda.anaconda.org/conda-forge/linux-aarch64/libcups-2.3.3-h405e4a8_4.conda#d42c670b0c96c1795fd859d5e0275a55 +https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.13.3-h8af1aa0_1.conda#2d4a1c3dcabb80b4a56d5c34bdacea08 +https://conda.anaconda.org/conda-forge/linux-aarch64/libglib-2.84.2-hc022ef1_0.conda#51323eab8e9f049d001424828c4c25a4 +https://conda.anaconda.org/conda-forge/linux-aarch64/libglx-1.7.0-hd24410f_2.conda#1d4269e233636148696a67e2d30dad2a +https://conda.anaconda.org/conda-forge/linux-aarch64/libhiredis-1.0.2-h05efe27_0.tar.bz2#a87f068744fd20334cd41489eb163bee +https://conda.anaconda.org/conda-forge/linux-aarch64/libxml2-2.13.8-he060846_0.conda#c73dfe6886cc8d39a09c357a36f91fb2 +https://conda.anaconda.org/conda-forge/noarch/meson-1.8.1-pyhe01879c_0.conda#f3cccd9a6ce5331ae33f69ade5529162 +https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyh9f0ad1d_0.tar.bz2#2ba8498c1018c1e9c61eb99b973dfe19 +https://conda.anaconda.org/conda-forge/linux-aarch64/openblas-0.3.29-pthreads_h3a8cbd8_0.conda#4ec5b6144709ced5e7933977675f61c6 +https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.3-h3f56577_0.conda#04231368e4af50d11184b50e14250993 +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/noarch/pluggy-1.6.0-pyhd8ed1ab_0.conda#7da7ccd349dbf6487a7778579d2bb971 +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda#513d3c262ee49b54a8fec85c5bc99764 +https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda#4de79c071274a53dcaf2a8c749d1499e +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda#a451d576819089b0d672f18768be0f65 +https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda#9d64911b31d57ca443e9f1e36b04385f +https://conda.anaconda.org/conda-forge/noarch/tomli-2.2.1-pyhd8ed1ab_1.conda#ac944244f1fed2eb49bae07193ae8215 +https://conda.anaconda.org/conda-forge/linux-aarch64/tornado-6.5.1-py310h78583b1_0.conda#e1e576b66cca7642b0a66310b675ea36 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.13.2-pyh29332c3_0.conda#83fc6ae00127671e301c9f44254c31b8 +https://conda.anaconda.org/conda-forge/linux-aarch64/unicodedata2-16.0.0-py310ha766c32_0.conda#2936ce19a675e162962f396c7b40b905 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.45.1-pyhd8ed1ab_1.conda#75cb7132eb58d97896e173ef12ac9986 +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-image-0.4.0-h5c728e9_2.conda#b82e5c78dbbfa931980e8bfe83bce913 +https://conda.anaconda.org/conda-forge/linux-aarch64/xkeyboard-config-2.44-h86ecc28_0.conda#4d91bf5ccb5b31be8e070fda2ed13c50 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxext-1.3.6-h57736b2_0.conda#bd1e86dd8aa3afd78a4bfdb4ef918165 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxfixes-6.0.1-h57736b2_0.conda#78f8715c002cc66991d7c11e3cf66039 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrender-0.9.12-h86ecc28_0.conda#ae2c2dd0e2d38d249887727db2af960e +https://conda.anaconda.org/conda-forge/linux-aarch64/ccache-4.11.3-h4889ad1_0.conda#e0b9e519da2bf0fb8c48381daf87a194 +https://conda.anaconda.org/conda-forge/linux-aarch64/dbus-1.16.2-heda779d_0.conda#9203b74bb1f3fa0d6f308094b3b44c1e +https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda#72e42d28960d875c7654614f8b50939a +https://conda.anaconda.org/conda-forge/linux-aarch64/fonttools-4.58.1-py310heeae437_0.conda#837e7673572a3d0ecd6cf5a31dee2f35 +https://conda.anaconda.org/conda-forge/linux-aarch64/freetype-2.13.3-h8af1aa0_1.conda#71c4cbe1b384a8e7b56993394a435343 +https://conda.anaconda.org/conda-forge/noarch/joblib-1.5.1-pyhd8ed1ab_0.conda#fb1c14694de51a476ce8636d92b6f42c +https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.9.0-31_hab92f65_openblas.conda#6b81dbae56a519f1ec2f25e0ee2f4334 +https://conda.anaconda.org/conda-forge/linux-aarch64/libgl-1.7.0-hd24410f_2.conda#0d00176464ebb25af83d40736a2cd3bb +https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.9.0-31_h411afd4_openblas.conda#41dbff5eb805a75c120a7b7a1c744dc2 +https://conda.anaconda.org/conda-forge/linux-aarch64/libllvm20-20.1.6-h07bd352_0.conda#978603200db5e721247fdb529a6e7321 +https://conda.anaconda.org/conda-forge/linux-aarch64/libxkbcommon-1.10.0-hbab7b08_0.conda#36cd1db31e923c6068b7e0e6fce2cd7b +https://conda.anaconda.org/conda-forge/linux-aarch64/libxslt-1.1.39-h1cc9640_0.conda#13e1d3f9188e85c6d59a98651aced002 +https://conda.anaconda.org/conda-forge/linux-aarch64/openldap-2.6.10-h30c48ee_0.conda#48f31a61be512ec1929f4b4a9cedf4bd +https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-11.2.1-py310h34c99de_0.conda#116816e9f034fcaeafcd878ef8b1e323 +https://conda.anaconda.org/conda-forge/noarch/pip-25.1.1-pyh8b19718_0.conda#32d0781ace05105cc99af55d36cbec7c +https://conda.anaconda.org/conda-forge/noarch/pyproject-metadata-0.9.1-pyhd8ed1ab_0.conda#22ae7c6ea81e0c8661ef32168dda929b +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda#5ba79d7c71f03c678c8ead841f347d6e +https://conda.anaconda.org/conda-forge/linux-aarch64/xcb-util-cursor-0.1.5-h86ecc28_0.conda#d6bb2038d26fa118d5cbc2761116f3e5 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxcomposite-0.4.6-h86ecc28_2.conda#86051eee0766c3542be24844a9c3cf36 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxcursor-1.2.3-h86ecc28_0.conda#f2054759c2203d12d0007005e1f1296d +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdamage-1.1.6-h86ecc28_0.conda#d5773c4e4d64428d7ddaa01f6f845dc7 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxi-1.8.2-h57736b2_0.conda#eeee3bdb31c6acde2b81ad1b8c287087 +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxrandr-1.5.4-h86ecc28_0.conda#dd3e74283a082381aa3860312e3c721e +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxxf86vm-1.1.6-h86ecc28_0.conda#d745faa2d7c15092652e40a22bb261ed +https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.15.0-h8dda3cd_1.conda#112b71b6af28b47c624bcbeefeea685b +https://conda.anaconda.org/conda-forge/linux-aarch64/libclang-cpp20.1-20.1.6-default_h7d4303a_0.conda#688d99949628971e08e6e44ee8b68a28 +https://conda.anaconda.org/conda-forge/linux-aarch64/libclang13-20.1.6-default_h9e36cb9_0.conda#ad384e458f9b9c2d5b22a399786b226a +https://conda.anaconda.org/conda-forge/linux-aarch64/liblapacke-3.9.0-31_hc659ca5_openblas.conda#256bb281d78e5b8927ff13a1cde9f6f5 +https://conda.anaconda.org/conda-forge/linux-aarch64/libpq-17.5-hf590da8_0.conda#b5a01e5aa04651ccf5865c2d029affa3 +https://conda.anaconda.org/conda-forge/noarch/meson-python-0.18.0-pyh70fd9c4_0.conda#576c04b9d9f8e45285fb4d9452c26133 +https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.2.6-py310h6e5608f_0.conda#9e9f1f279eb02c41bda162a42861adc0 +https://conda.anaconda.org/conda-forge/noarch/pytest-8.3.5-pyhd8ed1ab_0.conda#c3c9316209dec74a705a36797970c6be +https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxtst-1.2.5-h57736b2_3.conda#c05698071b5c8e0da82a282085845860 +https://conda.anaconda.org/conda-forge/linux-aarch64/blas-devel-3.9.0-31_h9678261_openblas.conda#a2cc143d7e25e52a915cb320e5b0d592 +https://conda.anaconda.org/conda-forge/linux-aarch64/cairo-1.18.4-h83712da_0.conda#cd55953a67ec727db5dc32b167201aa6 +https://conda.anaconda.org/conda-forge/linux-aarch64/contourpy-1.3.2-py310hf54e67a_0.conda#779694434d1f0a67c5260db76b7b7907 +https://conda.anaconda.org/conda-forge/noarch/pytest-xdist-3.7.0-pyhd8ed1ab_0.conda#15353a2a0ea6dfefaa52fc5ab5b98f41 +https://conda.anaconda.org/conda-forge/linux-aarch64/scipy-1.15.2-py310hf37559f_0.conda#5c9b72f10d2118d943a5eaaf2f396891 +https://conda.anaconda.org/conda-forge/linux-aarch64/blas-2.131-openblas.conda#51c5f346e1ebee750f76066490059df9 +https://conda.anaconda.org/conda-forge/linux-aarch64/harfbuzz-11.2.1-h405b6a2_0.conda#b55680fc90e9747dc858e7ceb0abc2b2 +https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-base-3.10.3-py310h2cc5e2d_0.conda#e29f4329f4f76cf14f74ed86dcc59bac +https://conda.anaconda.org/conda-forge/linux-aarch64/qt6-main-6.9.0-h13135bf_3.conda#f3d24ce6f388642e76f4917b5069c2e9 +https://conda.anaconda.org/conda-forge/linux-aarch64/pyside6-6.9.0-py310hee8ad4f_0.conda#68f556281ac23f1780381f00de99d66d +https://conda.anaconda.org/conda-forge/linux-aarch64/matplotlib-3.10.3-py310hbbe02a8_0.conda#08982f6ac753e962d59160b08839221b diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh old mode 100644 new mode 100755 index de564fc177c89..8f51a34d4039b --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -5,12 +5,12 @@ set -x WHEEL=$1 DEST_DIR=$2 -BITNESS=$3 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll +pip install wheel wheel unpack "$WHEEL" WHEEL_DIRNAME=$(ls -d scikit_learn-*) -python build_tools/github/vendor.py "$WHEEL_DIRNAME" "$BITNESS" +python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" diff --git a/build_tools/github/test_source.sh b/build_tools/github/test_source.sh old mode 100644 new mode 100755 index 3a65a657addec..c93d22a08e791 --- a/build_tools/github/test_source.sh +++ b/build_tools/github/test_source.sh @@ -13,7 +13,6 @@ python -m pip install pytest pandas # Run the tests on the installed source distribution mkdir tmp_for_test -cp scikit-learn/scikit-learn/conftest.py tmp_for_test cd tmp_for_test pytest --pyargs sklearn diff --git a/build_tools/github/test_wheels.sh b/build_tools/github/test_wheels.sh deleted file mode 100644 index 58a05b6182006..0000000000000 --- a/build_tools/github/test_wheels.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -e -set -x - -if [[ "$OSTYPE" != "linux-gnu" ]]; then - # The Linux test environment is run in a Docker container and - # it is not possible to copy the test configuration file (yet) - cp $CONFTEST_PATH $CONFTEST_NAME -fi - -pytest --pyargs sklearn - -# Test that there are no links to system libraries -python -m threadpoolctl -i sklearn diff --git a/build_tools/github/test_windows_wheels.sh b/build_tools/github/test_windows_wheels.sh old mode 100644 new mode 100755 index ff3f823be754b..c96ec4ad89d3e --- a/build_tools/github/test_windows_wheels.sh +++ b/build_tools/github/test_windows_wheels.sh @@ -4,21 +4,27 @@ set -e set -x PYTHON_VERSION=$1 -BITNESS=$2 +PROJECT_DIR=$2 -if [[ "$PYTHON_VERSION" == "36" || "$BITNESS" == "32" ]]; then - # For Python 3.6 and 32-bit architecture use the regular - # test command (outside of the minimal Docker container) - cp $CONFTEST_PATH $CONFTEST_NAME - pytest --pyargs sklearn - python -m threadpoolctl -i sklearn -else - docker container run -e SKLEARN_SKIP_NETWORK_TESTS=1 \ - -e OMP_NUM_THREADS=2 \ - -e OPENBLAS_NUM_THREADS=2 \ - --rm scikit-learn/minimal-windows \ - powershell -Command "pytest --pyargs sklearn" +python $PROJECT_DIR/build_tools/wheels/check_license.py + +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" - docker container run --rm scikit-learn/minimal-windows \ - powershell -Command "python -m threadpoolctl -i sklearn" +if [[ $FREE_THREADED_BUILD == "False" ]]; then + # Run the tests for the scikit-learn wheel in a minimal Windows environment + # without any developer runtime libraries installed to ensure that it does not + # implicitly rely on the presence of the DLLs of such runtime libraries. + docker container run \ + --rm scikit-learn/minimal-windows \ + powershell -Command "python -c 'import sklearn; sklearn.show_versions()'" + + docker container run \ + -e SKLEARN_SKIP_NETWORK_TESTS=1 \ + --rm scikit-learn/minimal-windows \ + powershell -Command "pytest --pyargs sklearn" +else + # This is too cumbersome to use a Docker image in the free-threaded case + export PYTHON_GIL=0 + python -c "import sklearn; sklearn.show_versions()" + pytest --pyargs sklearn fi diff --git a/build_tools/github/upload_anaconda.sh b/build_tools/github/upload_anaconda.sh old mode 100644 new mode 100755 index 7651576cf558e..b53f27b75e72b --- a/build_tools/github/upload_anaconda.sh +++ b/build_tools/github/upload_anaconda.sh @@ -3,16 +3,19 @@ set -e set -x -if [ "$GITHUB_EVENT_NAME" == "schedule" ]; then - ANACONDA_ORG="scipy-wheels-nightly" +if [[ "$GITHUB_EVENT_NAME" == "schedule" \ + || "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]]; then + ANACONDA_ORG="scientific-python-nightly-wheels" ANACONDA_TOKEN="$SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN" else ANACONDA_ORG="scikit-learn-wheels-staging" ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" fi -pip install git+https://github.com/Anaconda-Server/anaconda-client +export PATH=$CONDA/bin:$PATH +conda create -n upload -y anaconda-client +source activate upload # Force a replacement if the remote file already exists -anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG dist/artifact/* +anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG $ARTIFACTS_PATH/* echo "Index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" diff --git a/build_tools/github/vendor.py b/build_tools/github/vendor.py index 5b367f3fb4ecc..28b44be3c9aa9 100644 --- a/build_tools/github/vendor.py +++ b/build_tools/github/vendor.py @@ -1,9 +1,4 @@ -"""Embed vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll. - -Note that vcruntime140_1.dll is only required (and available) -for 64-bit architectures. -""" - +"""Embed vcomp140.dll and msvcp140.dll.""" import os import os.path as op @@ -11,67 +6,32 @@ import sys import textwrap - TARGET_FOLDER = op.join("sklearn", ".libs") DISTRIBUTOR_INIT = op.join("sklearn", "_distributor_init.py") VCOMP140_SRC_PATH = "C:\\Windows\\System32\\vcomp140.dll" -VCRUNTIME140_SRC_PATH = "C:\\Windows\\System32\\vcruntime140.dll" -VCRUNTIME140_1_SRC_PATH = "C:\\Windows\\System32\\vcruntime140_1.dll" - - -def make_distributor_init_32_bits(distributor_init, - vcomp140_dll_filename, - vcruntime140_dll_filename): - """Create a _distributor_init.py file for 32-bit architectures. - - This file is imported first when importing the sklearn package - so as to pre-load the vendored vcomp140.dll and vcruntime140.dll. - """ - with open(distributor_init, "wt") as f: - f.write(textwrap.dedent(""" - '''Helper to preload vcomp140.dll and vcruntime140.dll to - prevent "not found" errors. - - Once vcomp140.dll and vcruntime140.dll are preloaded, the - namespace is made available to any subsequent vcomp140.dll - and vcruntime140.dll. This is created as part of the scripts - that build the wheel. - ''' +MSVCP140_SRC_PATH = "C:\\Windows\\System32\\msvcp140.dll" - import os - import os.path as op - from ctypes import WinDLL - - - if os.name == "nt": - # Load vcomp140.dll and vcruntime140.dll - libs_path = op.join(op.dirname(__file__), ".libs") - vcomp140_dll_filename = op.join(libs_path, "{0}") - vcruntime140_dll_filename = op.join(libs_path, "{1}") - WinDLL(op.abspath(vcomp140_dll_filename)) - WinDLL(op.abspath(vcruntime140_dll_filename)) - """.format(vcomp140_dll_filename, vcruntime140_dll_filename))) - - -def make_distributor_init_64_bits(distributor_init, - vcomp140_dll_filename, - vcruntime140_dll_filename, - vcruntime140_1_dll_filename): +def make_distributor_init_64_bits( + distributor_init, + vcomp140_dll_filename, + msvcp140_dll_filename, +): """Create a _distributor_init.py file for 64-bit architectures. This file is imported first when importing the sklearn package - so as to pre-load the vendored vcomp140.dll, vcruntime140.dll - and vcruntime140_1.dll. + so as to pre-load the vendored vcomp140.dll and msvcp140.dll. """ with open(distributor_init, "wt") as f: - f.write(textwrap.dedent(""" - '''Helper to preload vcomp140.dll, vcruntime140.dll and - vcruntime140_1.dll to prevent "not found" errors. + f.write( + textwrap.dedent( + """ + '''Helper to preload vcomp140.dll and msvcp140.dll to prevent + "not found" errors. - Once vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll are + Once vcomp140.dll and msvcp140.dll are preloaded, the namespace is made available to any subsequent - vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll. This is + vcomp140.dll and msvcp140.dll. This is created as part of the scripts that build the wheel. ''' @@ -82,36 +42,32 @@ def make_distributor_init_64_bits(distributor_init, if os.name == "nt": - # Load vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll libs_path = op.join(op.dirname(__file__), ".libs") vcomp140_dll_filename = op.join(libs_path, "{0}") - vcruntime140_dll_filename = op.join(libs_path, "{1}") - vcruntime140_1_dll_filename = op.join(libs_path, "{2}") + msvcp140_dll_filename = op.join(libs_path, "{1}") WinDLL(op.abspath(vcomp140_dll_filename)) - WinDLL(op.abspath(vcruntime140_dll_filename)) - WinDLL(op.abspath(vcruntime140_1_dll_filename)) - """.format(vcomp140_dll_filename, - vcruntime140_dll_filename, - vcruntime140_1_dll_filename))) + WinDLL(op.abspath(msvcp140_dll_filename)) + """.format( + vcomp140_dll_filename, + msvcp140_dll_filename, + ) + ) + ) -def main(wheel_dirname, bitness): - """Embed vcomp140.dll, vcruntime140.dll and vcruntime140_1.dll.""" +def main(wheel_dirname): + """Embed vcomp140.dll and msvcp140.dll.""" if not op.exists(VCOMP140_SRC_PATH): raise ValueError(f"Could not find {VCOMP140_SRC_PATH}.") - if not op.exists(VCRUNTIME140_SRC_PATH): - raise ValueError(f"Could not find {VCRUNTIME140_SRC_PATH}.") - - if not op.exists(VCRUNTIME140_1_SRC_PATH) and bitness == "64": - raise ValueError(f"Could not find {VCRUNTIME140_1_SRC_PATH}.") + if not op.exists(MSVCP140_SRC_PATH): + raise ValueError(f"Could not find {MSVCP140_SRC_PATH}.") if not op.isdir(wheel_dirname): raise RuntimeError(f"Could not find {wheel_dirname} file.") vcomp140_dll_filename = op.basename(VCOMP140_SRC_PATH) - vcruntime140_dll_filename = op.basename(VCRUNTIME140_SRC_PATH) - vcruntime140_1_dll_filename = op.basename(VCRUNTIME140_1_SRC_PATH) + msvcp140_dll_filename = op.basename(MSVCP140_SRC_PATH) target_folder = op.join(wheel_dirname, TARGET_FOLDER) distributor_init = op.join(wheel_dirname, DISTRIBUTOR_INIT) @@ -123,26 +79,18 @@ def main(wheel_dirname, bitness): print(f"Copying {VCOMP140_SRC_PATH} to {target_folder}.") shutil.copy2(VCOMP140_SRC_PATH, target_folder) - print(f"Copying {VCRUNTIME140_SRC_PATH} to {target_folder}.") - shutil.copy2(VCRUNTIME140_SRC_PATH, target_folder) - - if bitness == "64": - print(f"Copying {VCRUNTIME140_1_SRC_PATH} to {target_folder}.") - shutil.copy2(VCRUNTIME140_1_SRC_PATH, target_folder) + print(f"Copying {MSVCP140_SRC_PATH} to {target_folder}.") + shutil.copy2(MSVCP140_SRC_PATH, target_folder) # Generate the _distributor_init file in the source tree print("Generating the '_distributor_init.py' file.") - if bitness == "32": - make_distributor_init_32_bits(distributor_init, - vcomp140_dll_filename, - vcruntime140_dll_filename) - else: - make_distributor_init_64_bits(distributor_init, - vcomp140_dll_filename, - vcruntime140_dll_filename, - vcruntime140_1_dll_filename) + make_distributor_init_64_bits( + distributor_init, + vcomp140_dll_filename, + msvcp140_dll_filename, + ) if __name__ == "__main__": - _, wheel_file, bitness = sys.argv - main(wheel_file, bitness) + _, wheel_file = sys.argv + main(wheel_file) diff --git a/build_tools/linting.sh b/build_tools/linting.sh new file mode 100755 index 0000000000000..34b37530e10ff --- /dev/null +++ b/build_tools/linting.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +# Note that any change in this file, adding or removing steps or changing the +# printed messages, should be also reflected in the `get_comment.py` file. + +# This script shouldn't exit if a command / pipeline fails +set +e +# pipefail is necessary to propagate exit codes +set -o pipefail + +global_status=0 + +echo -e "### Running the ruff linter ###\n" +ruff check --output-format=full +status=$? +if [[ $status -eq 0 ]] +then + echo -e "No problem detected by the ruff linter\n" +else + echo -e "Problems detected by ruff check, please fix them\n" + global_status=1 +fi + +echo -e "### Running the ruff formatter ###\n" +ruff format --diff +status=$? +if [[ $status -eq 0 ]] +then + echo -e "No problem detected by the ruff formatter\n" +else + echo -e "Problems detected by ruff format, please run ruff format and commit the result\n" + global_status=1 +fi + +echo -e "### Running mypy ###\n" +mypy sklearn/ +status=$? +if [[ $status -eq 0 ]] +then + echo -e "No problem detected by mypy\n" +else + echo -e "Problems detected by mypy, please fix them\n" + global_status=1 +fi + +echo -e "### Running cython-lint ###\n" +cython-lint sklearn/ +status=$? +if [[ $status -eq 0 ]] +then + echo -e "No problem detected by cython-lint\n" +else + echo -e "Problems detected by cython-lint, please fix them\n" + global_status=1 +fi + +# For docstrings and warnings of deprecated attributes to be rendered +# properly, the `deprecated` decorator must come before the `property` decorator +# (else they are treated as functions) + +echo -e "### Checking for bad deprecation order ###\n" +bad_deprecation_property_order=`git grep -A 10 "@property" -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"` + +if [ ! -z "$bad_deprecation_property_order" ] +then + echo "deprecated decorator should come before property decorator" + echo "found the following occurrences:" + echo $bad_deprecation_property_order + echo -e "\nProblems detected by deprecation order check\n" + global_status=1 +else + echo -e "No problems detected related to deprecation order\n" +fi + +# Check for default doctest directives ELLIPSIS and NORMALIZE_WHITESPACE + +echo -e "### Checking for default doctest directives ###\n" +doctest_directive="$(git grep -nw -E "# doctest\: \+(ELLIPSIS|NORMALIZE_WHITESPACE)")" + +if [ ! -z "$doctest_directive" ] +then + echo "ELLIPSIS and NORMALIZE_WHITESPACE doctest directives are enabled by default, but were found in:" + echo "$doctest_directive" + echo -e "\nProblems detected by doctest directive check\n" + global_status=1 +else + echo -e "No problems detected related to doctest directives\n" +fi + +# Check for joblib.delayed and joblib.Parallel imports +echo -e "### Checking for joblib imports ###\n" +joblib_status=0 +joblib_delayed_import="$(git grep -l -A 10 -E "joblib import.+delayed" -- "*.py" ":!sklearn/utils/parallel.py")" +if [ ! -z "$joblib_delayed_import" ]; then + echo "Use from sklearn.utils.parallel import delayed instead of joblib delayed. The following files contains imports to joblib.delayed:" + echo "$joblib_delayed_import" + joblib_status=1 +fi +joblib_Parallel_import="$(git grep -l -A 10 -E "joblib import.+Parallel" -- "*.py" ":!sklearn/utils/parallel.py")" +if [ ! -z "$joblib_Parallel_import" ]; then + echo "Use from sklearn.utils.parallel import Parallel instead of joblib Parallel. The following files contains imports to joblib.Parallel:" + echo "$joblib_Parallel_import" + joblib_status=1 +fi + +if [[ $joblib_status -eq 0 ]] +then + echo -e "No problems detected related to joblib imports\n" +else + echo -e "\nProblems detected by joblib import check\n" + global_status=1 +fi + +echo -e "### Linting completed ###\n" + +if [[ $global_status -eq 1 ]] +then + echo -e "Linting failed\n" + exit 1 +else + echo -e "Linting passed\n" + exit 0 +fi diff --git a/build_tools/shared.sh b/build_tools/shared.sh index 4aa260675b9be..3c6f238385506 100644 --- a/build_tools/shared.sh +++ b/build_tools/shared.sh @@ -5,7 +5,7 @@ get_dep() { # do not install with none echo elif [[ "${version%%[^0-9.]*}" ]]; then - # version number is explicity passed + # version number is explicitly passed echo "$package==$version" elif [[ "$version" == "latest" ]]; then # use latest @@ -14,3 +14,38 @@ get_dep() { echo "$package==$(python sklearn/_min_dependencies.py $package)" fi } + +show_installed_libraries(){ + # use conda list when inside a conda environment. conda list shows more + # info than pip list, e.g. whether OpenBLAS or MKL is installed as well as + # the version of OpenBLAS or MKL + if [[ -n "$CONDA_PREFIX" ]]; then + conda list + else + python -m pip list + fi +} + +activate_environment() { + if [[ "$DISTRIB" =~ ^conda.* ]]; then + source activate $VIRTUALENV + elif [[ "$DISTRIB" == "ubuntu" || "$DISTRIB" == "debian-32" ]]; then + source $VIRTUALENV/bin/activate + fi +} + +create_conda_environment_from_lock_file() { + ENV_NAME=$1 + LOCK_FILE=$2 + # Because we are using lock-files with the "explicit" format, conda can + # install them directly, provided the lock-file does not contain pip solved + # packages. For more details, see + # https://conda.github.io/conda-lock/output/#explicit-lockfile + lock_file_has_pip_packages=$(grep -q files.pythonhosted.org $LOCK_FILE && echo "true" || echo "false") + if [[ "$lock_file_has_pip_packages" == "false" ]]; then + conda create --name $ENV_NAME --file $LOCK_FILE + else + python -m pip install "$(get_dep conda-lock min)" + conda-lock install --name $ENV_NAME $LOCK_FILE + fi +} diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh deleted file mode 100755 index 2123f7efafc22..0000000000000 --- a/build_tools/travis/after_success.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# This script is meant to be called by the "after_success" step -# defined in ".travis.yml". In particular, we upload the wheels -# of the ARM64 architecture for the continuous deployment jobs. - -set -e - -# The wheels cannot be uploaded on PRs -if [[ $BUILD_WHEEL == true && $TRAVIS_EVENT_TYPE != pull_request ]]; then - # Nightly upload token and staging upload token are set in - # Travis settings (originally generated at Anaconda cloud) - if [[ $TRAVIS_EVENT_TYPE == cron ]]; then - ANACONDA_ORG="scipy-wheels-nightly" - ANACONDA_TOKEN="$SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN" - else - ANACONDA_ORG="scikit-learn-wheels-staging" - ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" - fi - - pip install git+https://github.com/Anaconda-Server/anaconda-client - - # Force a replacement if the remote file already exists - anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG wheelhouse/*.whl - echo "Index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" -fi diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh deleted file mode 100644 index 1e8e2963711ef..0000000000000 --- a/build_tools/travis/install.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -# This script is meant to be called by the "install" step -# defined in the ".travis.yml" file. In particular, it is -# important that we call to the right installation script. - -set -e - -if [[ $BUILD_WHEEL == true ]]; then - source build_tools/travis/install_wheels.sh -else - source build_tools/travis/install_main.sh -fi diff --git a/build_tools/travis/install_main.sh b/build_tools/travis/install_main.sh deleted file mode 100755 index c0795139859bb..0000000000000 --- a/build_tools/travis/install_main.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# Travis clone "scikit-learn/scikit-learn" repository into -# a local repository. We use a cached directory with three -# scikit-learn repositories (one for each matrix entry for -# non continuous deployment jobs) from which we pull local -# Travis repository. This allows us to keep build artifact -# for GCC + Cython, and gain time. - -set -e - -echo "CPU Arch: $TRAVIS_CPU_ARCH." - -# Import "get_dep" -source build_tools/shared.sh - -echo "List files from cached directories." -echo "pip:" -ls $HOME/.cache/pip - -export CC=/usr/lib/ccache/gcc -export CXX=/usr/lib/ccache/g++ - -# Useful for debugging how ccache is used -# export CCACHE_LOGFILE=/tmp/ccache.log - -# 60MB are (more or less) used by .ccache, when -# compiling from scratch at the time of writing -ccache --max-size 100M --show-stats - -# Deactivate the default virtual environment -# to setup a conda-based environment instead -deactivate - -MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" - -# Install Miniconda -wget $MINICONDA_URL -O miniconda.sh -MINICONDA_PATH=$HOME/miniconda -chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH -export PATH=$MINICONDA_PATH/bin:$PATH -conda update --yes conda - -# Create environment and install dependencies -conda create -n testenv --yes python=3.7 - -source activate testenv -conda install -y scipy numpy pandas cython -pip install joblib threadpoolctl - -pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist - -# Build scikit-learn in this script to collapse the -# verbose build output in the Travis output when it -# succeeds -python --version -python -c "import numpy; print(f'numpy {numpy.__version__}')" -python -c "import scipy; print(f'scipy {scipy.__version__}')" - -pip install -e . -python setup.py develop - -ccache --show-stats - -# Useful for debugging how ccache is used -# cat $CCACHE_LOGFILE diff --git a/build_tools/travis/install_wheels.sh b/build_tools/travis/install_wheels.sh deleted file mode 100644 index 4bb52f51f27f7..0000000000000 --- a/build_tools/travis/install_wheels.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -set -e - -python -m pip install cibuildwheel -python -m cibuildwheel --output-dir wheelhouse diff --git a/build_tools/travis/script.sh b/build_tools/travis/script.sh deleted file mode 100644 index 2b7aecb295d82..0000000000000 --- a/build_tools/travis/script.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# This script is meant to be called by the "script" step defined -# in the ".travis.yml" file. While this step is forbidden by the -# continuous deployment jobs, we have to execute the scripts for -# testing the continuous integration jobs. - -set -e - -if [[ $BUILD_WHEEL != true ]]; then - # This trick will make Travis terminate the continuation of the pipeline - bash build_tools/travis/test_script.sh || travis_terminate 1 - bash build_tools/travis/test_docs.sh || travis_terminate 1 -fi diff --git a/build_tools/travis/test_docs.sh b/build_tools/travis/test_docs.sh deleted file mode 100755 index 4907dee1c9789..0000000000000 --- a/build_tools/travis/test_docs.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -set -e - -if [[ $TRAVIS_CPU_ARCH != arm64 ]]; then - # Faster run of the documentation tests - PYTEST="pytest -n $CPU_COUNT" make test-doc -fi diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh deleted file mode 100755 index cb5a3dbfeed33..0000000000000 --- a/build_tools/travis/test_script.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -set -e - -python --version -python -c "import numpy; print(f'numpy {numpy.__version__}')" -python -c "import scipy; print(f'scipy {scipy.__version__}')" -python -c "\ -try: - import pandas - print(f'pandas {pandas.__version__}') -except ImportError: - pass -" -python -c "import joblib; print(f'{joblib.cpu_count()} CPUs')" -python -c "import platform; print(f'{platform.machine()}')" - -TEST_CMD="pytest --showlocals --durations=20 --pyargs" - -# Run the tests on the installed version -mkdir -p $TEST_DIR - -# Copy "setup.cfg" for the test settings -cp setup.cfg $TEST_DIR -cd $TEST_DIR - -if [[ $TRAVIS_CPU_ARCH == arm64 ]]; then - # Faster run of the source code tests - TEST_CMD="$TEST_CMD -n $CPU_COUNT" - - # Remove the option to test the docstring - sed -i -e 's/--doctest-modules//g' setup.cfg -fi - -if [[ -n $CHECK_WARNINGS ]]; then - TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning" -fi - -$TEST_CMD sklearn diff --git a/build_tools/travis/test_wheels.sh b/build_tools/travis/test_wheels.sh deleted file mode 100644 index be2328e3d44d6..0000000000000 --- a/build_tools/travis/test_wheels.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -set -e - -# Faster run of the source code tests -pytest -n $CPU_COUNT --pyargs sklearn - -# Test that there are no links to system libraries -python -m threadpoolctl -i sklearn diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py new file mode 100644 index 0000000000000..5efd7f12cffd7 --- /dev/null +++ b/build_tools/update_environments_and_lock_files.py @@ -0,0 +1,778 @@ +"""Script to update CI environment files and associated lock files. + +To run it you need to be in the root folder of the scikit-learn repo: +python build_tools/update_environments_and_lock_files.py + +Two scenarios where this script can be useful: +- make sure that the latest versions of all the dependencies are used in the CI. + There is a scheduled workflow that does this, see + .github/workflows/update-lock-files.yml. This is still useful to run this + script when the automated PR fails and for example some packages need to + be pinned. You can add the pins to this script, run it, and open a PR with + the changes. +- bump minimum dependencies in sklearn/_min_dependencies.py. Running this + script will update both the CI environment files and associated lock files. + You can then open a PR with the changes. +- pin some packages to an older version by adding them to the + default_package_constraints variable. This is useful when regressions are + introduced in our dependencies, this has happened for example with pytest 7 + and coverage 6.3. + +Environments are conda environment.yml or pip requirements.txt. Lock files are +conda-lock lock files or pip-compile requirements.txt. + +pip requirements.txt are used when we install some dependencies (e.g. numpy and +scipy) with apt-get and the rest of the dependencies (e.g. pytest and joblib) +with pip. + +To run this script you need: +- conda +- conda-lock. The version should match the one used in the CI in + sklearn/_min_dependencies.py +- pip-tools + +To only update the environment and lock files for specific builds, you can use +the command line argument `--select-build` which will take a regex. For example, +to only update the documentation builds you can use: +`python build_tools/update_environments_and_lock_files.py --select-build doc` +""" + +import json +import logging +import re +import subprocess +import sys +from importlib.metadata import version +from pathlib import Path + +import click +from jinja2 import Environment +from packaging.version import Version + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +logger.addHandler(handler) + +TRACE = logging.DEBUG - 5 + + +common_dependencies_without_coverage = [ + "python", + "numpy", + "blas", + "scipy", + "cython", + "joblib", + "threadpoolctl", + "matplotlib", + "pandas", + "pyamg", + "pytest", + "pytest-xdist", + "pillow", + "pip", + "ninja", + "meson-python", +] + +common_dependencies = common_dependencies_without_coverage + [ + "pytest-cov", + "coverage", +] + +docstring_test_dependencies = ["sphinx", "numpydoc"] + +default_package_constraints = {} + + +def remove_from(alist, to_remove): + return [each for each in alist if each not in to_remove] + + +build_metadata_list = [ + { + "name": "pylatest_conda_forge_cuda_array-api_linux-64", + "type": "conda", + "tag": "cuda", + "folder": "build_tools/github", + "platform": "linux-64", + "channels": ["conda-forge", "pytorch", "nvidia"], + "conda_dependencies": common_dependencies + + [ + "ccache", + "pytorch-gpu", + "polars", + "pyarrow", + "cupy", + "array-api-strict", + ], + }, + { + "name": "pylatest_conda_forge_mkl_linux-64", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": common_dependencies + + [ + "ccache", + "pytorch", + "pytorch-cpu", + "polars", + "pyarrow", + "array-api-strict", + "scipy-doctest", + ], + "package_constraints": { + "blas": "[build=mkl]", + }, + }, + { + "name": "pylatest_conda_forge_mkl_osx-64", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "osx-64", + "channels": ["conda-forge"], + "conda_dependencies": common_dependencies + + [ + "ccache", + "compilers", + "llvm-openmp", + ], + "package_constraints": { + "blas": "[build=mkl]", + }, + }, + { + "name": "pylatest_conda_mkl_no_openmp", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "osx-64", + "channels": ["defaults"], + "conda_dependencies": remove_from( + common_dependencies, ["cython", "threadpoolctl", "meson-python"] + ) + + ["ccache"], + "package_constraints": { + "blas": "[build=mkl]", + # scipy 1.12.x crashes on this platform (https://github.com/scipy/scipy/pull/20086) + # TODO: release scipy constraint when 1.13 is available in the "default" + # channel. + "scipy": "<1.12", + }, + # TODO: put cython, threadpoolctl and meson-python back to conda + # dependencies when required version is available on the main channel + "pip_dependencies": ["cython", "threadpoolctl", "meson-python", "meson"], + }, + { + "name": "pymin_conda_forge_openblas_min_dependencies", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": common_dependencies + ["ccache", "polars"], + "package_constraints": { + "python": "3.10", + "blas": "[build=openblas]", + "numpy": "min", + "scipy": "min", + "matplotlib": "min", + "cython": "min", + "joblib": "min", + "threadpoolctl": "min", + "meson-python": "min", + "pandas": "min", + "polars": "min", + "pyamg": "min", + }, + }, + { + "name": "pymin_conda_forge_openblas_ubuntu_2204", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": ( + remove_from(common_dependencies_without_coverage, ["matplotlib"]) + + docstring_test_dependencies + + ["ccache"] + ), + "package_constraints": { + "python": "3.10", + "blas": "[build=openblas]", + }, + }, + { + "name": "pylatest_pip_openblas_pandas", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["defaults"], + "conda_dependencies": ["python", "ccache"], + "pip_dependencies": ( + remove_from(common_dependencies, ["python", "blas", "pip"]) + + docstring_test_dependencies + # Test with some optional dependencies + + ["lightgbm", "scikit-image"] + # Test array API on CPU without PyTorch + + ["array-api-strict"] + # doctests dependencies + + ["scipy-doctest"] + ), + }, + { + "name": "pylatest_pip_scipy_dev", + "type": "conda", + "tag": "scipy-dev", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["defaults"], + "conda_dependencies": ["python", "ccache"], + "pip_dependencies": ( + remove_from( + common_dependencies, + [ + "python", + "blas", + "matplotlib", + "pyamg", + # all the dependencies below have a development version + # installed in the CI, so they can be removed from the + # environment.yml + "numpy", + "scipy", + "pandas", + "cython", + "joblib", + "pillow", + ], + ) + + ["pooch"] + + docstring_test_dependencies + # python-dateutil is a dependency of pandas and pandas is removed from + # the environment.yml. Adding python-dateutil so it is pinned + + ["python-dateutil"] + ), + }, + { + "name": "pylatest_free_threaded", + "type": "conda", + "tag": "free-threaded", + "folder": "build_tools/azure", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": [ + "python-freethreading", + "numpy", + "scipy", + "cython", + "joblib", + "threadpoolctl", + "pytest", + "pytest-xdist", + "ninja", + "meson-python", + "ccache", + "pip", + ], + }, + { + "name": "pymin_conda_forge_mkl", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/azure", + "platform": "win-64", + "channels": ["conda-forge"], + "conda_dependencies": remove_from(common_dependencies, ["pandas", "pyamg"]) + + [ + "wheel", + "pip", + ], + "package_constraints": { + "python": "3.10", + "blas": "[build=mkl]", + }, + }, + { + "name": "doc_min_dependencies", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/circle", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": common_dependencies_without_coverage + + [ + "scikit-image", + "seaborn", + "memory_profiler", + "compilers", + "sphinx", + "sphinx-gallery", + "sphinx-copybutton", + "numpydoc", + "sphinx-prompt", + "plotly", + "polars", + "pooch", + "sphinx-remove-toctrees", + "sphinx-design", + "pydata-sphinx-theme", + "towncrier", + ], + "pip_dependencies": [ + "sphinxext-opengraph", + "sphinxcontrib-sass", + ], + "package_constraints": { + "python": "3.10", + "numpy": "min", + "scipy": "min", + "matplotlib": "min", + "cython": "min", + "scikit-image": "min", + "sphinx": "min", + "pandas": "min", + "sphinx-gallery": "min", + "sphinx-copybutton": "min", + "numpydoc": "min", + "sphinx-prompt": "min", + "sphinxext-opengraph": "min", + "plotly": "min", + "polars": "min", + "pooch": "min", + "pyamg": "min", + "sphinx-design": "min", + "sphinxcontrib-sass": "min", + "sphinx-remove-toctrees": "min", + "pydata-sphinx-theme": "min", + "towncrier": "min", + }, + }, + { + "name": "doc", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/circle", + "platform": "linux-64", + "channels": ["conda-forge"], + "conda_dependencies": common_dependencies_without_coverage + + [ + "scikit-image", + "seaborn", + "memory_profiler", + "compilers", + "sphinx", + "sphinx-gallery", + "sphinx-copybutton", + "numpydoc", + "sphinx-prompt", + "plotly", + "polars", + "pooch", + "sphinxext-opengraph", + "sphinx-remove-toctrees", + "sphinx-design", + "pydata-sphinx-theme", + "towncrier", + ], + "pip_dependencies": [ + "jupyterlite-sphinx", + "jupyterlite-pyodide-kernel", + "sphinxcontrib-sass", + ], + "package_constraints": { + "python": "3.10", + }, + }, + { + "name": "pymin_conda_forge_arm", + "type": "conda", + "tag": "main-ci", + "folder": "build_tools/github", + "platform": "linux-aarch64", + "channels": ["conda-forge"], + "conda_dependencies": remove_from( + common_dependencies_without_coverage, ["pandas", "pyamg"] + ) + + ["pip", "ccache"], + "package_constraints": { + "python": "3.10", + }, + }, + { + "name": "debian_32bit", + "type": "pip", + "tag": "main-ci", + "folder": "build_tools/azure", + "pip_dependencies": [ + "cython", + "joblib", + "threadpoolctl", + "pytest", + "pytest-cov", + "ninja", + "meson-python", + ], + # Python version from the python3 APT package in the debian-32 docker + # image. + "python_version": "3.12.5", + }, + { + "name": "ubuntu_atlas", + "type": "pip", + "tag": "main-ci", + "folder": "build_tools/azure", + "pip_dependencies": [ + "cython", + "joblib", + "threadpoolctl", + "pytest", + "pytest-xdist", + "ninja", + "meson-python", + ], + "package_constraints": { + "joblib": "min", + "threadpoolctl": "min", + "cython": "min", + }, + "python_version": "3.10.4", + }, +] + + +def execute_command(command_list): + logger.debug(" ".join(command_list)) + proc = subprocess.Popen( + command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = proc.communicate() + out, err = out.decode(errors="replace"), err.decode(errors="replace") + + if proc.returncode != 0: + command_str = " ".join(command_list) + raise RuntimeError( + "Command exited with non-zero exit code.\n" + "Exit code: {}\n" + "Command:\n{}\n" + "stdout:\n{}\n" + "stderr:\n{}\n".format(proc.returncode, command_str, out, err) + ) + logger.log(TRACE, out) + return out + + +def get_package_with_constraint(package_name, build_metadata, uses_pip=False): + build_package_constraints = build_metadata.get("package_constraints") + if build_package_constraints is None: + constraint = None + else: + constraint = build_package_constraints.get(package_name) + + constraint = constraint or default_package_constraints.get(package_name) + + if constraint is None: + return package_name + + comment = "" + if constraint == "min": + constraint = execute_command( + [sys.executable, "sklearn/_min_dependencies.py", package_name] + ).strip() + comment = " # min" + + if re.match(r"\d[.\d]*", constraint): + equality = "==" if uses_pip else "=" + constraint = equality + constraint + + return f"{package_name}{constraint}{comment}" + + +environment = Environment(trim_blocks=True, lstrip_blocks=True) +environment.filters["get_package_with_constraint"] = get_package_with_constraint + + +def get_conda_environment_content(build_metadata): + template = environment.from_string( + """ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +channels: + {% for channel in build_metadata['channels'] %} + - {{ channel }} + {% endfor %} +dependencies: + {% for conda_dep in build_metadata['conda_dependencies'] %} + - {{ conda_dep | get_package_with_constraint(build_metadata) }} + {% endfor %} + {% if build_metadata['pip_dependencies'] %} + - pip + - pip: + {% for pip_dep in build_metadata.get('pip_dependencies', []) %} + - {{ pip_dep | get_package_with_constraint(build_metadata, uses_pip=True) }} + {% endfor %} + {% endif %}""".strip() + ) + return template.render(build_metadata=build_metadata) + + +def write_conda_environment(build_metadata): + content = get_conda_environment_content(build_metadata) + build_name = build_metadata["name"] + folder_path = Path(build_metadata["folder"]) + output_path = folder_path / f"{build_name}_environment.yml" + logger.debug(output_path) + output_path.write_text(content) + + +def write_all_conda_environments(build_metadata_list): + for build_metadata in build_metadata_list: + write_conda_environment(build_metadata) + + +def conda_lock(environment_path, lock_file_path, platform): + execute_command( + [ + "conda-lock", + "lock", + "--mamba", + "--kind", + "explicit", + "--platform", + platform, + "--file", + str(environment_path), + "--filename-template", + str(lock_file_path), + ] + ) + + +def create_conda_lock_file(build_metadata): + build_name = build_metadata["name"] + folder_path = Path(build_metadata["folder"]) + environment_path = folder_path / f"{build_name}_environment.yml" + platform = build_metadata["platform"] + lock_file_basename = build_name + if not lock_file_basename.endswith(platform): + lock_file_basename = f"{lock_file_basename}_{platform}" + + lock_file_path = folder_path / f"{lock_file_basename}_conda.lock" + conda_lock(environment_path, lock_file_path, platform) + + +def write_all_conda_lock_files(build_metadata_list): + for build_metadata in build_metadata_list: + logger.info(f"# Locking dependencies for {build_metadata['name']}") + create_conda_lock_file(build_metadata) + + +def get_pip_requirements_content(build_metadata): + template = environment.from_string( + """ +# DO NOT EDIT: this file is generated from the specification found in the +# following script to centralize the configuration for CI builds: +# build_tools/update_environments_and_lock_files.py +{% for pip_dep in build_metadata['pip_dependencies'] %} +{{ pip_dep | get_package_with_constraint(build_metadata, uses_pip=True) }} +{% endfor %}""".strip() + ) + return template.render(build_metadata=build_metadata) + + +def write_pip_requirements(build_metadata): + build_name = build_metadata["name"] + content = get_pip_requirements_content(build_metadata) + folder_path = Path(build_metadata["folder"]) + output_path = folder_path / f"{build_name}_requirements.txt" + logger.debug(output_path) + output_path.write_text(content) + + +def write_all_pip_requirements(build_metadata_list): + for build_metadata in build_metadata_list: + write_pip_requirements(build_metadata) + + +def pip_compile(pip_compile_path, requirements_path, lock_file_path): + execute_command( + [ + str(pip_compile_path), + "--upgrade", + str(requirements_path), + "-o", + str(lock_file_path), + ] + ) + + +def write_pip_lock_file(build_metadata): + build_name = build_metadata["name"] + python_version = build_metadata["python_version"] + environment_name = f"pip-tools-python{python_version}" + # To make sure that the Python used to create the pip lock file is the same + # as the one used during the CI build where the lock file is used, we first + # create a conda environment with the correct Python version and + # pip-compile and run pip-compile in this environment + + execute_command( + [ + "conda", + "create", + "-c", + "conda-forge", + "-n", + f"pip-tools-python{python_version}", + f"python={python_version}", + "pip-tools", + "-y", + ] + ) + + json_output = execute_command(["conda", "info", "--json"]) + conda_info = json.loads(json_output) + environment_folder = next( + each for each in conda_info["envs"] if each.endswith(environment_name) + ) + environment_path = Path(environment_folder) + pip_compile_path = environment_path / "bin" / "pip-compile" + + folder_path = Path(build_metadata["folder"]) + requirement_path = folder_path / f"{build_name}_requirements.txt" + lock_file_path = folder_path / f"{build_name}_lock.txt" + pip_compile(pip_compile_path, requirement_path, lock_file_path) + + +def write_all_pip_lock_files(build_metadata_list): + for build_metadata in build_metadata_list: + logger.info(f"# Locking dependencies for {build_metadata['name']}") + write_pip_lock_file(build_metadata) + + +def check_conda_lock_version(): + # Check that the installed conda-lock version is consistent with _min_dependencies. + expected_conda_lock_version = execute_command( + [sys.executable, "sklearn/_min_dependencies.py", "conda-lock"] + ).strip() + + installed_conda_lock_version = version("conda-lock") + if installed_conda_lock_version != expected_conda_lock_version: + raise RuntimeError( + f"Expected conda-lock version: {expected_conda_lock_version}, got:" + f" {installed_conda_lock_version}" + ) + + +def check_conda_version(): + # Avoid issues with glibc (https://github.com/conda/conda-lock/issues/292) + # or osx (https://github.com/conda/conda-lock/issues/408) virtual package. + # The glibc one has been fixed in conda 23.1.0 and the osx has been fixed + # in conda 23.7.0. + conda_info_output = execute_command(["conda", "info", "--json"]) + + conda_info = json.loads(conda_info_output) + conda_version = Version(conda_info["conda_version"]) + + if Version("22.9.0") < conda_version < Version("23.7"): + raise RuntimeError( + f"conda version should be <= 22.9.0 or >= 23.7 got: {conda_version}" + ) + + +@click.command() +@click.option( + "--select-build", + default="", + help=( + "Regex to filter the builds we want to update environment and lock files. By" + " default all the builds are selected." + ), +) +@click.option( + "--skip-build", + default=None, + help="Regex to skip some builds from the builds selected by --select-build", +) +@click.option( + "--select-tag", + default=None, + help=( + "Tag to filter the builds, e.g. 'main-ci' or 'scipy-dev'. " + "This is an additional filtering on top of --select-build." + ), +) +@click.option( + "-v", + "--verbose", + is_flag=True, + help="Print commands executed by the script", +) +@click.option( + "-vv", + "--very-verbose", + is_flag=True, + help="Print output of commands executed by the script", +) +def main(select_build, skip_build, select_tag, verbose, very_verbose): + if verbose: + logger.setLevel(logging.DEBUG) + if very_verbose: + logger.setLevel(TRACE) + handler.setLevel(TRACE) + check_conda_lock_version() + check_conda_version() + + filtered_build_metadata_list = [ + each for each in build_metadata_list if re.search(select_build, each["name"]) + ] + if select_tag is not None: + filtered_build_metadata_list = [ + each for each in build_metadata_list if each["tag"] == select_tag + ] + if skip_build is not None: + filtered_build_metadata_list = [ + each + for each in filtered_build_metadata_list + if not re.search(skip_build, each["name"]) + ] + + selected_build_info = "\n".join( + f" - {each['name']}, type: {each['type']}, tag: {each['tag']}" + for each in filtered_build_metadata_list + ) + selected_build_message = ( + f"# {len(filtered_build_metadata_list)} selected builds\n{selected_build_info}" + ) + logger.info(selected_build_message) + + filtered_conda_build_metadata_list = [ + each for each in filtered_build_metadata_list if each["type"] == "conda" + ] + + if filtered_conda_build_metadata_list: + logger.info("# Writing conda environments") + write_all_conda_environments(filtered_conda_build_metadata_list) + logger.info("# Writing conda lock files") + write_all_conda_lock_files(filtered_conda_build_metadata_list) + + filtered_pip_build_metadata_list = [ + each for each in filtered_build_metadata_list if each["type"] == "pip" + ] + if filtered_pip_build_metadata_list: + logger.info("# Writing pip requirements") + write_all_pip_requirements(filtered_pip_build_metadata_list) + logger.info("# Writing pip lock files") + write_all_pip_lock_files(filtered_pip_build_metadata_list) + + +if __name__ == "__main__": + main() diff --git a/build_tools/wheels/LICENSE_linux.txt b/build_tools/wheels/LICENSE_linux.txt new file mode 100644 index 0000000000000..057656fcc789d --- /dev/null +++ b/build_tools/wheels/LICENSE_linux.txt @@ -0,0 +1,80 @@ +This binary distribution of scikit-learn also bundles the following software: + +---- + +Name: GCC runtime library +Files: scikit_learn.libs/libgomp*.so* +Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgomp + +GCC RUNTIME LIBRARY EXCEPTION + +Version 3.1, 31 March 2009 + +Copyright (C) 2009 Free Software Foundation, Inc. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +This GCC Runtime Library Exception ("Exception") is an additional +permission under section 7 of the GNU General Public License, version +3 ("GPLv3"). It applies to a given file (the "Runtime Library") that +bears a notice placed by the copyright holder of the file stating that +the file is governed by GPLv3 along with this Exception. + +When you use GCC to compile a program, GCC may combine portions of +certain GCC header files and runtime libraries with the compiled +program. The purpose of this Exception is to allow compilation of +non-GPL (including proprietary) programs to use, in this way, the +header files and runtime libraries covered by this Exception. + +0. Definitions. + +A file is an "Independent Module" if it either requires the Runtime +Library for execution after a Compilation Process, or makes use of an +interface provided by the Runtime Library, but is not otherwise based +on the Runtime Library. + +"GCC" means a version of the GNU Compiler Collection, with or without +modifications, governed by version 3 (or a specified later version) of +the GNU General Public License (GPL) with the option of using any +subsequent versions published by the FSF. + +"GPL-compatible Software" is software whose conditions of propagation, +modification and use would permit combination with GCC in accord with +the license of GCC. + +"Target Code" refers to output from any compiler for a real or virtual +target processor architecture, in executable form or suitable for +input to an assembler, loader, linker and/or execution +phase. Notwithstanding that, Target Code does not include data in any +format that is used as a compiler intermediate representation, or used +for producing a compiler intermediate representation. + +The "Compilation Process" transforms code entirely represented in +non-intermediate languages designed for human-written code, and/or in +Java Virtual Machine byte code, into Target Code. Thus, for example, +use of source code generators and preprocessors need not be considered +part of the Compilation Process, since the Compilation Process can be +understood as starting with the output of the generators or +preprocessors. + +A Compilation Process is "Eligible" if it is done using GCC, alone or +with other GPL-compatible software, or if it is done without using any +work based on GCC. For example, using non-GPL-compatible Software to +optimize any GCC intermediate representations would not qualify as an +Eligible Compilation Process. + +1. Grant of Additional Permission. + +You have permission to propagate a work of Target Code formed by +combining the Runtime Library with Independent Modules, even if such +propagation would otherwise violate the terms of GPLv3, provided that +all Target Code was generated by Eligible Compilation Processes. You +may then convey such a combination under terms of your choice, +consistent with the licensing of the Independent Modules. + +2. No Weakening of GCC Copyleft. + +The availability of this Exception does not imply any general +presumption that third-party software is unaffected by the copyleft +requirements of the license of GCC. diff --git a/build_tools/wheels/LICENSE_macos.txt b/build_tools/wheels/LICENSE_macos.txt new file mode 100644 index 0000000000000..61a523f47663c --- /dev/null +++ b/build_tools/wheels/LICENSE_macos.txt @@ -0,0 +1,286 @@ +This binary distribution of scikit-learn also bundles the following software: + +---- + +Name: libomp runtime library +Files: sklearn/.dylibs/libomp.dylib +Availability: https://github.com/llvm/llvm-project + +============================================================================== +The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: +============================================================================== + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +---- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + +============================================================================== +Software from third parties included in the LLVM Project: +============================================================================== +The LLVM Project contains third party software which is under different license +terms. All such code will be identified clearly using at least one of two +mechanisms: +1) It will be in a separate directory tree with its own `LICENSE.txt` or + `LICENSE` file at the top containing the specific license and restrictions + which apply to that software, or +2) It will contain specific license and restriction terms at the top of every + file. + +============================================================================== +Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. diff --git a/build_tools/wheels/LICENSE_windows.txt b/build_tools/wheels/LICENSE_windows.txt new file mode 100644 index 0000000000000..9e98ad8defac2 --- /dev/null +++ b/build_tools/wheels/LICENSE_windows.txt @@ -0,0 +1,25 @@ +This binary distribution of scikit-learn also bundles the following software: + +---- + +Name: Microsoft Visual C++ Runtime Files +Files: sklearn\.libs\*.dll +Availability: https://learn.microsoft.com/en-us/visualstudio/releases/2015/2015-redistribution-vs + +Subject to the License Terms for the software, you may copy and distribute with your +program any of the files within the followng folder and its subfolders except as noted +below. You may not modify these files. + +C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\redist + +You may not distribute the contents of the following folders: + +C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\redist\debug_nonredist +C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\redist\onecore\debug_nonredist + +Subject to the License Terms for the software, you may copy and distribute the following +files with your program in your program’s application local folder or by deploying them +into the Global Assembly Cache (GAC): + +VC\atlmfc\lib\mfcmifc80.dll +VC\atlmfc\lib\amd64\mfcmifc80.dll diff --git a/build_tools/wheels/build_wheels.sh b/build_tools/wheels/build_wheels.sh new file mode 100755 index 0000000000000..02b05bc8a2795 --- /dev/null +++ b/build_tools/wheels/build_wheels.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -e +set -x + +# Set environment variables to make our wheel build easier to reproduce byte +# for byte from source. See https://reproducible-builds.org/. The long term +# motivation would be to be able to detect supply chain attacks. +# +# In particular we set SOURCE_DATE_EPOCH to the commit date of the last commit. +# +# XXX: setting those environment variables is not enough. See the following +# issue for more details on what remains to do: +# https://github.com/scikit-learn/scikit-learn/issues/28151 +export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct) +export PYTHONHASHSEED=0 + +# OpenMP is not present on macOS by default +if [[ $(uname) == "Darwin" ]]; then + # Make sure to use a libomp version binary compatible with the oldest + # supported version of the macos SDK as libomp will be vendored into the + # scikit-learn wheels for macos. + + if [[ "$CIBW_BUILD" == *-macosx_arm64 ]]; then + if [[ $(uname -m) == "x86_64" ]]; then + # arm64 builds must cross compile because the CI instance is x86 + # This turns off the computation of the test program in + # sklearn/_build_utils/pre_build_helpers.py + export PYTHON_CROSSENV=1 + fi + # SciPy requires 12.0 on arm to prevent kernel panics + # https://github.com/scipy/scipy/issues/14688 + # We use the same deployment target to match SciPy. + export MACOSX_DEPLOYMENT_TARGET=12.0 + OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2" + else + export MACOSX_DEPLOYMENT_TARGET=10.9 + OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2" + fi + + conda create -n build $OPENMP_URL + PREFIX="$HOME/miniconda3/envs/build" + + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ + export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" + export CFLAGS="$CFLAGS -I$PREFIX/include" + export CXXFLAGS="$CXXFLAGS -I$PREFIX/include" + export LDFLAGS="$LDFLAGS -Wl,-rpath,$PREFIX/lib -L$PREFIX/lib -lomp" +fi + +if [[ "$CIBW_FREE_THREADED_SUPPORT" =~ [tT]rue ]]; then + # Numpy, scipy, Cython only have free-threaded wheels on scientific-python-nightly-wheels + # TODO: remove this after CPython 3.13 is released (scheduled October 2024) + # and our dependencies have free-threaded wheels on PyPI + export CIBW_BUILD_FRONTEND='pip; args: --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --only-binary :all:' +fi + +# The version of the built dependencies are specified +# in the pyproject.toml file, while the tests are run +# against the most recent version of the dependencies + +python -m pip install cibuildwheel +python -m cibuildwheel --output-dir wheelhouse diff --git a/build_tools/wheels/check_license.py b/build_tools/wheels/check_license.py new file mode 100644 index 0000000000000..00fe4169be65d --- /dev/null +++ b/build_tools/wheels/check_license.py @@ -0,0 +1,30 @@ +"""Checks the bundled license is installed with the wheel.""" + +import platform +import site +from itertools import chain +from pathlib import Path + +site_packages = site.getsitepackages() + +site_packages_path = (Path(p) for p in site_packages) + +try: + distinfo_path = next( + chain( + s + for site_package in site_packages_path + for s in site_package.glob("scikit_learn-*.dist-info") + ) + ) +except StopIteration as e: + raise RuntimeError("Unable to find scikit-learn's dist-info") from e + +license_text = (distinfo_path / "COPYING").read_text() + +assert "Copyright (c)" in license_text + +assert ( + "This binary distribution of scikit-learn also bundles the following software" + in license_text +), f"Unable to find bundled license for {platform.system()}" diff --git a/build_tools/wheels/cibw_before_build.sh b/build_tools/wheels/cibw_before_build.sh new file mode 100755 index 0000000000000..4e4558db5a5bc --- /dev/null +++ b/build_tools/wheels/cibw_before_build.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -euxo pipefail + +PROJECT_DIR="$1" +LICENSE_FILE="$PROJECT_DIR/COPYING" + +echo "" >>"$LICENSE_FILE" +echo "----" >>"$LICENSE_FILE" +echo "" >>"$LICENSE_FILE" + +if [[ $RUNNER_OS == "Linux" ]]; then + cat $PROJECT_DIR/build_tools/wheels/LICENSE_linux.txt >>"$LICENSE_FILE" +elif [[ $RUNNER_OS == "macOS" ]]; then + cat $PROJECT_DIR/build_tools/wheels/LICENSE_macos.txt >>"$LICENSE_FILE" +elif [[ $RUNNER_OS == "Windows" ]]; then + cat $PROJECT_DIR/build_tools/wheels/LICENSE_windows.txt >>"$LICENSE_FILE" +fi diff --git a/build_tools/wheels/test_wheels.sh b/build_tools/wheels/test_wheels.sh new file mode 100755 index 0000000000000..1d6ee19bda8a8 --- /dev/null +++ b/build_tools/wheels/test_wheels.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +set -x + +PROJECT_DIR="$1" + +python $PROJECT_DIR/build_tools/wheels/check_license.py + +python -c "import joblib; print(f'Number of cores (physical): \ +{joblib.cpu_count()} ({joblib.cpu_count(only_physical_cores=True)})')" + +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + # TODO: delete when importing numpy no longer enables the GIL + # setting to zero ensures the GIL is disabled while running the + # tests under free-threaded python + export PYTHON_GIL=0 +fi + +# Test that there are no links to system libraries in the +# threadpoolctl output section of the show_versions output: +python -c "import sklearn; sklearn.show_versions()" + +if pip show -qq pytest-xdist; then + XDIST_WORKERS=$(python -c "import joblib; print(joblib.cpu_count(only_physical_cores=True))") + pytest --pyargs sklearn -n $XDIST_WORKERS +else + pytest --pyargs sklearn +fi diff --git a/conftest.py b/conftest.py deleted file mode 100644 index aec49c03ae13d..0000000000000 --- a/conftest.py +++ /dev/null @@ -1,91 +0,0 @@ -# Even if empty this file is useful so that when running from the root folder -# ./sklearn is added to sys.path by pytest. See -# https://docs.pytest.org/en/latest/pythonpath.html for more details. For -# example, this allows to build extensions in place and run pytest -# doc/modules/clustering.rst and use sklearn from the local folder rather than -# the one from site-packages. - -import platform -import sys - -import pytest -from _pytest.doctest import DoctestItem - -from sklearn.utils import _IS_32BIT -from sklearn.externals import _pilutil -from sklearn._min_dependencies import PYTEST_MIN_VERSION -from sklearn.utils.fixes import np_version, parse_version - -if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION): - raise ImportError('Your version of pytest is too old, you should have ' - 'at least pytest >= {} installed.' - .format(PYTEST_MIN_VERSION)) - - -def pytest_collection_modifyitems(config, items): - for item in items: - # FeatureHasher is not compatible with PyPy - if (item.name.endswith(('_hash.FeatureHasher', - 'text.HashingVectorizer')) - and platform.python_implementation() == 'PyPy'): - marker = pytest.mark.skip( - reason='FeatureHasher is not compatible with PyPy') - item.add_marker(marker) - # Known failure on with GradientBoostingClassifier on ARM64 - elif (item.name.endswith('GradientBoostingClassifier') - and platform.machine() == 'aarch64'): - - marker = pytest.mark.xfail( - reason=( - 'know failure. See ' - 'https://github.com/scikit-learn/scikit-learn/issues/17797' # noqa - ) - ) - item.add_marker(marker) - - # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to - # run doctests only for numpy >= 1.14. - skip_doctests = False - try: - if np_version < parse_version('1.14'): - reason = 'doctests are only run for numpy >= 1.14' - skip_doctests = True - elif _IS_32BIT: - reason = ('doctest are only run when the default numpy int is ' - '64 bits.') - skip_doctests = True - elif sys.platform.startswith("win32"): - reason = ("doctests are not run for Windows because numpy arrays " - "repr is inconsistent across platforms.") - skip_doctests = True - except ImportError: - pass - - if skip_doctests: - skip_marker = pytest.mark.skip(reason=reason) - - for item in items: - if isinstance(item, DoctestItem): - item.add_marker(skip_marker) - elif not _pilutil.pillow_installed: - skip_marker = pytest.mark.skip(reason="pillow (or PIL) not installed!") - for item in items: - if item.name in [ - "sklearn.feature_extraction.image.PatchExtractor", - "sklearn.feature_extraction.image.extract_patches_2d"]: - item.add_marker(skip_marker) - - -def pytest_configure(config): - import sys - sys._is_pytest_session = True - # declare our custom markers to avoid PytestUnknownMarkWarning - config.addinivalue_line( - "markers", - "network: mark a test for execution if network available." - ) - - -def pytest_unconfigure(config): - import sys - del sys._is_pytest_session diff --git a/doc/Makefile b/doc/Makefile index 6146d11123017..1419bac49316d 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,18 +2,29 @@ # # You can set these variables from the command line. -SPHINXOPTS = -j auto +SPHINXOPTS ?= -T SPHINXBUILD ?= sphinx-build PAPER = BUILDDIR = _build + ifneq ($(EXAMPLES_PATTERN),) EXAMPLES_PATTERN_OPTS := -D sphinx_gallery_conf.filename_pattern="$(EXAMPLES_PATTERN)" endif +ifeq ($(CI), true) + # On CircleCI using -j2 does not seem to speed up the html-noplot build + SPHINX_NUMJOBS_NOPLOT_DEFAULT=1 +else ifeq ($(shell uname), Darwin) + # Avoid stalling issues on MacOS + SPHINX_NUMJOBS_NOPLOT_DEFAULT=1 +else + SPHINX_NUMJOBS_NOPLOT_DEFAULT=auto +endif + # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\ +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\ $(EXAMPLES_PATTERN_OPTS) . @@ -36,21 +47,40 @@ help: clean: -rm -rf $(BUILDDIR)/* + @echo "Removed $(BUILDDIR)/*" -rm -rf auto_examples/ + @echo "Removed auto_examples/" -rm -rf generated/* + @echo "Removed generated/" -rm -rf modules/generated/ - + @echo "Removed modules/generated/" + -rm -rf css/styles/ + @echo "Removed css/styles/" + -rm -rf api/*.rst + @echo "Removed api/*.rst" + +# Default to SPHINX_NUMJOBS=1 for full documentation build. Using +# SPHINX_NUMJOBS!=1 may actually slow down the build, or cause weird issues in +# the CI (job stalling or EOFError), see +# https://github.com/scikit-learn/scikit-learn/pull/25836 or +# https://github.com/scikit-learn/scikit-learn/pull/25809 +html: SPHINX_NUMJOBS ?= 1 html: + @echo $(ALLSPHINXOPTS) # These two lines make the build a bit more lengthy, and the # the embedding of images more robust rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) -j$(SPHINX_NUMJOBS) $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" +# Default to SPHINX_NUMJOBS=auto (except on MacOS and CI) since this makes +# html-noplot build faster +html-noplot: SPHINX_NUMJOBS ?= $(SPHINX_NUMJOBS_NOPLOT_DEFAULT) html-noplot: - $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) -j$(SPHINX_NUMJOBS) \ + $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable." diff --git a/doc/README.md b/doc/README.md index 18d4bde4f5862..537ed85006006 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,6 +1,6 @@ # Documentation for scikit-learn -This directory contains the full manual and web site as displayed at -http://scikit-learn.org. See -http://scikit-learn.org/dev/developers/contributing.html#documentation for -detailed information about the documentation. +This directory contains the full manual and website as displayed at +https://scikit-learn.org. See +https://scikit-learn.org/dev/developers/contributing.html#documentation for +detailed information about the documentation. diff --git a/doc/about.rst b/doc/about.rst index fdfe8241b8aec..4db39f9709e73 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -1,480 +1,512 @@ .. _about: +======== About us ======== History -------- +======= This project was started in 2007 as a Google Summer of Code project by -David Cournapeau. Later that year, Matthieu Brucher started work on -this project as part of his thesis. +David Cournapeau. Later that year, Matthieu Brucher started working on this project +as part of his thesis. In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent Michel of INRIA took leadership of the project and made the first public release, February the 1st 2010. Since then, several releases have appeared -following a ~ 3-month cycle, and a thriving international community has -been leading the development. +following an approximately 3-month cycle, and a thriving international +community has been leading the development. As a result, INRIA holds the +copyright over the work done by people who were employed by INRIA at the +time of the contribution. Governance ----------- -The decision making process and governance structure of scikit-learn is laid -out in the :ref:`governance document `. +========== + +The decision making process and governance structure of scikit-learn, like roles and responsibilities, is laid out in the :ref:`governance document `. + +.. The "author" anchors below is there to ensure that old html links (in + the form of "about.html#author" still work) + +.. _authors: + +The people behind scikit-learn +============================== + +Scikit-learn is a community project, developed by a large group of +people, all across the world. A few core contributor teams, listed below, have +central roles, however a more complete list of contributors can be found `on +github +`__. + +Active Core Contributors +------------------------ -Authors -------- +Maintainers Team +................ -The following people are currently core contributors to scikit-learn's development -and maintenance: +The following people are currently maintainers, in charge of +consolidating scikit-learn's development and maintenance: -.. include:: authors.rst +.. include:: maintainers.rst -Please do not email the authors directly to ask for assistance or report issues. -Instead, please see `What's the best way to ask questions about scikit-learn -`_ -in the FAQ. +.. note:: + + Please do not email the authors directly to ask for assistance or report issues. + Instead, please see `What's the best way to ask questions about scikit-learn + `_ + in the FAQ. .. seealso:: - :ref:`How you can contribute to the project ` + How you can :ref:`contribute to the project `. + +Documentation Team +.................. + +The following people help with documenting the project: + +.. include:: documentation_team.rst -Triage Team ------------ +Contributor Experience Team +........................... The following people are active contributors who also help with :ref:`triaging issues `, PRs, and general maintenance: -.. include:: triage_team.rst +.. include:: contributor_experience_team.rst -Emeritus Core Developers ------------------------- +Communication Team +.................. + +The following people help with :ref:`communication around scikit-learn +`. + +.. include:: communication_team.rst + +Emeritus Core Contributors +-------------------------- + +Emeritus Maintainers Team +......................... The following people have been active contributors in the past, but are no longer active in the project: -.. include:: authors_emeritus.rst +.. include:: maintainers_emeritus.rst + +Emeritus Communication Team +........................... + +The following people have been active in the communication team in the +past, but no longer have communication responsibilities: + +.. include:: communication_team_emeritus.rst +Emeritus Contributor Experience Team +.................................... + +The following people have been active in the contributor experience team in the +past: + +.. include:: contributor_experience_team_emeritus.rst .. _citing-scikit-learn: Citing scikit-learn -------------------- +=================== If you use scikit-learn in a scientific publication, we would appreciate citations to the following paper: - `Scikit-learn: Machine Learning in Python - `_, Pedregosa - *et al.*, JMLR 12, pp. 2825-2830, 2011. - - Bibtex entry:: - - @article{scikit-learn, - title={Scikit-learn: Machine Learning in {P}ython}, - author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. - and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. - and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and - Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, - journal={Journal of Machine Learning Research}, - volume={12}, - pages={2825--2830}, - year={2011} - } +`Scikit-learn: Machine Learning in Python +`_, Pedregosa +*et al.*, JMLR 12, pp. 2825-2830, 2011. + +Bibtex entry:: + + @article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} + } If you want to cite scikit-learn for its API or design, you may also want to consider the following paper: - `API design for machine learning software: experiences from the scikit-learn - project `_, Buitinck *et al.*, 2013. - - Bibtex entry:: - - @inproceedings{sklearn_api, - author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and - Fabian Pedregosa and Andreas Mueller and Olivier Grisel and - Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort - and Jaques Grobler and Robert Layton and Jake VanderPlas and - Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, - title = {{API} design for machine learning software: experiences from the scikit-learn - project}, - booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, - year = {2013}, - pages = {108--122}, - } +:arxiv:`API design for machine learning software: experiences from the scikit-learn +project <1309.0238>`, Buitinck *et al.*, 2013. + +Bibtex entry:: + + @inproceedings{sklearn_api, + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort + and Jaques Grobler and Robert Layton and Jake VanderPlas and + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, + title = {{API} design for machine learning software: experiences from the scikit-learn + project}, + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, + year = {2013}, + pages = {108--122}, + } Artwork -------- +======= High quality PNG and SVG logos are available in the `doc/logos/ `_ source directory. .. image:: images/scikit-learn-logo-notext.png - :align: center + :align: center Funding -------- -Scikit-Learn is a community driven project, however institutional and private +======= + +Scikit-learn is a community driven project, however institutional and private grants help to assure its sustainability. The project would like to thank the following funders. ................................... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -The `Members `_ of -the `Scikit-Learn Consortium at Inria Foundation -`_ fund Olivier -Grisel, Guillaume Lemaitre, JÊrÊmie du Boisberranger and Chiara Marmo. + `:probabl. `_ employs Adrin Jalali, Arturo Amor, + François Goupil, Guillaume Lemaitre, JÊrÊmie du Boisberranger, Loïc Estève, + Olivier Grisel, and Stefanie Senger. -.. raw:: html + .. div:: image-box -
+ .. image:: images/probabl.png + :target: https://probabl.ai -.. |msn| image:: images/microsoft.png - :width: 100pt - :target: https://www.microsoft.com/ +.......... -.. |bcg| image:: images/bcg.png - :width: 100pt - :target: https://www.bcg.com/beyond-consulting/bcg-gamma/default.aspx +.. |chanel| image:: images/chanel.png + :target: https://www.chanel.com .. |axa| image:: images/axa.png - :width: 50pt - :target: https://www.axa.fr/ + :target: https://www.axa.fr/ .. |bnp| image:: images/bnp.png - :width: 150pt - :target: https://www.bnpparibascardif.com/ - -.. |fujitsu| image:: images/fujitsu.png - :width: 100pt - :target: https://www.fujitsu.com/global/ + :target: https://www.bnpparibascardif.com/ .. |dataiku| image:: images/dataiku.png - :width: 70pt - :target: https://www.dataiku.com/ + :target: https://www.dataiku.com/ -.. |inria| image:: images/inria-logo.jpg - :width: 100pt - :target: https://www.inria.fr +.. |nvidia| image:: images/nvidia.png + :target: https://www.nvidia.com +.. |inria| image:: images/inria-logo.jpg + :target: https://www.inria.fr .. raw:: html -
- -.. table:: - :class: sk-sponsor-table align-default - - +---------+----------+ - | |bcg| | - +---------+----------+ - | | - +---------+----------+ - | |axa| | |bnp| | - +---------+----------+ - ||fujitsu|| |msn| | - +---------+----------+ - | | - +---------+----------+ - | |dataiku| | - +---------+----------+ - | | - +---------+----------+ - | |inria| | - +---------+----------+ + -........ +.. div:: sk-text-image-grid-small -.. raw:: html + .. div:: text-box -
-
+ The `Members `_ of + the `Scikit-learn Consortium at Inria Foundation + `_ help at maintaining and + improving the project through their financial support. -`Columbia University `_ funds Andreas MÃŧller since 2016 + .. div:: image-box -.. raw:: html + .. table:: + :class: image-subtable + + +----------+-----------+ + | |chanel| | + +----------+-----------+ + | |axa| | |bnp| | + +----------+-----------+ + | |nvidia| | + +----------+-----------+ + | |dataiku| | + +----------+-----------+ + | |inria| | + +----------+-----------+ -
+.......... -
+.. div:: sk-text-image-grid-small -.. image:: themes/scikit-learn/static/img/columbia.png - :width: 50pt - :align: center - :target: https://www.columbia.edu/ + .. div:: text-box -.. raw:: html + `NVidia `_ funds Tim Head since 2022 + and is part of the scikit-learn consortium at Inria. -
-
+ .. div:: image-box + + .. image:: images/nvidia.png + :target: https://nvidia.com .......... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -Andreas MÃŧller received a grant to improve scikit-learn from the -`Alfred P. Sloan Foundation `_ . -This grant supports the position of Nicolas Hug and Thomas J. Fan. + `Microsoft `_ funds Andreas MÃŧller since 2020. -.. raw:: html + .. div:: image-box + + .. image:: images/microsoft.png + :target: https://microsoft.com -
+........... -
+.. div:: sk-text-image-grid-small -.. image:: images/sloan_banner.png - :width: 100pt - :align: center - :target: https://sloan.org/ + .. div:: text-box -.. raw:: html + `Quansight Labs `_ funds Lucy Liu since 2022. -
-
+ .. div:: image-box + + .. image:: images/quansight-labs.png + :target: https://labs.quansight.org ........... -.. raw:: html +.. |czi| image:: images/czi.png + :target: https://chanzuckerberg.com -
-
+.. |wellcome| image:: images/wellcome-trust.png + :target: https://wellcome.org/ -`The University of Sydney `_ funds Joel Nothman since -July 2017. +.. div:: sk-text-image-grid-small -.. raw:: html + .. div:: text-box -
+ `The Chan-Zuckerberg Initiative `_ and + `Wellcome Trust `_ fund scikit-learn through the + `Essential Open Source Software for Science (EOSS) `_ + cycle 6. -
+ It supports Lucy Liu and diversity & inclusion initiatives that will + be announced in the future. -.. image:: themes/scikit-learn/static/img/sydney-primary.jpeg - :width: 100pt - :align: center - :target: https://sydney.edu.au/ + .. div:: image-box -.. raw:: html + .. table:: + :class: image-subtable -
-
+ +----------+----------------+ + | |czi| | |wellcome| | + +----------+----------------+ ........... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -`Zalando SE `_ funds Adrin Jalali since -August 2020. + `Tidelift `_ supports the project via their service + agreement. -.. raw:: html + .. div:: image-box -
+ .. image:: images/Tidelift-logo-on-light.svg + :target: https://tidelift.com/ -
+........... -.. image:: images/zalando_logo.png - :width: 100pt - :align: center - :target: https://corporate.zalando.com/en -.. raw:: html +Past Sponsors +------------- -
-
+.. div:: sk-text-image-grid-small -Past Sponsors -............. + .. div:: text-box -.. raw:: html + `Quansight Labs `_ funded Meekail Zain in 2022 and 2023, + and funded Thomas J. Fan from 2021 to 2023. -
-
+ .. div:: image-box -`INRIA `_ actively supports this project. It has -provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler -(2012-2013) and Olivier Grisel (2013-2017) to work on this project -full-time. It also hosts coding sprints and other events. + .. image:: images/quansight-labs.png + :target: https://labs.quansight.org -.. raw:: html +........... -
+.. div:: sk-text-image-grid-small -
+ .. div:: text-box -.. image:: images/inria-logo.jpg - :width: 100pt - :align: center - :target: https://www.inria.fr + `Columbia University `_ funded Andreas MÃŧller + (2016-2020). -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/columbia.png + :target: https://columbia.edu -..................... +........ -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -`Paris-Saclay Center for Data Science -`_ -funded one year for a developer to work on the project full-time -(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the -time of Joris van den Bossche (2017-2018). + `The University of Sydney `_ funded Joel Nothman + (2017-2021). -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/sydney-primary.jpeg + :target: https://sydney.edu.au/ -.. image:: images/cds-logo.png - :width: 100pt - :align: center - :target: https://www.datascience-paris-saclay.fr/ +........... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -............ + Andreas MÃŧller received a grant to improve scikit-learn from the + `Alfred P. Sloan Foundation `_ . + This grant supported the position of Nicolas Hug and Thomas J. Fan. -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/sloan_banner.png + :target: https://sloan.org/ -`Anaconda, Inc `_ funded Adrin Jalali in 2019. +............. -.. raw:: html +.. div:: sk-text-image-grid-small -
+ .. div:: text-box -
+ `INRIA `_ actively supports this project. It has + provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler + (2012-2013) and Olivier Grisel (2013-2017) to work on this project + full-time. It also hosts coding sprints and other events. -.. image:: images/anaconda.png - :width: 100pt - :align: center - :target: https://www.anaconda.com/ + .. div:: image-box -.. raw:: html + .. image:: images/inria-logo.jpg + :target: https://www.inria.fr -
-
+..................... -.......................... +.. div:: sk-text-image-grid-small -.. raw:: html + .. div:: text-box -
-
+ `Paris-Saclay Center for Data Science `_ + funded one year for a developer to work on the project full-time (2014-2015), 50% + of the time of Guillaume Lemaitre (2016-2017) and 50% of the time of Joris van den + Bossche (2017-2018). -`NYU Moore-Sloan Data Science Environment `_ -funded Andreas Mueller (2014-2016) to work on this project. The Moore-Sloan -Data Science Environment also funds several students to work on the project -part-time. + .. div:: image-box -.. raw:: html + .. image:: images/cds-logo.png + :target: http://www.datascience-paris-saclay.fr/ -
-
+.......................... -.. image:: images/nyu_short_color.png - :width: 100pt - :align: center - :target: https://cds.nyu.edu/mooresloan/ +.. div:: sk-text-image-grid-small -.. raw:: html + .. div:: text-box + + `NYU Moore-Sloan Data Science Environment `_ + funded Andreas Mueller (2014-2016) to work on this project. The Moore-Sloan + Data Science Environment also funds several students to work on the project + part-time. + + .. div:: image-box -
-
+ .. image:: images/nyu_short_color.png + :target: https://cds.nyu.edu/mooresloan/ ........................ -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -`TÊlÊcom Paristech `_ funded Manoj Kumar -(2014), Tom DuprÊ la Tour (2015), Raghav RV (2015-2017), Thierry Guillemot -(2016-2017) and Albert Thomas (2017) to work on scikit-learn. + `TÊlÊcom Paristech `_ funded Manoj Kumar + (2014), Tom DuprÊ la Tour (2015), Raghav RV (2015-2017), Thierry Guillemot + (2016-2017) and Albert Thomas (2017) to work on scikit-learn. -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/telecom.png + :target: https://www.telecom-paristech.fr/ -.. image:: themes/scikit-learn/static/img/telecom.png - :width: 50pt - :align: center - :target: https://www.telecom-paristech.fr/ +..................... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -..................... + `The Labex DigiCosme `_ funded Nicolas Goix + (2015-2016), Tom DuprÊ la Tour (2015-2016 and 2017-2018), Mathurin Massias + (2018-2019) to work part time on scikit-learn during their PhDs. It also + funded a scikit-learn coding sprint in 2015. -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/digicosme.png + :target: https://digicosme.lri.fr -`The Labex DigiCosme `_ funded Nicolas Goix -(2015-2016), Tom DuprÊ la Tour (2015-2016 and 2017-2018), Mathurin Massias -(2018-2019) to work part time on scikit-learn during their PhDs. It also -funded a scikit-learn coding sprint in 2015. +..................... -.. raw:: html +.. div:: sk-text-image-grid-small -
-
+ .. div:: text-box -.. image:: themes/scikit-learn/static/img/digicosme.png - :width: 100pt - :align: center - :target: https://digicosme.lri.fr + `The Chan-Zuckerberg Initiative `_ funded Nicolas + Hug to work full-time on scikit-learn in 2020. -.. raw:: html + .. div:: image-box -
-
+ .. image:: images/czi.png + :target: https://chanzuckerberg.com ...................... The following students were sponsored by `Google -`_ to work on scikit-learn through +`_ to work on scikit-learn through the `Google Summer of Code `_ program. - 2007 - David Cournapeau - 2011 - `Vlad Niculae`_ -- 2012 - `Vlad Niculae`_, Immanuel Bayer. +- 2012 - `Vlad Niculae`_, Immanuel Bayer - 2013 - Kemal Eren, Nicolas TrÊsegnie -- 2014 - Hamzeh Alsalhi, Issam Laradji, Maheshakya Wijewardena, Manoj Kumar. +- 2014 - Hamzeh Alsalhi, Issam Laradji, Maheshakya Wijewardena, Manoj Kumar - 2015 - `Raghav RV `_, Wei Xue -- 2016 - `Nelson Liu `_, `YenChen Lin `_ +- 2016 - `Nelson Liu `_, `YenChen Lin `_ .. _Vlad Niculae: https://vene.ro/ @@ -485,68 +517,116 @@ The `NeuroDebian `_ project providing `Debian `Dr. James V. Haxby `_ (`Dartmouth College `_). -Sprints -------- +................... -The International 2019 Paris sprint was kindly hosted by `AXA `_. -Also some participants could attend thanks to the support of the `Alfred P. -Sloan Foundation `_, the `Python Software -Foundation `_ (PSF) and the `DATAIA Institute -`_. +The following organizations funded the scikit-learn consortium at Inria in +the past: -..................... +.. |msn| image:: images/microsoft.png + :target: https://www.microsoft.com/ + +.. |bcg| image:: images/bcg.png + :target: https://www.bcg.com/beyond-consulting/bcg-gamma/default.aspx + +.. |fujitsu| image:: images/fujitsu.png + :target: https://www.fujitsu.com/global/ + +.. |aphp| image:: images/logo_APHP_text.png + :target: https://aphp.fr/ + +.. |hf| image:: images/huggingface_logo-noborder.png + :target: https://huggingface.co + +.. raw:: html + + + +.. grid:: 2 2 4 4 + :class-row: image-subgrid + :gutter: 1 -The 2013 International Paris Sprint was made possible thanks to the support of -`TÊlÊcom Paristech `_, `tinyclues -`_, the `French Python Association -`_ and the `Fonds de la Recherche Scientifique -`_. + .. grid-item:: + :class: sd-text-center + :child-align: center -.............. + |msn| -The 2011 International Granada sprint was made possible thanks to the support -of the `PSF `_ and `tinyclues -`_. + .. grid-item:: + :class: sd-text-center + :child-align: center + + |bcg| + + .. grid-item:: + :class: sd-text-center + :child-align: center + + |fujitsu| + + .. grid-item:: + :class: sd-text-center + :child-align: center + + |aphp| + + .. grid-item:: + :class: sd-text-center + :child-align: center + + |hf| + +Coding Sprints +============== + +The scikit-learn project has a long history of `open source coding sprints +`_ with over 50 sprint +events from 2010 to present day. There are scores of sponsors who contributed +to costs which include venue, food, travel, developer time and more. See +`scikit-learn sprints `_ for a full +list of events. Donating to the project -....................... +======================= If you are interested in donating to the project or to one of our code-sprints, -you can use the *Paypal* button below or the `NumFOCUS Donations Page -`_ (if you use the latter, -please indicate that you are donating for the scikit-learn project). - -All donations will be handled by `NumFOCUS -`_, a non-profit-organization which is -managed by a board of `Scipy community members -`_. NumFOCUS's mission is to foster -scientific computing software, in particular in Python. As a fiscal home -of scikit-learn, it ensures that money is available when needed to keep -the project funded and available while in compliance with tax regulations. - -The received donations for the scikit-learn project mostly will go towards -covering travel-expenses for code sprints, as well as towards the organization -budget of the project [#f1]_. - -.. raw :: html - -

- -
+please donate via the `NumFOCUS Donations Page +`_. + +.. raw:: html + +

+ + Help us, donate! + +

+ +All donations will be handled by `NumFOCUS `_, a non-profit +organization which is managed by a board of `Scipy community members +`_. NumFOCUS's mission is to foster scientific +computing software, in particular in Python. As a fiscal home of scikit-learn, it +ensures that money is available when needed to keep the project funded and available +while in compliance with tax regulations. + +The received donations for the scikit-learn project mostly will go towards covering +travel-expenses for code sprints, as well as towards the organization budget of the +project [#f1]_. .. rubric:: Notes .. [#f1] Regarding the organization budget, in particular, we might use some of - the donated funds to pay for other project expenses such as DNS, - hosting or continuous integration services. + the donated funds to pay for other project expenses such as DNS, + hosting or continuous integration services. + Infrastructure support ----------------------- +====================== -- We would also like to thank `Microsoft Azure - `_, `Travis Cl `_, - `CircleCl `_ for free CPU time on their Continuous - Integration servers, and `Anaconda Inc. `_ for the - storage they provide for our staging and nightly builds. +We would also like to thank `Microsoft Azure `_, +`CircleCl `_ for free CPU +time on their Continuous Integration servers, and `Anaconda Inc. `_ +for the storage they provide for our staging and nightly builds. diff --git a/doc/api/deprecated.rst.template b/doc/api/deprecated.rst.template new file mode 100644 index 0000000000000..a48f0180f76ed --- /dev/null +++ b/doc/api/deprecated.rst.template @@ -0,0 +1,24 @@ +:html_theme.sidebar_secondary.remove: + +.. _api_depr_ref: + +Recently Deprecated +=================== + +.. currentmodule:: sklearn + +{% for ver, objs in DEPRECATED_API_REFERENCE %} +.. _api_depr_ref-{{ ver|replace(".", "-") }}: + +.. rubric:: To be removed in {{ ver }} + +.. autosummary:: + :nosignatures: + :toctree: ../modules/generated/ + :template: base.rst + +{% for obj in objs %} + {{ obj }} +{%- endfor %} + +{% endfor %} diff --git a/doc/api/index.rst.template b/doc/api/index.rst.template new file mode 100644 index 0000000000000..b0a3698775a94 --- /dev/null +++ b/doc/api/index.rst.template @@ -0,0 +1,77 @@ +:html_theme.sidebar_secondary.remove: + +.. _api_ref: + +============= +API Reference +============= + +This is the class and function reference of scikit-learn. Please refer to the +:ref:`full user guide ` for further details, as the raw specifications of +classes and functions may not be enough to give full guidelines on their use. For +reference on concepts repeated across the API, see :ref:`glossary`. + +.. toctree:: + :maxdepth: 2 + :hidden: + +{% for module, _ in API_REFERENCE %} + {{ module }} +{%- endfor %} +{%- if DEPRECATED_API_REFERENCE %} + deprecated +{%- endif %} + +.. list-table:: + :header-rows: 1 + :class: apisearch-table + + * - Object + - Description + +{% for module, module_info in API_REFERENCE %} +{% for section in module_info["sections"] %} +{% for obj in section["autosummary"] %} +{% set parts = obj.rsplit(".", 1) %} +{% if parts|length > 1 %} +{% set full_module = module + "." + parts[0] %} +{% else %} +{% set full_module = module %} +{% endif %} + * - :obj:`~{{ module }}.{{ obj }}` + + - .. div:: sk-apisearch-desc + + .. currentmodule:: {{ full_module }} + + .. autoshortsummary:: {{ module }}.{{ obj }} + + .. div:: caption + + :mod:`{{ full_module }}` +{% endfor %} +{% endfor %} +{% endfor %} + +{% for ver, objs in DEPRECATED_API_REFERENCE %} +{% for obj in objs %} +{% set parts = obj.rsplit(".", 1) %} +{% if parts|length > 1 %} +{% set full_module = "sklearn." + parts[0] %} +{% else %} +{% set full_module = "sklearn" %} +{% endif %} + * - :obj:`~sklearn.{{ obj }}` + + - .. div:: sk-apisearch-desc + + .. currentmodule:: {{ full_module }} + + .. autoshortsummary:: sklearn.{{ obj }} + + .. div:: caption + + :mod:`{{ full_module }}` + :bdg-ref-danger-line:`Deprecated in version {{ ver }} ` +{% endfor %} +{% endfor %} diff --git a/doc/api/module.rst.template b/doc/api/module.rst.template new file mode 100644 index 0000000000000..1980f27aad158 --- /dev/null +++ b/doc/api/module.rst.template @@ -0,0 +1,46 @@ +:html_theme.sidebar_secondary.remove: + +{% if module == "sklearn" -%} +{%- set module_hook = "sklearn" -%} +{%- elif module.startswith("sklearn.") -%} +{%- set module_hook = module[8:] -%} +{%- else -%} +{%- set module_hook = None -%} +{%- endif -%} + +{% if module_hook %} +.. _{{ module_hook }}_ref: +{% endif %} + +{{ module }} +{{ "=" * module|length }} + +.. automodule:: {{ module }} + +{% if module_info["description"] %} +{{ module_info["description"] }} +{% endif %} + +{% for section in module_info["sections"] %} +{% if section["title"] and module_hook %} +.. _{{ module_hook }}_ref-{{ section["title"]|lower|replace(" ", "-") }}: +{% endif %} + +{% if section["title"] %} +{{ section["title"] }} +{{ "-" * section["title"]|length }} +{% endif %} + +{% if section["description"] %} +{{ section["description"] }} +{% endif %} + +.. autosummary:: + :nosignatures: + :toctree: ../modules/generated/ + :template: base.rst + +{% for obj in section["autosummary"] %} + {{ obj }} +{%- endfor %} +{% endfor %} diff --git a/doc/api_reference.py b/doc/api_reference.py new file mode 100644 index 0000000000000..c90b115746415 --- /dev/null +++ b/doc/api_reference.py @@ -0,0 +1,1352 @@ +"""Configuration for the API reference documentation.""" + + +def _get_guide(*refs, is_developer=False): + """Get the rst to refer to user/developer guide. + + `refs` is several references that can be used in the :ref:`...` directive. + """ + if len(refs) == 1: + ref_desc = f":ref:`{refs[0]}` section" + elif len(refs) == 2: + ref_desc = f":ref:`{refs[0]}` and :ref:`{refs[1]}` sections" + else: + ref_desc = ", ".join(f":ref:`{ref}`" for ref in refs[:-1]) + ref_desc += f", and :ref:`{refs[-1]}` sections" + + guide_name = "Developer" if is_developer else "User" + return f"**{guide_name} guide.** See the {ref_desc} for further details." + + +def _get_submodule(module_name, submodule_name): + """Get the submodule docstring and automatically add the hook. + + `module_name` is e.g. `sklearn.feature_extraction`, and `submodule_name` is e.g. + `image`, so we get the docstring and hook for `sklearn.feature_extraction.image` + submodule. `module_name` is used to reset the current module because autosummary + automatically changes the current module. + """ + lines = [ + f".. automodule:: {module_name}.{submodule_name}", + f".. currentmodule:: {module_name}", + ] + return "\n\n".join(lines) + + +""" +CONFIGURING API_REFERENCE +========================= + +API_REFERENCE maps each module name to a dictionary that consists of the following +components: + +short_summary (required) + The text to be printed on the index page; it has nothing to do the API reference + page of each module. +description (required, `None` if not needed) + The additional description for the module to be placed under the module + docstring, before the sections start. +sections (required) + A list of sections, each of which consists of: + - title (required, `None` if not needed): the section title, commonly it should + not be `None` except for the first section of a module, + - description (optional): the optional additional description for the section, + - autosummary (required): an autosummary block, assuming current module is the + current module name. + +Essentially, the rendered page would look like the following: + +|---------------------------------------------------------------------------------| +| {{ module_name }} | +| ================= | +| {{ module_docstring }} | +| {{ description }} | +| | +| {{ section_title_1 }} <-------------- Optional if one wants the first | +| --------------------- section to directly follow | +| {{ section_description_1 }} without a second-level heading. | +| {{ section_autosummary_1 }} | +| | +| {{ section_title_2 }} | +| --------------------- | +| {{ section_description_2 }} | +| {{ section_autosummary_2 }} | +| | +| More sections... | +|---------------------------------------------------------------------------------| + +Hooks will be automatically generated for each module and each section. For a module, +e.g., `sklearn.feature_extraction`, the hook would be `feature_extraction_ref`; for a +section, e.g., "From text" under `sklearn.feature_extraction`, the hook would be +`feature_extraction_ref-from-text`. However, note that a better way is to refer using +the :mod: directive, e.g., :mod:`sklearn.feature_extraction` for the module and +:mod:`sklearn.feature_extraction.text` for the section. Only in case that a section +is not a particular submodule does the hook become useful, e.g., the "Loaders" section +under `sklearn.datasets`. +""" + +API_REFERENCE = { + "sklearn": { + "short_summary": "Settings and information tools.", + "description": None, + "sections": [ + { + "title": None, + "autosummary": [ + "config_context", + "get_config", + "set_config", + "show_versions", + ], + }, + ], + }, + "sklearn.base": { + "short_summary": "Base classes and utility functions.", + "description": None, + "sections": [ + { + "title": None, + "autosummary": [ + "BaseEstimator", + "BiclusterMixin", + "ClassNamePrefixFeaturesOutMixin", + "ClassifierMixin", + "ClusterMixin", + "DensityMixin", + "MetaEstimatorMixin", + "OneToOneFeatureMixin", + "OutlierMixin", + "RegressorMixin", + "TransformerMixin", + "clone", + "is_classifier", + "is_clusterer", + "is_regressor", + "is_outlier_detector", + ], + } + ], + }, + "sklearn.calibration": { + "short_summary": "Probability calibration.", + "description": _get_guide("calibration"), + "sections": [ + { + "title": None, + "autosummary": ["CalibratedClassifierCV", "calibration_curve"], + }, + { + "title": "Visualization", + "autosummary": ["CalibrationDisplay"], + }, + ], + }, + "sklearn.cluster": { + "short_summary": "Clustering.", + "description": _get_guide("clustering", "biclustering"), + "sections": [ + { + "title": None, + "autosummary": [ + "AffinityPropagation", + "AgglomerativeClustering", + "Birch", + "BisectingKMeans", + "DBSCAN", + "FeatureAgglomeration", + "HDBSCAN", + "KMeans", + "MeanShift", + "MiniBatchKMeans", + "OPTICS", + "SpectralBiclustering", + "SpectralClustering", + "SpectralCoclustering", + "affinity_propagation", + "cluster_optics_dbscan", + "cluster_optics_xi", + "compute_optics_graph", + "dbscan", + "estimate_bandwidth", + "k_means", + "kmeans_plusplus", + "mean_shift", + "spectral_clustering", + "ward_tree", + ], + }, + ], + }, + "sklearn.compose": { + "short_summary": "Composite estimators.", + "description": _get_guide("combining_estimators"), + "sections": [ + { + "title": None, + "autosummary": [ + "ColumnTransformer", + "TransformedTargetRegressor", + "make_column_selector", + "make_column_transformer", + ], + }, + ], + }, + "sklearn.covariance": { + "short_summary": "Covariance estimation.", + "description": _get_guide("covariance"), + "sections": [ + { + "title": None, + "autosummary": [ + "EllipticEnvelope", + "EmpiricalCovariance", + "GraphicalLasso", + "GraphicalLassoCV", + "LedoitWolf", + "MinCovDet", + "OAS", + "ShrunkCovariance", + "empirical_covariance", + "graphical_lasso", + "ledoit_wolf", + "ledoit_wolf_shrinkage", + "oas", + "shrunk_covariance", + ], + }, + ], + }, + "sklearn.cross_decomposition": { + "short_summary": "Cross decomposition.", + "description": _get_guide("cross_decomposition"), + "sections": [ + { + "title": None, + "autosummary": ["CCA", "PLSCanonical", "PLSRegression", "PLSSVD"], + }, + ], + }, + "sklearn.datasets": { + "short_summary": "Datasets.", + "description": _get_guide("datasets"), + "sections": [ + { + "title": "Loaders", + "autosummary": [ + "clear_data_home", + "dump_svmlight_file", + "fetch_20newsgroups", + "fetch_20newsgroups_vectorized", + "fetch_california_housing", + "fetch_covtype", + "fetch_file", + "fetch_kddcup99", + "fetch_lfw_pairs", + "fetch_lfw_people", + "fetch_olivetti_faces", + "fetch_openml", + "fetch_rcv1", + "fetch_species_distributions", + "get_data_home", + "load_breast_cancer", + "load_diabetes", + "load_digits", + "load_files", + "load_iris", + "load_linnerud", + "load_sample_image", + "load_sample_images", + "load_svmlight_file", + "load_svmlight_files", + "load_wine", + ], + }, + { + "title": "Sample generators", + "autosummary": [ + "make_biclusters", + "make_blobs", + "make_checkerboard", + "make_circles", + "make_classification", + "make_friedman1", + "make_friedman2", + "make_friedman3", + "make_gaussian_quantiles", + "make_hastie_10_2", + "make_low_rank_matrix", + "make_moons", + "make_multilabel_classification", + "make_regression", + "make_s_curve", + "make_sparse_coded_signal", + "make_sparse_spd_matrix", + "make_sparse_uncorrelated", + "make_spd_matrix", + "make_swiss_roll", + ], + }, + ], + }, + "sklearn.decomposition": { + "short_summary": "Matrix decomposition.", + "description": _get_guide("decompositions"), + "sections": [ + { + "title": None, + "autosummary": [ + "DictionaryLearning", + "FactorAnalysis", + "FastICA", + "IncrementalPCA", + "KernelPCA", + "LatentDirichletAllocation", + "MiniBatchDictionaryLearning", + "MiniBatchNMF", + "MiniBatchSparsePCA", + "NMF", + "PCA", + "SparseCoder", + "SparsePCA", + "TruncatedSVD", + "dict_learning", + "dict_learning_online", + "fastica", + "non_negative_factorization", + "sparse_encode", + ], + }, + ], + }, + "sklearn.discriminant_analysis": { + "short_summary": "Discriminant analysis.", + "description": _get_guide("lda_qda"), + "sections": [ + { + "title": None, + "autosummary": [ + "LinearDiscriminantAnalysis", + "QuadraticDiscriminantAnalysis", + ], + }, + ], + }, + "sklearn.dummy": { + "short_summary": "Dummy estimators.", + "description": _get_guide("model_evaluation"), + "sections": [ + { + "title": None, + "autosummary": ["DummyClassifier", "DummyRegressor"], + }, + ], + }, + "sklearn.ensemble": { + "short_summary": "Ensemble methods.", + "description": _get_guide("ensemble"), + "sections": [ + { + "title": None, + "autosummary": [ + "AdaBoostClassifier", + "AdaBoostRegressor", + "BaggingClassifier", + "BaggingRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "GradientBoostingClassifier", + "GradientBoostingRegressor", + "HistGradientBoostingClassifier", + "HistGradientBoostingRegressor", + "IsolationForest", + "RandomForestClassifier", + "RandomForestRegressor", + "RandomTreesEmbedding", + "StackingClassifier", + "StackingRegressor", + "VotingClassifier", + "VotingRegressor", + ], + }, + ], + }, + "sklearn.exceptions": { + "short_summary": "Exceptions and warnings.", + "description": None, + "sections": [ + { + "title": None, + "autosummary": [ + "ConvergenceWarning", + "DataConversionWarning", + "DataDimensionalityWarning", + "EfficiencyWarning", + "FitFailedWarning", + "InconsistentVersionWarning", + "NotFittedError", + "UndefinedMetricWarning", + "EstimatorCheckFailedWarning", + ], + }, + ], + }, + "sklearn.experimental": { + "short_summary": "Experimental tools.", + "description": None, + "sections": [ + { + "title": None, + "autosummary": ["enable_halving_search_cv", "enable_iterative_imputer"], + }, + ], + }, + "sklearn.feature_extraction": { + "short_summary": "Feature extraction.", + "description": _get_guide("feature_extraction"), + "sections": [ + { + "title": None, + "autosummary": ["DictVectorizer", "FeatureHasher"], + }, + { + "title": "From images", + "description": _get_submodule("sklearn.feature_extraction", "image"), + "autosummary": [ + "image.PatchExtractor", + "image.extract_patches_2d", + "image.grid_to_graph", + "image.img_to_graph", + "image.reconstruct_from_patches_2d", + ], + }, + { + "title": "From text", + "description": _get_submodule("sklearn.feature_extraction", "text"), + "autosummary": [ + "text.CountVectorizer", + "text.HashingVectorizer", + "text.TfidfTransformer", + "text.TfidfVectorizer", + ], + }, + ], + }, + "sklearn.feature_selection": { + "short_summary": "Feature selection.", + "description": _get_guide("feature_selection"), + "sections": [ + { + "title": None, + "autosummary": [ + "GenericUnivariateSelect", + "RFE", + "RFECV", + "SelectFdr", + "SelectFpr", + "SelectFromModel", + "SelectFwe", + "SelectKBest", + "SelectPercentile", + "SelectorMixin", + "SequentialFeatureSelector", + "VarianceThreshold", + "chi2", + "f_classif", + "f_regression", + "mutual_info_classif", + "mutual_info_regression", + "r_regression", + ], + }, + ], + }, + "sklearn.frozen": { + "short_summary": "Frozen estimators.", + "description": None, + "sections": [ + { + "title": None, + "autosummary": ["FrozenEstimator"], + }, + ], + }, + "sklearn.gaussian_process": { + "short_summary": "Gaussian processes.", + "description": _get_guide("gaussian_process"), + "sections": [ + { + "title": None, + "autosummary": [ + "GaussianProcessClassifier", + "GaussianProcessRegressor", + ], + }, + { + "title": "Kernels", + "description": _get_submodule("sklearn.gaussian_process", "kernels"), + "autosummary": [ + "kernels.CompoundKernel", + "kernels.ConstantKernel", + "kernels.DotProduct", + "kernels.ExpSineSquared", + "kernels.Exponentiation", + "kernels.Hyperparameter", + "kernels.Kernel", + "kernels.Matern", + "kernels.PairwiseKernel", + "kernels.Product", + "kernels.RBF", + "kernels.RationalQuadratic", + "kernels.Sum", + "kernels.WhiteKernel", + ], + }, + ], + }, + "sklearn.impute": { + "short_summary": "Imputation.", + "description": _get_guide("impute"), + "sections": [ + { + "title": None, + "autosummary": [ + "IterativeImputer", + "KNNImputer", + "MissingIndicator", + "SimpleImputer", + ], + }, + ], + }, + "sklearn.inspection": { + "short_summary": "Inspection.", + "description": _get_guide("inspection"), + "sections": [ + { + "title": None, + "autosummary": ["partial_dependence", "permutation_importance"], + }, + { + "title": "Plotting", + "autosummary": ["DecisionBoundaryDisplay", "PartialDependenceDisplay"], + }, + ], + }, + "sklearn.isotonic": { + "short_summary": "Isotonic regression.", + "description": _get_guide("isotonic"), + "sections": [ + { + "title": None, + "autosummary": [ + "IsotonicRegression", + "check_increasing", + "isotonic_regression", + ], + }, + ], + }, + "sklearn.kernel_approximation": { + "short_summary": "Kernel approximation.", + "description": _get_guide("kernel_approximation"), + "sections": [ + { + "title": None, + "autosummary": [ + "AdditiveChi2Sampler", + "Nystroem", + "PolynomialCountSketch", + "RBFSampler", + "SkewedChi2Sampler", + ], + }, + ], + }, + "sklearn.kernel_ridge": { + "short_summary": "Kernel ridge regression.", + "description": _get_guide("kernel_ridge"), + "sections": [ + { + "title": None, + "autosummary": ["KernelRidge"], + }, + ], + }, + "sklearn.linear_model": { + "short_summary": "Generalized linear models.", + "description": ( + _get_guide("linear_model") + + "\n\nThe following subsections are only rough guidelines: the same " + "estimator can fall into multiple categories, depending on its parameters." + ), + "sections": [ + { + "title": "Linear classifiers", + "autosummary": [ + "LogisticRegression", + "LogisticRegressionCV", + "PassiveAggressiveClassifier", + "Perceptron", + "RidgeClassifier", + "RidgeClassifierCV", + "SGDClassifier", + "SGDOneClassSVM", + ], + }, + { + "title": "Classical linear regressors", + "autosummary": ["LinearRegression", "Ridge", "RidgeCV", "SGDRegressor"], + }, + { + "title": "Regressors with variable selection", + "description": ( + "The following estimators have built-in variable selection fitting " + "procedures, but any estimator using a L1 or elastic-net penalty " + "also performs variable selection: typically " + ":class:`~linear_model.SGDRegressor` or " + ":class:`~sklearn.linear_model.SGDClassifier` with an appropriate " + "penalty." + ), + "autosummary": [ + "ElasticNet", + "ElasticNetCV", + "Lars", + "LarsCV", + "Lasso", + "LassoCV", + "LassoLars", + "LassoLarsCV", + "LassoLarsIC", + "OrthogonalMatchingPursuit", + "OrthogonalMatchingPursuitCV", + ], + }, + { + "title": "Bayesian regressors", + "autosummary": ["ARDRegression", "BayesianRidge"], + }, + { + "title": "Multi-task linear regressors with variable selection", + "description": ( + "These estimators fit multiple regression problems (or tasks)" + " jointly, while inducing sparse coefficients. While the inferred" + " coefficients may differ between the tasks, they are constrained" + " to agree on the features that are selected (non-zero" + " coefficients)." + ), + "autosummary": [ + "MultiTaskElasticNet", + "MultiTaskElasticNetCV", + "MultiTaskLasso", + "MultiTaskLassoCV", + ], + }, + { + "title": "Outlier-robust regressors", + "description": ( + "Any estimator using the Huber loss would also be robust to " + "outliers, e.g., :class:`~linear_model.SGDRegressor` with " + "``loss='huber'``." + ), + "autosummary": [ + "HuberRegressor", + "QuantileRegressor", + "RANSACRegressor", + "TheilSenRegressor", + ], + }, + { + "title": "Generalized linear models (GLM) for regression", + "description": ( + "These models allow for response variables to have error " + "distributions other than a normal distribution." + ), + "autosummary": [ + "GammaRegressor", + "PoissonRegressor", + "TweedieRegressor", + ], + }, + { + "title": "Miscellaneous", + "autosummary": [ + "PassiveAggressiveRegressor", + "enet_path", + "lars_path", + "lars_path_gram", + "lasso_path", + "orthogonal_mp", + "orthogonal_mp_gram", + "ridge_regression", + ], + }, + ], + }, + "sklearn.manifold": { + "short_summary": "Manifold learning.", + "description": _get_guide("manifold"), + "sections": [ + { + "title": None, + "autosummary": [ + "Isomap", + "LocallyLinearEmbedding", + "MDS", + "SpectralEmbedding", + "TSNE", + "locally_linear_embedding", + "smacof", + "spectral_embedding", + "trustworthiness", + ], + }, + ], + }, + "sklearn.metrics": { + "short_summary": "Metrics.", + "description": _get_guide("model_evaluation", "metrics"), + "sections": [ + { + "title": "Model selection interface", + "description": _get_guide("scoring_parameter"), + "autosummary": [ + "check_scoring", + "get_scorer", + "get_scorer_names", + "make_scorer", + ], + }, + { + "title": "Classification metrics", + "description": _get_guide("classification_metrics"), + "autosummary": [ + "accuracy_score", + "auc", + "average_precision_score", + "balanced_accuracy_score", + "brier_score_loss", + "class_likelihood_ratios", + "classification_report", + "cohen_kappa_score", + "confusion_matrix", + "d2_log_loss_score", + "dcg_score", + "det_curve", + "f1_score", + "fbeta_score", + "hamming_loss", + "hinge_loss", + "jaccard_score", + "log_loss", + "matthews_corrcoef", + "multilabel_confusion_matrix", + "ndcg_score", + "precision_recall_curve", + "precision_recall_fscore_support", + "precision_score", + "recall_score", + "roc_auc_score", + "roc_curve", + "top_k_accuracy_score", + "zero_one_loss", + ], + }, + { + "title": "Regression metrics", + "description": _get_guide("regression_metrics"), + "autosummary": [ + "d2_absolute_error_score", + "d2_pinball_score", + "d2_tweedie_score", + "explained_variance_score", + "max_error", + "mean_absolute_error", + "mean_absolute_percentage_error", + "mean_gamma_deviance", + "mean_pinball_loss", + "mean_poisson_deviance", + "mean_squared_error", + "mean_squared_log_error", + "mean_tweedie_deviance", + "median_absolute_error", + "r2_score", + "root_mean_squared_error", + "root_mean_squared_log_error", + ], + }, + { + "title": "Multilabel ranking metrics", + "description": _get_guide("multilabel_ranking_metrics"), + "autosummary": [ + "coverage_error", + "label_ranking_average_precision_score", + "label_ranking_loss", + ], + }, + { + "title": "Clustering metrics", + "description": ( + _get_submodule("sklearn.metrics", "cluster") + + "\n\n" + + _get_guide("clustering_evaluation") + ), + "autosummary": [ + "adjusted_mutual_info_score", + "adjusted_rand_score", + "calinski_harabasz_score", + "cluster.contingency_matrix", + "cluster.pair_confusion_matrix", + "completeness_score", + "davies_bouldin_score", + "fowlkes_mallows_score", + "homogeneity_completeness_v_measure", + "homogeneity_score", + "mutual_info_score", + "normalized_mutual_info_score", + "rand_score", + "silhouette_samples", + "silhouette_score", + "v_measure_score", + ], + }, + { + "title": "Biclustering metrics", + "description": _get_guide("biclustering_evaluation"), + "autosummary": ["consensus_score"], + }, + { + "title": "Distance metrics", + "autosummary": ["DistanceMetric"], + }, + { + "title": "Pairwise metrics", + "description": ( + _get_submodule("sklearn.metrics", "pairwise") + + "\n\n" + + _get_guide("metrics") + ), + "autosummary": [ + "pairwise.additive_chi2_kernel", + "pairwise.chi2_kernel", + "pairwise.cosine_distances", + "pairwise.cosine_similarity", + "pairwise.distance_metrics", + "pairwise.euclidean_distances", + "pairwise.haversine_distances", + "pairwise.kernel_metrics", + "pairwise.laplacian_kernel", + "pairwise.linear_kernel", + "pairwise.manhattan_distances", + "pairwise.nan_euclidean_distances", + "pairwise.paired_cosine_distances", + "pairwise.paired_distances", + "pairwise.paired_euclidean_distances", + "pairwise.paired_manhattan_distances", + "pairwise.pairwise_kernels", + "pairwise.polynomial_kernel", + "pairwise.rbf_kernel", + "pairwise.sigmoid_kernel", + "pairwise_distances", + "pairwise_distances_argmin", + "pairwise_distances_argmin_min", + "pairwise_distances_chunked", + ], + }, + { + "title": "Plotting", + "description": _get_guide("visualizations"), + "autosummary": [ + "ConfusionMatrixDisplay", + "DetCurveDisplay", + "PrecisionRecallDisplay", + "PredictionErrorDisplay", + "RocCurveDisplay", + ], + }, + ], + }, + "sklearn.mixture": { + "short_summary": "Gaussian mixture models.", + "description": _get_guide("mixture"), + "sections": [ + { + "title": None, + "autosummary": ["BayesianGaussianMixture", "GaussianMixture"], + }, + ], + }, + "sklearn.model_selection": { + "short_summary": "Model selection.", + "description": _get_guide("cross_validation", "grid_search", "learning_curve"), + "sections": [ + { + "title": "Splitters", + "autosummary": [ + "GroupKFold", + "GroupShuffleSplit", + "KFold", + "LeaveOneGroupOut", + "LeaveOneOut", + "LeavePGroupsOut", + "LeavePOut", + "PredefinedSplit", + "RepeatedKFold", + "RepeatedStratifiedKFold", + "ShuffleSplit", + "StratifiedGroupKFold", + "StratifiedKFold", + "StratifiedShuffleSplit", + "TimeSeriesSplit", + "check_cv", + "train_test_split", + ], + }, + { + "title": "Hyper-parameter optimizers", + "autosummary": [ + "GridSearchCV", + "HalvingGridSearchCV", + "HalvingRandomSearchCV", + "ParameterGrid", + "ParameterSampler", + "RandomizedSearchCV", + ], + }, + { + "title": "Post-fit model tuning", + "autosummary": [ + "FixedThresholdClassifier", + "TunedThresholdClassifierCV", + ], + }, + { + "title": "Model validation", + "autosummary": [ + "cross_val_predict", + "cross_val_score", + "cross_validate", + "learning_curve", + "permutation_test_score", + "validation_curve", + ], + }, + { + "title": "Visualization", + "autosummary": ["LearningCurveDisplay", "ValidationCurveDisplay"], + }, + ], + }, + "sklearn.multiclass": { + "short_summary": "Multiclass classification.", + "description": _get_guide("multiclass_classification"), + "sections": [ + { + "title": None, + "autosummary": [ + "OneVsOneClassifier", + "OneVsRestClassifier", + "OutputCodeClassifier", + ], + }, + ], + }, + "sklearn.multioutput": { + "short_summary": "Multioutput regression and classification.", + "description": _get_guide( + "multilabel_classification", + "multiclass_multioutput_classification", + "multioutput_regression", + ), + "sections": [ + { + "title": None, + "autosummary": [ + "ClassifierChain", + "MultiOutputClassifier", + "MultiOutputRegressor", + "RegressorChain", + ], + }, + ], + }, + "sklearn.naive_bayes": { + "short_summary": "Naive Bayes.", + "description": _get_guide("naive_bayes"), + "sections": [ + { + "title": None, + "autosummary": [ + "BernoulliNB", + "CategoricalNB", + "ComplementNB", + "GaussianNB", + "MultinomialNB", + ], + }, + ], + }, + "sklearn.neighbors": { + "short_summary": "Nearest neighbors.", + "description": _get_guide("neighbors"), + "sections": [ + { + "title": None, + "autosummary": [ + "BallTree", + "KDTree", + "KNeighborsClassifier", + "KNeighborsRegressor", + "KNeighborsTransformer", + "KernelDensity", + "LocalOutlierFactor", + "NearestCentroid", + "NearestNeighbors", + "NeighborhoodComponentsAnalysis", + "RadiusNeighborsClassifier", + "RadiusNeighborsRegressor", + "RadiusNeighborsTransformer", + "kneighbors_graph", + "radius_neighbors_graph", + "sort_graph_by_row_values", + ], + }, + ], + }, + "sklearn.neural_network": { + "short_summary": "Neural network models.", + "description": _get_guide( + "neural_networks_supervised", "neural_networks_unsupervised" + ), + "sections": [ + { + "title": None, + "autosummary": ["BernoulliRBM", "MLPClassifier", "MLPRegressor"], + }, + ], + }, + "sklearn.pipeline": { + "short_summary": "Pipeline.", + "description": _get_guide("combining_estimators"), + "sections": [ + { + "title": None, + "autosummary": [ + "FeatureUnion", + "Pipeline", + "make_pipeline", + "make_union", + ], + }, + ], + }, + "sklearn.preprocessing": { + "short_summary": "Preprocessing and normalization.", + "description": _get_guide("preprocessing"), + "sections": [ + { + "title": None, + "autosummary": [ + "Binarizer", + "FunctionTransformer", + "KBinsDiscretizer", + "KernelCenterer", + "LabelBinarizer", + "LabelEncoder", + "MaxAbsScaler", + "MinMaxScaler", + "MultiLabelBinarizer", + "Normalizer", + "OneHotEncoder", + "OrdinalEncoder", + "PolynomialFeatures", + "PowerTransformer", + "QuantileTransformer", + "RobustScaler", + "SplineTransformer", + "StandardScaler", + "TargetEncoder", + "add_dummy_feature", + "binarize", + "label_binarize", + "maxabs_scale", + "minmax_scale", + "normalize", + "power_transform", + "quantile_transform", + "robust_scale", + "scale", + ], + }, + ], + }, + "sklearn.random_projection": { + "short_summary": "Random projection.", + "description": _get_guide("random_projection"), + "sections": [ + { + "title": None, + "autosummary": [ + "GaussianRandomProjection", + "SparseRandomProjection", + "johnson_lindenstrauss_min_dim", + ], + }, + ], + }, + "sklearn.semi_supervised": { + "short_summary": "Semi-supervised learning.", + "description": _get_guide("semi_supervised"), + "sections": [ + { + "title": None, + "autosummary": [ + "LabelPropagation", + "LabelSpreading", + "SelfTrainingClassifier", + ], + }, + ], + }, + "sklearn.svm": { + "short_summary": "Support vector machines.", + "description": _get_guide("svm"), + "sections": [ + { + "title": None, + "autosummary": [ + "LinearSVC", + "LinearSVR", + "NuSVC", + "NuSVR", + "OneClassSVM", + "SVC", + "SVR", + "l1_min_c", + ], + }, + ], + }, + "sklearn.tree": { + "short_summary": "Decision trees.", + "description": _get_guide("tree"), + "sections": [ + { + "title": None, + "autosummary": [ + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "ExtraTreeClassifier", + "ExtraTreeRegressor", + ], + }, + { + "title": "Exporting", + "autosummary": ["export_graphviz", "export_text"], + }, + { + "title": "Plotting", + "autosummary": ["plot_tree"], + }, + ], + }, + "sklearn.utils": { + "short_summary": "Utilities.", + "description": _get_guide("developers-utils", is_developer=True), + "sections": [ + { + "title": None, + "autosummary": [ + "Bunch", + "_safe_indexing", + "as_float_array", + "assert_all_finite", + "deprecated", + "estimator_html_repr", + "gen_batches", + "gen_even_slices", + "indexable", + "murmurhash3_32", + "resample", + "safe_mask", + "safe_sqr", + "shuffle", + "Tags", + "InputTags", + "TargetTags", + "ClassifierTags", + "RegressorTags", + "TransformerTags", + "get_tags", + ], + }, + { + "title": "Input and parameter validation", + "description": _get_submodule("sklearn.utils", "validation"), + "autosummary": [ + "check_X_y", + "check_array", + "check_consistent_length", + "check_random_state", + "check_scalar", + "validation.check_is_fitted", + "validation.check_memory", + "validation.check_symmetric", + "validation.column_or_1d", + "validation.has_fit_parameter", + "validation.validate_data", + ], + }, + { + "title": "Meta-estimators", + "description": _get_submodule("sklearn.utils", "metaestimators"), + "autosummary": ["metaestimators.available_if"], + }, + { + "title": "Weight handling based on class labels", + "description": _get_submodule("sklearn.utils", "class_weight"), + "autosummary": [ + "class_weight.compute_class_weight", + "class_weight.compute_sample_weight", + ], + }, + { + "title": "Dealing with multiclass target in classifiers", + "description": _get_submodule("sklearn.utils", "multiclass"), + "autosummary": [ + "multiclass.is_multilabel", + "multiclass.type_of_target", + "multiclass.unique_labels", + ], + }, + { + "title": "Optimal mathematical operations", + "description": _get_submodule("sklearn.utils", "extmath"), + "autosummary": [ + "extmath.density", + "extmath.fast_logdet", + "extmath.randomized_range_finder", + "extmath.randomized_svd", + "extmath.safe_sparse_dot", + "extmath.weighted_mode", + ], + }, + { + "title": "Working with sparse matrices and arrays", + "description": _get_submodule("sklearn.utils", "sparsefuncs"), + "autosummary": [ + "sparsefuncs.incr_mean_variance_axis", + "sparsefuncs.inplace_column_scale", + "sparsefuncs.inplace_csr_column_scale", + "sparsefuncs.inplace_row_scale", + "sparsefuncs.inplace_swap_column", + "sparsefuncs.inplace_swap_row", + "sparsefuncs.mean_variance_axis", + ], + }, + { + "title": None, + "description": _get_submodule("sklearn.utils", "sparsefuncs_fast"), + "autosummary": [ + "sparsefuncs_fast.inplace_csr_row_normalize_l1", + "sparsefuncs_fast.inplace_csr_row_normalize_l2", + ], + }, + { + "title": "Working with graphs", + "description": _get_submodule("sklearn.utils", "graph"), + "autosummary": ["graph.single_source_shortest_path_length"], + }, + { + "title": "Random sampling", + "description": _get_submodule("sklearn.utils", "random"), + "autosummary": ["random.sample_without_replacement"], + }, + { + "title": "Auxiliary functions that operate on arrays", + "description": _get_submodule("sklearn.utils", "arrayfuncs"), + "autosummary": ["arrayfuncs.min_pos"], + }, + { + "title": "Metadata routing", + "description": ( + _get_submodule("sklearn.utils", "metadata_routing") + + "\n\n" + + _get_guide("metadata_routing") + ), + "autosummary": [ + "metadata_routing.MetadataRequest", + "metadata_routing.MetadataRouter", + "metadata_routing.MethodMapping", + "metadata_routing.get_routing_for_object", + "metadata_routing.process_routing", + ], + }, + { + "title": "Discovering scikit-learn objects", + "description": _get_submodule("sklearn.utils", "discovery"), + "autosummary": [ + "discovery.all_displays", + "discovery.all_estimators", + "discovery.all_functions", + ], + }, + { + "title": "API compatibility checkers", + "description": _get_submodule("sklearn.utils", "estimator_checks"), + "autosummary": [ + "estimator_checks.check_estimator", + "estimator_checks.parametrize_with_checks", + "estimator_checks.estimator_checks_generator", + ], + }, + { + "title": "Parallel computing", + "description": _get_submodule("sklearn.utils", "parallel"), + "autosummary": [ + "parallel.Parallel", + "parallel.delayed", + ], + }, + ], + }, +} + + +""" +CONFIGURING DEPRECATED_API_REFERENCE +==================================== + +DEPRECATED_API_REFERENCE maps each deprecation target version to a corresponding +autosummary block. It will be placed at the bottom of the API index page under the +"Recently deprecated" section. Essentially, the rendered section would look like the +following: + +|------------------------------------------| +| To be removed in {{ version_1 }} | +| -------------------------------- | +| {{ autosummary_1 }} | +| | +| To be removed in {{ version_2 }} | +| -------------------------------- | +| {{ autosummary_2 }} | +| | +| More versions... | +|------------------------------------------| + +Note that the autosummary here assumes that the current module is `sklearn`, i.e., if +`sklearn.utils.Memory` is deprecated, one should put `utils.Memory` in the "entries" +slot of the autosummary block. + +Example: + +DEPRECATED_API_REFERENCE = { + "0.24": [ + "model_selection.fit_grid_point", + "utils.safe_indexing", + ], +} +""" + +DEPRECATED_API_REFERENCE = {} # type: ignore[var-annotated] diff --git a/doc/authors.rst b/doc/authors.rst deleted file mode 100644 index 861f2c0a884e4..0000000000000 --- a/doc/authors.rst +++ /dev/null @@ -1,92 +0,0 @@ -.. raw :: html - - -
- -
-
-

JÊrÊmie du Boisberranger

-
-
-
-

Joris Van den Bossche

-
-
-
-

Loïc Estève

-
-
-
-

Thomas J. Fan

-
-
-
-

Alexandre Gramfort

-
-
-
-

Olivier Grisel

-
-
-
-

Yaroslav Halchenko

-
-
-
-

Nicolas Hug

-
-
-
-

Adrin Jalali

-
-
-
-

Guillaume Lemaitre

-
-
-
-

Christian Lorentzen

-
-
-
-

Jan Hendrik Metzen

-
-
-
-

Andreas Mueller

-
-
-
-

Vlad Niculae

-
-
-
-

Joel Nothman

-
-
-
-

Hanmin Qin

-
-
-
-

Bertrand Thirion

-
-
-
-

Tom DuprÊ la Tour

-
-
-
-

Gael Varoquaux

-
-
-
-

Nelle Varoquaux

-
-
-
-

Roman Yurchak

-
-
\ No newline at end of file diff --git a/doc/binder/requirements.txt b/doc/binder/requirements.txt index 38619ceae0bc2..92bee596d18ce 100644 --- a/doc/binder/requirements.txt +++ b/doc/binder/requirements.txt @@ -1,5 +1,5 @@ -# A binder requirement file is required by sphinx-gallery. We don't really need -# one since the binder requirement files live in the -# scikit-learn/binder-examples repo and not in the scikit-learn.github.io repo -# that comes from the scikit-learn doc build. This file can be removed if -# 'dependencies' is made an optional key for binder in sphinx-gallery. +# A binder requirement file is required by sphinx-gallery. +# We don't really need one since our binder requirement file lives in the +# .binder directory. +# This file can be removed if 'dependencies' is made an optional key for +# binder in sphinx-gallery. diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst index 6bc79fbc14c0d..129f9b3990fd5 100644 --- a/doc/common_pitfalls.rst +++ b/doc/common_pitfalls.rst @@ -1,9 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -.. include:: includes/big_toc_css.rst - .. _common_pitfalls: ========================================= @@ -104,6 +98,26 @@ be the average of the train subset, **not** the average of all the data. If the test subset is included in the average calculation, information from the test subset is influencing the model. +How to avoid data leakage +------------------------- + +Below are some tips on avoiding data leakage: + +* Always split the data into train and test subsets first, particularly + before any preprocessing steps. +* Never include test data when using the `fit` and `fit_transform` + methods. Using all the data, e.g., `fit(X)`, can result in overly optimistic + scores. + + Conversely, the `transform` method should be used on both train and test + subsets as the same preprocessing should be applied to all the data. + This can be achieved by using `fit_transform` on the train subset and + `transform` on the test subset. +* The scikit-learn :ref:`pipeline ` is a great way to prevent data + leakage as it ensures that the appropriate method is performed on the + correct data subset. The pipeline is ideal for use in cross-validation + and hyper-parameter tuning functions. + An example of data leakage during preprocessing is detailed below. Data leakage during pre-processing @@ -146,7 +160,7 @@ much higher than expected accuracy score:: >>> from sklearn.model_selection import train_test_split >>> from sklearn.feature_selection import SelectKBest - >>> from sklearn.ensemble import GradientBoostingClassifier + >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> from sklearn.metrics import accuracy_score >>> # Incorrect preprocessing: the entire data is transformed @@ -154,9 +168,9 @@ much higher than expected accuracy score:: >>> X_train, X_test, y_train, y_test = train_test_split( ... X_selected, y, random_state=42) - >>> gbc = GradientBoostingClassifier(random_state=1) + >>> gbc = HistGradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train, y_train) - GradientBoostingClassifier(random_state=1) + HistGradientBoostingClassifier(random_state=1) >>> y_pred = gbc.predict(X_test) >>> accuracy_score(y_test, y_pred) @@ -175,14 +189,14 @@ data, close to chance:: >>> select = SelectKBest(k=25) >>> X_train_selected = select.fit_transform(X_train, y_train) - >>> gbc = GradientBoostingClassifier(random_state=1) + >>> gbc = HistGradientBoostingClassifier(random_state=1) >>> gbc.fit(X_train_selected, y_train) - GradientBoostingClassifier(random_state=1) + HistGradientBoostingClassifier(random_state=1) >>> X_test_selected = select.transform(X_test) >>> y_pred = gbc.predict(X_test_selected) >>> accuracy_score(y_test, y_pred) - 0.46 + 0.5 Here again, we recommend using a :class:`~sklearn.pipeline.Pipeline` to chain together the feature selection and model estimators. The pipeline ensures @@ -193,15 +207,15 @@ is used only for calculating the accuracy score:: >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, random_state=42) >>> pipeline = make_pipeline(SelectKBest(k=25), - ... GradientBoostingClassifier(random_state=1)) + ... HistGradientBoostingClassifier(random_state=1)) >>> pipeline.fit(X_train, y_train) Pipeline(steps=[('selectkbest', SelectKBest(k=25)), - ('gradientboostingclassifier', - GradientBoostingClassifier(random_state=1))]) + ('histgradientboostingclassifier', + HistGradientBoostingClassifier(random_state=1))]) >>> y_pred = pipeline.predict(X_test) >>> accuracy_score(y_test, y_pred) - 0.46 + 0.5 The pipeline can also be fed into a cross-validation function such as :func:`~sklearn.model_selection.cross_val_score`. @@ -211,27 +225,8 @@ method is used during fitting and predicting:: >>> from sklearn.model_selection import cross_val_score >>> scores = cross_val_score(pipeline, X, y) >>> print(f"Mean accuracy: {scores.mean():.2f}+/-{scores.std():.2f}") - Mean accuracy: 0.45+/-0.07 + Mean accuracy: 0.43+/-0.05 -How to avoid data leakage -------------------------- - -Below are some tips on avoiding data leakage: - -* Always split the data into train and test subsets first, particularly - before any preprocessing steps. -* Never include test data when using the `fit` and `fit_transform` - methods. Using all the data, e.g., `fit(X)`, can result in overly optimistic - scores. - - Conversely, the `transform` method should be used on both train and test - subsets as the same preprocessing should be applied to all the data. - This can be achieved by using `fit_transform` on the train subset and - `transform` on the test subset. -* The scikit-learn :ref:`pipeline ` is a great way to prevent data - leakage as it ensures that the appropriate method is performed on the - correct data subset. The pipeline is ideal for use in cross-validation - and hyper-parameter tuning functions. .. _randomness: @@ -243,7 +238,7 @@ Some scikit-learn objects are inherently random. These are usually estimators splitters (e.g. :class:`~sklearn.model_selection.KFold`). The randomness of these objects is controlled via their `random_state` parameter, as described in the :term:`Glossary `. This section expands on the glossary -entry, and describes good practices and common pitfalls w.r.t. to this +entry, and describes good practices and common pitfalls w.r.t. this subtle parameter. .. note:: Recommendation summary @@ -316,7 +311,7 @@ inter-dependent. For example, two estimators that share the same we discuss cloning. This point is important to keep in mind when debugging. If we had passed an integer to the `random_state` parameter of the -:class:`~sklearn.ensemble.RandomForestClassifier`, we would have obtained the +:class:`~sklearn.linear_model.SGDClassifier`, we would have obtained the same models, and thus the same scores each time. When we pass an integer, the same RNG is used across all calls to `fit`. What internally happens is that even though the RNG is consumed when `fit` is called, it is always reset to @@ -397,7 +392,7 @@ each case**: be the same across all folds. - Since `rf_inst` was passed a `RandomState` instance, each call to `fit` starts from a different RNG. As a result, the random subset of features - will be different for each folds. + will be different for each fold. While having a constant estimator RNG across folds isn't inherently wrong, we usually want CV results that are robust w.r.t. the estimator's randomness. As @@ -413,39 +408,40 @@ it will allow the estimator RNG to vary for each fold. illustration purpose: what matters is what we pass to the :class:`~sklearn.ensemble.RandomForestClassifier` estimator. -**Cloning** +.. dropdown:: Cloning -Another subtle side effect of passing `RandomState` instances is how -:func:`~sklearn.clone` will work:: + Another subtle side effect of passing `RandomState` instances is how + :func:`~sklearn.base.clone` will work:: - >>> from sklearn import clone - >>> from sklearn.ensemble import RandomForestClassifier - >>> import numpy as np + >>> from sklearn import clone + >>> from sklearn.ensemble import RandomForestClassifier + >>> import numpy as np + + >>> rng = np.random.RandomState(0) + >>> a = RandomForestClassifier(random_state=rng) + >>> b = clone(a) + + Since a `RandomState` instance was passed to `a`, `a` and `b` are not clones + in the strict sense, but rather clones in the statistical sense: `a` and `b` + will still be different models, even when calling `fit(X, y)` on the same + data. Moreover, `a` and `b` will influence each other since they share the + same internal RNG: calling `a.fit` will consume `b`'s RNG, and calling + `b.fit` will consume `a`'s RNG, since they are the same. This bit is true for + any estimators that share a `random_state` parameter; it is not specific to + clones. + + If an integer were passed, `a` and `b` would be exact clones and they would not + influence each other. + + .. warning:: + Even though :func:`~sklearn.base.clone` is rarely used in user code, it is + called pervasively throughout scikit-learn codebase: in particular, most + meta-estimators that accept non-fitted estimators call + :func:`~sklearn.base.clone` internally + (:class:`~sklearn.model_selection.GridSearchCV`, + :class:`~sklearn.ensemble.StackingClassifier`, + :class:`~sklearn.calibration.CalibratedClassifierCV`, etc.). - >>> rng = np.random.RandomState(0) - >>> a = RandomForestClassifier(random_state=rng) - >>> b = clone(a) - -Since a `RandomState` instance was passed to `a`, `a` and `b` are not clones -in the strict sense, but rather clones in the statistical sense: `a` and `b` -will still be different models, even when calling `fit(X, y)` on the same -data. Moreover, `a` and `b` will influence each-other since they share the -same internal RNG: calling `a.fit` will consume `b`'s RNG, and calling -`b.fit` will consume `a`'s RNG, since they are the same. This bit is true for -any estimators that share a `random_state` parameter; it is not specific to -clones. - -If an integer were passed, `a` and `b` would be exact clones and they would not -influence each other. - -.. warning:: - Even though :func:`~sklearn.clone` is rarely used in user code, it is - called pervasively throughout scikit-learn codebase: in particular, most - meta-estimators that accept non-fitted estimators call - :func:`~sklearn.clone` internally - (:class:`~sklearn.model_selection.GridSearchCV`, - :class:`~sklearn.ensemble.StackingClassifier`, - :class:`~sklearn.calibration.CalibratedClassifierCV`, etc.). CV splitters ............ @@ -553,18 +549,18 @@ When we evaluate a randomized estimator performance by cross-validation, we want to make sure that the estimator can yield accurate predictions for new data, but we also want to make sure that the estimator is robust w.r.t. its random initialization. For example, we would like the random weights -initialization of a :class:`~sklearn.linear_model.SGDCLassifier` to be +initialization of an :class:`~sklearn.linear_model.SGDClassifier` to be consistently good across all folds: otherwise, when we train that estimator on new data, we might get unlucky and the random initialization may lead to -bad performance. Similarly, we want a random forest to be robust w.r.t the +bad performance. Similarly, we want a random forest to be robust w.r.t. the set of randomly selected features that each tree will be using. For these reasons, it is preferable to evaluate the cross-validation -preformance by letting the estimator use a different RNG on each fold. This +performance by letting the estimator use a different RNG on each fold. This is done by passing a `RandomState` instance (or `None`) to the estimator initialization. -When we pass an integer, the estimator will use the same RNG on each fold: if +When we pass an integer, the estimator will use the same RNG on each fold: if the estimator performs well (or bad), as evaluated by CV, it might just be because we got lucky (or unlucky) with that specific seed. Passing instances leads to more robust CV results, and makes the comparison between various diff --git a/doc/communication_team.rst b/doc/communication_team.rst new file mode 100644 index 0000000000000..fb9666f0b42f7 --- /dev/null +++ b/doc/communication_team.rst @@ -0,0 +1,16 @@ +.. raw :: html + + +
+ +
+
+

Lauren Burke-McCarthy

+
+
+
+

François Goupil

+
+
diff --git a/doc/communication_team_emeritus.rst b/doc/communication_team_emeritus.rst new file mode 100644 index 0000000000000..d5ef7df59238e --- /dev/null +++ b/doc/communication_team_emeritus.rst @@ -0,0 +1 @@ +- Reshama Shaikh diff --git a/doc/computing.rst b/doc/computing.rst index 6732b754918b0..9f166432006b2 100644 --- a/doc/computing.rst +++ b/doc/computing.rst @@ -1,13 +1,7 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - ============================ Computing with scikit-learn ============================ -.. include:: includes/big_toc_css.rst - .. toctree:: :maxdepth: 2 diff --git a/doc/computing/computational_performance.rst b/doc/computing/computational_performance.rst index 32a485e21a2a5..4af79206dae1c 100644 --- a/doc/computing/computational_performance.rst +++ b/doc/computing/computational_performance.rst @@ -1,7 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - .. _computational_performance: .. currentmodule:: sklearn @@ -19,9 +15,9 @@ scikit-learn estimators in different contexts and provide some tips and tricks for overcoming performance bottlenecks. Prediction latency is measured as the elapsed time necessary to make a -prediction (e.g. in micro-seconds). Latency is often viewed as a distribution +prediction (e.g. in microseconds). Latency is often viewed as a distribution and operations engineers often focus on the latency at a given percentile of -this distribution (e.g. the 90 percentile). +this distribution (e.g. the 90th percentile). Prediction throughput is defined as the number of predictions the software can deliver in a given amount of time (e.g. in predictions per second). @@ -34,15 +30,16 @@ to take into account the same exact properties of the data as more complex ones. Prediction Latency ------------------ -One of the most straight-forward concerns one may have when using/choosing a +One of the most straightforward concerns one may have when using/choosing a machine learning toolkit is the latency at which predictions can be made in a production environment. The main factors that influence the prediction latency are - 1. Number of features - 2. Input data representation and sparsity - 3. Model complexity - 4. Feature extraction + +1. Number of features +2. Input data representation and sparsity +3. Model complexity +4. Feature extraction A last major parameter is also the possibility to do predictions in bulk or one-at-a-time mode. @@ -128,7 +125,7 @@ by quite a bit as only the non zero valued features impact the dot product and thus the model predictions. Hence if you have 100 non zeros in 1e6 dimensional space, you only need 100 multiply and add operation instead of 1e6. -Calculation over a dense representation, however, may leverage highly optimised +Calculation over a dense representation, however, may leverage highly optimized vector operations and multithreading in BLAS, and tends to result in fewer CPU cache misses. So the sparsity should typically be quite high (10% non-zeros max, to be checked depending on the hardware) for the sparse input @@ -195,7 +192,7 @@ support vectors. .. centered:: |nusvr_model_complexity| For :mod:`sklearn.ensemble` of trees (e.g. RandomForest, GBT, -ExtraTrees etc) the number of trees and their depth play the most +ExtraTrees, etc.) the number of trees and their depth play the most important role. Latency and throughput should scale linearly with the number of trees. In this case we used directly the ``n_estimators`` parameter of :class:`~ensemble.GradientBoostingRegressor`. @@ -224,9 +221,9 @@ files, tokenizing the text and hashing it into a common vector space) is taking 100 to 500 times more time than the actual prediction code, depending on the chosen model. - .. |prediction_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_004.png - :target: ../auto_examples/applications/plot_out_of_core_classification.html - :scale: 80 +.. |prediction_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_004.png + :target: ../auto_examples/applications/plot_out_of_core_classification.html + :scale: 80 .. centered:: |prediction_time| @@ -278,21 +275,20 @@ BLAS implementation and lead to orders of magnitude speedup over a non-optimized BLAS. You can display the BLAS / LAPACK implementation used by your NumPy / SciPy / -scikit-learn install with the following commands:: +scikit-learn install with the following command:: - from numpy.distutils.system_info import get_info - print(get_info('blas_opt')) - print(get_info('lapack_opt')) + python -c "import sklearn; sklearn.show_versions()" Optimized BLAS / LAPACK implementations include: - - Atlas (need hardware specific tuning by rebuilding on the target machine) - - OpenBLAS - - MKL - - Apple Accelerate and vecLib frameworks (OSX only) -More information can be found on the `Scipy install page `_ +- Atlas (need hardware specific tuning by rebuilding on the target machine) +- OpenBLAS +- MKL +- Apple Accelerate and vecLib frameworks (OSX only) + +More information can be found on the `NumPy install page `_ and in this -`blog post `_ +`blog post `_ from Daniel Nouri which has some nice step by step install instructions for Debian / Ubuntu. @@ -356,7 +352,7 @@ feature selection components in a pipeline once we know which features to keep from a previous run. Finally, it can help reduce processing time and I/O usage upstream in the data access and feature extraction layers by not collecting and building features that are discarded by the model. For instance -if the raw data come from a database, it can make it possible to write simpler +if the raw data come from a database, it is possible to write simpler and faster queries or reduce I/O usage by making the queries return lighter records. At the moment, reshaping needs to be performed manually in scikit-learn. @@ -366,5 +362,5 @@ sufficient to not generate the relevant features, leaving their columns empty. Links ...... - - :ref:`scikit-learn developer performance documentation ` - - `Scipy sparse matrix formats documentation `_ +- :ref:`scikit-learn developer performance documentation ` +- `Scipy sparse matrix formats documentation `_ diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst index 8605650e8eec5..d2ff106aec3be 100644 --- a/doc/computing/parallelism.rst +++ b/doc/computing/parallelism.rst @@ -1,7 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - Parallelism, resource management, and configuration =================================================== @@ -10,22 +6,29 @@ Parallelism, resource management, and configuration Parallelism ----------- -Some scikit-learn estimators and utilities can parallelize costly operations -using multiple CPU cores, thanks to the following components: +Some scikit-learn estimators and utilities parallelize costly operations +using multiple CPU cores. + +Depending on the type of estimator and sometimes the values of the +constructor parameters, this is either done: -- via the `joblib `_ library. In - this case the number of threads or processes can be controlled with the - ``n_jobs`` parameter. -- via OpenMP, used in C or Cython code. +- with higher-level parallelism via `joblib `_. +- with lower-level parallelism via OpenMP, used in C or Cython code. +- with lower-level parallelism via BLAS, used by NumPy and SciPy for generic operations + on arrays. -In addition, some of the numpy routines that are used internally by -scikit-learn may also be parallelized if numpy is installed with specific -numerical libraries such as MKL, OpenBLAS, or BLIS. +The `n_jobs` parameters of estimators always controls the amount of parallelism +managed by joblib (processes or threads depending on the joblib backend). +The thread-level parallelism managed by OpenMP in scikit-learn's own Cython code +or by BLAS & LAPACK libraries used by NumPy and SciPy operations used in scikit-learn +is always controlled by environment variables or `threadpoolctl` as explained below. +Note that some estimators can leverage all three kinds of parallelism at different +points of their training and prediction methods. -We describe these 3 scenarios in the following subsections. +We describe these 3 types of parallelism in the following subsections in more details. -Joblib-based parallelism -........................ +Higher-level parallelism with joblib +.................................... When the underlying implementation uses joblib, the number of workers (threads or processes) that are spawned in parallel can be controlled via the @@ -33,15 +36,16 @@ When the underlying implementation uses joblib, the number of workers .. note:: - Where (and how) parallelization happens in the estimators is currently - poorly documented. Please help us by improving our docs and tackle `issue - 14228 `_! + Where (and how) parallelization happens in the estimators using joblib by + specifying `n_jobs` is currently poorly documented. + Please help us by improving our docs and tackle `issue 14228 + `_! Joblib is able to support both multi-processing and multi-threading. Whether joblib chooses to spawn a thread or a process depends on the **backend** that it's using. -Scikit-learn generally relies on the ``loky`` backend, which is joblib's +scikit-learn generally relies on the ``loky`` backend, which is joblib's default backend. Loky is a multi-processing backend. When doing multi-processing, in order to avoid duplicating the memory in each process (which isn't reasonable with big datasets), joblib will create a `memmap @@ -68,44 +72,63 @@ In practice, whether parallelism is helpful at improving runtime depends on many factors. It is usually a good idea to experiment rather than assuming that increasing the number of workers is always a good thing. In some cases it can be highly detrimental to performance to run multiple copies of some -estimators or functions in parallel (see oversubscription below). +estimators or functions in parallel (see :ref:`oversubscription` below). -OpenMP-based parallelism -........................ +Lower-level parallelism with OpenMP +................................... OpenMP is used to parallelize code written in Cython or C, relying on -multi-threading exclusively. By default (and unless joblib is trying to -avoid oversubscription), the implementation will use as many threads as -possible. +multi-threading exclusively. By default, the implementations using OpenMP +will use as many threads as possible, i.e. as many threads as logical cores. -You can control the exact number of threads that are used via the -``OMP_NUM_THREADS`` environment variable: +You can control the exact number of threads that are used either: -.. prompt:: bash $ +- via the ``OMP_NUM_THREADS`` environment variable, for instance when: + running a python script: + + .. prompt:: bash $ - OMP_NUM_THREADS=4 python my_script.py + OMP_NUM_THREADS=4 python my_script.py -Parallel Numpy routines from numerical libraries -................................................ +- or via `threadpoolctl` as explained by `this piece of documentation + `_. -Scikit-learn relies heavily on NumPy and SciPy, which internally call -multi-threaded linear algebra routines implemented in libraries such as MKL, -OpenBLAS or BLIS. +Parallel NumPy and SciPy routines from numerical libraries +.......................................................... -The number of threads used by the OpenBLAS, MKL or BLIS libraries can be set -via the ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, and -``BLIS_NUM_THREADS`` environment variables. +scikit-learn relies heavily on NumPy and SciPy, which internally call +multi-threaded linear algebra routines (BLAS & LAPACK) implemented in libraries +such as MKL, OpenBLAS or BLIS. + +You can control the exact number of threads used by BLAS for each library +using environment variables, namely: + +- ``MKL_NUM_THREADS`` sets the number of threads MKL uses, +- ``OPENBLAS_NUM_THREADS`` sets the number of threads OpenBLAS uses +- ``BLIS_NUM_THREADS`` sets the number of threads BLIS uses + +Note that BLAS & LAPACK implementations can also be impacted by +`OMP_NUM_THREADS`. To check whether this is the case in your environment, +you can inspect how the number of threads effectively used by those libraries +is affected when running the following command in a bash or zsh terminal +for different values of `OMP_NUM_THREADS`: + +.. prompt:: bash $ -Please note that scikit-learn has no direct control over these -implementations. Scikit-learn solely relies on Numpy and Scipy. + OMP_NUM_THREADS=2 python -m threadpoolctl -i numpy scipy .. note:: - At the time of writing (2019), NumPy and SciPy packages distributed on - pypi.org (used by ``pip``) and on the conda-forge channel are linked - with OpenBLAS, while conda packages shipped on the "defaults" channel - from anaconda.org are linked by default with MKL. + At the time of writing (2022), NumPy and SciPy packages which are + distributed on pypi.org (i.e. the ones installed via ``pip install``) + and on the conda-forge channel (i.e. the ones installed via + ``conda install --channel conda-forge``) are linked with OpenBLAS, while + NumPy and SciPy packages shipped on the ``defaults`` conda + channel from Anaconda.org (i.e. the ones installed via ``conda install``) + are linked by default with MKL. +.. _oversubscription: + Oversubscription: spawning too many threads ........................................... @@ -120,8 +143,8 @@ with ``n_jobs=8`` over a OpenMP). Each instance of :class:`~sklearn.ensemble.HistGradientBoostingClassifier` will spawn 8 threads (since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which -leads to oversubscription of physical CPU resources and to scheduling -overhead. +leads to oversubscription of threads for physical CPU resources and thus +to scheduling overhead. Oversubscription can arise in the exact same fashion with parallelized routines from MKL, OpenBLAS or BLIS that are nested in joblib calls. @@ -146,71 +169,170 @@ Note that: only use ``_NUM_THREADS``. Joblib exposes a context manager for finer control over the number of threads in its workers (see joblib docs linked below). -- Joblib is currently unable to avoid oversubscription in a - multi-threading context. It can only do so with the ``loky`` backend - (which spawns processes). +- When joblib is configured to use the ``threading`` backend, there is no + mechanism to avoid oversubscriptions when calling into parallel native + libraries in the joblib-managed threads. +- All scikit-learn estimators that explicitly rely on OpenMP in their Cython code + always use `threadpoolctl` internally to automatically adapt the numbers of + threads used by OpenMP and potentially nested BLAS calls so as to avoid + oversubscription. You will find additional details about joblib mitigation of oversubscription in `joblib documentation -`_. +`_. +You will find additional details about parallelism in numerical python libraries +in `this document from Thomas J. Fan `_. Configuration switches ----------------------- -Python runtime -.............. +Python API +.......... -:func:`sklearn.set_config` controls the following behaviors: +:func:`sklearn.set_config` and :func:`sklearn.config_context` can be used to change +parameters of the configuration which control aspect of parallelism. -:assume_finite: +.. _environment_variable: - used to skip validation, which enables faster computations but may - lead to segmentation faults if the data contains NaNs. +Environment variables +..................... -:working_memory: +These environment variables should be set before importing scikit-learn. - the optimal size of temporary arrays used by some algorithms. +`SKLEARN_ASSUME_FINITE` +~~~~~~~~~~~~~~~~~~~~~~~ -.. _environment_variable: +Sets the default value for the `assume_finite` argument of +:func:`sklearn.set_config`. -Environment variables -...................... +`SKLEARN_WORKING_MEMORY` +~~~~~~~~~~~~~~~~~~~~~~~~ -These environment variables should be set before importing scikit-learn. +Sets the default value for the `working_memory` argument of +:func:`sklearn.set_config`. + +`SKLEARN_SEED` +~~~~~~~~~~~~~~ + +Sets the seed of the global random generator when running the tests, for +reproducibility. + +Note that scikit-learn tests are expected to run deterministically with +explicit seeding of their own independent RNG instances instead of relying on +the numpy or Python standard library RNG singletons to make sure that test +results are independent of the test execution order. However some tests might +forget to use explicit seeding and this variable is a way to control the initial +state of the aforementioned singletons. + +`SKLEARN_TESTS_GLOBAL_RANDOM_SEED` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Controls the seeding of the random number generator used in tests that rely on +the `global_random_seed` fixture. + +All tests that use this fixture accept the contract that they should +deterministically pass for any seed value from 0 to 99 included. + +In nightly CI builds, the `SKLEARN_TESTS_GLOBAL_RANDOM_SEED` environment +variable is drawn randomly in the above range and all fixtured tests will run +for that specific seed. The goal is to ensure that, over time, our CI will run +all tests with different seeds while keeping the test duration of a single run +of the full test suite limited. This will check that the assertions of tests +written to use this fixture are not dependent on a specific seed value. + +The range of admissible seed values is limited to [0, 99] because it is often +not possible to write a test that can work for any possible seed and we want to +avoid having tests that randomly fail on the CI. + +Valid values for `SKLEARN_TESTS_GLOBAL_RANDOM_SEED`: + +- `SKLEARN_TESTS_GLOBAL_RANDOM_SEED="42"`: run tests with a fixed seed of 42 +- `SKLEARN_TESTS_GLOBAL_RANDOM_SEED="40-42"`: run the tests with all seeds + between 40 and 42 included +- `SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all"`: run the tests with all seeds + between 0 and 99 included. This can take a long time: only use for individual + tests, not the full test suite! + +If the variable is not set, then 42 is used as the global seed in a +deterministic manner. This ensures that, by default, the scikit-learn test +suite is as deterministic as possible to avoid disrupting our friendly +third-party package maintainers. Similarly, this variable should not be set in +the CI config of pull-requests to make sure that our friendly contributors are +not the first people to encounter a seed-sensitivity regression in a test +unrelated to the changes of their own PR. Only the scikit-learn maintainers who +watch the results of the nightly builds are expected to be annoyed by this. + +When writing a new test function that uses this fixture, please use the +following command to make sure that it passes deterministically for all +admissible seeds on your local machine: + +.. prompt:: bash $ + + SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest -v -k test_your_test_name + +`SKLEARN_SKIP_NETWORK_TESTS` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When this environment variable is set to a non zero value, the tests that need +network access are skipped. When this environment variable is not set then +network tests are skipped. + +`SKLEARN_RUN_FLOAT32_TESTS` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When this environment variable is set to '1', the tests using the +`global_dtype` fixture are also run on float32 data. +When this environment variable is not set, the tests are only run on +float64 data. + +`SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When this environment variable is set to a non zero value, the `Cython` +derivative, `boundscheck` is set to `True`. This is useful for finding +segfaults. -:SKLEARN_SITE_JOBLIB: +`SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - When this environment variable is set to a non zero value, - scikit-learn uses the site joblib rather than its vendored version. - Consequently, joblib must be installed for scikit-learn to run. - Note that using the site joblib is at your own risks: the versions of - scikit-learn and joblib need to be compatible. Currently, joblib 0.11+ - is supported. In addition, dumps from joblib.Memory might be incompatible, - and you might loose some caches and have to redownload some datasets. +When this environment variable is set to a non zero value, the debug symbols +will be included in the compiled C extensions. Only debug symbols for POSIX +systems are configured. - .. deprecated:: 0.21 +`SKLEARN_PAIRWISE_DIST_CHUNK_SIZE` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - As of version 0.21 this parameter has no effect, vendored joblib was - removed and site joblib is always used. +This sets the size of chunk to be used by the underlying `PairwiseDistancesReductions` +implementations. The default value is `256` which has been showed to be adequate on +most machines. -:SKLEARN_ASSUME_FINITE: +Users looking for the best performance might want to tune this variable using +powers of 2 so as to get the best parallelism behavior for their hardware, +especially with respect to their caches' sizes. - Sets the default value for the `assume_finite` argument of - :func:`sklearn.set_config`. +`SKLEARN_WARNINGS_AS_ERRORS` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:SKLEARN_WORKING_MEMORY: +This environment variable is used to turn warnings into errors in tests and +documentation build. - Sets the default value for the `working_memory` argument of - :func:`sklearn.set_config`. +Some CI (Continuous Integration) builds set `SKLEARN_WARNINGS_AS_ERRORS=1`, for +example to make sure that we catch deprecation warnings from our dependencies +and that we adapt our code. -:SKLEARN_SEED: +To locally run with the same "warnings as errors" setting as in these CI builds +you can set `SKLEARN_WARNINGS_AS_ERRORS=1`. - Sets the seed of the global random generator when running the tests, - for reproducibility. +By default, warnings are not turned into errors. This is the case if +`SKLEARN_WARNINGS_AS_ERRORS` is unset, or `SKLEARN_WARNINGS_AS_ERRORS=0`. -:SKLEARN_SKIP_NETWORK_TESTS: +This environment variable uses specific warning filters to ignore some warnings, +since sometimes warnings originate from third-party libraries and there is not +much we can do about it. You can see the warning filters in the +`_get_warnings_filters_info_list` function in `sklearn/utils/_testing.py`. - When this environment variable is set to a non zero value, the tests - that need network access are skipped. When this environment variable is - not set then network tests are skipped. +Note that for documentation build, `SKLEARN_WARNING_AS_ERRORS=1` is checking +that the documentation build, in particular running examples, does not produce +any warnings. This is different from the `-W` `sphinx-build` argument that +catches syntax warnings in the rst files. diff --git a/doc/computing/scaling_strategies.rst b/doc/computing/scaling_strategies.rst index 5eee5728e4b9a..286a1e79d0a8c 100644 --- a/doc/computing/scaling_strategies.rst +++ b/doc/computing/scaling_strategies.rst @@ -1,7 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - .. _scaling_strategies: Strategies to scale computationally: bigger data @@ -20,9 +16,9 @@ data that cannot fit in a computer's main memory (RAM). Here is a sketch of a system designed to achieve this goal: - 1. a way to stream instances - 2. a way to extract features from instances - 3. an incremental algorithm +1. a way to stream instances +2. a way to extract features from instances +3. an incremental algorithm Streaming instances .................... @@ -62,28 +58,29 @@ balances relevancy and memory footprint could involve some tuning [1]_. Here is a list of incremental estimators for different tasks: - - Classification - + :class:`sklearn.naive_bayes.MultinomialNB` - + :class:`sklearn.naive_bayes.BernoulliNB` - + :class:`sklearn.linear_model.Perceptron` - + :class:`sklearn.linear_model.SGDClassifier` - + :class:`sklearn.linear_model.PassiveAggressiveClassifier` - + :class:`sklearn.neural_network.MLPClassifier` - - Regression - + :class:`sklearn.linear_model.SGDRegressor` - + :class:`sklearn.linear_model.PassiveAggressiveRegressor` - + :class:`sklearn.neural_network.MLPRegressor` - - Clustering - + :class:`sklearn.cluster.MiniBatchKMeans` - + :class:`sklearn.cluster.Birch` - - Decomposition / feature Extraction - + :class:`sklearn.decomposition.MiniBatchDictionaryLearning` - + :class:`sklearn.decomposition.IncrementalPCA` - + :class:`sklearn.decomposition.LatentDirichletAllocation` - - Preprocessing - + :class:`sklearn.preprocessing.StandardScaler` - + :class:`sklearn.preprocessing.MinMaxScaler` - + :class:`sklearn.preprocessing.MaxAbsScaler` +- Classification + + :class:`sklearn.naive_bayes.MultinomialNB` + + :class:`sklearn.naive_bayes.BernoulliNB` + + :class:`sklearn.linear_model.Perceptron` + + :class:`sklearn.linear_model.SGDClassifier` + + :class:`sklearn.linear_model.PassiveAggressiveClassifier` + + :class:`sklearn.neural_network.MLPClassifier` +- Regression + + :class:`sklearn.linear_model.SGDRegressor` + + :class:`sklearn.linear_model.PassiveAggressiveRegressor` + + :class:`sklearn.neural_network.MLPRegressor` +- Clustering + + :class:`sklearn.cluster.MiniBatchKMeans` + + :class:`sklearn.cluster.Birch` +- Decomposition / feature Extraction + + :class:`sklearn.decomposition.MiniBatchDictionaryLearning` + + :class:`sklearn.decomposition.IncrementalPCA` + + :class:`sklearn.decomposition.LatentDirichletAllocation` + + :class:`sklearn.decomposition.MiniBatchNMF` +- Preprocessing + + :class:`sklearn.preprocessing.StandardScaler` + + :class:`sklearn.preprocessing.MinMaxScaler` + + :class:`sklearn.preprocessing.MaxAbsScaler` For classification, a somewhat important thing to note is that although a stateless feature extraction routine may be able to cope with new/unseen diff --git a/doc/conf.py b/doc/conf.py index 6768aab208a99..71c9ec5bb60c3 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # scikit-learn documentation build configuration file, created by # sphinx-quickstart on Fri Jan 8 09:13:42 2010. # @@ -12,81 +10,141 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys +import json import os -import warnings import re +import sys +import warnings from datetime import datetime -from packaging.version import parse from pathlib import Path -from io import StringIO +from urllib.request import urlopen + +from sklearn.externals._packaging.version import parse +from sklearn.utils._testing import turn_warnings_into_errors # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -sys.path.insert(0, os.path.abspath('sphinxext')) +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("sphinxext")) -from github_link import make_linkcode_resolve +import jinja2 import sphinx_gallery +from github_link import make_linkcode_resolve +from sphinx.util.logging import getLogger +from sphinx_gallery.notebook import add_code_cell, add_markdown_cell +from sphinx_gallery.sorting import ExampleTitleSortKey + +logger = getLogger(__name__) + +try: + # Configure plotly to integrate its output into the HTML pages generated by + # sphinx-gallery. + import plotly.io as pio + + pio.renderers.default = "sphinx_gallery" +except ImportError: + # Make it possible to render the doc when not running the examples + # that need plotly. + pass # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', - 'numpydoc', - 'sphinx.ext.linkcode', 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.imgconverter', - 'sphinx_gallery.gen_gallery', - 'sphinx_issues', - 'add_toctree_functions', - 'sphinx-prompt', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "numpydoc", + "sphinx.ext.linkcode", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.imgconverter", + "sphinx_gallery.gen_gallery", + "sphinx-prompt", + "sphinx_copybutton", + "sphinxext.opengraph", + "matplotlib.sphinxext.plot_directive", + "sphinxcontrib.sass", + "sphinx_remove_toctrees", + "sphinx_design", + # See sphinxext/ + "allow_nan_estimators", + "autoshortsummary", + "doi_role", + "dropdown_anchors", + "override_pst_pagetoc", + "sphinx_issues", ] -# this is needed for some reason... -# see https://github.com/numpy/numpydoc/issues/69 +# Specify how to identify the prompt when copying code snippets +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True +copybutton_exclude = "style" + +try: + import jupyterlite_sphinx # noqa: F401 + + extensions.append("jupyterlite_sphinx") + with_jupyterlite = True +except ImportError: + # In some cases we don't want to require jupyterlite_sphinx to be installed, + # e.g. the doc-min-dependencies build + warnings.warn( + "jupyterlite_sphinx is not installed, you need to install it " + "if you want JupyterLite links to appear in each example" + ) + with_jupyterlite = False + +# Produce `plot::` directives for examples that contain `import matplotlib` or +# `from matplotlib import`. +numpydoc_use_plots = True + +# Options for the `::plot` directive: +# https://matplotlib.org/stable/api/sphinxext_plot_directive_api.html +plot_formats = ["png"] +plot_include_source = True +plot_html_show_formats = False +plot_html_show_source_link = False + +# We do not need the table of class members because `sphinxext/override_pst_pagetoc.py` +# will show them in the secondary sidebar +numpydoc_show_class_members = False +numpydoc_show_inherited_class_members = False + +# We want in-page toc of class members instead of a separate page for each entry numpydoc_class_members_toctree = False # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set # (useful for viewing the doc offline) -if os.environ.get('NO_MATHJAX'): - extensions.append('sphinx.ext.imgmath') - imgmath_image_format = 'svg' - mathjax_path = '' +if os.environ.get("NO_MATHJAX"): + extensions.append("sphinx.ext.imgmath") + imgmath_image_format = "svg" + mathjax_path = "" else: - extensions.append('sphinx.ext.mathjax') - mathjax_path = ('https://cdn.jsdelivr.net/npm/mathjax@3/es5/' - 'tex-chtml.js') - -autodoc_default_options = { - 'members': True, - 'inherited-members': True -} + extensions.append("sphinx.ext.mathjax") + mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js" # Add any paths that contain templates here, relative to this directory. -templates_path = ['templates'] +templates_path = ["templates"] # generate autosummary even if no references autosummary_generate = True # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8' +source_encoding = "utf-8" # The main toctree document. -main_doc = 'contents' +root_doc = "index" # General information about the project. -project = 'scikit-learn' -copyright = ( - f'2007 - {datetime.now().year}, scikit-learn developers (BSD License)' -) +project = "scikit-learn" +copyright = f"2007 - {datetime.now().year}, scikit-learn developers (BSD License)" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -94,6 +152,7 @@ # # The short X.Y version. import sklearn + parsed_version = parse(sklearn.__version__) version = ".".join(parsed_version.base_version.split(".")[:2]) # The full version, including alpha/beta/rc tags. @@ -105,89 +164,229 @@ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', 'templates', 'includes', 'themes'] +exclude_patterns = [ + "_build", + "templates", + "includes", + "**/sg_execution_times.rst", + "whats_new/upcoming_changes", +] # The reST default role (used for this markup: `text`) to use for all # documents. -default_role = 'literal' +default_role = "literal" # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = False # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +# show_authors = False # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'scikit-learn-modern' +html_theme = "pydata_sphinx_theme" + +# This config option is used to generate the canonical links in the header +# of every page. The canonical link is needed to prevent search engines from +# returning results pointing to old scikit-learn versions. +html_baseurl = "https://scikit-learn.org/stable/" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -html_theme_options = {'google_analytics': True, - 'mathjax_path': mathjax_path} +html_theme_options = { + # -- General configuration ------------------------------------------------ + "sidebar_includehidden": True, + "use_edit_page_button": True, + "external_links": [], + "icon_links_label": "Icon Links", + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/scikit-learn/scikit-learn", + "icon": "fa-brands fa-square-github", + "type": "fontawesome", + }, + ], + "analytics": { + "plausible_analytics_domain": "scikit-learn.org", + "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", + }, + # If "prev-next" is included in article_footer_items, then setting show_prev_next + # to True would repeat prev and next links. See + # https://github.com/pydata/pydata-sphinx-theme/blob/b731dc230bc26a3d1d1bb039c56c977a9b3d25d8/src/pydata_sphinx_theme/theme/pydata_sphinx_theme/layout.html#L118-L129 + "show_prev_next": False, + "search_bar_text": "Search the docs ...", + "navigation_with_keys": False, + "collapse_navigation": False, + "navigation_depth": 2, + "show_nav_level": 1, + "show_toc_level": 1, + "navbar_align": "left", + "header_links_before_dropdown": 5, + "header_dropdown_text": "More", + # The switcher requires a JSON file with the list of documentation versions, which + # is generated by the script `build_tools/circle/list_versions.py` and placed under + # the `js/` static directory; it will then be copied to the `_static` directory in + # the built documentation + "switcher": { + "json_url": "https://scikit-learn.org/dev/_static/versions.json", + "version_match": release, + }, + # check_switcher may be set to False if docbuild pipeline fails. See + # https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/version-dropdown.html#configure-switcher-json-url + "check_switcher": True, + "pygments_light_style": "tango", + "pygments_dark_style": "monokai", + "logo": { + "alt_text": "scikit-learn homepage", + "image_relative": "logos/scikit-learn-logo-small.png", + "image_light": "logos/scikit-learn-logo-small.png", + "image_dark": "logos/scikit-learn-logo-small.png", + }, + "surface_warnings": True, + # -- Template placement in theme layouts ---------------------------------- + "navbar_start": ["navbar-logo"], + # Note that the alignment of navbar_center is controlled by navbar_align + "navbar_center": ["navbar-nav"], + "navbar_end": ["theme-switcher", "navbar-icon-links", "version-switcher"], + # navbar_persistent is persistent right (even when on mobiles) + "navbar_persistent": ["search-button"], + "article_header_start": ["breadcrumbs"], + "article_header_end": [], + "article_footer_items": ["prev-next"], + "content_footer_items": [], + # Use html_sidebars that map page patterns to list of sidebar templates + "primary_sidebar_end": [], + "footer_start": ["copyright"], + "footer_center": [], + "footer_end": [], + # When specified as a dictionary, the keys should follow glob-style patterns, as in + # https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-exclude_patterns + # In particular, "**" specifies the default for all pages + # Use :html_theme.sidebar_secondary.remove: for file-wide removal + "secondary_sidebar_items": { + "**": [ + "page-toc", + "sourcelink", + # Sphinx-Gallery-specific sidebar components + # https://sphinx-gallery.github.io/stable/advanced.html#using-sphinx-gallery-sidebar-components + "sg_download_links", + "sg_launcher_links", + ], + }, + "show_version_warning_banner": True, + "announcement": None, +} # Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ['themes'] - +# html_theme_path = ["themes"] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -html_short_title = 'scikit-learn' - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -html_logo = 'logos/scikit-learn-logo-small.png' +html_short_title = "scikit-learn" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = 'logos/favicon.ico' +html_favicon = "logos/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['images'] +html_static_path = ["images", "css", "js"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# Workaround for removing the left sidebar on pages without TOC +# A better solution would be to follow the merge of: +# https://github.com/pydata/pydata-sphinx-theme/pull/1682 +html_sidebars = { + "install": [], + "getting_started": [], + "glossary": [], + "faq": [], + "support": [], + "related_projects": [], + "roadmap": [], + "governance": [], + "about": [], +} # Additional templates that should be rendered to pages, maps page names to # template names. -html_additional_pages = { - 'index': 'index.html', - 'documentation': 'documentation.html'} # redirects to index +html_additional_pages = {"index": "index.html"} + +# Additional files to copy +# html_extra_path = [] + +# Additional JS files +html_js_files = [ + "scripts/dropdown.js", + "scripts/version-switcher.js", + "scripts/sg_plotly_resize.js", +] + +# Compile scss files into css files using sphinxcontrib-sass +sass_src_dir, sass_out_dir = "scss", "css/styles" +sass_targets = { + f"{file.stem}.scss": f"{file.stem}.css" + for file in Path(sass_src_dir).glob("*.scss") +} + +# Additional CSS files, should be subset of the values of `sass_targets` +html_css_files = ["styles/colors.css", "styles/custom.css"] + + +def add_js_css_files(app, pagename, templatename, context, doctree): + """Load additional JS and CSS files only for certain pages. + + Note that `html_js_files` and `html_css_files` are included in all pages and + should be used for the ones that are used by multiple pages. All page-specific + JS and CSS files should be added here instead. + """ + if pagename == "api/index": + # External: jQuery and DataTables + app.add_js_file("https://code.jquery.com/jquery-3.7.0.js") + app.add_js_file("https://cdn.datatables.net/2.0.0/js/dataTables.min.js") + app.add_css_file( + "https://cdn.datatables.net/2.0.0/css/dataTables.dataTables.min.css" + ) + # Internal: API search initialization and styling + app.add_js_file("scripts/api-search.js") + app.add_css_file("styles/api-search.css") + elif pagename == "index": + app.add_css_file("styles/index.css") + elif pagename.startswith("modules/generated/"): + app.add_css_file("styles/api.css") + # If false, no module index is generated. html_domain_indices = False @@ -196,21 +395,21 @@ html_use_index = False # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' +# html_file_suffix = '' # Output file base name for HTML help builder. -htmlhelp_basename = 'scikit-learndoc' +htmlhelp_basename = "scikit-learndoc" # If true, the reST sources are included in the HTML build as _sources/name. html_copy_source = True @@ -221,27 +420,111 @@ # index.html release_highlights_dir = Path("..") / "examples" / "release_highlights" # Finds the highlight with the latest version number -latest_highlights = sorted(release_highlights_dir.glob( - "plot_release_highlights_*.py"))[-1] -latest_highlights = latest_highlights.with_suffix('').name -html_context["release_highlights"] = \ +latest_highlights = sorted(release_highlights_dir.glob("plot_release_highlights_*.py"))[ + -1 +] +latest_highlights = latest_highlights.with_suffix("").name +html_context["release_highlights"] = ( f"auto_examples/release_highlights/{latest_highlights}" +) -# get version from higlight name assuming highlights have the form +# get version from highlight name assuming highlights have the form # plot_release_highlights_0_22_0 highlight_version = ".".join(latest_highlights.split("_")[-3:-1]) html_context["release_highlights_version"] = highlight_version + +# redirects dictionary maps from old links to new links +redirects = { + "documentation": "index", + "contents": "index", + "preface": "index", + "modules/classes": "api/index", + "tutorial/machine_learning_map/index": "machine_learning_map", + "auto_examples/feature_selection/plot_permutation_test_for_classification": ( + "auto_examples/model_selection/plot_permutation_tests_for_classification" + ), + "modules/model_persistence": "model_persistence", + "auto_examples/linear_model/plot_bayesian_ridge": ( + "auto_examples/linear_model/plot_ard" + ), + "auto_examples/model_selection/grid_search_text_feature_extraction": ( + "auto_examples/model_selection/plot_grid_search_text_feature_extraction" + ), + "auto_examples/model_selection/plot_validation_curve": ( + "auto_examples/model_selection/plot_train_error_vs_test_error" + ), + "auto_examples/datasets/plot_digits_last_image": ( + "auto_examples/exercises/plot_digits_classification_exercises" + ), + "auto_examples/datasets/plot_random_dataset": ( + "auto_examples/classification/plot_classifier_comparison" + ), + "auto_examples/miscellaneous/plot_changed_only_pprint_parameter": ( + "auto_examples/miscellaneous/plot_estimator_representation" + ), + "auto_examples/decomposition/plot_beta_divergence": ( + "auto_examples/applications/plot_topics_extraction_with_nmf_lda" + ), + "auto_examples/svm/plot_svm_nonlinear": "auto_examples/svm/plot_svm_kernels", + "auto_examples/ensemble/plot_adaboost_hastie_10_2": ( + "auto_examples/ensemble/plot_adaboost_multiclass" + ), + "auto_examples/decomposition/plot_pca_3d": ( + "auto_examples/decomposition/plot_pca_iris" + ), + "auto_examples/exercises/plot_cv_digits": ( + "auto_examples/model_selection/plot_nested_cross_validation_iris" + ), + "auto_examples/linear_model/plot_lasso_lars": ( + "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + ), + "auto_examples/linear_model/plot_lasso_coordinate_descent_path": ( + "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + ), + "auto_examples/cluster/plot_color_quantization": ( + "auto_examples/cluster/plot_face_compress" + ), + "auto_examples/cluster/plot_cluster_iris": ( + "auto_examples/cluster/plot_kmeans_assumptions" + ), + "auto_examples/ensemble/plot_forest_importances_faces": ( + "auto_examples/ensemble/plot_forest_importances" + ), + "auto_examples/ensemble/plot_voting_probas": ( + "auto_examples/ensemble/plot_voting_decision_regions" + ), + "auto_examples/datasets/plot_iris_dataset": ( + "auto_examples/decomposition/plot_pca_iris" + ), + "auto_examples/linear_model/plot_iris_logistic": ( + "auto_examples/linear_model/plot_logistic_multinomial" + ), + "auto_examples/linear_model/plot_ols_3d": ("auto_examples/linear_model/plot_ols"), + "auto_examples/linear_model/plot_ols": "auto_examples/linear_model/plot_ols_ridge", + "auto_examples/linear_model/plot_ols_ridge_variance": ( + "auto_examples/linear_model/plot_ols_ridge" + ), + "auto_examples/linear_model/plot_sgd_comparison": ( + "auto_examples/linear_model/plot_sgd_loss_functions" + ), +} +html_context["redirects"] = redirects +for old_link in redirects: + html_additional_pages[old_link] = "redirects.html" + +# See https://github.com/scikit-learn/scikit-learn/pull/22550 +html_context["is_devrelease"] = parsed_version.is_devrelease + + # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - 'preamble': r""" + "preamble": r""" \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm} \usepackage{morefloats}\usepackage{enumitem} \setlistdepth{10} \let\oldhref\href @@ -252,8 +535,15 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). -latex_documents = [('contents', 'user_guide.tex', 'scikit-learn user guide', - 'scikit-learn developers', 'manual'), ] +latex_documents = [ + ( + "contents", + "user_guide.tex", + "scikit-learn user guide", + "scikit-learn developers", + "manual", + ), +] # The name of an image file (relative to this directory) to place at the top of # the title page. @@ -269,27 +559,27 @@ # intersphinx configuration intersphinx_mapping = { - 'python': ('https://docs.python.org/{.major}'.format( - sys.version_info), None), - 'numpy': ('https://numpy.org/doc/stable', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), - 'matplotlib': ('https://matplotlib.org/', None), - 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'joblib': ('https://joblib.readthedocs.io/en/latest/', None), - 'seaborn': ('https://seaborn.pydata.org/', None), + "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), + "numpy": ("https://numpy.org/doc/stable", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), + "matplotlib": ("https://matplotlib.org/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "joblib": ("https://joblib.readthedocs.io/en/latest/", None), + "seaborn": ("https://seaborn.pydata.org/", None), + "skops": ("https://skops.readthedocs.io/en/stable/", None), } v = parse(release) if v.release is None: raise ValueError( - 'Ill-formed version: {!r}. Version should follow ' - 'PEP440'.format(version)) + "Ill-formed version: {!r}. Version should follow PEP440".format(version) + ) if v.is_devrelease: - binder_branch = 'main' + binder_branch = "main" else: major, minor = v.release[:2] - binder_branch = '{}.{}.X'.format(major, minor) + binder_branch = "{}.{}.X".format(major, minor) class SubSectionTitleOrder: @@ -298,12 +588,13 @@ class SubSectionTitleOrder: Assumes README.txt exists for all subsections and uses the subsection with dashes, '---', as the adornment. """ + def __init__(self, src_dir): self.src_dir = src_dir self.regex = re.compile(r"^([\w ]+)\n-", re.MULTILINE) def __repr__(self): - return '<%s>' % (self.__class__.__name__,) + return "<%s>" % (self.__class__.__name__,) def __call__(self, directory): src_path = os.path.normpath(os.path.join(self.src_dir, directory)) @@ -315,7 +606,7 @@ def __call__(self, directory): readme = os.path.join(src_path, "README.txt") try: - with open(readme, 'r') as f: + with open(readme, "r") as f: content = f.read() except FileNotFoundError: return directory @@ -326,54 +617,178 @@ def __call__(self, directory): return directory +class SKExampleTitleSortKey(ExampleTitleSortKey): + """Sorts release highlights based on version number.""" + + def __call__(self, filename): + title = super().__call__(filename) + prefix = "plot_release_highlights_" + + # Use title to sort if not a release highlight + if not str(filename).startswith(prefix): + return title + + major_minor = filename[len(prefix) :].split("_")[:2] + version_float = float(".".join(major_minor)) + + # negate to place the newest version highlights first + return -version_float + + +def notebook_modification_function(notebook_content, notebook_filename): + notebook_content_str = str(notebook_content) + warning_template = "\n".join( + [ + "
", + "", + "# JupyterLite warning", + "", + "{message}", + "
", + ] + ) + + message_class = "warning" + message = ( + "Running the scikit-learn examples in JupyterLite is experimental and you may" + " encounter some unexpected behavior.\n\nThe main difference is that imports" + " will take a lot longer than usual, for example the first `import sklearn` can" + " take roughly 10-20s.\n\nIf you notice problems, feel free to open an" + " [issue](https://github.com/scikit-learn/scikit-learn/issues/new/choose)" + " about it." + ) + + markdown = warning_template.format(message_class=message_class, message=message) + + dummy_notebook_content = {"cells": []} + add_markdown_cell(dummy_notebook_content, markdown) + + code_lines = [] + + if "seaborn" in notebook_content_str: + code_lines.append("%pip install seaborn") + if "plotly.express" in notebook_content_str: + code_lines.append("%pip install plotly nbformat") + if "skimage" in notebook_content_str: + code_lines.append("%pip install scikit-image") + if "polars" in notebook_content_str: + code_lines.append("%pip install polars") + if "fetch_" in notebook_content_str: + code_lines.extend( + [ + "%pip install pyodide-http", + "import pyodide_http", + "pyodide_http.patch_all()", + ] + ) + # always import matplotlib and pandas to avoid Pyodide limitation with + # imports inside functions + code_lines.extend(["import matplotlib", "import pandas"]) + + # Work around https://github.com/jupyterlite/pyodide-kernel/issues/166 + # and https://github.com/pyodide/micropip/issues/223 by installing the + # dependencies first, and then scikit-learn from Anaconda.org. + if "dev" in release: + dev_docs_specific_code = [ + "import piplite", + "import joblib", + "import threadpoolctl", + "import scipy", + "await piplite.install(\n" + f" 'scikit-learn=={release}',\n" + " index_urls='https://pypi.anaconda.org/scientific-python-nightly-wheels/simple',\n" + ")", + ] + + code_lines.extend(dev_docs_specific_code) + + if code_lines: + code_lines = ["# JupyterLite-specific code"] + code_lines + code = "\n".join(code_lines) + add_code_cell(dummy_notebook_content, code) + + notebook_content["cells"] = ( + dummy_notebook_content["cells"] + notebook_content["cells"] + ) + + +default_global_config = sklearn.get_config() + + +def reset_sklearn_config(gallery_conf, fname): + """Reset sklearn config to default values.""" + sklearn.set_config(**default_global_config) + + +sg_examples_dir = "../examples" +sg_gallery_dir = "auto_examples" sphinx_gallery_conf = { - 'doc_module': 'sklearn', - 'backreferences_dir': os.path.join('modules', 'generated'), - 'show_memory': False, - 'reference_url': { - 'sklearn': None}, - 'examples_dirs': ['../examples'], - 'gallery_dirs': ['auto_examples'], - 'subsection_order': SubSectionTitleOrder('../examples'), - 'binder': { - 'org': 'scikit-learn', - 'repo': 'scikit-learn', - 'binderhub_url': 'https://mybinder.org', - 'branch': binder_branch, - 'dependencies': './binder/requirements.txt', - 'use_jupyter_lab': True + "doc_module": "sklearn", + "backreferences_dir": os.path.join("modules", "generated"), + "show_memory": False, + "reference_url": {"sklearn": None}, + "examples_dirs": [sg_examples_dir], + "gallery_dirs": [sg_gallery_dir], + "subsection_order": SubSectionTitleOrder(sg_examples_dir), + "within_subsection_order": SKExampleTitleSortKey, + "binder": { + "org": "scikit-learn", + "repo": "scikit-learn", + "binderhub_url": "https://mybinder.org", + "branch": binder_branch, + "dependencies": "./binder/requirements.txt", + "use_jupyter_lab": True, }, # avoid generating too many cross links - 'inspect_global_variables': False, - 'remove_config_comments': True, + "inspect_global_variables": False, + "remove_config_comments": True, + "plot_gallery": "True", + "recommender": {"enable": True, "n_examples": 4, "min_df": 12}, + "reset_modules": ("matplotlib", "seaborn", reset_sklearn_config), } +if with_jupyterlite: + sphinx_gallery_conf["jupyterlite"] = { + "notebook_modification_function": notebook_modification_function + } + +# For the index page of the gallery and each nested section, we hide the secondary +# sidebar by specifying an empty list (no components), because there is no meaningful +# in-page toc for these pages, and they are generated so "sourcelink" is not useful +# either. +html_theme_options["secondary_sidebar_items"][f"{sg_gallery_dir}/index"] = [] +for sub_sg_dir in (Path(".") / sg_examples_dir).iterdir(): + if sub_sg_dir.is_dir(): + html_theme_options["secondary_sidebar_items"][ + f"{sg_gallery_dir}/{sub_sg_dir.name}/index" + ] = [] # The following dictionary contains the information used to create the # thumbnails for the front page of the scikit-learn home page. # key: first image in set # values: (number of plot in set, height of thumbnail) -carousel_thumbs = {'sphx_glr_plot_classifier_comparison_001.png': 600} +carousel_thumbs = {"sphx_glr_plot_classifier_comparison_001.png": 600} # enable experimental module so that experimental estimators can be # discovered properly by sphinx -from sklearn.experimental import enable_hist_gradient_boosting # noqa -from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.experimental import ( # noqa: F401 + enable_halving_search_cv, + enable_iterative_imputer, +) def make_carousel_thumbs(app, exception): """produces the final resized carousel images""" if exception is not None: return - print('Preparing carousel images') + print("Preparing carousel images") - image_dir = os.path.join(app.builder.outdir, '_images') + image_dir = os.path.join(app.builder.outdir, "_images") for glr_plot, max_width in carousel_thumbs.items(): image = os.path.join(image_dir, glr_plot) if os.path.exists(image): - c_thumb = os.path.join(image_dir, glr_plot[:-4] + '_carousel.png') + c_thumb = os.path.join(image_dir, glr_plot[:-4] + "_carousel.png") sphinx_gallery.gen_rst.scale_image(image, c_thumb, max_width, 190) @@ -382,129 +797,77 @@ def filter_search_index(app, exception): return # searchindex only exist when generating html - if app.builder.name != 'html': + if app.builder.name != "html": return - print('Removing methods from search index') + print("Removing methods from search index") - searchindex_path = os.path.join(app.builder.outdir, 'searchindex.js') - with open(searchindex_path, 'r') as f: + searchindex_path = os.path.join(app.builder.outdir, "searchindex.js") + with open(searchindex_path, "r") as f: searchindex_text = f.read() - searchindex_text = re.sub(r'{__init__.+?}', '{}', searchindex_text) - searchindex_text = re.sub(r'{__call__.+?}', '{}', searchindex_text) + searchindex_text = re.sub(r"{__init__.+?}", "{}", searchindex_text) + searchindex_text = re.sub(r"{__call__.+?}", "{}", searchindex_text) - with open(searchindex_path, 'w') as f: + with open(searchindex_path, "w") as f: f.write(searchindex_text) -def generate_min_dependency_table(app): - """Generate min dependency table for docs.""" - from sklearn._min_dependencies import dependent_packages - - # get length of header - package_header_len = max(len(package) - for package in dependent_packages) + 4 - version_header_len = len('Minimum Version') + 4 - tags_header_len = max(len(tags) - for _, tags in dependent_packages.values()) + 4 - - output = StringIO() - output.write(' '.join(['=' * package_header_len, - '=' * version_header_len, - '=' * tags_header_len])) - output.write('\n') - dependency_title = "Dependency" - version_title = "Minimum Version" - tags_title = "Purpose" - - output.write(f'{dependency_title:<{package_header_len}} ' - f'{version_title:<{version_header_len}} ' - f'{tags_title}\n') - - output.write(' '.join(['=' * package_header_len, - '=' * version_header_len, - '=' * tags_header_len])) - output.write('\n') - - for package, (version, tags) in dependent_packages.items(): - output.write(f'{package:<{package_header_len}} ' - f'{version:<{version_header_len}} ' - f'{tags}\n') - - output.write(' '.join(['=' * package_header_len, - '=' * version_header_len, - '=' * tags_header_len])) - output.write('\n') - output = output.getvalue() - - with (Path('.') / 'min_dependency_table.rst').open('w') as f: - f.write(output) - - -def generate_min_dependency_substitutions(app): - """Generate min dependency substitutions for docs.""" - from sklearn._min_dependencies import dependent_packages - - output = StringIO() - - for package, (version, _) in dependent_packages.items(): - package = package.capitalize() - output.write(f'.. |{package}MinVersion| replace:: {version}') - output.write('\n') - - output = output.getvalue() - - with (Path('.') / 'min_dependency_substitutions.rst').open('w') as f: - f.write(output) - - # Config for sphinx_issues # we use the issues path for PRs since the issues URL will forward -issues_github_path = 'scikit-learn/scikit-learn' +issues_github_path = "scikit-learn/scikit-learn" -# Hack to get kwargs to appear in docstring #18434 -# TODO: Remove when https://github.com/sphinx-doc/sphinx/pull/8234 gets -# merged -from sphinx.util import inspect # noqa -from sphinx.ext.autodoc import ClassDocumenter # noqa +def disable_plot_gallery_for_linkcheck(app): + if app.builder.name == "linkcheck": + sphinx_gallery_conf["plot_gallery"] = "False" -class PatchedClassDocumenter(ClassDocumenter): - def _get_signature(self): - old_signature = inspect.signature +def skip_properties(app, what, name, obj, skip, options): + """Skip properties that are fitted attributes""" + if isinstance(obj, property): + if name.endswith("_") and not name.startswith("_"): + return True - def patch_signature(subject, bound_method=False, follow_wrapped=True): - # changes the default of follow_wrapped to True - return old_signature(subject, bound_method=bound_method, - follow_wrapped=follow_wrapped) - inspect.signature = patch_signature - result = super()._get_signature() - inspect.signature = old_signature - return result + return skip def setup(app): - app.registry.documenters['class'] = PatchedClassDocumenter - app.connect('builder-inited', generate_min_dependency_table) - app.connect('builder-inited', generate_min_dependency_substitutions) - # to hide/show the prompt in code examples: - app.connect('build-finished', make_carousel_thumbs) - app.connect('build-finished', filter_search_index) + # do not run the examples when using linkcheck by using a small priority + # (default priority is 500 and sphinx-gallery using builder-inited event too) + app.connect("builder-inited", disable_plot_gallery_for_linkcheck, priority=50) + # triggered just before the HTML for an individual page is created + app.connect("html-page-context", add_js_css_files) -# The following is used by sphinx.ext.linkcode to provide links to github -linkcode_resolve = make_linkcode_resolve('sklearn', - 'https://github.com/scikit-learn/' - 'scikit-learn/blob/{revision}/' - '{package}/{path}#L{lineno}') + # to hide/show the prompt in code examples + app.connect("build-finished", make_carousel_thumbs) + app.connect("build-finished", filter_search_index) + + app.connect("autodoc-skip-member", skip_properties) -warnings.filterwarnings("ignore", category=UserWarning, - message='Matplotlib is currently using agg, which is a' - ' non-GUI backend, so cannot show the figure.') +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve( + "sklearn", + ( + "https://github.com/scikit-learn/" + "scikit-learn/blob/{revision}/" + "{package}/{path}#L{lineno}" + ), +) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + message=( + "Matplotlib is currently using agg, which is a" + " non-GUI backend, so cannot show the figure." + ), +) +if os.environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0": + turn_warnings_into_errors() # maps functions with a class name that is indistinguishable when case is # ignore to another filename @@ -513,3 +876,217 @@ def setup(app): "sklearn.covariance.oas": "oas-function", "sklearn.decomposition.fastica": "fastica-function", } + + +# Config for sphinxext.opengraph + +ogp_site_url = "https://scikit-learn/stable/" +ogp_image = "https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png" +ogp_use_first_image = True +ogp_site_name = "scikit-learn" + +# Config for linkcheck that checks the documentation for broken links + +# ignore all links in 'whats_new' to avoid doing many github requests and +# hitting the github rate threshold that makes linkcheck take a lot of time +linkcheck_exclude_documents = [r"whats_new/.*"] + +# default timeout to make some sites links fail faster +linkcheck_timeout = 10 + +# Allow redirects from doi.org +linkcheck_allowed_redirects = {r"https://doi.org/.+": r".*"} +linkcheck_ignore = [ + # ignore links to local html files e.g. in image directive :target: field + r"^..?/", + # ignore links to specific pdf pages because linkcheck does not handle them + # ('utf-8' codec can't decode byte error) + r"http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=.*", + ( + "https://www.fordfoundation.org/media/2976/roads-and-bridges" + "-the-unseen-labor-behind-our-digital-infrastructure.pdf#page=.*" + ), + # links falsely flagged as broken + ( + "https://www.researchgate.net/publication/" + "233096619_A_Dendrite_Method_for_Cluster_Analysis" + ), + ( + "https://www.researchgate.net/publication/221114584_Random_Fourier" + "_Approximations_for_Skewed_Multiplicative_Histogram_Kernels" + ), + ( + "https://www.researchgate.net/publication/4974606_" + "Hedonic_housing_prices_and_the_demand_for_clean_air" + ), + ( + "https://www.researchgate.net/profile/Anh-Huy-Phan/publication/220241471_Fast_" + "Local_Algorithms_for_Large_Scale_Nonnegative_Matrix_and_Tensor_Factorizations" + ), + "https://doi.org/10.13140/RG.2.2.35280.02565", + ( + "https://www.microsoft.com/en-us/research/uploads/prod/2006/01/" + "Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf" + ), + "https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-99-87.pdf", + "https://microsoft.com/", + "https://www.jstor.org/stable/2984099", + "https://stat.uw.edu/sites/default/files/files/reports/2000/tr371.pdf", + # Broken links from testimonials + "http://www.bestofmedia.com", + "http://www.data-publica.com/", + "https://livelovely.com", + "https://www.mars.com/global", + "https://www.yhat.com", + # Ignore some dynamically created anchors. See + # https://github.com/sphinx-doc/sphinx/issues/9016 for more details about + # the github example + r"https://github.com/conda-forge/miniforge#miniforge", + r"https://github.com/joblib/threadpoolctl/" + "#setting-the-maximum-size-of-thread-pools", + r"https://stackoverflow.com/questions/5836335/" + "consistently-create-same-random-numpy-array/5837352#comment6712034_5837352", +] + +# Use a browser-like user agent to avoid some "403 Client Error: Forbidden for +# url" errors. This is taken from the variable navigator.userAgent inside a +# browser console. +user_agent = ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0" +) + +# Use Github token from environment variable to avoid Github rate limits when +# checking Github links +github_token = os.getenv("GITHUB_TOKEN") + +if github_token is None: + linkcheck_request_headers = {} +else: + linkcheck_request_headers = { + "https://github.com/": {"Authorization": f"token {github_token}"}, + } + + +def infer_next_release_versions(): + """Infer the most likely next release versions to make.""" + all_version_full = {"rc": "0.99.0rc1", "final": "0.99.0", "bf": "0.98.1"} + all_version_short = {"rc": "0.99", "final": "0.99", "bf": "0.98"} + all_previous_tag = {"rc": "unused", "final": "0.98.33", "bf": "0.97.22"} + + try: + # Fetch the version switcher JSON; see `html_theme_options` for more details + versions_json = json.loads( + urlopen(html_theme_options["switcher"]["json_url"], timeout=10).read() + ) + + # See `build_tools/circle/list_versions.py`, stable is always the second entry + stable_version = parse(versions_json[1]["version"]) + last_stable_version = parse(versions_json[2]["version"]) + next_major_minor = f"{stable_version.major}.{stable_version.minor + 1}" + + # RC + all_version_full["rc"] = f"{next_major_minor}.0rc1" + all_version_short["rc"] = next_major_minor + + # Major/Minor final + all_version_full["final"] = f"{next_major_minor}.0" + all_version_short["final"] = next_major_minor + all_previous_tag["final"] = stable_version.base_version + + # Bug-fix + all_version_full["bf"] = ( + f"{stable_version.major}.{stable_version.minor}.{stable_version.micro + 1}" + ) + all_version_short["bf"] = f"{stable_version.major}.{stable_version.minor}" + all_previous_tag["bf"] = last_stable_version.base_version + except Exception as e: + logger.warning( + "Failed to infer all possible next release versions because of " + f"{type(e).__name__}: {e}" + ) + + return { + "version_full": all_version_full, + "version_short": all_version_short, + "previous_tag": all_previous_tag, + } + + +# -- Convert .rst.template files to .rst --------------------------------------- + +from api_reference import API_REFERENCE, DEPRECATED_API_REFERENCE + +from sklearn._min_dependencies import dependent_packages + +# If development build, link to local page in the top navbar; otherwise link to the +# development version; see https://github.com/scikit-learn/scikit-learn/pull/22550 +if parsed_version.is_devrelease: + development_link = "developers/index" +else: + development_link = "https://scikit-learn.org/dev/developers/index.html" + +# Define the templates and target files for conversion +# Each entry is in the format (template name, file name, kwargs for rendering) +rst_templates = [ + ("index", "index", {"development_link": development_link}), + ( + "developers/maintainer", + "developers/maintainer", + {"inferred": infer_next_release_versions()}, + ), + ( + "min_dependency_table", + "min_dependency_table", + {"dependent_packages": dependent_packages}, + ), + ( + "min_dependency_substitutions", + "min_dependency_substitutions", + {"dependent_packages": dependent_packages}, + ), + ( + "api/index", + "api/index", + { + "API_REFERENCE": sorted(API_REFERENCE.items(), key=lambda x: x[0]), + "DEPRECATED_API_REFERENCE": sorted( + DEPRECATED_API_REFERENCE.items(), key=lambda x: x[0], reverse=True + ), + }, + ), +] + +# Convert each module API reference page +for module in API_REFERENCE: + rst_templates.append( + ( + "api/module", + f"api/{module}", + {"module": module, "module_info": API_REFERENCE[module]}, + ) + ) + +# Convert the deprecated API reference page (if there exists any) +if DEPRECATED_API_REFERENCE: + rst_templates.append( + ( + "api/deprecated", + "api/deprecated", + { + "DEPRECATED_API_REFERENCE": sorted( + DEPRECATED_API_REFERENCE.items(), key=lambda x: x[0], reverse=True + ) + }, + ) + ) + +for rst_template_name, rst_target_name, kwargs in rst_templates: + # Read the corresponding template file into jinja2 + with (Path(".") / f"{rst_template_name}.rst.template").open( + "r", encoding="utf-8" + ) as f: + t = jinja2.Template(f.read()) + + # Render the template and write to the target + with (Path(".") / f"{rst_target_name}.rst").open("w", encoding="utf-8") as f: + f.write(t.render(**kwargs)) diff --git a/doc/conftest.py b/doc/conftest.py index 5468184bf5509..ad8d6eb8cfb62 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -1,20 +1,20 @@ import os -from os.path import exists -from os.path import join from os import environ -import warnings +from os.path import exists, join + +import pytest +from _pytest.doctest import DoctestItem -from sklearn.utils import IS_PYPY -from sklearn.utils._testing import SkipTest -from sklearn.utils._testing import check_skip_network from sklearn.datasets import get_data_home from sklearn.datasets._base import _pkl_filepath from sklearn.datasets._twenty_newsgroups import CACHE_NAME +from sklearn.utils._testing import SkipTest, check_skip_network +from sklearn.utils.fixes import np_base_version, parse_version, sp_version def setup_labeled_faces(): data_home = get_data_home() - if not exists(join(data_home, 'lfw_home')): + if not exists(join(data_home, "lfw_home")): raise SkipTest("Skipping dataset loading doctests") @@ -27,15 +27,12 @@ def setup_rcv1(): def setup_twenty_newsgroups(): - data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") def setup_working_with_text_data(): - if IS_PYPY and os.environ.get('CI', None): - raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): @@ -44,80 +41,139 @@ def setup_working_with_text_data(): def setup_loading_other_datasets(): try: - import pandas # noqa + import pandas # noqa: F401 except ImportError: - raise SkipTest("Skipping loading_other_datasets.rst, " - "pandas not installed") + raise SkipTest("Skipping loading_other_datasets.rst, pandas not installed") # checks SKLEARN_SKIP_NETWORK_TESTS to see if test should run - run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", '1') == "0" + run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "0" if not run_network_tests: - raise SkipTest("Skipping loading_other_datasets.rst, tests can be " - "enabled by settting SKLEARN_SKIP_NETWORK_TESTS=0") + raise SkipTest( + "Skipping loading_other_datasets.rst, tests can be " + "enabled by setting SKLEARN_SKIP_NETWORK_TESTS=0" + ) def setup_compose(): try: - import pandas # noqa + import pandas # noqa: F401 except ImportError: raise SkipTest("Skipping compose.rst, pandas not installed") def setup_impute(): try: - import pandas # noqa + import pandas # noqa: F401 except ImportError: raise SkipTest("Skipping impute.rst, pandas not installed") def setup_grid_search(): try: - import pandas # noqa + import pandas # noqa: F401 except ImportError: raise SkipTest("Skipping grid_search.rst, pandas not installed") def setup_preprocessing(): try: - import pandas # noqa + import pandas # noqa: F401 except ImportError: raise SkipTest("Skipping preprocessing.rst, pandas not installed") -def setup_unsupervised_learning(): +def skip_if_matplotlib_not_installed(fname): try: - import skimage # noqa + import matplotlib # noqa: F401 except ImportError: - raise SkipTest("Skipping unsupervised_learning.rst, scikit-image " - "not installed") - # ignore deprecation warnings from scipy.misc.face - warnings.filterwarnings('ignore', 'The binary mode of fromstring', - DeprecationWarning) + basename = os.path.basename(fname) + raise SkipTest(f"Skipping doctests for {basename}, matplotlib not installed") + + +def skip_if_cupy_not_installed(fname): + try: + import cupy # noqa: F401 + except ImportError: + basename = os.path.basename(fname) + raise SkipTest(f"Skipping doctests for {basename}, cupy not installed") def pytest_runtest_setup(item): fname = item.fspath.strpath - is_index = fname.endswith('datasets/index.rst') - if fname.endswith('datasets/labeled_faces.rst') or is_index: + # normalize filename to use forward slashes on Windows for easier handling + # later + fname = fname.replace(os.sep, "/") + + is_index = fname.endswith("datasets/index.rst") + if fname.endswith("datasets/labeled_faces.rst") or is_index: setup_labeled_faces() - elif fname.endswith('datasets/rcv1.rst') or is_index: + elif fname.endswith("datasets/rcv1.rst") or is_index: setup_rcv1() - elif fname.endswith('datasets/twenty_newsgroups.rst') or is_index: + elif fname.endswith("datasets/twenty_newsgroups.rst") or is_index: setup_twenty_newsgroups() - elif fname.endswith('tutorial/text_analytics/working_with_text_data.rst')\ - or is_index: - setup_working_with_text_data() - elif fname.endswith('modules/compose.rst') or is_index: + elif fname.endswith("modules/compose.rst") or is_index: setup_compose() - elif IS_PYPY and fname.endswith('modules/feature_extraction.rst'): - raise SkipTest('FeatureHasher is not compatible with PyPy') - elif fname.endswith('datasets/loading_other_datasets.rst'): + elif fname.endswith("datasets/loading_other_datasets.rst"): setup_loading_other_datasets() - elif fname.endswith('modules/impute.rst'): + elif fname.endswith("modules/impute.rst"): setup_impute() - elif fname.endswith('modules/grid_search.rst'): + elif fname.endswith("modules/grid_search.rst"): setup_grid_search() - elif fname.endswith('modules/preprocessing.rst'): + elif fname.endswith("modules/preprocessing.rst"): setup_preprocessing() - elif fname.endswith('statistical_inference/unsupervised_learning.rst'): - setup_unsupervised_learning() + + rst_files_requiring_matplotlib = [ + "modules/partial_dependence.rst", + "modules/tree.rst", + ] + for each in rst_files_requiring_matplotlib: + if fname.endswith(each): + skip_if_matplotlib_not_installed(fname) + + if fname.endswith("array_api.rst"): + skip_if_cupy_not_installed(fname) + + +def pytest_configure(config): + # Use matplotlib agg backend during the tests including doctests + try: + import matplotlib + + matplotlib.use("agg") + except ImportError: + pass + + +def pytest_collection_modifyitems(config, items): + """Called after collect is completed. + + Parameters + ---------- + config : pytest config + items : list of collected items + """ + skip_doctests = False + if np_base_version < parse_version("2"): + # TODO: configure numpy to output scalar arrays as regular Python scalars + # once possible to improve readability of the tests docstrings. + # https://numpy.org/neps/nep-0051-scalar-representation.html#implementation + reason = "Due to NEP 51 numpy scalar repr has changed in numpy 2" + skip_doctests = True + + if sp_version < parse_version("1.14"): + reason = "Scipy sparse matrix repr has changed in scipy 1.14" + skip_doctests = True + + # Normally doctest has the entire module's scope. Here we set globs to an empty dict + # to remove the module's scope: + # https://docs.python.org/3/library/doctest.html#what-s-the-execution-context + for item in items: + if isinstance(item, DoctestItem): + item.dtest.globs = {} + + if skip_doctests: + skip_marker = pytest.mark.skip(reason=reason) + + for item in items: + if isinstance(item, DoctestItem): + item.add_marker(skip_marker) diff --git a/doc/contents.rst b/doc/contents.rst deleted file mode 100644 index a28634621d558..0000000000000 --- a/doc/contents.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. include:: includes/big_toc_css.rst -.. include:: tune_toc.rst - -.. Places global toc into the sidebar - -:globalsidebartoc: True - -================= -Table Of Contents -================= - -.. Define an order for the Table of Contents: - -.. toctree:: - :maxdepth: 2 - - preface - tutorial/index - getting_started - user_guide - glossary - auto_examples/index - modules/classes - developers/index diff --git a/doc/contributor_experience_team.rst b/doc/contributor_experience_team.rst new file mode 100644 index 0000000000000..73ccd668b20cd --- /dev/null +++ b/doc/contributor_experience_team.rst @@ -0,0 +1,52 @@ +.. raw :: html + + +
+ +
+
+

Virgil Chan

+
+
+
+

Juan Carlos Alfaro JimÊnez

+
+
+
+

Lucy Liu

+
+
+
+

Maxwell Liu

+
+
+
+

Juan Martin Loyola

+
+
+
+

Sylvain MariÊ

+
+
+
+

Norbert Preining

+
+
+
+

Stefanie Senger

+
+
+
+

Reshama Shaikh

+
+
+
+

Albert Thomas

+
+
+
+

Maren Westermann

+
+
diff --git a/doc/contributor_experience_team_emeritus.rst b/doc/contributor_experience_team_emeritus.rst new file mode 100644 index 0000000000000..a833907dd5e4a --- /dev/null +++ b/doc/contributor_experience_team_emeritus.rst @@ -0,0 +1 @@ +- Chiara Marmo diff --git a/doc/themes/scikit-learn/static/css/examples.css b/doc/css/.gitkeep similarity index 100% rename from doc/themes/scikit-learn/static/css/examples.css rename to doc/css/.gitkeep diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst index 084214cb094f5..536539ec97007 100644 --- a/doc/data_transforms.rst +++ b/doc/data_transforms.rst @@ -1,9 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -.. include:: includes/big_toc_css.rst - .. _data-transforms: Dataset transformations diff --git a/doc/datasets.rst b/doc/datasets.rst index b9484a02ce84c..f12e5095cc6a8 100644 --- a/doc/datasets.rst +++ b/doc/datasets.rst @@ -1,9 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -.. include:: includes/big_toc_css.rst - .. _datasets: ========================= @@ -12,12 +6,9 @@ Dataset loading utilities .. currentmodule:: sklearn.datasets -The ``sklearn.datasets`` package embeds some small toy datasets -as introduced in the :ref:`Getting Started ` section. - -This package also features helpers to fetch larger datasets commonly -used by the machine learning community to benchmark algorithms on data -that comes from the 'real world'. +The ``sklearn.datasets`` package embeds some small toy datasets and provides helpers +to fetch larger datasets commonly used by the machine learning community to benchmark +algorithms on data that comes from the 'real world'. To evaluate the impact of the scale of the dataset (``n_samples`` and ``n_features``) while controlling the statistical properties of the data @@ -42,7 +33,7 @@ length ``n_samples``, containing the target values, with key ``target``. The Bunch object is a dictionary that exposes its keys as attributes. For more information about Bunch object, see :class:`~sklearn.utils.Bunch`. -It's also possible for almost all of these function to constrain the output +It's also possible for almost all of these functions to constrain the output to be a tuple containing only the data and the target, by setting the ``return_X_y`` parameter to ``True``. diff --git a/doc/datasets/loading_other_datasets.rst b/doc/datasets/loading_other_datasets.rst index 131d6ca9757f5..84d042f64c9d3 100644 --- a/doc/datasets/loading_other_datasets.rst +++ b/doc/datasets/loading_other_datasets.rst @@ -1,7 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - .. _loading_other_datasets: Loading other datasets @@ -23,24 +19,29 @@ and pipelines on 2D data. load_sample_images load_sample_image -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_color_quantization_001.png - :target: ../auto_examples/cluster/plot_color_quantization.html +.. plot:: + :context: close-figs :scale: 30 :align: right + :include-source: False + import matplotlib.pyplot as plt + from sklearn.datasets import load_sample_image + + china = load_sample_image("china.jpg") + plt.imshow(china) + plt.axis('off') + plt.tight_layout() + plt.show() .. warning:: The default coding of images is based on the ``uint8`` dtype to spare memory. Often machine learning algorithms work best if the input is converted to a floating point representation first. Also, - if you plan to use ``matplotlib.pyplpt.imshow``, don't forget to scale to the range + if you plan to use ``matplotlib.pyplot.imshow``, don't forget to scale to the range 0 - 1 as done in the following example. -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py` - .. _libsvm_loader: Datasets in svmlight / libsvm format @@ -52,7 +53,7 @@ takes the form ``
- -Note that in order to avoid potential conflicts with other packages it is -strongly recommended to use a `virtual environment (venv) -`_ or a `conda environment -`_. - -Using such an isolated environment makes it possible to install a specific -version of scikit-learn with pip or conda and its dependencies independently of -any previously installed Python packages. In particular under Linux is it -discouraged to install pip packages alongside the packages managed by the + .. tab-item:: pip + :class-label: tab-6 + :sync: package-manager-pip + + Python 3 is usually installed by default on most Linux distributions. To + check if you have it installed, try: + + .. prompt:: bash + + python3 --version + pip3 --version + + If you don't have Python 3 installed, please install `python3` and + `python3-pip` from your distribution's package manager. + + Now create a `virtual environment (venv) + `_ and install scikit-learn. + Note that the virtual environment is optional but strongly recommended, in + order to avoid potential conflicts with other packages. + + .. prompt:: bash + + python3 -m venv sklearn-env + source sklearn-env/bin/activate # activate + pip3 install -U scikit-learn + + In order to check your installation, you can use: + + .. prompt:: bash + + python3 -m pip show scikit-learn # show scikit-learn version and location + python3 -m pip freeze # show all installed packages in the environment + python3 -c "import sklearn; sklearn.show_versions()" + + .. tab-item:: conda + :class-label: tab-6 + :sync: package-manager-conda + + .. include:: ./install_instructions_conda.rst + + +Using an isolated environment such as pip venv or conda makes it possible to +install a specific version of scikit-learn with pip or conda and its dependencies +independently of any previously installed Python packages. In particular under Linux +it is discouraged to install pip packages alongside the packages managed by the package manager of the distribution (apt, dnf, pacman...). Note that you should always remember to activate the environment of your choice @@ -128,11 +191,10 @@ and NumPy and SciPy are not recompiled from source, which can happen when using particular configurations of operating system and hardware (such as Linux on a Raspberry Pi). - -Scikit-learn plotting capabilities (i.e., functions start with "plot\_" -and classes end with "Display") require Matplotlib. The examples require +Scikit-learn plotting capabilities (i.e., functions starting with `plot\_` +and classes ending with `Display`) require Matplotlib. The examples require Matplotlib and some examples require scikit-image, pandas, or seaborn. The -minimum version of Scikit-learn dependencies are listed below along with its +minimum version of scikit-learn dependencies are listed below along with its purpose. .. include:: min_dependency_table.rst @@ -140,37 +202,24 @@ purpose. .. warning:: Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4. - Scikit-learn 0.21 supported Python 3.5-3.7. - Scikit-learn 0.22 supported Python 3.5-3.8. - Scikit-learn now requires Python 3.6 or newer. - -.. note:: + Scikit-learn 0.21 supported Python 3.5—3.7. - For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+ - are required. + Scikit-learn 0.22 supported Python 3.5—3.8. -.. _install_on_apple_silicon_m1: + Scikit-learn 0.23 required Python 3.6—3.8. -Installing on Apple Silicon M1 hardware -======================================= + Scikit-learn 0.24 required Python 3.6—3.9. -The recently introduced `macos/arm64` platform (sometimes also known as -`macos/aarch64`) requires the open source community to upgrade the build -configuation and automation to properly support it. + Scikit-learn 1.0 supported Python 3.7—3.10. -At the time of writing (January 2021), the only way to get a working -installation of scikit-learn on this hardware is to install scikit-learn and its -dependencies from the conda-forge distribution, for instance using the miniforge -installers: + Scikit-learn 1.1, 1.2 and 1.3 supported Python 3.8—3.12. -https://github.com/conda-forge/miniforge + Scikit-learn 1.4 and 1.5 supported Python 3.9—3.12. -The following issue tracks progress on making it possible to install -scikit-learn from PyPI with pip: - -https://github.com/scikit-learn/scikit-learn/issues/19137 + Scikit-learn 1.6 supported Python 3.9—3.13. + Scikit-learn 1.7 requires Python 3.10 or newer. .. _install_by_distribution: @@ -187,6 +236,19 @@ dependencies (numpy, scipy) that scikit-learn requires. The following is an incomplete list of OS and python distributions that provide their own version of scikit-learn. +Alpine Linux +------------ + +Alpine Linux's package is provided through the `official repositories +`__ as +``py3-scikit-learn`` for Python. +It can be installed by typing the following command: + +.. prompt:: bash + + sudo apk add py3-scikit-learn + + Arch Linux ---------- @@ -195,7 +257,7 @@ Arch Linux's package is provided through the `official repositories ``python-scikit-learn`` for Python. It can be installed by typing the following command: -.. prompt:: bash $ +.. prompt:: bash sudo pacman -S python-scikit-learn @@ -203,26 +265,26 @@ It can be installed by typing the following command: Debian/Ubuntu ------------- -The Debian/Ubuntu package is splitted in three different packages called +The Debian/Ubuntu package is split in three different packages called ``python3-sklearn`` (python modules), ``python3-sklearn-lib`` (low-level -implementations and bindings), ``python3-sklearn-doc`` (documentation). -Only the Python 3 version is available in the Debian Buster (the more recent -Debian distribution). +implementations and bindings), ``python-sklearn-doc`` (documentation). +Note that scikit-learn requires Python 3, hence the need to use the `python3-` +suffixed package names. Packages can be installed using ``apt-get``: -.. prompt:: bash $ +.. prompt:: bash - sudo apt-get install python3-sklearn python3-sklearn-lib python3-sklearn-doc + sudo apt-get install python3-sklearn python3-sklearn-lib python-sklearn-doc Fedora ------ The Fedora package is called ``python3-scikit-learn`` for the python 3 version, -the only one available in Fedora30. +the only one available in Fedora. It can be installed using ``dnf``: -.. prompt:: bash $ +.. prompt:: bash sudo dnf install python3-scikit-learn @@ -230,10 +292,8 @@ It can be installed using ``dnf``: NetBSD ------ -scikit-learn is available via `pkgsrc-wip -`_: - - http://pkgsrc.se/math/py-scikit-learn +scikit-learn is available via `pkgsrc-wip `_: +https://pkgsrc.se/math/py-scikit-learn MacPorts for Mac OSX @@ -244,9 +304,9 @@ where ``XY`` denotes the Python version. It can be installed by typing the following command: -.. prompt:: bash $ +.. prompt:: bash - sudo port install py36-scikit-learn + sudo port install py312-scikit-learn Anaconda and Enthought Deployment Manager for all supported platforms @@ -260,30 +320,39 @@ python library for Windows, Mac OSX and Linux. Anaconda offers scikit-learn as part of its free distribution. -Intel conda channel -------------------- +Intel Extension for Scikit-learn +-------------------------------- -Intel maintains a dedicated conda channel that ships scikit-learn: +Intel maintains an optimized x86_64 package, available in PyPI (via `pip`), +and in the `main`, `conda-forge` and `intel` conda channels: -.. prompt:: bash $ +.. prompt:: bash - conda install -c intel scikit-learn + conda install scikit-learn-intelex -This version of scikit-learn comes with alternative solvers for some common -estimators. Those solvers come from the DAAL C++ library and are optimized for -multi-core Intel CPUs. +This package has an Intel optimized version of many estimators. Whenever +an alternative implementation doesn't exist, scikit-learn implementation +is used as a fallback. Those optimized solvers come from the oneDAL +C++ library and are optimized for the x86_64 architecture, and are +optimized for multi-core Intel CPUs. Note that those solvers are not enabled by default, please refer to the -`daal4py `_ documentation -for more details. +`scikit-learn-intelex `_ +documentation for more details on usage scenarios. Direct export example: + +.. prompt:: python >>> + + from sklearnex.neighbors import NearestNeighbors Compatibility with the standard scikit-learn solvers is checked by running the full scikit-learn test suite via automated continuous integration as reported -on https://github.com/IntelPython/daal4py. +on https://github.com/intel/scikit-learn-intelex. If you observe any issue +with `scikit-learn-intelex`, please report the issue on their +`issue tracker `__. WinPython for Windows ------------------------ +--------------------- The `WinPython `_ project distributes scikit-learn as an additional plugin. @@ -292,6 +361,10 @@ scikit-learn as an additional plugin. Troubleshooting =============== +If you encounter unexpected failures when installing scikit-learn, you may submit +an issue to the `issue tracker `_. +Before that, please also make sure to check the following common issues. + .. _windows_longpath: Error caused by file path length limit on Windows @@ -305,7 +378,7 @@ size limit of Windows if Python is installed in a nested location such as the Collecting scikit-learn ... Installing collected packages: scikit-learn - ERROR: Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'C:\\Users\\username\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python37\\site-packages\\sklearn\\datasets\\tests\\data\\openml\\292\\api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz' + ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\username\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python37\\site-packages\\sklearn\\datasets\\tests\\data\\openml\\292\\api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz' In this case it is possible to lift that limit in the Windows registry by using the ``regedit`` tool: @@ -321,6 +394,6 @@ using the ``regedit`` tool: #. Reinstall scikit-learn (ignoring the previous broken installation): -.. prompt:: python $ + .. prompt:: powershell - pip install --exists-action=i scikit-learn + pip install --exists-action=i scikit-learn diff --git a/doc/install_instructions_conda.rst b/doc/install_instructions_conda.rst new file mode 100644 index 0000000000000..0b5a57b747021 --- /dev/null +++ b/doc/install_instructions_conda.rst @@ -0,0 +1,16 @@ +Install conda using the +`conda-forge installers `__ (no +administrator permission required). Then run: + +.. prompt:: bash + + conda create -n sklearn-env -c conda-forge scikit-learn + conda activate sklearn-env + +In order to check your installation, you can use: + +.. prompt:: bash + + conda list scikit-learn # show scikit-learn version and location + conda list # show all installed packages in the environment + python -c "import sklearn; sklearn.show_versions()" diff --git a/doc/js/scripts/api-search.js b/doc/js/scripts/api-search.js new file mode 100644 index 0000000000000..2148e0c429aaa --- /dev/null +++ b/doc/js/scripts/api-search.js @@ -0,0 +1,12 @@ +/** + * This script is for initializing the search table on the API index page. See + * DataTables documentation for more information: https://datatables.net/ + */ + +document.addEventListener("DOMContentLoaded", function () { + new DataTable("table.apisearch-table", { + order: [], // Keep original order + lengthMenu: [10, 25, 50, 100, { label: "All", value: -1 }], + pageLength: -1, // Show all entries by default + }); +}); diff --git a/doc/js/scripts/dropdown.js b/doc/js/scripts/dropdown.js new file mode 100644 index 0000000000000..d74d138773eed --- /dev/null +++ b/doc/js/scripts/dropdown.js @@ -0,0 +1,63 @@ +/** + * This script is used to add the functionality of collapsing/expanding all dropdowns + * on the page to the sphinx-design dropdowns. This is because some browsers cannot + * search into collapsed
(such as Firefox). + * + * The reason why the buttons are added to the page with JS (dynamic) instead of with + * sphinx (static) is that the button will not work without JS activated, so we do not + * want them to show up in that case. + */ + +document.addEventListener("DOMContentLoaded", () => { + // Get all sphinx-design dropdowns + const allDropdowns = document.querySelectorAll("details.sd-dropdown"); + + allDropdowns.forEach((dropdown) => { + // Get the summary element of the dropdown, where we will place the buttons + const summaryTitle = dropdown.querySelector("summary.sd-summary-title"); + + // The state marker with the toggle all icon inside + const newStateMarker = document.createElement("span"); + const newIcon = document.createElement("i"); + newIcon.classList.add("fa-solid", "fa-angles-right"); + newStateMarker.appendChild(newIcon); + + // Classes for styling; `sd-summary-state-marker` and `sd-summary-chevron-right` are + // implemented by sphinx-design; `sk-toggle-all` is implemented by us + newStateMarker.classList.add( + "sd-summary-state-marker", + "sd-summary-chevron-right", + "sk-toggle-all" + ); + + // Bootstrap tooltip configurations + newStateMarker.setAttribute("data-bs-toggle", "tooltip"); + newStateMarker.setAttribute("data-bs-placement", "top"); + newStateMarker.setAttribute("data-bs-offset", "0,10"); + newStateMarker.setAttribute("data-bs-title", "Toggle all dropdowns"); + // Enable the tooltip + new bootstrap.Tooltip(newStateMarker); + + // Assign the collapse/expand action to the state marker + newStateMarker.addEventListener("click", () => { + if (dropdown.open) { + console.log("[SK] Collapsing all dropdowns..."); + allDropdowns.forEach((node) => { + if (node !== dropdown) { + node.removeAttribute("open"); + } + }); + } else { + console.log("[SK] Expanding all dropdowns..."); + allDropdowns.forEach((node) => { + if (node !== dropdown) { + node.setAttribute("open", ""); + } + }); + } + }); + + // Append the state marker to the summary element + summaryTitle.insertBefore(newStateMarker, summaryTitle.lastElementChild); + }); +}); diff --git a/doc/js/scripts/sg_plotly_resize.js b/doc/js/scripts/sg_plotly_resize.js new file mode 100644 index 0000000000000..2d2611910db78 --- /dev/null +++ b/doc/js/scripts/sg_plotly_resize.js @@ -0,0 +1,10 @@ +// Related to https://github.com/scikit-learn/scikit-learn/issues/30279 +// There an interaction between plotly and bootstrap/pydata-sphinx-theme +// that causes plotly figures to not detect the right-hand sidebar width + +// Plotly figures are responsive, this triggers a resize event once the DOM has +// finished loading so that they resize themselves. + +document.addEventListener("DOMContentLoaded", () => { + window.dispatchEvent(new Event("resize")); +}); diff --git a/doc/js/scripts/vendor/svg-pan-zoom.min.js b/doc/js/scripts/vendor/svg-pan-zoom.min.js new file mode 100644 index 0000000000000..bde44a689bfe1 --- /dev/null +++ b/doc/js/scripts/vendor/svg-pan-zoom.min.js @@ -0,0 +1,31 @@ +/** + * svg-pan-zoom v3.6.2 + * + * https://github.com/bumbu/svg-pan-zoom + * + * Copyright 2009-2010 Andrea Leofreddi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +!function s(r,a,l){function u(e,t){if(!a[e]){if(!r[e]){var o="function"==typeof require&&require;if(!t&&o)return o(e,!0);if(h)return h(e,!0);var n=new Error("Cannot find module '"+e+"'");throw n.code="MODULE_NOT_FOUND",n}var i=a[e]={exports:{}};r[e][0].call(i.exports,function(t){return u(r[e][1][t]||t)},i,i.exports,s,r,a,l)}return a[e].exports}for(var h="function"==typeof require&&require,t=0;tthis.options.maxZoom*n.zoom&&(t=this.options.maxZoom*n.zoom/this.getZoom());var i=this.viewport.getCTM(),s=e.matrixTransform(i.inverse()),r=this.svg.createSVGMatrix().translate(s.x,s.y).scale(t).translate(-s.x,-s.y),a=i.multiply(r);a.a!==i.a&&this.viewport.setCTM(a)},i.prototype.zoom=function(t,e){this.zoomAtPoint(t,a.getSvgCenterPoint(this.svg,this.width,this.height),e)},i.prototype.publicZoom=function(t,e){e&&(t=this.computeFromRelativeZoom(t)),this.zoom(t,e)},i.prototype.publicZoomAtPoint=function(t,e,o){if(o&&(t=this.computeFromRelativeZoom(t)),"SVGPoint"!==r.getType(e)){if(!("x"in e&&"y"in e))throw new Error("Given point is invalid");e=a.createSVGPoint(this.svg,e.x,e.y)}this.zoomAtPoint(t,e,o)},i.prototype.getZoom=function(){return this.viewport.getZoom()},i.prototype.getRelativeZoom=function(){return this.viewport.getRelativeZoom()},i.prototype.computeFromRelativeZoom=function(t){return t*this.viewport.getOriginalState().zoom},i.prototype.resetZoom=function(){var t=this.viewport.getOriginalState();this.zoom(t.zoom,!0)},i.prototype.resetPan=function(){this.pan(this.viewport.getOriginalState())},i.prototype.reset=function(){this.resetZoom(),this.resetPan()},i.prototype.handleDblClick=function(t){var e;if((this.options.preventMouseEventsDefault&&(t.preventDefault?t.preventDefault():t.returnValue=!1),this.options.controlIconsEnabled)&&-1<(t.target.getAttribute("class")||"").indexOf("svg-pan-zoom-control"))return!1;e=t.shiftKey?1/(2*(1+this.options.zoomScaleSensitivity)):2*(1+this.options.zoomScaleSensitivity);var o=a.getEventPoint(t,this.svg).matrixTransform(this.svg.getScreenCTM().inverse());this.zoomAtPoint(e,o)},i.prototype.handleMouseDown=function(t,e){this.options.preventMouseEventsDefault&&(t.preventDefault?t.preventDefault():t.returnValue=!1),r.mouseAndTouchNormalize(t,this.svg),this.options.dblClickZoomEnabled&&r.isDblClick(t,e)?this.handleDblClick(t):(this.state="pan",this.firstEventCTM=this.viewport.getCTM(),this.stateOrigin=a.getEventPoint(t,this.svg).matrixTransform(this.firstEventCTM.inverse()))},i.prototype.handleMouseMove=function(t){if(this.options.preventMouseEventsDefault&&(t.preventDefault?t.preventDefault():t.returnValue=!1),"pan"===this.state&&this.options.panEnabled){var e=a.getEventPoint(t,this.svg).matrixTransform(this.firstEventCTM.inverse()),o=this.firstEventCTM.translate(e.x-this.stateOrigin.x,e.y-this.stateOrigin.y);this.viewport.setCTM(o)}},i.prototype.handleMouseUp=function(t){this.options.preventMouseEventsDefault&&(t.preventDefault?t.preventDefault():t.returnValue=!1),"pan"===this.state&&(this.state="none")},i.prototype.fit=function(){var t=this.viewport.getViewBox(),e=Math.min(this.width/t.width,this.height/t.height);this.zoom(e,!0)},i.prototype.contain=function(){var t=this.viewport.getViewBox(),e=Math.max(this.width/t.width,this.height/t.height);this.zoom(e,!0)},i.prototype.center=function(){var t=this.viewport.getViewBox(),e=.5*(this.width-(t.width+2*t.x)*this.getZoom()),o=.5*(this.height-(t.height+2*t.y)*this.getZoom());this.getPublicInstance().pan({x:e,y:o})},i.prototype.updateBBox=function(){this.viewport.simpleViewBoxCache()},i.prototype.pan=function(t){var e=this.viewport.getCTM();e.e=t.x,e.f=t.y,this.viewport.setCTM(e)},i.prototype.panBy=function(t){var e=this.viewport.getCTM();e.e+=t.x,e.f+=t.y,this.viewport.setCTM(e)},i.prototype.getPan=function(){var t=this.viewport.getState();return{x:t.x,y:t.y}},i.prototype.resize=function(){var t=a.getBoundingClientRectNormalized(this.svg);this.width=t.width,this.height=t.height;var e=this.viewport;e.options.width=this.width,e.options.height=this.height,e.processCTM(),this.options.controlIconsEnabled&&(this.getPublicInstance().disableControlIcons(),this.getPublicInstance().enableControlIcons())},i.prototype.destroy=function(){var e=this;for(var t in this.beforeZoom=null,this.onZoom=null,this.beforePan=null,this.onPan=null,(this.onUpdatedCTM=null)!=this.options.customEventsHandler&&this.options.customEventsHandler.destroy({svgElement:this.svg,eventsListenerElement:this.options.eventsListenerElement,instance:this.getPublicInstance()}),this.eventListeners)(this.options.eventsListenerElement||this.svg).removeEventListener(t,this.eventListeners[t],!this.options.preventMouseEventsDefault&&h);this.disableMouseWheelZoom(),this.getPublicInstance().disableControlIcons(),this.reset(),c=c.filter(function(t){return t.svg!==e.svg}),delete this.options,delete this.viewport,delete this.publicInstance,delete this.pi,this.getPublicInstance=function(){return null}},i.prototype.getPublicInstance=function(){var o=this;return this.publicInstance||(this.publicInstance=this.pi={enablePan:function(){return o.options.panEnabled=!0,o.pi},disablePan:function(){return o.options.panEnabled=!1,o.pi},isPanEnabled:function(){return!!o.options.panEnabled},pan:function(t){return o.pan(t),o.pi},panBy:function(t){return o.panBy(t),o.pi},getPan:function(){return o.getPan()},setBeforePan:function(t){return o.options.beforePan=null===t?null:r.proxy(t,o.publicInstance),o.pi},setOnPan:function(t){return o.options.onPan=null===t?null:r.proxy(t,o.publicInstance),o.pi},enableZoom:function(){return o.options.zoomEnabled=!0,o.pi},disableZoom:function(){return o.options.zoomEnabled=!1,o.pi},isZoomEnabled:function(){return!!o.options.zoomEnabled},enableControlIcons:function(){return o.options.controlIconsEnabled||(o.options.controlIconsEnabled=!0,s.enable(o)),o.pi},disableControlIcons:function(){return o.options.controlIconsEnabled&&(o.options.controlIconsEnabled=!1,s.disable(o)),o.pi},isControlIconsEnabled:function(){return!!o.options.controlIconsEnabled},enableDblClickZoom:function(){return o.options.dblClickZoomEnabled=!0,o.pi},disableDblClickZoom:function(){return o.options.dblClickZoomEnabled=!1,o.pi},isDblClickZoomEnabled:function(){return!!o.options.dblClickZoomEnabled},enableMouseWheelZoom:function(){return o.enableMouseWheelZoom(),o.pi},disableMouseWheelZoom:function(){return o.disableMouseWheelZoom(),o.pi},isMouseWheelZoomEnabled:function(){return!!o.options.mouseWheelZoomEnabled},setZoomScaleSensitivity:function(t){return o.options.zoomScaleSensitivity=t,o.pi},setMinZoom:function(t){return o.options.minZoom=t,o.pi},setMaxZoom:function(t){return o.options.maxZoom=t,o.pi},setBeforeZoom:function(t){return o.options.beforeZoom=null===t?null:r.proxy(t,o.publicInstance),o.pi},setOnZoom:function(t){return o.options.onZoom=null===t?null:r.proxy(t,o.publicInstance),o.pi},zoom:function(t){return o.publicZoom(t,!0),o.pi},zoomBy:function(t){return o.publicZoom(t,!1),o.pi},zoomAtPoint:function(t,e){return o.publicZoomAtPoint(t,e,!0),o.pi},zoomAtPointBy:function(t,e){return o.publicZoomAtPoint(t,e,!1),o.pi},zoomIn:function(){return this.zoomBy(1+o.options.zoomScaleSensitivity),o.pi},zoomOut:function(){return this.zoomBy(1/(1+o.options.zoomScaleSensitivity)),o.pi},getZoom:function(){return o.getRelativeZoom()},setOnUpdatedCTM:function(t){return o.options.onUpdatedCTM=null===t?null:r.proxy(t,o.publicInstance),o.pi},resetZoom:function(){return o.resetZoom(),o.pi},resetPan:function(){return o.resetPan(),o.pi},reset:function(){return o.reset(),o.pi},fit:function(){return o.fit(),o.pi},contain:function(){return o.contain(),o.pi},center:function(){return o.center(),o.pi},updateBBox:function(){return o.updateBBox(),o.pi},resize:function(){return o.resize(),o.pi},getSizes:function(){return{width:o.width,height:o.height,realZoom:o.getZoom(),viewBox:o.viewport.getViewBox()}},destroy:function(){return o.destroy(),o.pi}}),this.publicInstance};var c=[];e.exports=function(t,e){var o=r.getSvg(t);if(null===o)return null;for(var n=c.length-1;0<=n;n--)if(c[n].svg===o)return c[n].instance.getPublicInstance();return c.push({svg:o,instance:new i(o,e)}),c[c.length-1].instance.getPublicInstance()}},{"./control-icons":1,"./shadow-viewport":2,"./svg-utilities":5,"./uniwheel":6,"./utilities":7}],5:[function(t,e,o){var l=t("./utilities"),s="unknown";document.documentMode&&(s="ie"),e.exports={svgNS:"http://www.w3.org/2000/svg",xmlNS:"http://www.w3.org/XML/1998/namespace",xmlnsNS:"http://www.w3.org/2000/xmlns/",xlinkNS:"http://www.w3.org/1999/xlink",evNS:"http://www.w3.org/2001/xml-events",getBoundingClientRectNormalized:function(t){if(t.clientWidth&&t.clientHeight)return{width:t.clientWidth,height:t.clientHeight};if(t.getBoundingClientRect())return t.getBoundingClientRect();throw new Error("Cannot get BoundingClientRect for SVG.")},getOrCreateViewport:function(t,e){var o=null;if(!(o=l.isElement(e)?e:t.querySelector(e))){var n=Array.prototype.slice.call(t.childNodes||t.children).filter(function(t){return"defs"!==t.nodeName&&"#text"!==t.nodeName});1===n.length&&"g"===n[0].nodeName&&null===n[0].getAttribute("transform")&&(o=n[0])}if(!o){var i="viewport-"+(new Date).toISOString().replace(/\D/g,"");(o=document.createElementNS(this.svgNS,"g")).setAttribute("id",i);var s=t.childNodes||t.children;if(s&&0 + +## Brand Name +The official name of the package is __scikit-learn__. Do not abbreviate or otherwise alter the name. Always spell ‘scikit’ with a lowercase ‘s’. + + +## Color Palette + +![#29ABE2 Cyan](brand_colors/colorswatch_29ABE2_cyan.png) `RGB 41/171/226 | HEX #29ABE2 | scikit-learn Cyan` | More info: [#29ABE2](https://www.color-hex.com/color/29abe2) + +![#F7931E Orange](brand_colors/colorswatch_F7931E_orange.png) `RGB 247/147/30 | HEX #F7931E | scikit-learn Orange` | More info: [#F7931E](https://www.color-hex.com/color/f7931e) + +![#9B4600 Brown](brand_colors/colorswatch_9B4600_brown.png) `RGB 155/70/0| HEX #9B4600 | scikit-learn Brown` | More info: [#9B4600](https://www.color-hex.com/color/9b4600) + + +## Typography +The following typeface is used in the logo: +- "scikit": Helvetica Neue +- "learn": Script MT + + +## Logos +You may highlight or reference your work with scikit-learn by using one of the logos provided below. Any use must abide by the Logo Integrity Standards defined below. + +| | | +| - | - | +| | __Logo 1__
File type: PNG
File size: 49 KB (1280 x 689 px)
File name: [1280px-scikit-learn-logo.png](https://github.com/scikit-learn/scikit-learn/blob/main/doc/logos/1280px-scikit-learn-logo.png) | +| | __Logo 2__
File type: ICO
File size: 2 KB (32 x 32 px)
File name: [favicon.ico](https://github.com/scikit-learn/scikit-learn/blob/main/doc/logos/favicon.ico) | +| | __Logo 3__
File type: SVG
File size: 5 KB
File name: [scikit-learn-logo-without-subtitle.svg](https://github.com/scikit-learn/scikit-learn/blob/main/doc/logos/scikit-learn-logo-without-subtitle.svg) | +| | __Logo 4__
File type: SVG
File size: 4.59 KB
File name: [scikit-learn-logo.svg](https://github.com/scikit-learn/scikit-learn/blob/main/doc/logos/scikit-learn-logo.svg) | + +
+ + +### Logo Integrity Standards + +- __Minimum Size:__ For consistent legibility, please do not display the scikit-learn logo at less than 50px wide. +- __Scale:__ Ensure any logos used are scaled proportionally. Stretched, compressed, or otherwise distorted versions of the logo should not be displayed. + +- __Clear Space:__ To ensure the logo is clearly visible in all uses, surround it with a sufficient amount of clear space that is free of type, graphics, and other elements that might cause visual clutter. Do not overlap or obscure the logo with text, images, or other elements. The image below demonstrates the suggested amount of clear space margins to use around the logo.
+ +- __Colors:__ Only use logos in the approved color palette defined above. Do not recolor the logo. +- __Typeface:__ Do not change the typeface used in the logo. +- __No Modification:__ Do not attempt recreate or otherwise modify the scikit-learn logo. + + + +--- + +## Reference +- [color-hex](https://www.color-hex.com): Glossary of Color Palettes + +## Other +You can find more variations of the logos here: https://github.com/scikit-learn/blog/tree/main/assets/images diff --git a/doc/logos/brand_colors/colorswatch_29ABE2_cyan.png b/doc/logos/brand_colors/colorswatch_29ABE2_cyan.png new file mode 100644 index 0000000000000..b014a859dd4b9 Binary files /dev/null and b/doc/logos/brand_colors/colorswatch_29ABE2_cyan.png differ diff --git a/doc/logos/brand_colors/colorswatch_9B4600_brown.png b/doc/logos/brand_colors/colorswatch_9B4600_brown.png new file mode 100644 index 0000000000000..379400786ef56 Binary files /dev/null and b/doc/logos/brand_colors/colorswatch_9B4600_brown.png differ diff --git a/doc/logos/brand_colors/colorswatch_F7931E_orange.png b/doc/logos/brand_colors/colorswatch_F7931E_orange.png new file mode 100644 index 0000000000000..5b22b575ac411 Binary files /dev/null and b/doc/logos/brand_colors/colorswatch_F7931E_orange.png differ diff --git a/doc/logos/brand_guidelines/scikitlearn_logo_clearspace_updated.png b/doc/logos/brand_guidelines/scikitlearn_logo_clearspace_updated.png new file mode 100644 index 0000000000000..e10bff7a54c63 Binary files /dev/null and b/doc/logos/brand_guidelines/scikitlearn_logo_clearspace_updated.png differ diff --git a/doc/logos/scikit-learn-logo-without-subtitle.svg b/doc/logos/scikit-learn-logo-without-subtitle.svg new file mode 100644 index 0000000000000..932d418672034 --- /dev/null +++ b/doc/logos/scikit-learn-logo-without-subtitle.svg @@ -0,0 +1 @@ +scikit diff --git a/doc/logos/scikit-learn-logo.svg b/doc/logos/scikit-learn-logo.svg index 523a656943772..362542602e0ae 100644 --- a/doc/logos/scikit-learn-logo.svg +++ b/doc/logos/scikit-learn-logo.svg @@ -1,110 +1 @@ - - - -image/svg+xml - - - - - - - - - - - - - - -scikit - - -machine learning in Python - - \ No newline at end of file +scikitmachine learning in Python diff --git a/doc/machine_learning_map.rst b/doc/machine_learning_map.rst new file mode 100644 index 0000000000000..e63ab1b1ddce6 --- /dev/null +++ b/doc/machine_learning_map.rst @@ -0,0 +1,76 @@ +:html_theme.sidebar_secondary.remove: + +.. _ml_map: + +Choosing the right estimator +============================ + +Often the hardest part of solving a machine learning problem can be finding the right +estimator for the job. Different estimators are better suited for different types of +data and different problems. + +The flowchart below is designed to give users a bit of a rough guide on how to approach +problems with regard to which estimators to try on your data. Click on any estimator in +the chart below to see its documentation. The **Try next** orange arrows are to be read as +"if this estimator does not achieve the desired outcome, then follow the arrow and try +the next one". Use scroll wheel to zoom in and out, and click and drag to pan around. +You can also download the chart: :download:`ml_map.svg `. + +.. raw:: html + + + + + + +
+ +.. raw:: html + :file: images/ml_map.svg + +.. raw:: html + +
diff --git a/doc/maintainers.rst b/doc/maintainers.rst new file mode 100644 index 0000000000000..6b4f3a25c0ddc --- /dev/null +++ b/doc/maintainers.rst @@ -0,0 +1,84 @@ +.. raw :: html + + +
+ +
+
+

JÊrÊmie du Boisberranger

+
+
+
+

Loïc Estève

+
+
+
+

Thomas J. Fan

+
+
+
+

Alexandre Gramfort

+
+
+
+

Olivier Grisel

+
+
+
+

Tim Head

+
+
+
+

Nicolas Hug

+
+
+
+

Adrin Jalali

+
+
+
+

Julien Jerphanion

+
+
+
+

Guillaume Lemaitre

+
+
+
+

Adam Li

+
+
+
+

Lucy Liu

+
+
+
+

Christian Lorentzen

+
+
+
+

Andreas Mueller

+
+
+
+

Joel Nothman

+
+
+
+

Omar Salman

+
+
+
+

Gael Varoquaux

+
+
+
+

Yao Xiao

+
+
+
+

Meekail Zain

+
+
diff --git a/doc/authors_emeritus.rst b/doc/maintainers_emeritus.rst similarity index 73% rename from doc/authors_emeritus.rst rename to doc/maintainers_emeritus.rst index bcfd7d7d0514c..f5640ab2caf31 100644 --- a/doc/authors_emeritus.rst +++ b/doc/maintainers_emeritus.rst @@ -1,9 +1,9 @@ - Mathieu Blondel +- Joris Van den Bossche - Matthieu Brucher - Lars Buitinck - David Cournapeau - Noel Dawe -- Shiqiao Du - Vincent Dubourg - Edouard Duchesnay - Alexander Fabisch @@ -12,6 +12,7 @@ - Angel Soler Gollonet - Chris Gorgolewski - Jaques Grobler +- Yaroslav Halchenko - Brian Holt - Arnaud Joly - Thouis (Ray) Jones @@ -21,13 +22,21 @@ - Wei Li - Paolo Losi - Gilles Louppe +- Jan Hendrik Metzen - Vincent Michel - Jarrod Millman +- Vlad Niculae - Alexandre Passos - Fabian Pedregosa - Peter Prettenhofer +- Hanmin Qin - (Venkat) Raghav, Rajagopalan - Jacob Schreiber +- 杜世抋 Du Shiqiao +- Bertrand Thirion +- Tom DuprÊ la Tour - Jake Vanderplas +- Nelle Varoquaux - David Warde-Farley -- Ron Weiss \ No newline at end of file +- Ron Weiss +- Roman Yurchak diff --git a/doc/make.bat b/doc/make.bat index fa8e7171ea7e6..2a32bcb678f62 100644 --- a/doc/make.bat +++ b/doc/make.bat @@ -9,7 +9,7 @@ if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% ) -if "%1" == "" goto help +if "%1" == "" goto html-noplot if "%1" == "help" ( :help @@ -29,8 +29,30 @@ if "%1" == "help" ( ) if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* + if exist %BUILDDIR%\ ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s "%%i" + del /q /s %BUILDDIR%\* + echo. Removed %BUILDDIR%\* + ) + if exist auto_examples\ ( + rmdir /q /s auto_examples + echo. Removed auto_examples\ + ) + if exist generated\ ( + for /d %%i in (generated\*) do rmdir /q /s "%%i" + del /q /s generated\* + echo. Removed generated\* + ) + if exist modules\generated\ ( + rmdir /q /s modules\generated + echo. Removed modules\generated\ + ) + if exist css\styles\ ( + rmdir /q /s css\styles + echo. Removed css\styles\ + ) + for %%i in (api\*.rst) do del /q "%%i" + echo. Removed api\*.rst goto end ) @@ -42,9 +64,11 @@ if "%1" == "html" ( ) if "%1" == "html-noplot" ( + :html-noplot %SPHINXBUILD% -D plot_gallery=0 -b html %ALLSPHINXOPTS% %BUILDDIR%/html echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html + goto end ) if "%1" == "dirhtml" ( diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst new file mode 100644 index 0000000000000..d302b84c5de68 --- /dev/null +++ b/doc/metadata_routing.rst @@ -0,0 +1,326 @@ +.. currentmodule:: sklearn + +.. _metadata_routing: + +Metadata Routing +================ + +.. note:: + The Metadata Routing API is experimental, and is not yet implemented for all + estimators. Please refer to the :ref:`list of supported and unsupported + models ` for more information. It may change without + the usual deprecation cycle. By default this feature is not enabled. You can + enable it by setting the ``enable_metadata_routing`` flag to + ``True``:: + + >>> import sklearn + >>> sklearn.set_config(enable_metadata_routing=True) + + Note that the methods and requirements introduced in this document are only + relevant if you want to pass :term:`metadata` (e.g. ``sample_weight``) to a method. + If you're only passing ``X`` and ``y`` and no other parameter / metadata to + methods such as :term:`fit`, :term:`transform`, etc., then you don't need to set + anything. + +This guide demonstrates how :term:`metadata` can be routed and passed between objects in +scikit-learn. If you are developing a scikit-learn compatible estimator or +meta-estimator, you can check our related developer guide: +:ref:`sphx_glr_auto_examples_miscellaneous_plot_metadata_routing.py`. + +Metadata is data that an estimator, scorer, or CV splitter takes into account if the +user explicitly passes it as a parameter. For instance, :class:`~cluster.KMeans` accepts +`sample_weight` in its `fit()` method and considers it to calculate its centroids. +`classes` are consumed by some classifiers and `groups` are used in some splitters, but +any data that is passed into an object's methods apart from X and y can be considered as +metadata. Prior to scikit-learn version 1.3, there was no single API for passing +metadata like that if these objects were used in conjunction with other objects, e.g. a +scorer accepting `sample_weight` inside a :class:`~model_selection.GridSearchCV`. + +With the Metadata Routing API, we can transfer metadata to estimators, scorers, and CV +splitters using :term:`meta-estimators` (such as :class:`~pipeline.Pipeline` or +:class:`~model_selection.GridSearchCV`) or functions such as +:func:`~model_selection.cross_validate` which route data to other objects. In order to +pass metadata to a method like ``fit`` or ``score``, the object consuming the metadata, +must *request* it. This is done via `set_{method}_request()` methods, where `{method}` +is substituted by the name of the method that requests the metadata. For instance, +estimators that use the metadata in their `fit()` method would use `set_fit_request()`, +and scorers would use `set_score_request()`. These methods allow us to specify which +metadata to request, for instance `set_fit_request(sample_weight=True)`. + +For grouped splitters such as :class:`~model_selection.GroupKFold`, a +``groups`` parameter is requested by default. This is best demonstrated by the +following examples. + +Usage Examples +************** +Here we present a few examples to show some common use-cases. Our goal is to pass +`sample_weight` and `groups` through :func:`~model_selection.cross_validate`, which +routes the metadata to :class:`~linear_model.LogisticRegressionCV` and to a custom scorer +made with :func:`~metrics.make_scorer`, both of which *can* use the metadata in their +methods. In these examples we want to individually set whether to use the metadata +within the different :term:`consumers `. + +The examples in this section require the following imports and data:: + + >>> import numpy as np + >>> from sklearn.metrics import make_scorer, accuracy_score + >>> from sklearn.linear_model import LogisticRegressionCV, LogisticRegression + >>> from sklearn.model_selection import cross_validate, GridSearchCV, GroupKFold + >>> from sklearn.feature_selection import SelectKBest + >>> from sklearn.pipeline import make_pipeline + >>> n_samples, n_features = 100, 4 + >>> rng = np.random.RandomState(42) + >>> X = rng.rand(n_samples, n_features) + >>> y = rng.randint(0, 2, size=n_samples) + >>> my_groups = rng.randint(0, 10, size=n_samples) + >>> my_weights = rng.rand(n_samples) + >>> my_other_weights = rng.rand(n_samples) + +Weighted scoring and fitting +---------------------------- + +The splitter used internally in :class:`~linear_model.LogisticRegressionCV`, +:class:`~model_selection.GroupKFold`, requests ``groups`` by default. However, we need +to explicitly request `sample_weight` for it and for our custom scorer by specifying +`sample_weight=True` in :class:`~linear_model.LogisticRegressionCV`'s `set_fit_request()` +method and in :func:`~metrics.make_scorer`'s `set_score_request()` method. Both +:term:`consumers ` know how to use ``sample_weight`` in their `fit()` or +`score()` methods. We can then pass the metadata in +:func:`~model_selection.cross_validate` which will route it to any active consumers:: + + >>> weighted_acc = make_scorer(accuracy_score).set_score_request(sample_weight=True) + >>> lr = LogisticRegressionCV( + ... cv=GroupKFold(), + ... scoring=weighted_acc + ... ).set_fit_request(sample_weight=True) + >>> cv_results = cross_validate( + ... lr, + ... X, + ... y, + ... params={"sample_weight": my_weights, "groups": my_groups}, + ... cv=GroupKFold(), + ... scoring=weighted_acc, + ... ) + +Note that in this example, :func:`~model_selection.cross_validate` routes ``my_weights`` +to both the scorer and :class:`~linear_model.LogisticRegressionCV`. + +If we would pass `sample_weight` in the params of +:func:`~model_selection.cross_validate`, but not set any object to request it, +`UnsetMetadataPassedError` would be raised, hinting to us that we need to explicitly set +where to route it. The same applies if ``params={"sample_weights": my_weights, ...}`` +were passed (note the typo, i.e. ``weights`` instead of ``weight``), since +``sample_weights`` was not requested by any of its underlying objects. + +Weighted scoring and unweighted fitting +--------------------------------------- + +When passing metadata such as ``sample_weight`` into a :term:`router` +(:term:`meta-estimators` or routing function), all ``sample_weight`` :term:`consumers +` require weights to be either explicitly requested or explicitly not +requested (i.e. ``True`` or ``False``). Thus, to perform an unweighted fit, we need to +configure :class:`~linear_model.LogisticRegressionCV` to not request sample weights, so +that :func:`~model_selection.cross_validate` does not pass the weights along:: + + >>> weighted_acc = make_scorer(accuracy_score).set_score_request(sample_weight=True) + >>> lr = LogisticRegressionCV( + ... cv=GroupKFold(), scoring=weighted_acc, + ... ).set_fit_request(sample_weight=False) + >>> cv_results = cross_validate( + ... lr, + ... X, + ... y, + ... cv=GroupKFold(), + ... params={"sample_weight": my_weights, "groups": my_groups}, + ... scoring=weighted_acc, + ... ) + +If :meth:`linear_model.LogisticRegressionCV.set_fit_request` had not been called, +:func:`~model_selection.cross_validate` would raise an error because ``sample_weight`` +is passed but :class:`~linear_model.LogisticRegressionCV` would not be explicitly +configured to recognize the weights. + +Unweighted feature selection +---------------------------- + +Routing metadata is only possible if the object's method knows how to use the metadata, +which in most cases means they have it as an explicit parameter. Only then we can set +request values for metadata using `set_fit_request(sample_weight=True)`, for instance. +This makes the object a :term:`consumer `. + +Unlike :class:`~linear_model.LogisticRegressionCV`, +:class:`~feature_selection.SelectKBest` can't consume weights and therefore no request +value for ``sample_weight`` on its instance is set and ``sample_weight`` is not routed +to it:: + + >>> weighted_acc = make_scorer(accuracy_score).set_score_request(sample_weight=True) + >>> lr = LogisticRegressionCV( + ... cv=GroupKFold(), scoring=weighted_acc, + ... ).set_fit_request(sample_weight=True) + >>> sel = SelectKBest(k=2) + >>> pipe = make_pipeline(sel, lr) + >>> cv_results = cross_validate( + ... pipe, + ... X, + ... y, + ... cv=GroupKFold(), + ... params={"sample_weight": my_weights, "groups": my_groups}, + ... scoring=weighted_acc, + ... ) + +Different scoring and fitting weights +------------------------------------- + +Despite :func:`~metrics.make_scorer` and +:class:`~linear_model.LogisticRegressionCV` both expecting the key +``sample_weight``, we can use aliases to pass different weights to different +consumers. In this example, we pass ``scoring_weight`` to the scorer, and +``fitting_weight`` to :class:`~linear_model.LogisticRegressionCV`:: + + >>> weighted_acc = make_scorer(accuracy_score).set_score_request( + ... sample_weight="scoring_weight" + ... ) + >>> lr = LogisticRegressionCV( + ... cv=GroupKFold(), scoring=weighted_acc, + ... ).set_fit_request(sample_weight="fitting_weight") + >>> cv_results = cross_validate( + ... lr, + ... X, + ... y, + ... cv=GroupKFold(), + ... params={ + ... "scoring_weight": my_weights, + ... "fitting_weight": my_other_weights, + ... "groups": my_groups, + ... }, + ... scoring=weighted_acc, + ... ) + +API Interface +************* + +A :term:`consumer` is an object (estimator, meta-estimator, scorer, splitter) which +accepts and uses some :term:`metadata` in at least one of its methods (for instance +``fit``, ``predict``, ``inverse_transform``, ``transform``, ``score``, ``split``). +Meta-estimators which only forward the metadata to other objects (child estimators, +scorers, or splitters) and don't use the metadata themselves are not consumers. +(Meta-)Estimators which route metadata to other objects are :term:`routers `. +A(n) (meta-)estimator can be a :term:`consumer` and a :term:`router` at the same time. +(Meta-)Estimators and splitters expose a `set_{method}_request` method for each method +which accepts at least one metadata. For instance, if an estimator supports +``sample_weight`` in ``fit`` and ``score``, it exposes +``estimator.set_fit_request(sample_weight=value)`` and +``estimator.set_score_request(sample_weight=value)``. Here ``value`` can be: + +- ``True``: method requests a ``sample_weight``. This means if the metadata is provided, + it will be used, otherwise no error is raised. +- ``False``: method does not request a ``sample_weight``. +- ``None``: router will raise an error if ``sample_weight`` is passed. This is in almost + all cases the default value when an object is instantiated and ensures the user sets + the metadata requests explicitly when a metadata is passed. The only exception are + ``Group*Fold`` splitters. +- ``"param_name"``: alias for ``sample_weight`` if we want to pass different weights to + different consumers. If aliasing is used the meta-estimator should not forward + ``"param_name"`` to the consumer, but ``sample_weight`` instead, because the consumer + will expect a param called ``sample_weight``. This means the mapping between the + metadata required by the object, e.g. ``sample_weight`` and the variable name provided + by the user, e.g. ``my_weights`` is done at the router level, and not by the consuming + object itself. + +Metadata are requested in the same way for scorers using ``set_score_request``. + +If a metadata, e.g. ``sample_weight``, is passed by the user, the metadata request for +all objects which potentially can consume ``sample_weight`` should be set by the user, +otherwise an error is raised by the router object. For example, the following code +raises an error, since it hasn't been explicitly specified whether ``sample_weight`` +should be passed to the estimator's scorer or not:: + + >>> param_grid = {"C": [0.1, 1]} + >>> lr = LogisticRegression().set_fit_request(sample_weight=True) + >>> try: + ... GridSearchCV( + ... estimator=lr, param_grid=param_grid + ... ).fit(X, y, sample_weight=my_weights) + ... except ValueError as e: + ... print(e) + [sample_weight] are passed but are not explicitly set as requested or not + requested for LogisticRegression.score, which is used within GridSearchCV.fit. + Call `LogisticRegression.set_score_request({metadata}=True/False)` for each metadata + you want to request/ignore. See the Metadata Routing User guide + for more information. + +The issue can be fixed by explicitly setting the request value:: + + >>> lr = LogisticRegression().set_fit_request( + ... sample_weight=True + ... ).set_score_request(sample_weight=False) + +At the end of the **Usage Examples** section, we disable the configuration flag for +metadata routing:: + + >>> sklearn.set_config(enable_metadata_routing=False) + +.. _metadata_routing_models: + +Metadata Routing Support Status +******************************* +All consumers (i.e. simple estimators which only consume metadata and don't +route them) support metadata routing, meaning they can be used inside +meta-estimators which support metadata routing. However, development of support +for metadata routing for meta-estimators is in progress, and here is a list of +meta-estimators and tools which support and don't yet support metadata routing. + + +Meta-estimators and functions supporting metadata routing: + +- :class:`sklearn.calibration.CalibratedClassifierCV` +- :class:`sklearn.compose.ColumnTransformer` +- :class:`sklearn.compose.TransformedTargetRegressor` +- :class:`sklearn.covariance.GraphicalLassoCV` +- :class:`sklearn.ensemble.StackingClassifier` +- :class:`sklearn.ensemble.StackingRegressor` +- :class:`sklearn.ensemble.VotingClassifier` +- :class:`sklearn.ensemble.VotingRegressor` +- :class:`sklearn.ensemble.BaggingClassifier` +- :class:`sklearn.ensemble.BaggingRegressor` +- :class:`sklearn.feature_selection.RFE` +- :class:`sklearn.feature_selection.RFECV` +- :class:`sklearn.feature_selection.SelectFromModel` +- :class:`sklearn.feature_selection.SequentialFeatureSelector` +- :class:`sklearn.impute.IterativeImputer` +- :class:`sklearn.linear_model.ElasticNetCV` +- :class:`sklearn.linear_model.LarsCV` +- :class:`sklearn.linear_model.LassoCV` +- :class:`sklearn.linear_model.LassoLarsCV` +- :class:`sklearn.linear_model.LogisticRegressionCV` +- :class:`sklearn.linear_model.MultiTaskElasticNetCV` +- :class:`sklearn.linear_model.MultiTaskLassoCV` +- :class:`sklearn.linear_model.OrthogonalMatchingPursuitCV` +- :class:`sklearn.linear_model.RANSACRegressor` +- :class:`sklearn.linear_model.RidgeClassifierCV` +- :class:`sklearn.linear_model.RidgeCV` +- :class:`sklearn.model_selection.GridSearchCV` +- :class:`sklearn.model_selection.HalvingGridSearchCV` +- :class:`sklearn.model_selection.HalvingRandomSearchCV` +- :class:`sklearn.model_selection.RandomizedSearchCV` +- :class:`sklearn.model_selection.permutation_test_score` +- :func:`sklearn.model_selection.cross_validate` +- :func:`sklearn.model_selection.cross_val_score` +- :func:`sklearn.model_selection.cross_val_predict` +- :class:`sklearn.model_selection.learning_curve` +- :class:`sklearn.model_selection.validation_curve` +- :class:`sklearn.multiclass.OneVsOneClassifier` +- :class:`sklearn.multiclass.OneVsRestClassifier` +- :class:`sklearn.multiclass.OutputCodeClassifier` +- :class:`sklearn.multioutput.ClassifierChain` +- :class:`sklearn.multioutput.MultiOutputClassifier` +- :class:`sklearn.multioutput.MultiOutputRegressor` +- :class:`sklearn.multioutput.RegressorChain` +- :class:`sklearn.pipeline.FeatureUnion` +- :class:`sklearn.pipeline.Pipeline` +- :class:`sklearn.semi_supervised.SelfTrainingClassifier` + +Meta-estimators and tools not supporting metadata routing yet: + +- :class:`sklearn.ensemble.AdaBoostClassifier` +- :class:`sklearn.ensemble.AdaBoostRegressor` diff --git a/doc/min_dependency_substitutions.rst.template b/doc/min_dependency_substitutions.rst.template new file mode 100644 index 0000000000000..946de84902b3b --- /dev/null +++ b/doc/min_dependency_substitutions.rst.template @@ -0,0 +1,3 @@ +{% for package, (version, _) in dependent_packages.items() -%} +.. |{{ package|capitalize }}MinVersion| replace:: {{ version }} +{% endfor %} diff --git a/doc/min_dependency_table.rst.template b/doc/min_dependency_table.rst.template new file mode 100644 index 0000000000000..fbe58633e913a --- /dev/null +++ b/doc/min_dependency_table.rst.template @@ -0,0 +1,13 @@ +.. list-table:: + :header-rows: 1 + + * - Dependency + - Minimum Version + - Purpose + + {% for package, (version, tags) in dependent_packages.items() -%} + * - {{ package }} + - {{ version }} + - {{ tags }} + + {% endfor %} diff --git a/doc/model_persistence.rst b/doc/model_persistence.rst new file mode 100644 index 0000000000000..21d6934a48730 --- /dev/null +++ b/doc/model_persistence.rst @@ -0,0 +1,394 @@ +.. _model_persistence: + +================= +Model persistence +================= + +.. list-table:: Summary of model persistence methods + :widths: 25 50 50 + :header-rows: 1 + + * - Persistence method + - Pros + - Risks / Cons + * - :ref:`ONNX ` + - * Serve models without a Python environment + * Serving and training environments independent of one another + * Most secure option + - * Not all scikit-learn models are supported + * Custom estimators require more work to support + * Original Python object is lost and cannot be reconstructed + * - :ref:`skops_persistence` + - * More secure than `pickle` based formats + * Contents can be partly validated without loading + - * Not as fast as `pickle` based formats + * Supports less types than `pickle` based formats + * Requires the same environment as the training environment + * - :mod:`pickle` + - * Native to Python + * Can serialize most Python objects + * Efficient memory usage with `protocol=5` + - * Loading can execute arbitrary code + * Requires the same environment as the training environment + * - :mod:`joblib` + - * Efficient memory usage + * Supports memory mapping + * Easy shortcuts for compression and decompression + - * Pickle based format + * Loading can execute arbitrary code + * Requires the same environment as the training environment + * - `cloudpickle`_ + - * Can serialize non-packaged, custom Python code + * Comparable loading efficiency as :mod:`pickle` with `protocol=5` + - * Pickle based format + * Loading can execute arbitrary code + * No forward compatibility guarantees + * Requires the same environment as the training environment + +After training a scikit-learn model, it is desirable to have a way to persist +the model for future use without having to retrain. Based on your use-case, +there are a few different ways to persist a scikit-learn model, and here we +help you decide which one suits you best. In order to make a decision, you need +to answer the following questions: + +1. Do you need the Python object after persistence, or do you only need to + persist in order to serve the model and get predictions out of it? + +If you only need to serve the model and no further investigation on the Python +object itself is required, then :ref:`ONNX ` might be the +best fit for you. Note that not all models are supported by ONNX. + +In case ONNX is not suitable for your use-case, the next question is: + +2. Do you absolutely trust the source of the model, or are there any security + concerns regarding where the persisted model comes from? + +If you have security concerns, then you should consider using :ref:`skops.io +` which gives you back the Python object, but unlike +`pickle` based persistence solutions, loading the persisted model doesn't +automatically allow arbitrary code execution. Note that this requires manual +investigation of the persisted file, which :mod:`skops.io` allows you to do. + +The other solutions assume you absolutely trust the source of the file to be +loaded, as they are all susceptible to arbitrary code execution upon loading +the persisted file since they all use the pickle protocol under the hood. + +3. Do you care about the performance of loading the model, and sharing it + between processes where a memory mapped object on disk is beneficial? + +If yes, then you can consider using :ref:`joblib `. If this +is not a major concern for you, then you can use the built-in :mod:`pickle` +module. + +4. Did you try :mod:`pickle` or :mod:`joblib` and found that the model cannot + be persisted? It can happen for instance when you have user defined + functions in your model. + +If yes, then you can use `cloudpickle`_ which can serialize certain objects +which cannot be serialized by :mod:`pickle` or :mod:`joblib`. + + +Workflow Overview +----------------- + +In a typical workflow, the first step is to train the model using scikit-learn +and scikit-learn compatible libraries. Note that support for scikit-learn and +third party estimators varies across the different persistence methods. + +Train and Persist the Model +........................... + +Creating an appropriate model depends on your use-case. As an example, here we +train a :class:`sklearn.ensemble.HistGradientBoostingClassifier` on the iris +dataset:: + + >>> from sklearn import ensemble + >>> from sklearn import datasets + >>> clf = ensemble.HistGradientBoostingClassifier() + >>> X, y = datasets.load_iris(return_X_y=True) + >>> clf.fit(X, y) + HistGradientBoostingClassifier() + +Once the model is trained, you can persist it using your desired method, and +then you can load the model in a separate environment and get predictions from +it given input data. Here there are two major paths depending on how you +persist and plan to serve the model: + +- :ref:`ONNX `: You need an `ONNX` runtime and an environment + with appropriate dependencies installed to load the model and use the runtime + to get predictions. This environment can be minimal and does not necessarily + even require Python to be installed to load the model and compute + predictions. Also note that `onnxruntime` typically requires much less RAM + than Python to compute predictions from small models. + +- :mod:`skops.io`, :mod:`pickle`, :mod:`joblib`, `cloudpickle`_: You need a + Python environment with the appropriate dependencies installed to load the + model and get predictions from it. This environment should have the same + **packages** and the same **versions** as the environment where the model was + trained. Note that none of these methods support loading a model trained with + a different version of scikit-learn, and possibly different versions of other + dependencies such as `numpy` and `scipy`. Another concern would be running + the persisted model on a different hardware, and in most cases you should be + able to load your persisted model on a different hardware. + + +.. _onnx_persistence: + +ONNX +---- + +`ONNX`, or `Open Neural Network Exchange `__ format is best +suitable in use-cases where one needs to persist the model and then use the +persisted artifact to get predictions without the need to load the Python +object itself. It is also useful in cases where the serving environment needs +to be lean and minimal, since the `ONNX` runtime does not require `python`. + +`ONNX` is a binary serialization of the model. It has been developed to improve +the usability of the interoperable representation of data models. It aims to +facilitate the conversion of the data models between different machine learning +frameworks, and to improve their portability on different computing +architectures. More details are available from the `ONNX tutorial +`__. To convert scikit-learn model to `ONNX` +`sklearn-onnx `__ has been developed. However, +not all scikit-learn models are supported, and it is limited to the core +scikit-learn and does not support most third party estimators. One can write a +custom converter for third party or custom estimators, but the documentation to +do that is sparse and it might be challenging to do so. + +.. dropdown:: Using ONNX + + To convert the model to `ONNX` format, you need to give the converter some + information about the input as well, about which you can read more `here + `__:: + + from skl2onnx import to_onnx + onx = to_onnx(clf, X[:1].astype(numpy.float32), target_opset=12) + with open("filename.onnx", "wb") as f: + f.write(onx.SerializeToString()) + + You can load the model in Python and use the `ONNX` runtime to get + predictions:: + + from onnxruntime import InferenceSession + with open("filename.onnx", "rb") as f: + onx = f.read() + sess = InferenceSession(onx, providers=["CPUExecutionProvider"]) + pred_ort = sess.run(None, {"X": X_test.astype(numpy.float32)})[0] + +.. _skops_persistence: + +`skops.io` +---------- + +:mod:`skops.io` avoids using :mod:`pickle` and only loads files which have types +and references to functions which are trusted either by default or by the user. +Therefore it provides a more secure format than :mod:`pickle`, :mod:`joblib`, +and `cloudpickle`_. + + +.. dropdown:: Using skops + + The API is very similar to :mod:`pickle`, and you can persist your models as + explained in the `documentation + `__ using + :func:`skops.io.dump` and :func:`skops.io.dumps`:: + + import skops.io as sio + obj = sio.dump(clf, "filename.skops") + + And you can load them back using :func:`skops.io.load` and + :func:`skops.io.loads`. However, you need to specify the types which are + trusted by you. You can get existing unknown types in a dumped object / file + using :func:`skops.io.get_untrusted_types`, and after checking its contents, + pass it to the load function:: + + unknown_types = sio.get_untrusted_types(file="filename.skops") + # investigate the contents of unknown_types, and only load if you trust + # everything you see. + clf = sio.load("filename.skops", trusted=unknown_types) + + Please report issues and feature requests related to this format on the `skops + issue tracker `__. + + +.. _pickle_persistence: + +`pickle`, `joblib`, and `cloudpickle` +------------------------------------- + +These three modules / packages, use the `pickle` protocol under the hood, but +come with slight variations: + +- :mod:`pickle` is a module from the Python Standard Library. It can serialize + and deserialize any Python object, including custom Python classes and + objects. +- :mod:`joblib` is more efficient than `pickle` when working with large machine + learning models or large numpy arrays. +- `cloudpickle`_ can serialize certain objects which cannot be serialized by + :mod:`pickle` or :mod:`joblib`, such as user defined functions and lambda + functions. This can happen for instance, when using a + :class:`~sklearn.preprocessing.FunctionTransformer` and using a custom + function to transform the data. + +.. dropdown:: Using `pickle`, `joblib`, or `cloudpickle` + + Depending on your use-case, you can choose one of these three methods to + persist and load your scikit-learn model, and they all follow the same API:: + + # Here you can replace pickle with joblib or cloudpickle + from pickle import dump + with open("filename.pkl", "wb") as f: + dump(clf, f, protocol=5) + + Using `protocol=5` is recommended to reduce memory usage and make it faster to + store and load any large NumPy array stored as a fitted attribute in the model. + You can alternatively pass `protocol=pickle.HIGHEST_PROTOCOL` which is + equivalent to `protocol=5` in Python 3.8 and later (at the time of writing). + + And later when needed, you can load the same object from the persisted file:: + + # Here you can replace pickle with joblib or cloudpickle + from pickle import load + with open("filename.pkl", "rb") as f: + clf = load(f) + +.. _persistence_limitations: + +Security & Maintainability Limitations +-------------------------------------- + +:mod:`pickle` (and :mod:`joblib` and :mod:`cloudpickle` by extension), has +many documented security vulnerabilities by design and should only be used if +the artifact, i.e. the pickle-file, is coming from a trusted and verified +source. You should never load a pickle file from an untrusted source, similarly +to how you should never execute code from an untrusted source. + +Also note that arbitrary computations can be represented using the `ONNX` +format, and it is therefore recommended to serve models using `ONNX` in a +sandboxed environment to safeguard against computational and memory exploits. + +Also note that there are no supported ways to load a model trained with a +different version of scikit-learn. While using :mod:`skops.io`, :mod:`joblib`, +:mod:`pickle`, or `cloudpickle`_, models saved using one version of +scikit-learn might load in other versions, however, this is entirely +unsupported and inadvisable. It should also be kept in mind that operations +performed on such data could give different and unexpected results, or even +crash your Python process. + +In order to rebuild a similar model with future versions of scikit-learn, +additional metadata should be saved along the pickled model: + +* The training data, e.g. a reference to an immutable snapshot +* The Python source code used to generate the model +* The versions of scikit-learn and its dependencies +* The cross validation score obtained on the training data + +This should make it possible to check that the cross-validation score is in the +same range as before. + +Aside for a few exceptions, persisted models should be portable across +operating systems and hardware architectures assuming the same versions of +dependencies and Python are used. If you encounter an estimator that is not +portable, please open an issue on GitHub. Persisted models are often deployed +in production using containers like Docker, in order to freeze the environment +and dependencies. + +If you want to know more about these issues, please refer to these talks: + +- `Adrin Jalali: Let's exploit pickle, and skops to the rescue! | PyData + Amsterdam 2023 `__. +- `Alex Gaynor: Pickles are for Delis, not Software - PyCon 2014 + `__. + + +.. _serving_environment: + +Replicating the training environment in production +.................................................. + +If the versions of the dependencies used may differ from training to +production, it may result in unexpected behaviour and errors while using the +trained model. To prevent such situations it is recommended to use the same +dependencies and versions in both the training and production environment. +These transitive dependencies can be pinned with the help of package management +tools like `pip`, `mamba`, `conda`, `poetry`, `conda-lock`, `pixi`, etc. + +It is not always possible to load a model trained with older versions of the +scikit-learn library and its dependencies in an updated software environment. +Instead, you might need to retrain the model with the new versions of all +the libraries. So when training a model, it is important to record the training +recipe (e.g. a Python script) and training set information, and metadata about +all the dependencies to be able to automatically reconstruct the same training +environment for the updated software. + +.. dropdown:: InconsistentVersionWarning + + When an estimator is loaded with a scikit-learn version that is inconsistent + with the version the estimator was pickled with, an + :class:`~sklearn.exceptions.InconsistentVersionWarning` is raised. This warning + can be caught to obtain the original version the estimator was pickled with:: + + from sklearn.exceptions import InconsistentVersionWarning + warnings.simplefilter("error", InconsistentVersionWarning) + + try: + with open("model_from_previous_version.pickle", "rb") as f: + est = pickle.load(f) + except InconsistentVersionWarning as w: + print(w.original_sklearn_version) + + +Serving the model artifact +.......................... + +The last step after training a scikit-learn model is serving the model. +Once the trained model is successfully loaded, it can be served to manage +different prediction requests. This can involve deploying the model as a +web service using containerization, or other model deployment strategies, +according to the specifications. + + +Summarizing the key points +-------------------------- + +Based on the different approaches for model persistence, the key points for +each approach can be summarized as follows: + +* `ONNX`: It provides a uniform format for persisting any machine learning or + deep learning model (other than scikit-learn) and is useful for model + inference (predictions). It can however, result in compatibility issues with + different frameworks. +* :mod:`skops.io`: Trained scikit-learn models can be easily shared and put + into production using :mod:`skops.io`. It is more secure compared to + alternate approaches based on :mod:`pickle` because it does not load + arbitrary code unless explicitly asked for by the user. Such code needs to be + packaged and importable in the target Python environment. +* :mod:`joblib`: Efficient memory mapping techniques make it faster when using + the same persisted model in multiple Python processes when using + `mmap_mode="r"`. It also gives easy shortcuts to compress and decompress the + persisted object without the need for extra code. However, it may trigger the + execution of malicious code when loading a model from an untrusted source as + any other pickle-based persistence mechanism. +* :mod:`pickle`: It is native to Python and most Python objects can be + serialized and deserialized using :mod:`pickle`, including custom Python + classes and functions as long as they are defined in a package that can be + imported in the target environment. While :mod:`pickle` can be used to easily + save and load scikit-learn models, it may trigger the execution of malicious + code while loading a model from an untrusted source. :mod:`pickle` can also + be very efficient memorywise if the model was persisted with `protocol=5` but + it does not support memory mapping. +* `cloudpickle`_: It has comparable loading efficiency as :mod:`pickle` and + :mod:`joblib` (without memory mapping), but offers additional flexibility to + serialize custom Python code such as lambda expressions and interactively + defined functions and classes. It might be a last resort to persist pipelines + with custom Python components such as a + :class:`sklearn.preprocessing.FunctionTransformer` that wraps a function + defined in the training script itself or more generally outside of any + importable Python package. Note that `cloudpickle`_ offers no forward + compatibility guarantees and you might need the same version of + `cloudpickle`_ to load the persisted model along with the same version of all + the libraries used to define the model. As the other pickle-based persistence + mechanisms, it may trigger the execution of malicious code while loading + a model from an untrusted source. + +.. _cloudpickle: https://github.com/cloudpipe/cloudpickle diff --git a/doc/model_selection.rst b/doc/model_selection.rst index 25cd2b655ccc5..b78c9ff4c3aa8 100644 --- a/doc/model_selection.rst +++ b/doc/model_selection.rst @@ -1,9 +1,3 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -.. include:: includes/big_toc_css.rst - .. _model_selection: Model selection and evaluation @@ -14,5 +8,6 @@ Model selection and evaluation modules/cross_validation modules/grid_search + modules/classification_threshold modules/model_evaluation modules/learning_curve diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst new file mode 100644 index 0000000000000..ee049937f5ce0 --- /dev/null +++ b/doc/modules/array_api.rst @@ -0,0 +1,266 @@ +.. _array_api: + +================================ +Array API support (experimental) +================================ + +.. currentmodule:: sklearn + +The `Array API `_ specification defines +a standard API for all array manipulation libraries with a NumPy-like API. +Scikit-learn vendors pinned copies of +`array-api-compat `__ +and `array-api-extra `__. + +Scikit-learn's support for the array API standard requires the environment variable +`SCIPY_ARRAY_API` to be set to `1` before importing `scipy` and `scikit-learn`: + +.. prompt:: bash $ + + export SCIPY_ARRAY_API=1 + +Please note that this environment variable is intended for temporary use. +For more details, refer to SciPy's `Array API documentation +`_. + +Some scikit-learn estimators that primarily rely on NumPy (as opposed to using +Cython) to implement the algorithmic logic of their `fit`, `predict` or +`transform` methods can be configured to accept any Array API compatible input +data structures and automatically dispatch operations to the underlying namespace +instead of relying on NumPy. + +At this stage, this support is **considered experimental** and must be enabled +explicitly as explained in the following. + +.. note:: + Currently, only `array-api-strict`, `cupy`, and `PyTorch` are known to work + with scikit-learn's estimators. + +The following video provides an overview of the standard's design principles +and how it facilitates interoperability between array libraries: + +- `Scikit-learn on GPUs with Array API `_ + by :user:`Thomas Fan ` at PyData NYC 2023. + +Example usage +============= + +Here is an example code snippet to demonstrate how to use `CuPy +`_ to run +:class:`~discriminant_analysis.LinearDiscriminantAnalysis` on a GPU:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn import config_context + >>> from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + >>> import cupy + + >>> X_np, y_np = make_classification(random_state=0) + >>> X_cu = cupy.asarray(X_np) + >>> y_cu = cupy.asarray(y_np) + >>> X_cu.device + + + >>> with config_context(array_api_dispatch=True): + ... lda = LinearDiscriminantAnalysis() + ... X_trans = lda.fit_transform(X_cu, y_cu) + >>> X_trans.device + + +After the model is trained, fitted attributes that are arrays will also be +from the same Array API namespace as the training data. For example, if CuPy's +Array API namespace was used for training, then fitted attributes will be on the +GPU. We provide an experimental `_estimator_with_converted_arrays` utility that +transfers an estimator attributes from Array API to a ndarray:: + + >>> from sklearn.utils._array_api import _estimator_with_converted_arrays + >>> cupy_to_ndarray = lambda array : array.get() + >>> lda_np = _estimator_with_converted_arrays(lda, cupy_to_ndarray) + >>> X_trans = lda_np.transform(X_np) + >>> type(X_trans) + + +PyTorch Support +--------------- + +PyTorch Tensors are supported by setting `array_api_dispatch=True` and passing in +the tensors directly:: + + >>> import torch + >>> X_torch = torch.asarray(X_np, device="cuda", dtype=torch.float32) + >>> y_torch = torch.asarray(y_np, device="cuda", dtype=torch.float32) + + >>> with config_context(array_api_dispatch=True): + ... lda = LinearDiscriminantAnalysis() + ... X_trans = lda.fit_transform(X_torch, y_torch) + >>> type(X_trans) + + >>> X_trans.device.type + 'cuda' + +.. _array_api_supported: + +Support for `Array API`-compatible inputs +========================================= + +Estimators and other tools in scikit-learn that support Array API compatible inputs. + +Estimators +---------- + +- :class:`decomposition.PCA` (with `svd_solver="full"`, + `svd_solver="randomized"` and `power_iteration_normalizer="QR"`) +- :class:`linear_model.Ridge` (with `solver="svd"`) +- :class:`discriminant_analysis.LinearDiscriminantAnalysis` (with `solver="svd"`) +- :class:`preprocessing.Binarizer` +- :class:`preprocessing.KernelCenterer` +- :class:`preprocessing.LabelEncoder` +- :class:`preprocessing.MaxAbsScaler` +- :class:`preprocessing.MinMaxScaler` +- :class:`preprocessing.Normalizer` + +Meta-estimators +--------------- + +Meta-estimators that accept Array API inputs conditioned on the fact that the +base estimator also does: + +- :class:`model_selection.GridSearchCV` +- :class:`model_selection.RandomizedSearchCV` +- :class:`model_selection.HalvingGridSearchCV` +- :class:`model_selection.HalvingRandomSearchCV` + +Metrics +------- + +- :func:`sklearn.metrics.cluster.entropy` +- :func:`sklearn.metrics.accuracy_score` +- :func:`sklearn.metrics.d2_tweedie_score` +- :func:`sklearn.metrics.explained_variance_score` +- :func:`sklearn.metrics.f1_score` +- :func:`sklearn.metrics.fbeta_score` +- :func:`sklearn.metrics.hamming_loss` +- :func:`sklearn.metrics.jaccard_score` +- :func:`sklearn.metrics.max_error` +- :func:`sklearn.metrics.mean_absolute_error` +- :func:`sklearn.metrics.mean_absolute_percentage_error` +- :func:`sklearn.metrics.mean_gamma_deviance` +- :func:`sklearn.metrics.mean_pinball_loss` +- :func:`sklearn.metrics.mean_poisson_deviance` (requires `enabling array API support for SciPy `_) +- :func:`sklearn.metrics.mean_squared_error` +- :func:`sklearn.metrics.mean_squared_log_error` +- :func:`sklearn.metrics.mean_tweedie_deviance` +- :func:`sklearn.metrics.median_absolute_error` +- :func:`sklearn.metrics.multilabel_confusion_matrix` +- :func:`sklearn.metrics.pairwise.additive_chi2_kernel` +- :func:`sklearn.metrics.pairwise.chi2_kernel` +- :func:`sklearn.metrics.pairwise.cosine_similarity` +- :func:`sklearn.metrics.pairwise.cosine_distances` +- :func:`sklearn.metrics.pairwise.euclidean_distances` (see :ref:`device_support_for_float64`) +- :func:`sklearn.metrics.pairwise.linear_kernel` +- :func:`sklearn.metrics.pairwise.paired_cosine_distances` +- :func:`sklearn.metrics.pairwise.paired_euclidean_distances` +- :func:`sklearn.metrics.pairwise.polynomial_kernel` +- :func:`sklearn.metrics.pairwise.rbf_kernel` (see :ref:`device_support_for_float64`) +- :func:`sklearn.metrics.pairwise.sigmoid_kernel` +- :func:`sklearn.metrics.precision_score` +- :func:`sklearn.metrics.precision_recall_fscore_support` +- :func:`sklearn.metrics.r2_score` +- :func:`sklearn.metrics.recall_score` +- :func:`sklearn.metrics.root_mean_squared_error` +- :func:`sklearn.metrics.root_mean_squared_log_error` +- :func:`sklearn.metrics.zero_one_loss` + +Tools +----- + +- :func:`model_selection.train_test_split` +- :func:`utils.check_consistent_length` + +Coverage is expected to grow over time. Please follow the dedicated `meta-issue on GitHub +`_ to track progress. + +Type of return values and fitted attributes +------------------------------------------- + +When calling functions or methods with Array API compatible inputs, the +convention is to return array values of the same array container type and +device as the input data. + +Similarly, when an estimator is fitted with Array API compatible inputs, the +fitted attributes will be arrays from the same library as the input and stored +on the same device. The `predict` and `transform` method subsequently expect +inputs from the same array library and device as the data passed to the `fit` +method. + +Note however that scoring functions that return scalar values return Python +scalars (typically a `float` instance) instead of an array scalar value. + +Common estimator checks +======================= + +Add the `array_api_support` tag to an estimator's set of tags to indicate that +it supports the Array API. This will enable dedicated checks as part of the +common tests to verify that the estimators' results are the same when using +vanilla NumPy and Array API inputs. + +To run these checks you need to install +`array-api-strict `_ in your +test environment. This allows you to run checks without having a +GPU. To run the full set of checks you also need to install +`PyTorch `_, `CuPy `_ and have +a GPU. Checks that can not be executed or have missing dependencies will be +automatically skipped. Therefore it's important to run the tests with the +`-v` flag to see which checks are skipped: + +.. prompt:: bash $ + + pip install array-api-strict # and other libraries as needed + pytest -k "array_api" -v + +Running the scikit-learn tests against `array-api-strict` should help reveal +most code problems related to handling multiple device inputs via the use of +simulated non-CPU devices. This allows for fast iterative development and debugging of +array API related code. + +However, to ensure full handling of PyTorch or CuPy inputs allocated on actual GPU +devices, it is necessary to run the tests against those libraries and hardware. +This can either be achieved by using +`Google Colab `_ +or leveraging our CI infrastructure on pull requests (manually triggered by maintainers +for cost reasons). + +.. _mps_support: + +Note on MPS device support +-------------------------- + +On macOS, PyTorch can use the Metal Performance Shaders (MPS) to access +hardware accelerators (e.g. the internal GPU component of the M1 or M2 chips). +However, the MPS device support for PyTorch is incomplete at the time of +writing. See the following github issue for more details: + +- https://github.com/pytorch/pytorch/issues/77764 + +To enable the MPS support in PyTorch, set the environment variable +`PYTORCH_ENABLE_MPS_FALLBACK=1` before running the tests: + +.. prompt:: bash $ + + PYTORCH_ENABLE_MPS_FALLBACK=1 pytest -k "array_api" -v + +At the time of writing all scikit-learn tests should pass, however, the +computational speed is not necessarily better than with the CPU device. + +.. _device_support_for_float64: + +Note on device support for ``float64`` +-------------------------------------- + +Certain operations within scikit-learn will automatically perform operations +on floating-point values with `float64` precision to prevent overflows and ensure +correctness (e.g., :func:`metrics.pairwise.euclidean_distances`). However, +certain combinations of array namespaces and devices, such as `PyTorch on MPS` +(see :ref:`mps_support`) do not support the `float64` data type. In these cases, +scikit-learn will revert to using the `float32` data type instead. This can result in +different behavior (typically numerically unstable results) compared to not using array +API dispatching or using a device with `float64` support. diff --git a/doc/modules/biclustering.rst b/doc/modules/biclustering.rst index 7ec175883d4cd..41c2316c753ad 100644 --- a/doc/modules/biclustering.rst +++ b/doc/modules/biclustering.rst @@ -4,8 +4,7 @@ Biclustering ============ -Biclustering can be performed with the module -:mod:`sklearn.cluster.bicluster`. Biclustering algorithms simultaneously +Biclustering algorithms simultaneously cluster rows and columns of a data matrix. These clusters of rows and columns are known as biclusters. Each determines a submatrix of the original data matrix with some desired properties. @@ -82,7 +81,7 @@ diagonal and checkerboard bicluster structures. these alternate names. -.. currentmodule:: sklearn.cluster.bicluster +.. currentmodule:: sklearn.cluster .. _spectral_coclustering: @@ -148,21 +147,21 @@ Then the rows of :math:`Z` are clustered using :ref:`k-means and the remaining ``n_columns`` labels provide the column partitioning. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`: A simple example - showing how to generate a data matrix with biclusters and apply - this method to it. +* :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`: A simple example + showing how to generate a data matrix with biclusters and apply + this method to it. - * :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py`: An example of finding - biclusters in the twenty newsgroup dataset. +* :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py`: An example of finding + biclusters in the twenty newsgroup dataset. -.. topic:: References: +.. rubric:: References - * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using - bipartite spectral graph partitioning - `__. +* Dhillon, Inderjit S, 2001. :doi:`Co-clustering documents and words using + bipartite spectral graph partitioning + <10.1145/502512.502550>` .. _spectral_biclustering: @@ -221,7 +220,7 @@ Given these singular vectors, they are ranked according to which can be best approximated by a piecewise-constant vector. The approximations for each vector are found using one-dimensional k-means and scored using the Euclidean distance. Some subset of the best left -and right singular vector are selected. Next, the data is projected to +and right singular vectors are selected. Next, the data is projected to this best subset of singular vectors and clustered. For instance, if :math:`p` singular vectors were calculated, the @@ -235,17 +234,17 @@ Similarly, projecting the columns to :math:`A^{\top} * U` and clustering this :math:`n \times q` matrix yields the column labels. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`: a simple example - showing how to generate a checkerboard matrix and bicluster it. +* :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`: a simple example + showing how to generate a checkerboard matrix and bicluster it. -.. topic:: References: +.. rubric:: References - * Kluger, Yuval, et. al., 2003. `Spectral biclustering of microarray - data: coclustering genes and conditions - `__. +* Kluger, Yuval, et. al., 2003. :doi:`Spectral biclustering of microarray + data: coclustering genes and conditions + <10.1101/gr.648603>` .. _biclustering_evaluation: @@ -277,7 +276,7 @@ now, only the Jaccard index is implemented: where :math:`A` and :math:`B` are biclusters, :math:`|A \cap B|` is the number of elements in their intersection. The Jaccard index -achieves its minimum of 0 when the biclusters to not overlap at all +achieves its minimum of 0 when the biclusters do not overlap at all and its maximum of 1 when they are identical. Several methods have been developed to compare two sets of biclusters. @@ -289,7 +288,8 @@ available: 2. Assign biclusters from one set to another in a one-to-one fashion to maximize the sum of their similarities. This step is performed - using the Hungarian algorithm. + using :func:`scipy.optimize.linear_sum_assignment`, which uses a + modified Jonker-Volgenant algorithm. 3. The final sum of similarities is divided by the size of the larger set. @@ -299,8 +299,8 @@ are totally dissimilar. The maximum score, 1, occurs when both sets are identical. -.. topic:: References: +.. rubric:: References - * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis - for bicluster acquisition - `__. +* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis + for bicluster acquisition + `__. diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst index d0a9737dac612..e8e6aa8b9953a 100644 --- a/doc/modules/calibration.rst +++ b/doc/modules/calibration.rst @@ -20,21 +20,53 @@ prediction. Well calibrated classifiers are probabilistic classifiers for which the output of the :term:`predict_proba` method can be directly interpreted as a confidence level. -For instance, a well calibrated (binary) classifier should classify the samples -such that among the samples to which it gave a :term:`predict_proba` value -close to 0.8, -approximately 80% actually belong to the positive class. +For instance, a well calibrated (binary) classifier should classify the samples such +that among the samples to which it gave a :term:`predict_proba` value close to, say, +0.8, approximately 80% actually belong to the positive class. + +Before we show how to re-calibrate a classifier, we first need a way to detect how +good a classifier is calibrated. + +.. note:: + Strictly proper scoring rules for probabilistic predictions like + :func:`sklearn.metrics.brier_score_loss` and + :func:`sklearn.metrics.log_loss` assess calibration (reliability) and + discriminative power (resolution) of a model, as well as the randomness of the data + (uncertainty) at the same time. This follows from the well-known Brier score + decomposition of Murphy [1]_. As it is not clear which term dominates, the score is + of limited use for assessing calibration alone (unless one computes each term of + the decomposition). A lower Brier loss, for instance, does not necessarily + mean a better calibrated model, it could also mean a worse calibrated model with much + more discriminatory power, e.g. using many more features. .. _calibration_curve: Calibration curves ------------------ -The following plot compares how well the probabilistic predictions of -different classifiers are calibrated, using :func:`calibration_curve`. -The x axis represents the average predicted probability in each bin. The -y axis is the *fraction of positives*, i.e. the proportion of samples whose -class is the positive class (in each bin). +Calibration curves, also referred to as *reliability diagrams* (Wilks 1995 [2]_), +compare how well the probabilistic predictions of a binary classifier are calibrated. +It plots the frequency of the positive label (to be more precise, an estimation of the +*conditional event probability* :math:`P(Y=1|\text{predict_proba})`) on the y-axis +against the predicted probability :term:`predict_proba` of a model on the x-axis. +The tricky part is to get values for the y-axis. +In scikit-learn, this is accomplished by binning the predictions such that the x-axis +represents the average predicted probability in each bin. +The y-axis is then the *fraction of positives* given the predictions of that bin, i.e. +the proportion of samples whose class is the positive class (in each bin). + +The top calibration curve plot is created with +:func:`CalibrationDisplay.from_estimator`, which uses :func:`calibration_curve` to +calculate the per bin average predicted probabilities and fraction of positives. +:func:`CalibrationDisplay.from_estimator` +takes as input a fitted classifier, which is used to calculate the predicted +probabilities. The classifier thus must have :term:`predict_proba` method. For +the few classifiers that do not have a :term:`predict_proba` method, it is +possible to use :class:`CalibratedClassifierCV` to calibrate the classifier +outputs to probabilities. + +The bottom histogram gives some insight into the behavior of each classifier +by showing the number of samples in each predicted probability bin. .. figure:: ../auto_examples/calibration/images/sphx_glr_plot_compare_calibration_001.png :target: ../auto_examples/calibration/plot_compare_calibration.html @@ -42,13 +74,20 @@ class is the positive class (in each bin). .. currentmodule:: sklearn.linear_model -:class:`LogisticRegression` returns well calibrated predictions by default as it directly -optimizes :ref:`log_loss`. In contrast, the other methods return biased probabilities; -with different biases per method: +:class:`LogisticRegression` is more likely to return well calibrated predictions by itself as it has a +canonical link function for its loss, i.e. the logit-link for the :ref:`log_loss`. +In the unpenalized case, this leads to the so-called **balance property**, see [8]_ and :ref:`Logistic_regression`. +In the plot above, data is generated according to a linear mechanism, which is +consistent with the :class:`LogisticRegression` model (the model is 'well specified'), +and the value of the regularization parameter `C` is tuned to be +appropriate (neither too strong nor too low). As a consequence, this model returns +accurate predictions from its `predict_proba` method. +In contrast to that, the other shown models return biased probabilities; with +different biases per model. .. currentmodule:: sklearn.naive_bayes -:class:`GaussianNB` tends to push probabilities to 0 or 1 (note the counts +:class:`GaussianNB` (Naive Bayes) tends to push probabilities to 0 or 1 (note the counts in the histograms). This is mainly because it makes the assumption that features are conditionally independent given the class, which is not the case in this dataset which contains 2 redundant features. @@ -56,33 +95,31 @@ case in this dataset which contains 2 redundant features. .. currentmodule:: sklearn.ensemble :class:`RandomForestClassifier` shows the opposite behavior: the histograms -show peaks at approximately 0.2 and 0.9 probability, while probabilities +show peaks at probabilities approximately 0.2 and 0.9, while probabilities close to 0 or 1 are very rare. An explanation for this is given by -Niculescu-Mizil and Caruana [1]_: "Methods such as bagging and random +Niculescu-Mizil and Caruana [3]_: "Methods such as bagging and random forests that average predictions from a base set of models can have difficulty making predictions near 0 and 1 because variance in the underlying base models will bias predictions that should be near zero or one away from these values. Because predictions are restricted to the interval [0,1], errors caused by variance tend to be one-sided near zero and one. For -example, if a model should predict p = 0 for a case, the only way bagging +example, if a model should predict :math:`p = 0` for a case, the only way bagging can achieve this is if all bagged trees predict zero. If we add noise to the trees that bagging is averaging over, this noise will cause some trees to predict values larger than 0 for this case, thus moving the average prediction of the bagged ensemble away from 0. We observe this effect most strongly with random forests because the base-level trees trained with random forests have relatively high variance due to feature subsetting." As -a result, the calibration curve also referred to as the reliability diagram -(Wilks 1995 [2]_) shows a characteristic sigmoid shape, indicating that the -classifier could trust its "intuition" more and return probabilities closer +a result, the calibration curve shows a characteristic sigmoid shape, indicating that +the classifier could trust its "intuition" more and return probabilities closer to 0 or 1 typically. .. currentmodule:: sklearn.svm -Linear Support Vector Classification (:class:`LinearSVC`) shows an even more -sigmoid curve than :class:`~sklearn.ensemble.RandomForestClassifier`, which is -typical for maximum-margin methods (compare Niculescu-Mizil and Caruana [1]_), -which focus on difficult to classify samples that are close to the decision -boundary (the support vectors). +:class:`LinearSVC` (SVC) shows an even more sigmoid curve than the random forest, which +is typical for maximum-margin methods (compare Niculescu-Mizil and Caruana [3]_), which +focus on difficult to classify samples that are close to the decision boundary (the +support vectors). Calibrating a classifier ------------------------ @@ -93,10 +130,11 @@ Calibrating a classifier consists of fitting a regressor (called a *calibrator*) that maps the output of the classifier (as given by :term:`decision_function` or :term:`predict_proba`) to a calibrated probability in [0, 1]. Denoting the output of the classifier for a given sample by :math:`f_i`, -the calibrator tries to predict :math:`p(y_i = 1 | f_i)`. +the calibrator tries to predict the conditional event probability +:math:`P(y_i = 1 | f_i)`. -The samples that are used to fit the calibrator should not be the same -samples used to fit the classifier, as this would introduce bias. +Ideally, the calibrator is fit on a dataset independent of the training data used to +fit the classifier in the first place. This is because performance of the classifier on its training data would be better than for novel data. Using the classifier output of training data to fit the calibrator would thus result in a biased calibrator that maps to @@ -108,22 +146,36 @@ Usage The :class:`CalibratedClassifierCV` class is used to calibrate a classifier. :class:`CalibratedClassifierCV` uses a cross-validation approach to ensure -unbiased data is always used to fit the calibrator. The data is split into k +unbiased data is always used to fit the calibrator. The data is split into :math:`k` `(train_set, test_set)` couples (as determined by `cv`). When `ensemble=True` (default), the following procedure is repeated independently for each -cross-validation split: a clone of `base_estimator` is first trained on the -train subset. Then its predictions on the test subset are used to fit a -calibrator (either a sigmoid or isotonic regressor). This results in an -ensemble of k `(classifier, calibrator)` couples where each calibrator maps +cross-validation split: + +1. a clone of `base_estimator` is trained on the train subset +2. the trained `base_estimator` makes predictions on the test subset +3. the predictions are used to fit a calibrator (either a sigmoid or isotonic + regressor) (when the data is multiclass, a calibrator is fit for every class) + +This results in an +ensemble of :math:`k` `(classifier, calibrator)` couples where each calibrator maps the output of its corresponding classifier into [0, 1]. Each couple is exposed in the `calibrated_classifiers_` attribute, where each entry is a calibrated classifier with a :term:`predict_proba` method that outputs calibrated probabilities. The output of :term:`predict_proba` for the main :class:`CalibratedClassifierCV` instance corresponds to the average of the -predicted probabilities of the `k` estimators in the `calibrated_classifiers_` +predicted probabilities of the :math:`k` estimators in the `calibrated_classifiers_` list. The output of :term:`predict` is the class that has the highest probability. +It is important to choose `cv` carefully when using `ensemble=True`. +All classes should be present in both train and test subsets for every split. +When a class is absent in the train subset, the predicted probability for that +class will default to 0 for the `(classifier, calibrator)` couple of that split. +This skews the :term:`predict_proba` as it averages across all couples. +When a class is absent in the test subset, the calibrator for that class +(within the `(classifier, calibrator)` couple of that split) is +fit on data with no positive class. This results in ineffective calibration. + When `ensemble=False`, cross-validation is used to obtain 'unbiased' predictions for all the data, via :func:`~sklearn.model_selection.cross_val_predict`. @@ -141,33 +193,24 @@ The main advantage of using `ensemble=False` is computational: it reduces the overall fit time by training only a single base classifier and calibrator pair, decreases the final model size and increases prediction speed. -Alternatively an already fitted classifier can be calibrated by setting -`cv="prefit"`. In this case, the data is not split and all of it is used to -fit the regressor. It is up to the user to -make sure that the data used for fitting the classifier is disjoint from the -data used for fitting the regressor. - -:func:`sklearn.metrics.brier_score_loss` may be used to assess how -well a classifier is calibrated. However, this metric should be used with care -because a lower Brier score does not always mean a better calibrated model. -This is because the Brier score metric is a combination of calibration loss -and refinement loss. Calibration loss is defined as the mean squared deviation -from empirical probabilities derived from the slope of ROC segments. -Refinement loss can be defined as the expected optimal loss as measured by the -area under the optimal cost curve. As refinement loss can change -independently from calibration loss, a lower Brier score does not necessarily -mean a better calibrated model. - -:class:`CalibratedClassifierCV` supports the use of two 'calibration' -regressors: 'sigmoid' and 'isotonic'. +Alternatively an already fitted classifier can be calibrated by using a +:class:`~sklearn.frozen.FrozenEstimator` as +``CalibratedClassifierCV(estimator=FrozenEstimator(estimator))``. +It is up to the user to make sure that the data used for fitting the classifier +is disjoint from the data used for fitting the regressor. + +:class:`CalibratedClassifierCV` supports the use of two regression techniques +for calibration via the `method` parameter: `"sigmoid"` and `"isotonic"`. + +.. _sigmoid_regressor: Sigmoid ^^^^^^^ -The sigmoid regressor is based on Platt's logistic model [3]_: +The sigmoid regressor, `method="sigmoid"` is based on Platt's logistic model [4]_: .. math:: - p(y_i = 1 | f_i) = \frac{1}{1 + \exp(A f_i + B)} + p(y_i = 1 | f_i) = \frac{1}{1 + \exp(A f_i + B)} \,, where :math:`y_i` is the true label of sample :math:`i` and :math:`f_i` is the output of the un-calibrated classifier for sample :math:`i`. :math:`A` @@ -178,37 +221,46 @@ The sigmoid method assumes the :ref:`calibration curve ` can be corrected by applying a sigmoid function to the raw predictions. This assumption has been empirically justified in the case of :ref:`svm` with common kernel functions on various benchmark datasets in section 2.1 of Platt -1999 [3]_ but does not necessarily hold in general. Additionally, the +1999 [4]_ but does not necessarily hold in general. Additionally, the logistic model works best if the calibration error is symmetrical, meaning the classifier output for each binary class is normally distributed with -the same variance [6]_. This can be a problem for highly imbalanced +the same variance [7]_. This can be a problem for highly imbalanced classification problems, where outputs do not have equal variance. -In general this method is most effective when the un-calibrated model is -under-confident and has similar calibration errors for both high and low -outputs. +In general this method is most effective for small sample sizes or when the +un-calibrated model is under-confident and has similar calibration errors for both +high and low outputs. Isotonic ^^^^^^^^ -The 'isotonic' method fits a non-parametric isotonic regressor, which outputs -a step-wise non-decreasing function (see :mod:`sklearn.isotonic`). It -minimizes: +The `method="isotonic"` fits a non-parametric isotonic regressor, which outputs +a step-wise non-decreasing function, see :mod:`sklearn.isotonic`. It minimizes: .. math:: \sum_{i=1}^{n} (y_i - \hat{f}_i)^2 -subject to :math:`\hat{f}_i >= \hat{f}_j` whenever -:math:`f_i >= f_j`. :math:`y_i` is the true +subject to :math:`\hat{f}_i \geq \hat{f}_j` whenever +:math:`f_i \geq f_j`. :math:`y_i` is the true label of sample :math:`i` and :math:`\hat{f}_i` is the output of the calibrated classifier for sample :math:`i` (i.e., the calibrated probability). -This method is more general when compared to 'sigmoid' as the only restriction +This method is more general when compared to `'sigmoid'` as the only restriction is that the mapping function is monotonically increasing. It is thus more powerful as it can correct any monotonic distortion of the un-calibrated model. -However, it is more prone to overfitting, especially on small datasets [5]_. +However, it is more prone to overfitting, especially on small datasets [6]_. -Overall, 'isotonic' will perform as well as or better than 'sigmoid' when -there is enough data (greater than ~ 1000 samples) to avoid overfitting [1]_. +Overall, `'isotonic'` will perform as well as or better than `'sigmoid'` when +there is enough data (greater than ~ 1000 samples) to avoid overfitting [3]_. + +.. note:: Impact on ranking metrics like AUC + + It is generally expected that calibration does not affect ranking metrics such as + ROC-AUC. However, these metrics might differ after calibration when using + `method="isotonic"` since isotonic regression introduces ties in the predicted + probabilities. This can be seen as within the uncertainty of the model predictions. + In case, you strictly want to keep the ranking and thus AUC scores, use + `method="sigmoid"` which is a strictly monotonic transformation and thus keeps + the ranking. Multiclass support ^^^^^^^^^^^^^^^^^^ @@ -218,46 +270,57 @@ support 1-dimensional data (e.g., binary classification output) but are extended for multiclass classification if the `base_estimator` supports multiclass predictions. For multiclass predictions, :class:`CalibratedClassifierCV` calibrates for -each class separately in a :ref:`ovr_classification` fashion [4]_. When +each class separately in a :ref:`ovr_classification` fashion [5]_. When predicting probabilities, the calibrated probabilities for each class are predicted separately. As those probabilities do not necessarily sum to one, a postprocessing is performed to normalize them. -.. topic:: Examples: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_calibration_plot_calibration_curve.py` +* :ref:`sphx_glr_auto_examples_calibration_plot_calibration_multiclass.py` +* :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py` +* :ref:`sphx_glr_auto_examples_calibration_plot_compare_calibration.py` + +.. rubric:: References + +.. [1] Allan H. Murphy (1973). + :doi:`"A New Vector Partition of the Probability Score" + <10.1175/1520-0450(1973)012%3C0595:ANVPOT%3E2.0.CO;2>` + Journal of Applied Meteorology and Climatology - * :ref:`sphx_glr_auto_examples_calibration_plot_calibration_curve.py` - * :ref:`sphx_glr_auto_examples_calibration_plot_calibration_multiclass.py` - * :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py` - * :ref:`sphx_glr_auto_examples_calibration_plot_compare_calibration.py` +.. [2] `On the combination of forecast probabilities for + consecutive precipitation periods. + `_ + Wea. Forecasting, 5, 640–650., Wilks, D. S., 1990a -.. topic:: References: +.. [3] `Predicting Good Probabilities with Supervised Learning + `_, + A. Niculescu-Mizil & R. Caruana, ICML 2005 - .. [1] `Predicting Good Probabilities with Supervised Learning - `_, - A. Niculescu-Mizil & R. Caruana, ICML 2005 - .. [2] `On the combination of forecast probabilities for - consecutive precipitation periods. - `_ - Wea. Forecasting, 5, 640–650., Wilks, D. S., 1990a +.. [4] `Probabilistic Outputs for Support Vector Machines and Comparisons + to Regularized Likelihood Methods. + `_ + J. Platt, (1999) - .. [3] `Probabilistic Outputs for Support Vector Machines and Comparisons - to Regularized Likelihood Methods. - `_ - J. Platt, (1999) +.. [5] `Transforming Classifier Scores into Accurate Multiclass + Probability Estimates. + `_ + B. Zadrozny & C. Elkan, (KDD 2002) - .. [4] `Transforming Classifier Scores into Accurate Multiclass - Probability Estimates. - `_ - B. Zadrozny & C. Elkan, (KDD 2002) +.. [6] `Predicting accurate probabilities with a ranking loss. + `_ + Menon AK, Jiang XJ, Vembu S, Elkan C, Ohno-Machado L. + Proc Int Conf Mach Learn. 2012;2012:703-710 - .. [5] `Predicting accurate probabilities with a ranking loss. - `_ - Menon AK, Jiang XJ, Vembu S, Elkan C, Ohno-Machado L. - Proc Int Conf Mach Learn. 2012;2012:703-710 +.. [7] `Beyond sigmoids: How to obtain well-calibrated probabilities from + binary classifiers with beta calibration + `_ + Kull, M., Silva Filho, T. M., & Flach, P. (2017). - .. [6] `Beyond sigmoids: How to obtain well-calibrated probabilities from - binary classifiers with beta calibration - `_ - Kull, M., Silva Filho, T. M., & Flach, P. (2017). +.. [8] Mario V. WÃŧthrich, Michael Merz (2023). + :doi:`"Statistical Foundations of Actuarial Learning and its Applications" + <10.1007/978-3-031-12409-9>` + Springer Actuarial diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst deleted file mode 100644 index c658bc6b12452..0000000000000 --- a/doc/modules/classes.rst +++ /dev/null @@ -1,1643 +0,0 @@ -.. _api_ref: - -============= -API Reference -============= - -This is the class and function reference of scikit-learn. Please refer to -the :ref:`full user guide ` for further details, as the class and -function raw specifications may not be enough to give full guidelines on their -uses. -For reference on concepts repeated across the API, see :ref:`glossary`. - - -:mod:`sklearn.base`: Base classes and utility functions -======================================================= - -.. automodule:: sklearn.base - :no-members: - :no-inherited-members: - -Base classes ------------- -.. currentmodule:: sklearn - -.. autosummary:: - :nosignatures: - :toctree: generated/ - :template: class.rst - - base.BaseEstimator - base.BiclusterMixin - base.ClassifierMixin - base.ClusterMixin - base.DensityMixin - base.RegressorMixin - base.TransformerMixin - feature_selection.SelectorMixin - -Functions ---------- -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - base.clone - base.is_classifier - base.is_regressor - config_context - get_config - set_config - show_versions - -.. _calibration_ref: - -:mod:`sklearn.calibration`: Probability Calibration -=================================================== - -.. automodule:: sklearn.calibration - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`calibration` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - calibration.CalibratedClassifierCV - - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - calibration.calibration_curve - -.. _cluster_ref: - -:mod:`sklearn.cluster`: Clustering -================================== - -.. automodule:: sklearn.cluster - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`clustering` and :ref:`biclustering` sections for -further details. - -Classes -------- -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - cluster.AffinityPropagation - cluster.AgglomerativeClustering - cluster.Birch - cluster.DBSCAN - cluster.FeatureAgglomeration - cluster.KMeans - cluster.MiniBatchKMeans - cluster.MeanShift - cluster.OPTICS - cluster.SpectralClustering - cluster.SpectralBiclustering - cluster.SpectralCoclustering - -Functions ---------- -.. autosummary:: - :toctree: generated/ - :template: function.rst - - cluster.affinity_propagation - cluster.cluster_optics_dbscan - cluster.cluster_optics_xi - cluster.compute_optics_graph - cluster.dbscan - cluster.estimate_bandwidth - cluster.k_means - cluster.kmeans_plusplus - cluster.mean_shift - cluster.spectral_clustering - cluster.ward_tree - -.. _compose_ref: - -:mod:`sklearn.compose`: Composite Estimators -============================================ - -.. automodule:: sklearn.compose - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`combining_estimators` section for further -details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated - :template: class.rst - - compose.ColumnTransformer - compose.TransformedTargetRegressor - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - compose.make_column_transformer - compose.make_column_selector - -.. _covariance_ref: - -:mod:`sklearn.covariance`: Covariance Estimators -================================================ - -.. automodule:: sklearn.covariance - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`covariance` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - covariance.EmpiricalCovariance - covariance.EllipticEnvelope - covariance.GraphicalLasso - covariance.GraphicalLassoCV - covariance.LedoitWolf - covariance.MinCovDet - covariance.OAS - covariance.ShrunkCovariance - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - covariance.empirical_covariance - covariance.graphical_lasso - covariance.ledoit_wolf - covariance.oas - covariance.shrunk_covariance - -.. _cross_decomposition_ref: - -:mod:`sklearn.cross_decomposition`: Cross decomposition -======================================================= - -.. automodule:: sklearn.cross_decomposition - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`cross_decomposition` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - cross_decomposition.CCA - cross_decomposition.PLSCanonical - cross_decomposition.PLSRegression - cross_decomposition.PLSSVD - -.. _datasets_ref: - -:mod:`sklearn.datasets`: Datasets -================================= - -.. automodule:: sklearn.datasets - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`datasets` section for further details. - -Loaders -------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - datasets.clear_data_home - datasets.dump_svmlight_file - datasets.fetch_20newsgroups - datasets.fetch_20newsgroups_vectorized - datasets.fetch_california_housing - datasets.fetch_covtype - datasets.fetch_kddcup99 - datasets.fetch_lfw_pairs - datasets.fetch_lfw_people - datasets.fetch_olivetti_faces - datasets.fetch_openml - datasets.fetch_rcv1 - datasets.fetch_species_distributions - datasets.get_data_home - datasets.load_boston - datasets.load_breast_cancer - datasets.load_diabetes - datasets.load_digits - datasets.load_files - datasets.load_iris - datasets.load_linnerud - datasets.load_sample_image - datasets.load_sample_images - datasets.load_svmlight_file - datasets.load_svmlight_files - datasets.load_wine - -Samples generator ------------------ - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - datasets.make_biclusters - datasets.make_blobs - datasets.make_checkerboard - datasets.make_circles - datasets.make_classification - datasets.make_friedman1 - datasets.make_friedman2 - datasets.make_friedman3 - datasets.make_gaussian_quantiles - datasets.make_hastie_10_2 - datasets.make_low_rank_matrix - datasets.make_moons - datasets.make_multilabel_classification - datasets.make_regression - datasets.make_s_curve - datasets.make_sparse_coded_signal - datasets.make_sparse_spd_matrix - datasets.make_sparse_uncorrelated - datasets.make_spd_matrix - datasets.make_swiss_roll - - -.. _decomposition_ref: - -:mod:`sklearn.decomposition`: Matrix Decomposition -================================================== - -.. automodule:: sklearn.decomposition - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`decompositions` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - decomposition.DictionaryLearning - decomposition.FactorAnalysis - decomposition.FastICA - decomposition.IncrementalPCA - decomposition.KernelPCA - decomposition.LatentDirichletAllocation - decomposition.MiniBatchDictionaryLearning - decomposition.MiniBatchSparsePCA - decomposition.NMF - decomposition.PCA - decomposition.SparsePCA - decomposition.SparseCoder - decomposition.TruncatedSVD - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - decomposition.dict_learning - decomposition.dict_learning_online - decomposition.fastica - decomposition.non_negative_factorization - decomposition.sparse_encode - -.. _lda_ref: - -:mod:`sklearn.discriminant_analysis`: Discriminant Analysis -=========================================================== - -.. automodule:: sklearn.discriminant_analysis - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`lda_qda` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated - :template: class.rst - - discriminant_analysis.LinearDiscriminantAnalysis - discriminant_analysis.QuadraticDiscriminantAnalysis - -.. _dummy_ref: - -:mod:`sklearn.dummy`: Dummy estimators -====================================== - -.. automodule:: sklearn.dummy - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`model_evaluation` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - dummy.DummyClassifier - dummy.DummyRegressor - -.. autosummary:: - :toctree: generated/ - :template: function.rst - -.. _ensemble_ref: - -:mod:`sklearn.ensemble`: Ensemble Methods -========================================= - -.. automodule:: sklearn.ensemble - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`ensemble` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - ensemble.AdaBoostClassifier - ensemble.AdaBoostRegressor - ensemble.BaggingClassifier - ensemble.BaggingRegressor - ensemble.ExtraTreesClassifier - ensemble.ExtraTreesRegressor - ensemble.GradientBoostingClassifier - ensemble.GradientBoostingRegressor - ensemble.IsolationForest - ensemble.RandomForestClassifier - ensemble.RandomForestRegressor - ensemble.RandomTreesEmbedding - ensemble.StackingClassifier - ensemble.StackingRegressor - ensemble.VotingClassifier - ensemble.VotingRegressor - ensemble.HistGradientBoostingRegressor - ensemble.HistGradientBoostingClassifier - - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - -.. _exceptions_ref: - -:mod:`sklearn.exceptions`: Exceptions and warnings -================================================== - -.. automodule:: sklearn.exceptions - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - exceptions.ConvergenceWarning - exceptions.DataConversionWarning - exceptions.DataDimensionalityWarning - exceptions.EfficiencyWarning - exceptions.FitFailedWarning - exceptions.NotFittedError - exceptions.UndefinedMetricWarning - - -:mod:`sklearn.experimental`: Experimental -========================================= - -.. automodule:: sklearn.experimental - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - - experimental.enable_hist_gradient_boosting - experimental.enable_iterative_imputer - experimental.enable_halving_search_cv - - -.. _feature_extraction_ref: - -:mod:`sklearn.feature_extraction`: Feature Extraction -===================================================== - -.. automodule:: sklearn.feature_extraction - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`feature_extraction` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - feature_extraction.DictVectorizer - feature_extraction.FeatureHasher - -From images ------------ - -.. automodule:: sklearn.feature_extraction.image - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - feature_extraction.image.extract_patches_2d - feature_extraction.image.grid_to_graph - feature_extraction.image.img_to_graph - feature_extraction.image.reconstruct_from_patches_2d - - :template: class.rst - - feature_extraction.image.PatchExtractor - -.. _text_feature_extraction_ref: - -From text ---------- - -.. automodule:: sklearn.feature_extraction.text - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - feature_extraction.text.CountVectorizer - feature_extraction.text.HashingVectorizer - feature_extraction.text.TfidfTransformer - feature_extraction.text.TfidfVectorizer - - -.. _feature_selection_ref: - -:mod:`sklearn.feature_selection`: Feature Selection -=================================================== - -.. automodule:: sklearn.feature_selection - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`feature_selection` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - feature_selection.GenericUnivariateSelect - feature_selection.SelectPercentile - feature_selection.SelectKBest - feature_selection.SelectFpr - feature_selection.SelectFdr - feature_selection.SelectFromModel - feature_selection.SelectFwe - feature_selection.SequentialFeatureSelector - feature_selection.RFE - feature_selection.RFECV - feature_selection.VarianceThreshold - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - feature_selection.chi2 - feature_selection.f_classif - feature_selection.f_regression - feature_selection.mutual_info_classif - feature_selection.mutual_info_regression - - -.. _gaussian_process_ref: - -:mod:`sklearn.gaussian_process`: Gaussian Processes -=================================================== - -.. automodule:: sklearn.gaussian_process - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`gaussian_process` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - gaussian_process.GaussianProcessClassifier - gaussian_process.GaussianProcessRegressor - -Kernels: - -.. autosummary:: - :toctree: generated/ - :template: class_with_call.rst - - gaussian_process.kernels.CompoundKernel - gaussian_process.kernels.ConstantKernel - gaussian_process.kernels.DotProduct - gaussian_process.kernels.ExpSineSquared - gaussian_process.kernels.Exponentiation - gaussian_process.kernels.Hyperparameter - gaussian_process.kernels.Kernel - gaussian_process.kernels.Matern - gaussian_process.kernels.PairwiseKernel - gaussian_process.kernels.Product - gaussian_process.kernels.RBF - gaussian_process.kernels.RationalQuadratic - gaussian_process.kernels.Sum - gaussian_process.kernels.WhiteKernel - - -.. _impute_ref: - -:mod:`sklearn.impute`: Impute -============================= - -.. automodule:: sklearn.impute - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`Impute` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - impute.SimpleImputer - impute.IterativeImputer - impute.MissingIndicator - impute.KNNImputer - - -.. _inspection_ref: - -:mod:`sklearn.inspection`: Inspection -===================================== - -.. automodule:: sklearn.inspection - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - inspection.partial_dependence - inspection.permutation_importance - -Plotting --------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - inspection.PartialDependenceDisplay - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - inspection.plot_partial_dependence - -.. _isotonic_ref: - -:mod:`sklearn.isotonic`: Isotonic regression -============================================ - -.. automodule:: sklearn.isotonic - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`isotonic` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - isotonic.IsotonicRegression - -.. autosummary:: - :toctree: generated - :template: function.rst - - isotonic.check_increasing - isotonic.isotonic_regression - - -.. _kernel_approximation_ref: - -:mod:`sklearn.kernel_approximation`: Kernel Approximation -========================================================= - -.. automodule:: sklearn.kernel_approximation - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`kernel_approximation` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - kernel_approximation.AdditiveChi2Sampler - kernel_approximation.Nystroem - kernel_approximation.PolynomialCountSketch - kernel_approximation.RBFSampler - kernel_approximation.SkewedChi2Sampler - -.. _kernel_ridge_ref: - -:mod:`sklearn.kernel_ridge`: Kernel Ridge Regression -==================================================== - -.. automodule:: sklearn.kernel_ridge - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`kernel_ridge` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - kernel_ridge.KernelRidge - -.. _linear_model_ref: - -:mod:`sklearn.linear_model`: Linear Models -========================================== - -.. automodule:: sklearn.linear_model - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`linear_model` section for further details. - -The following subsections are only rough guidelines: the same estimator can -fall into multiple categories, depending on its parameters. - -.. currentmodule:: sklearn - -Linear classifiers ------------------- -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.LogisticRegression - linear_model.LogisticRegressionCV - linear_model.PassiveAggressiveClassifier - linear_model.Perceptron - linear_model.RidgeClassifier - linear_model.RidgeClassifierCV - linear_model.SGDClassifier - -Classical linear regressors ---------------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.LinearRegression - linear_model.Ridge - linear_model.RidgeCV - linear_model.SGDRegressor - -Regressors with variable selection ----------------------------------- - -The following estimators have built-in variable selection fitting -procedures, but any estimator using a L1 or elastic-net penalty also -performs variable selection: typically :class:`~linear_model.SGDRegressor` -or :class:`~sklearn.linear_model.SGDClassifier` with an appropriate penalty. - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.ElasticNet - linear_model.ElasticNetCV - linear_model.Lars - linear_model.LarsCV - linear_model.Lasso - linear_model.LassoCV - linear_model.LassoLars - linear_model.LassoLarsCV - linear_model.LassoLarsIC - linear_model.OrthogonalMatchingPursuit - linear_model.OrthogonalMatchingPursuitCV - -Bayesian regressors -------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.ARDRegression - linear_model.BayesianRidge - -Multi-task linear regressors with variable selection ----------------------------------------------------- - -These estimators fit multiple regression problems (or tasks) jointly, while -inducing sparse coefficients. While the inferred coefficients may differ -between the tasks, they are constrained to agree on the features that are -selected (non-zero coefficients). - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.MultiTaskElasticNet - linear_model.MultiTaskElasticNetCV - linear_model.MultiTaskLasso - linear_model.MultiTaskLassoCV - -Outlier-robust regressors -------------------------- - -Any estimator using the Huber loss would also be robust to outliers, e.g. -:class:`~linear_model.SGDRegressor` with ``loss='huber'``. - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.HuberRegressor - linear_model.RANSACRegressor - linear_model.TheilSenRegressor - -Generalized linear models (GLM) for regression ----------------------------------------------- - -These models allow for response variables to have error distributions other -than a normal distribution: - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - linear_model.PoissonRegressor - linear_model.TweedieRegressor - linear_model.GammaRegressor - - -Miscellaneous -------------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - linear_model.PassiveAggressiveRegressor - linear_model.enet_path - linear_model.lars_path - linear_model.lars_path_gram - linear_model.lasso_path - linear_model.orthogonal_mp - linear_model.orthogonal_mp_gram - linear_model.ridge_regression - - -.. _manifold_ref: - -:mod:`sklearn.manifold`: Manifold Learning -========================================== - -.. automodule:: sklearn.manifold - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`manifold` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated - :template: class.rst - - manifold.Isomap - manifold.LocallyLinearEmbedding - manifold.MDS - manifold.SpectralEmbedding - manifold.TSNE - -.. autosummary:: - :toctree: generated - :template: function.rst - - manifold.locally_linear_embedding - manifold.smacof - manifold.spectral_embedding - manifold.trustworthiness - - -.. _metrics_ref: - -:mod:`sklearn.metrics`: Metrics -=============================== - -See the :ref:`model_evaluation` section and the :ref:`metrics` section of the -user guide for further details. - -.. automodule:: sklearn.metrics - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -Model Selection Interface -------------------------- -See the :ref:`scoring_parameter` section of the user guide for further -details. - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.check_scoring - metrics.get_scorer - metrics.make_scorer - -Classification metrics ----------------------- - -See the :ref:`classification_metrics` section of the user guide for further -details. - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.accuracy_score - metrics.auc - metrics.average_precision_score - metrics.balanced_accuracy_score - metrics.brier_score_loss - metrics.classification_report - metrics.cohen_kappa_score - metrics.confusion_matrix - metrics.dcg_score - metrics.det_curve - metrics.f1_score - metrics.fbeta_score - metrics.hamming_loss - metrics.hinge_loss - metrics.jaccard_score - metrics.log_loss - metrics.matthews_corrcoef - metrics.multilabel_confusion_matrix - metrics.ndcg_score - metrics.precision_recall_curve - metrics.precision_recall_fscore_support - metrics.precision_score - metrics.recall_score - metrics.roc_auc_score - metrics.roc_curve - metrics.top_k_accuracy_score - metrics.zero_one_loss - -Regression metrics ------------------- - -See the :ref:`regression_metrics` section of the user guide for further -details. - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.explained_variance_score - metrics.max_error - metrics.mean_absolute_error - metrics.mean_squared_error - metrics.mean_squared_log_error - metrics.median_absolute_error - metrics.mean_absolute_percentage_error - metrics.r2_score - metrics.mean_poisson_deviance - metrics.mean_gamma_deviance - metrics.mean_tweedie_deviance - metrics.mean_pinball_loss - -Multilabel ranking metrics --------------------------- -See the :ref:`multilabel_ranking_metrics` section of the user guide for further -details. - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.coverage_error - metrics.label_ranking_average_precision_score - metrics.label_ranking_loss - - -Clustering metrics ------------------- - -See the :ref:`clustering_evaluation` section of the user guide for further -details. - -.. automodule:: sklearn.metrics.cluster - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.adjusted_mutual_info_score - metrics.adjusted_rand_score - metrics.calinski_harabasz_score - metrics.davies_bouldin_score - metrics.completeness_score - metrics.cluster.contingency_matrix - metrics.cluster.pair_confusion_matrix - metrics.fowlkes_mallows_score - metrics.homogeneity_completeness_v_measure - metrics.homogeneity_score - metrics.mutual_info_score - metrics.normalized_mutual_info_score - metrics.rand_score - metrics.silhouette_score - metrics.silhouette_samples - metrics.v_measure_score - -Biclustering metrics --------------------- - -See the :ref:`biclustering_evaluation` section of the user guide for -further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.consensus_score - - -Pairwise metrics ----------------- - -See the :ref:`metrics` section of the user guide for further details. - -.. automodule:: sklearn.metrics.pairwise - :no-members: - :no-inherited-members: - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.pairwise.additive_chi2_kernel - metrics.pairwise.chi2_kernel - metrics.pairwise.cosine_similarity - metrics.pairwise.cosine_distances - metrics.pairwise.distance_metrics - metrics.pairwise.euclidean_distances - metrics.pairwise.haversine_distances - metrics.pairwise.kernel_metrics - metrics.pairwise.laplacian_kernel - metrics.pairwise.linear_kernel - metrics.pairwise.manhattan_distances - metrics.pairwise.nan_euclidean_distances - metrics.pairwise.pairwise_kernels - metrics.pairwise.polynomial_kernel - metrics.pairwise.rbf_kernel - metrics.pairwise.sigmoid_kernel - metrics.pairwise.paired_euclidean_distances - metrics.pairwise.paired_manhattan_distances - metrics.pairwise.paired_cosine_distances - metrics.pairwise.paired_distances - metrics.pairwise_distances - metrics.pairwise_distances_argmin - metrics.pairwise_distances_argmin_min - metrics.pairwise_distances_chunked - - -Plotting --------- - -See the :ref:`visualizations` section of the user guide for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - metrics.plot_confusion_matrix - metrics.plot_det_curve - metrics.plot_precision_recall_curve - metrics.plot_roc_curve - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - metrics.ConfusionMatrixDisplay - metrics.DetCurveDisplay - metrics.PrecisionRecallDisplay - metrics.RocCurveDisplay - - -.. _mixture_ref: - -:mod:`sklearn.mixture`: Gaussian Mixture Models -=============================================== - -.. automodule:: sklearn.mixture - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`mixture` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - mixture.BayesianGaussianMixture - mixture.GaussianMixture - -.. _modelselection_ref: - -:mod:`sklearn.model_selection`: Model Selection -=============================================== - -.. automodule:: sklearn.model_selection - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`cross_validation`, :ref:`grid_search` and -:ref:`learning_curve` sections for further details. - -Splitter Classes ----------------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - model_selection.GroupKFold - model_selection.GroupShuffleSplit - model_selection.KFold - model_selection.LeaveOneGroupOut - model_selection.LeavePGroupsOut - model_selection.LeaveOneOut - model_selection.LeavePOut - model_selection.PredefinedSplit - model_selection.RepeatedKFold - model_selection.RepeatedStratifiedKFold - model_selection.ShuffleSplit - model_selection.StratifiedKFold - model_selection.StratifiedShuffleSplit - model_selection.TimeSeriesSplit - -Splitter Functions ------------------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - model_selection.check_cv - model_selection.train_test_split - -.. _hyper_parameter_optimizers: - -Hyper-parameter optimizers --------------------------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - model_selection.GridSearchCV - model_selection.HalvingGridSearchCV - model_selection.ParameterGrid - model_selection.ParameterSampler - model_selection.RandomizedSearchCV - model_selection.HalvingRandomSearchCV - - -Model validation ----------------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - model_selection.cross_validate - model_selection.cross_val_predict - model_selection.cross_val_score - model_selection.learning_curve - model_selection.permutation_test_score - model_selection.validation_curve - -.. _multiclass_ref: - -:mod:`sklearn.multiclass`: Multiclass classification -==================================================== - -.. automodule:: sklearn.multiclass - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`multiclass_classification` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated - :template: class.rst - - multiclass.OneVsRestClassifier - multiclass.OneVsOneClassifier - multiclass.OutputCodeClassifier - -.. _multioutput_ref: - -:mod:`sklearn.multioutput`: Multioutput regression and classification -===================================================================== - -.. automodule:: sklearn.multioutput - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`multilabel_classification`, -:ref:`multiclass_multioutput_classification`, and -:ref:`multioutput_regression` sections for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated - :template: class.rst - - multioutput.ClassifierChain - multioutput.MultiOutputRegressor - multioutput.MultiOutputClassifier - multioutput.RegressorChain - -.. _naive_bayes_ref: - -:mod:`sklearn.naive_bayes`: Naive Bayes -======================================= - -.. automodule:: sklearn.naive_bayes - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`naive_bayes` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - naive_bayes.BernoulliNB - naive_bayes.CategoricalNB - naive_bayes.ComplementNB - naive_bayes.GaussianNB - naive_bayes.MultinomialNB - - -.. _neighbors_ref: - -:mod:`sklearn.neighbors`: Nearest Neighbors -=========================================== - -.. automodule:: sklearn.neighbors - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`neighbors` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - neighbors.BallTree - neighbors.DistanceMetric - neighbors.KDTree - neighbors.KernelDensity - neighbors.KNeighborsClassifier - neighbors.KNeighborsRegressor - neighbors.KNeighborsTransformer - neighbors.LocalOutlierFactor - neighbors.RadiusNeighborsClassifier - neighbors.RadiusNeighborsRegressor - neighbors.RadiusNeighborsTransformer - neighbors.NearestCentroid - neighbors.NearestNeighbors - neighbors.NeighborhoodComponentsAnalysis - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - neighbors.kneighbors_graph - neighbors.radius_neighbors_graph - -.. _neural_network_ref: - -:mod:`sklearn.neural_network`: Neural network models -==================================================== - -.. automodule:: sklearn.neural_network - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`neural_networks_supervised` and :ref:`neural_networks_unsupervised` sections for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - neural_network.BernoulliRBM - neural_network.MLPClassifier - neural_network.MLPRegressor - -.. _pipeline_ref: - -:mod:`sklearn.pipeline`: Pipeline -================================= - -.. automodule:: sklearn.pipeline - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`combining_estimators` section for further -details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - pipeline.FeatureUnion - pipeline.Pipeline - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - pipeline.make_pipeline - pipeline.make_union - -.. _preprocessing_ref: - -:mod:`sklearn.preprocessing`: Preprocessing and Normalization -============================================================= - -.. automodule:: sklearn.preprocessing - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`preprocessing` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - preprocessing.Binarizer - preprocessing.FunctionTransformer - preprocessing.KBinsDiscretizer - preprocessing.KernelCenterer - preprocessing.LabelBinarizer - preprocessing.LabelEncoder - preprocessing.MultiLabelBinarizer - preprocessing.MaxAbsScaler - preprocessing.MinMaxScaler - preprocessing.Normalizer - preprocessing.OneHotEncoder - preprocessing.OrdinalEncoder - preprocessing.PolynomialFeatures - preprocessing.PowerTransformer - preprocessing.QuantileTransformer - preprocessing.RobustScaler - preprocessing.SplineTransformer - preprocessing.StandardScaler - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - preprocessing.add_dummy_feature - preprocessing.binarize - preprocessing.label_binarize - preprocessing.maxabs_scale - preprocessing.minmax_scale - preprocessing.normalize - preprocessing.quantile_transform - preprocessing.robust_scale - preprocessing.scale - preprocessing.power_transform - - -.. _random_projection_ref: - -:mod:`sklearn.random_projection`: Random projection -=================================================== - -.. automodule:: sklearn.random_projection - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`random_projection` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - random_projection.GaussianRandomProjection - random_projection.SparseRandomProjection - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - random_projection.johnson_lindenstrauss_min_dim - - -.. _semi_supervised_ref: - -:mod:`sklearn.semi_supervised`: Semi-Supervised Learning -======================================================== - -.. automodule:: sklearn.semi_supervised - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`semi_supervised` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - semi_supervised.LabelPropagation - semi_supervised.LabelSpreading - semi_supervised.SelfTrainingClassifier - - -.. _svm_ref: - -:mod:`sklearn.svm`: Support Vector Machines -=========================================== - -.. automodule:: sklearn.svm - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`svm` section for further details. - -Estimators ----------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - svm.LinearSVC - svm.LinearSVR - svm.NuSVC - svm.NuSVR - svm.OneClassSVM - svm.SVC - svm.SVR - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - svm.l1_min_c - -.. _tree_ref: - -:mod:`sklearn.tree`: Decision Trees -=================================== - -.. automodule:: sklearn.tree - :no-members: - :no-inherited-members: - -**User guide:** See the :ref:`tree` section for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - tree.DecisionTreeClassifier - tree.DecisionTreeRegressor - tree.ExtraTreeClassifier - tree.ExtraTreeRegressor - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - tree.export_graphviz - tree.export_text - -Plotting --------- - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - tree.plot_tree - -.. _utils_ref: - -:mod:`sklearn.utils`: Utilities -=============================== - -.. automodule:: sklearn.utils - :no-members: - :no-inherited-members: - -**Developer guide:** See the :ref:`developers-utils` page for further details. - -.. currentmodule:: sklearn - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - utils.arrayfuncs.min_pos - utils.as_float_array - utils.assert_all_finite - utils.Bunch - utils.check_X_y - utils.check_array - utils.check_scalar - utils.check_consistent_length - utils.check_random_state - utils.class_weight.compute_class_weight - utils.class_weight.compute_sample_weight - utils.deprecated - utils.estimator_checks.check_estimator - utils.estimator_checks.parametrize_with_checks - utils.estimator_html_repr - utils.extmath.safe_sparse_dot - utils.extmath.randomized_range_finder - utils.extmath.randomized_svd - utils.extmath.fast_logdet - utils.extmath.density - utils.extmath.weighted_mode - utils.gen_even_slices - utils.graph.single_source_shortest_path_length - utils.graph_shortest_path.graph_shortest_path - utils.indexable - utils.metaestimators.if_delegate_has_method - utils.multiclass.type_of_target - utils.multiclass.is_multilabel - utils.multiclass.unique_labels - utils.murmurhash3_32 - utils.resample - utils._safe_indexing - utils.safe_mask - utils.safe_sqr - utils.shuffle - utils.sparsefuncs.incr_mean_variance_axis - utils.sparsefuncs.inplace_column_scale - utils.sparsefuncs.inplace_row_scale - utils.sparsefuncs.inplace_swap_row - utils.sparsefuncs.inplace_swap_column - utils.sparsefuncs.mean_variance_axis - utils.sparsefuncs.inplace_csr_column_scale - utils.sparsefuncs_fast.inplace_csr_row_normalize_l1 - utils.sparsefuncs_fast.inplace_csr_row_normalize_l2 - utils.random.sample_without_replacement - utils.validation.check_is_fitted - utils.validation.check_memory - utils.validation.check_symmetric - utils.validation.column_or_1d - utils.validation.has_fit_parameter - utils.all_estimators - -Utilities from joblib: - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - utils.parallel_backend - utils.register_parallel_backend - - -Recently deprecated -=================== - -To be removed in 1.0 (renaming of 0.25) ---------------------------------------- diff --git a/doc/modules/classification_threshold.rst b/doc/modules/classification_threshold.rst new file mode 100644 index 0000000000000..ee7028f469b5f --- /dev/null +++ b/doc/modules/classification_threshold.rst @@ -0,0 +1,159 @@ +.. currentmodule:: sklearn.model_selection + +.. _TunedThresholdClassifierCV: + +================================================== +Tuning the decision threshold for class prediction +================================================== + +Classification is best divided into two parts: + +* the statistical problem of learning a model to predict, ideally, class probabilities; +* the decision problem to take concrete action based on those probability predictions. + +Let's take a straightforward example related to weather forecasting: the first point is +related to answering "what is the chance that it will rain tomorrow?" while the second +point is related to answering "should I take an umbrella tomorrow?". + +When it comes to the scikit-learn API, the first point is addressed by providing scores +using :term:`predict_proba` or :term:`decision_function`. The former returns conditional +probability estimates :math:`P(y|X)` for each class, while the latter returns a decision +score for each class. + +The decision corresponding to the labels is obtained with :term:`predict`. In binary +classification, a decision rule or action is then defined by thresholding the scores, +leading to the prediction of a single class label for each sample. For binary +classification in scikit-learn, class labels predictions are obtained by hard-coded +cut-off rules: a positive class is predicted when the conditional probability +:math:`P(y|X)` is greater than 0.5 (obtained with :term:`predict_proba`) or if the +decision score is greater than 0 (obtained with :term:`decision_function`). + +Here, we show an example that illustrates the relation between conditional +probability estimates :math:`P(y|X)` and class labels:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = make_classification(random_state=0) + >>> classifier = DecisionTreeClassifier(max_depth=2, random_state=0).fit(X, y) + >>> classifier.predict_proba(X[:4]) + array([[0.94 , 0.06 ], + [0.94 , 0.06 ], + [0.0416, 0.9583], + [0.0416, 0.9583]]) + >>> classifier.predict(X[:4]) + array([0, 0, 1, 1]) + +While these hard-coded rules might at first seem reasonable as default behavior, they +are most certainly not ideal for most use cases. Let's illustrate with an example. + +Consider a scenario where a predictive model is being deployed to assist +physicians in detecting tumors. In this setting, physicians will most likely be +interested in identifying all patients with cancer and not missing anyone with cancer so +that they can provide them with the right treatment. In other words, physicians +prioritize achieving a high recall rate. This emphasis on recall comes, of course, with +the trade-off of potentially more false-positive predictions, reducing the precision of +the model. That is a risk physicians are willing to take because the cost of a missed +cancer is much higher than the cost of further diagnostic tests. Consequently, when it +comes to deciding whether to classify a patient as having cancer or not, it may be more +beneficial to classify them as positive for cancer when the conditional probability +estimate is much lower than 0.5. + +Post-tuning the decision threshold +================================== + +One solution to address the problem stated in the introduction is to tune the decision +threshold of the classifier once the model has been trained. The +:class:`~sklearn.model_selection.TunedThresholdClassifierCV` tunes this threshold using +an internal cross-validation. The optimum threshold is chosen to maximize a given +metric. + +The following image illustrates the tuning of the decision threshold for a gradient +boosting classifier. While the vanilla and tuned classifiers provide the same +:term:`predict_proba` outputs and thus the same Receiver Operating Characteristic (ROC) +and Precision-Recall curves, the class label predictions differ because of the tuned +decision threshold. The vanilla classifier predicts the class of interest for a +conditional probability greater than 0.5 while the tuned classifier predicts the class +of interest for a very low probability (around 0.02). This decision threshold optimizes +a utility metric defined by the business (in this case an insurance company). + +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cost_sensitive_learning_002.png + :target: ../auto_examples/model_selection/plot_cost_sensitive_learning.html + :align: center + +Options to tune the decision threshold +-------------------------------------- + +The decision threshold can be tuned through different strategies controlled by the +parameter `scoring`. + +One way to tune the threshold is by maximizing a pre-defined scikit-learn metric. These +metrics can be found by calling the function :func:`~sklearn.metrics.get_scorer_names`. +By default, the balanced accuracy is the metric used but be aware that one should choose +a meaningful metric for their use case. + +.. note:: + + It is important to notice that these metrics come with default parameters, notably + the label of the class of interest (i.e. `pos_label`). Thus, if this label is not + the right one for your application, you need to define a scorer and pass the right + `pos_label` (and additional parameters) using the + :func:`~sklearn.metrics.make_scorer`. Refer to :ref:`scoring_callable` to get + information to define your own scoring function. For instance, we show how to pass + the information to the scorer that the label of interest is `0` when maximizing the + :func:`~sklearn.metrics.f1_score`:: + + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import TunedThresholdClassifierCV + >>> from sklearn.metrics import make_scorer, f1_score + >>> X, y = make_classification( + ... n_samples=1_000, weights=[0.1, 0.9], random_state=0) + >>> pos_label = 0 + >>> scorer = make_scorer(f1_score, pos_label=pos_label) + >>> base_model = LogisticRegression() + >>> model = TunedThresholdClassifierCV(base_model, scoring=scorer) + >>> scorer(model.fit(X, y), X, y) + 0.88 + >>> # compare it with the internal score found by cross-validation + >>> model.best_score_ + np.float64(0.86) + +Important notes regarding the internal cross-validation +------------------------------------------------------- + +By default :class:`~sklearn.model_selection.TunedThresholdClassifierCV` uses a 5-fold +stratified cross-validation to tune the decision threshold. The parameter `cv` allows to +control the cross-validation strategy. It is possible to bypass cross-validation by +setting `cv="prefit"` and providing a fitted classifier. In this case, the decision +threshold is tuned on the data provided to the `fit` method. + +However, you should be extremely careful when using this option. You should never use +the same data for training the classifier and tuning the decision threshold due to the +risk of overfitting. Refer to the following example section for more details (cf. +:ref:`TunedThresholdClassifierCV_no_cv`). If you have limited resources, consider using +a float number for `cv` to limit to an internal single train-test split. + +The option `cv="prefit"` should only be used when the provided classifier was already +trained, and you just want to find the best decision threshold using a new validation +set. + +.. _FixedThresholdClassifier: + +Manually setting the decision threshold +--------------------------------------- + +The previous sections discussed strategies to find an optimal decision threshold. It is +also possible to manually set the decision threshold using the class +:class:`~sklearn.model_selection.FixedThresholdClassifier`. In case that you don't want +to refit the model when calling `fit`, wrap your sub-estimator with a +:class:`~sklearn.frozen.FrozenEstimator` and do +``FixedThresholdClassifier(FrozenEstimator(estimator), ...)``. + +Examples +-------- + +- See the example entitled + :ref:`sphx_glr_auto_examples_model_selection_plot_tuned_decision_threshold.py`, + to get insights on the post-tuning of the decision threshold. +- See the example entitled + :ref:`sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py`, + to learn about cost-sensitive learning and decision threshold tuning. diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 17ae9eb2651c6..cdf8421a103e3 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -89,14 +89,22 @@ Overview of clustering methods * - :ref:`DBSCAN ` - neighborhood size - Very large ``n_samples``, medium ``n_clusters`` - - Non-flat geometry, uneven cluster sizes, transductive + - Non-flat geometry, uneven cluster sizes, outlier removal, + transductive + - Distances between nearest points + + * - :ref:`HDBSCAN ` + - minimum cluster membership, minimum point neighbors + - large ``n_samples``, medium ``n_clusters`` + - Non-flat geometry, uneven cluster sizes, outlier removal, + transductive, hierarchical, variable cluster density - Distances between nearest points * - :ref:`OPTICS ` - minimum cluster membership - Very large ``n_samples``, large ``n_clusters`` - Non-flat geometry, uneven cluster sizes, variable cluster density, - transductive + outlier removal, transductive - Distances between points * - :ref:`Gaussian mixtures ` @@ -111,6 +119,13 @@ Overview of clustering methods - Large dataset, outlier removal, data reduction, inductive - Euclidean distance between points + * - :ref:`Bisecting K-Means ` + - number of clusters + - Very large ``n_samples``, medium ``n_clusters`` + - General-purpose, even cluster size, flat geometry, + no empty clusters, inductive, hierarchical + - Distances between points + Non-flat geometry clustering is useful when the clusters have a specific shape, i.e. a non-flat manifold, and the standard euclidean distance is not the right metric. This case arises in the two top rows of the figure @@ -125,6 +140,11 @@ model with equal covariance per component. :term:`inductive` clustering methods) are not designed to be applied to new, unseen data. +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_inductive_clustering.py`: An example + of an inductive clustering model for handling new data. + .. _k_means: K-means @@ -133,7 +153,7 @@ K-means The :class:`KMeans` algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the *inertia* or within-cluster sum-of-squares (see below). This algorithm requires the number -of clusters to be specified. It scales well to large number of samples and has +of clusters to be specified. It scales well to large numbers of samples and has been used across a large range of application areas in many different fields. The k-means algorithm divides a set of :math:`N` samples :math:`X` into @@ -162,11 +182,15 @@ It suffers from various drawbacks: k-means clustering can alleviate this problem and speed up the computations. -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_001.png +.. image:: ../auto_examples/cluster/images/sphx_glr_plot_kmeans_assumptions_002.png :target: ../auto_examples/cluster/plot_kmeans_assumptions.html :align: center :scale: 50 +For more detailed descriptions of the issues shown above and how to address them, +refer to the examples :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py` +and :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. + K-means is often referred to as Lloyd's algorithm. In basic terms, the algorithm has three steps. The first step chooses the initial centroids, with the most basic method being to choose :math:`k` samples from the dataset @@ -202,10 +226,13 @@ As a result, the computation is often done several times, with different initializations of the centroids. One method to help address this issue is the k-means++ initialization scheme, which has been implemented in scikit-learn (use the ``init='k-means++'`` parameter). This initializes the centroids to be -(generally) distant from each other, leading to provably better results than -random initialization, as shown in the reference. +(generally) distant from each other, leading to probably better results than +random initialization, as shown in the reference. For detailed examples of +comparing different initialization schemes, refer to +:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py` and +:ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. -K-means++ can also be called independently to select seeds for other +K-means++ can also be called independently to select seeds for other clustering algorithms, see :func:`sklearn.cluster.kmeans_plusplus` for details and example usage. @@ -215,8 +242,13 @@ computing cluster centers and values of inertia. For example, assigning a weight of 2 to a sample is equivalent to adding a duplicate of that sample to the dataset :math:`X`. -K-means can be used for vector quantization. This is achieved using the -transform method of a trained model of :class:`KMeans`. +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering + using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data + +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_plusplus.py`: Using K-means++ + to select seeds for other clustering algorithms. Low-level parallelism --------------------- @@ -226,19 +258,20 @@ chunks of data (256 samples) are processed in parallel, which in addition yields a low memory footprint. For more details on how to control the number of threads, please refer to our :ref:`parallelism` notes. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`: Demonstrating when - k-means performs intuitively and when it does not - * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`: Clustering handwritten digits +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`: Demonstrating when + k-means performs intuitively and when it does not +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`: Clustering handwritten digits -.. topic:: References: +.. dropdown:: References + + * `"k-means++: The advantages of careful seeding" + `_ + Arthur, David, and Sergei Vassilvitskii, + *Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete + algorithms*, Society for Industrial and Applied Mathematics (2007) - * `"k-means++: The advantages of careful seeding" - `_ - Arthur, David, and Sergei Vassilvitskii, - *Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete - algorithms*, Society for Industrial and Applied Mathematics (2007) .. _mini_batch_kmeans: @@ -274,23 +307,22 @@ small, as shown in the example and cited reference. :scale: 100 -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of KMeans and - MiniBatchKMeans +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering using sparse - MiniBatchKMeans +* :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`: Comparison of + :class:`KMeans` and :class:`MiniBatchKMeans` - * :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` +* :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`: Document clustering + using :class:`KMeans` and :class:`MiniBatchKMeans` based on sparse data +* :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` -.. topic:: References: +.. dropdown:: References - * `"Web Scale K-Means clustering" - `_ - D. Sculley, *Proceedings of the 19th international conference on World - wide web* (2010) + * `"Web Scale K-Means clustering" + `_ + D. Sculley, *Proceedings of the 19th international conference on World + wide web* (2010) .. _affinity_propagation: @@ -327,52 +359,51 @@ convergence. Further, the memory complexity is of the order sparse similarity matrix is used. This makes Affinity Propagation most appropriate for small to medium sized datasets. -.. topic:: Examples: +.. dropdown:: Algorithm description - * :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`: Affinity - Propagation on a synthetic 2D datasets with 3 classes. + The messages sent between points belong to one of two categories. The first is + the responsibility :math:`r(i, k)`, which is the accumulated evidence that + sample :math:`k` should be the exemplar for sample :math:`i`. The second is the + availability :math:`a(i, k)` which is the accumulated evidence that sample + :math:`i` should choose sample :math:`k` to be its exemplar, and considers the + values for all other samples that :math:`k` should be an exemplar. In this way, + exemplars are chosen by samples if they are (1) similar enough to many samples + and (2) chosen by many samples to be representative of themselves. - * :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py` Affinity Propagation on - Financial time series to find groups of companies + More formally, the responsibility of a sample :math:`k` to be the exemplar of + sample :math:`i` is given by: + .. math:: -**Algorithm description:** -The messages sent between points belong to one of two categories. The first is -the responsibility :math:`r(i, k)`, -which is the accumulated evidence that sample :math:`k` -should be the exemplar for sample :math:`i`. -The second is the availability :math:`a(i, k)` -which is the accumulated evidence that sample :math:`i` -should choose sample :math:`k` to be its exemplar, -and considers the values for all other samples that :math:`k` should -be an exemplar. In this way, exemplars are chosen by samples if they are (1) -similar enough to many samples and (2) chosen by many samples to be -representative of themselves. + r(i, k) \leftarrow s(i, k) - max [ a(i, k') + s(i, k') \forall k' \neq k ] -More formally, the responsibility of a sample :math:`k` -to be the exemplar of sample :math:`i` is given by: + Where :math:`s(i, k)` is the similarity between samples :math:`i` and :math:`k`. + The availability of sample :math:`k` to be the exemplar of sample :math:`i` is + given by: -.. math:: + .. math:: - r(i, k) \leftarrow s(i, k) - max [ a(i, k') + s(i, k') \forall k' \neq k ] + a(i, k) \leftarrow min [0, r(k, k) + \sum_{i'~s.t.~i' \notin \{i, k\}}{r(i', + k)}] -Where :math:`s(i, k)` is the similarity between samples :math:`i` and :math:`k`. -The availability of sample :math:`k` -to be the exemplar of sample :math:`i` is given by: + To begin with, all values for :math:`r` and :math:`a` are set to zero, and the + calculation of each iterates until convergence. As discussed above, in order to + avoid numerical oscillations when updating the messages, the damping factor + :math:`\lambda` is introduced to iteration process: -.. math:: + .. math:: r_{t+1}(i, k) = \lambda\cdot r_{t}(i, k) + (1-\lambda)\cdot r_{t+1}(i, k) + .. math:: a_{t+1}(i, k) = \lambda\cdot a_{t}(i, k) + (1-\lambda)\cdot a_{t+1}(i, k) - a(i, k) \leftarrow min [0, r(k, k) + \sum_{i'~s.t.~i' \notin \{i, k\}}{r(i', k)}] + where :math:`t` indicates the iteration times. -To begin with, all values for :math:`r` and :math:`a` are set to zero, -and the calculation of each iterates until convergence. -As discussed above, in order to avoid numerical oscillations when updating the -messages, the damping factor :math:`\lambda` is introduced to iteration process: -.. math:: r_{t+1}(i, k) = \lambda\cdot r_{t}(i, k) + (1-\lambda)\cdot r_{t+1}(i, k) -.. math:: a_{t+1}(i, k) = \lambda\cdot a_{t}(i, k) + (1-\lambda)\cdot a_{t+1}(i, k) +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`: Affinity + Propagation on a synthetic 2D datasets with 3 classes +* :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py` Affinity Propagation + on financial time series to find groups of companies -where :math:`t` indicates the iteration times. .. _mean_shift: @@ -384,22 +415,40 @@ for centroids to be the mean of the points within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates to form the final set of centroids. -Given a candidate centroid :math:`x_i` for iteration :math:`t`, the candidate -is updated according to the following equation: +.. dropdown:: Mathematical details -.. math:: + The position of centroid candidates is iteratively adjusted using a technique + called hill climbing, which finds local maxima of the estimated probability + density. Given a candidate centroid :math:`x` for iteration :math:`t`, the + candidate is updated according to the following equation: - x_i^{t+1} = m(x_i^t) + .. math:: -Where :math:`N(x_i)` is the neighborhood of samples within a given distance -around :math:`x_i` and :math:`m` is the *mean shift* vector that is computed for each -centroid that points towards a region of the maximum increase in the density of points. -This is computed using the following equation, effectively updating a centroid -to be the mean of the samples within its neighborhood: + x^{t+1} = x^t + m(x^t) -.. math:: + Where :math:`m` is the *mean shift* vector that is computed for each centroid + that points towards a region of the maximum increase in the density of points. + To compute :math:`m` we define :math:`N(x)` as the neighborhood of samples + within a given distance around :math:`x`. Then :math:`m` is computed using the + following equation, effectively updating a centroid to be the mean of the + samples within its neighborhood: + + .. math:: + + m(x) = \frac{1}{|N(x)|} \sum_{x_j \in N(x)}x_j - x + + In general, the equation for :math:`m` depends on a kernel used for density + estimation. The generic formula is: + + .. math:: + + m(x) = \frac{\sum_{x_j \in N(x)}K(x_j - x)x_j}{\sum_{x_j \in N(x)}K(x_j - + x)} - x + + In our implementation, :math:`K(x)` is equal to 1 if :math:`x` is small enough + and is equal to 0 otherwise. Effectively :math:`K(y - x)` indicates whether + :math:`y` is in the neighborhood of :math:`x`. - m(x_i) = \frac{\sum_{x_j \in N(x_i)}K(x_j - x_i)x_j}{\sum_{x_j \in N(x_i)}K(x_j - x_i)} The algorithm automatically sets the number of clusters, instead of relying on a parameter ``bandwidth``, which dictates the size of the region to search through. @@ -421,16 +470,16 @@ given sample. :scale: 50 -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`: Mean Shift clustering - on a synthetic 2D datasets with 3 classes. +* :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`: Mean Shift clustering + on a synthetic 2D datasets with 3 classes. -.. topic:: References: +.. dropdown:: References - * `"Mean shift: A robust approach toward feature space analysis." - `_ - D. Comaniciu and P. Meer, *IEEE Transactions on Pattern Analysis and Machine Intelligence* (2002) + * :doi:`"Mean shift: A robust approach toward feature space analysis" + <10.1109/34.1000236>` D. Comaniciu and P. Meer, *IEEE Transactions on Pattern + Analysis and Machine Intelligence* (2002) .. _spectral_clustering: @@ -450,7 +499,7 @@ to be specified in advance. It works well for a small number of clusters, but is not advised for many clusters. For two clusters, SpectralClustering solves a convex relaxation of the -`normalised cuts `_ +`normalized cuts `_ problem on the similarity graph: cutting the graph in two so that the weight of the edges cut is small compared to the weights of the edges inside each cluster. This criteria is especially interesting when working on images, where @@ -481,21 +530,26 @@ computed using a function of a gradient of the image. See the examples for such an application. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_segmentation_toy.py`: Segmenting objects - from a noisy background using spectral clustering. +* :ref:`sphx_glr_auto_examples_cluster_plot_segmentation_toy.py`: Segmenting objects + from a noisy background using spectral clustering. +* :ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`: Spectral clustering + to split the image of coins in regions. - * :ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`: Spectral clustering - to split the image of coins in regions. .. |coin_kmeans| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_001.png - :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 35 .. |coin_discretize| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_002.png - :target: ../auto_examples/cluster/plot_coin_segmentation.html - :scale: 65 + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 35 + +.. |coin_cluster_qr| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png + :target: ../auto_examples/cluster/plot_coin_segmentation.html + :scale: 35 + Different label assignment strategies ------------------------------------- @@ -507,12 +561,27 @@ In particular, unless you control the ``random_state``, it may not be reproducible from run-to-run, as it depends on random initialization. The alternative ``"discretize"`` strategy is 100% reproducible, but tends to create parcels of fairly even and geometrical shape. +The recently added ``"cluster_qr"`` option is a deterministic alternative that +tends to create the visually best partitioning on the example application +below. + +================================ ================================ ================================ + ``assign_labels="kmeans"`` ``assign_labels="discretize"`` ``assign_labels="cluster_qr"`` +================================ ================================ ================================ +|coin_kmeans| |coin_discretize| |coin_cluster_qr| +================================ ================================ ================================ + +.. dropdown:: References + + * `"Multiclass spectral clustering" + `_ + Stella X. Yu, Jianbo Shi, 2003 -===================================== ===================================== - ``assign_labels="kmeans"`` ``assign_labels="discretize"`` -===================================== ===================================== -|coin_kmeans| |coin_discretize| -===================================== ===================================== + * :doi:`"Simple, direct, and efficient multi-way spectral clustering"<10.1093/imaiai/iay008>` + Anil Damle, Victor Minden, Lexing Ying, 2019 + + +.. _spectral_clustering_graph: Spectral Clustering Graphs -------------------------- @@ -526,28 +595,25 @@ graph, and SpectralClustering is initialized with `affinity='precomputed'`:: ... assign_labels='discretize') >>> sc.fit_predict(adjacency_matrix) # doctest: +SKIP -.. topic:: References: +.. dropdown:: References - * `"A Tutorial on Spectral Clustering" - `_ - Ulrike von Luxburg, 2007 + * :doi:`"A Tutorial on Spectral Clustering" <10.1007/s11222-007-9033-z>` Ulrike + von Luxburg, 2007 - * `"Normalized cuts and image segmentation" - `_ - Jianbo Shi, Jitendra Malik, 2000 + * :doi:`"Normalized cuts and image segmentation" <10.1109/34.868688>` Jianbo + Shi, Jitendra Malik, 2000 - * `"A Random Walks View of Spectral Segmentation" - `_ - Marina Meila, Jianbo Shi, 2001 + * `"A Random Walks View of Spectral Segmentation" + `_ + Marina Meila, Jianbo Shi, 2001 - * `"On Spectral Clustering: Analysis and an algorithm" - `_ - Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001 + * `"On Spectral Clustering: Analysis and an algorithm" + `_ + Andrew Y. Ng, Michael I. Jordan, Yair Weiss, 2001 + + * :arxiv:`"Preconditioned Spectral Clustering for Stochastic Block Partition + Streaming Graph Challenge" <1708.07481>` David Zhuzhunashvili, Andrew Knyazev - * `"Preconditioned Spectral Clustering for Stochastic - Block Partition Streaming Graph Challenge" - `_ - David Zhuzhunashvili, Andrew Knyazev .. _hierarchical_clustering: @@ -608,10 +674,14 @@ while not robust to noisy data, can be computed very efficiently and can therefore be useful to provide hierarchical clustering of larger datasets. Single linkage can also perform well on non-globular data. -.. topic:: Examples: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_digits_linkage.py`: exploration of the + different linkage strategies in a real dataset. + + * :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`: exploration of + the different linkage strategies in toy datasets. - * :ref:`sphx_glr_auto_examples_cluster_plot_digits_linkage.py`: exploration of the - different linkage strategies in a real dataset. Visualization of cluster hierarchy ---------------------------------- @@ -624,6 +694,9 @@ of the data, though more so in the case of small sample sizes. :target: ../auto_examples/cluster/plot_agglomerative_dendrogram.html :scale: 42 +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py` Adding connectivity constraints @@ -665,21 +738,6 @@ using :func:`sklearn.feature_extraction.image.grid_to_graph` to enable only merging of neighboring pixels on an image, as in the :ref:`coin ` example. -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_coin_ward_segmentation.py`: Ward clustering - to split the image of coins in regions. - - * :ref:`sphx_glr_auto_examples_cluster_plot_ward_structured_vs_unstructured.py`: Example of - Ward algorithm on a swiss-roll, comparison of structured approaches - versus unstructured approaches. - - * :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py`: - Example of dimensionality reduction with feature agglomeration based on - Ward hierarchical clustering. - - * :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py` - .. warning:: **Connectivity constraints with single, average and complete linkage** Connectivity constraints and single, complete or average linkage can enhance @@ -707,6 +765,21 @@ enable only merging of neighboring pixels on an image, as in the :target: ../auto_examples/cluster/plot_agglomerative_clustering.html :scale: 38 +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_coin_ward_segmentation.py`: Ward + clustering to split the image of coins in regions. + +* :ref:`sphx_glr_auto_examples_cluster_plot_ward_structured_vs_unstructured.py`: Example + of Ward algorithm on a swiss-roll, comparison of structured approaches + versus unstructured approaches. + +* :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py`: Example + of dimensionality reduction with feature agglomeration based on Ward + hierarchical clustering. + +* :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py` + Varying the metric ------------------- @@ -739,9 +812,68 @@ each class. :target: ../auto_examples/cluster/plot_agglomerative_clustering_metrics.html :scale: 32 -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering_metrics.py` +* :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering_metrics.py` + + +Bisecting K-Means +----------------- + +.. _bisect_k_means: + +The :class:`BisectingKMeans` is an iterative variant of :class:`KMeans`, using +divisive hierarchical clustering. Instead of creating all centroids at once, centroids +are picked progressively based on a previous clustering: a cluster is split into two +new clusters repeatedly until the target number of clusters is reached. + +:class:`BisectingKMeans` is more efficient than :class:`KMeans` when the number of +clusters is large since it only works on a subset of the data at each bisection +while :class:`KMeans` always works on the entire dataset. + +Although :class:`BisectingKMeans` can't benefit from the advantages of the `"k-means++"` +initialization by design, it will still produce comparable results than +`KMeans(init="k-means++")` in terms of inertia at cheaper computational costs, and will +likely produce better results than `KMeans` with a random initialization. + +This variant is more efficient to agglomerative clustering if the number of clusters is +small compared to the number of data points. + +This variant also does not produce empty clusters. + +There exist two strategies for selecting the cluster to split: + - ``bisecting_strategy="largest_cluster"`` selects the cluster having the most points + - ``bisecting_strategy="biggest_inertia"`` selects the cluster with biggest inertia + (cluster with biggest Sum of Squared Errors within) + +Picking by largest amount of data points in most cases produces result as +accurate as picking by inertia and is faster (especially for larger amount of data +points, where calculating error may be costly). + +Picking by largest amount of data points will also likely produce clusters of similar +sizes while `KMeans` is known to produce clusters of different sizes. + +Difference between Bisecting K-Means and regular K-Means can be seen on example +:ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`. +While the regular K-Means algorithm tends to create non-related clusters, +clusters from Bisecting K-Means are well ordered and create quite a visible hierarchy. + +.. dropdown:: References + + * `"A Comparison of Document Clustering Techniques" + `_ Michael + Steinbach, George Karypis and Vipin Kumar, Department of Computer Science and + Egineering, University of Minnesota (June 2000) + * `"Performance Analysis of K-Means and Bisecting K-Means Algorithms in Weblog + Data" + `_ + K.Abirami and Dr.P.Mayilvahanan, International Journal of Emerging + Technologies in Engineering Research (IJETER) Volume 4, Issue 8, (August 2016) + * `"Bisecting K-means Algorithm Based on K-valued Self-determining and + Clustering Center Optimization" + `_ Jian Di, Xinyue Gou School + of Control and Computer Engineering,North China Electric Power University, + Baoding, Hebei, China (August 2017) .. _dbscan: @@ -795,71 +927,179 @@ indicating core samples found by the algorithm. Smaller circles are non-core samples that are still part of a cluster. Moreover, the outliers are indicated by black points below. -.. |dbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_dbscan_001.png - :target: ../auto_examples/cluster/plot_dbscan.html - :scale: 50 +.. |dbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_dbscan_002.png + :target: ../auto_examples/cluster/plot_dbscan.html + :scale: 50 .. centered:: |dbscan_results| -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py` - -.. topic:: Implementation - - The DBSCAN algorithm is deterministic, always generating the same clusters - when given the same data in the same order. However, the results can differ when - data is provided in a different order. First, even though the core samples - will always be assigned to the same clusters, the labels of those clusters - will depend on the order in which those samples are encountered in the data. - Second and more importantly, the clusters to which non-core samples are assigned - can differ depending on the data order. This would happen when a non-core sample - has a distance lower than ``eps`` to two core samples in different clusters. By the - triangular inequality, those two core samples must be more distant than - ``eps`` from each other, or they would be in the same cluster. The non-core - sample is assigned to whichever cluster is generated first in a pass - through the data, and so the results will depend on the data ordering. - - The current implementation uses ball trees and kd-trees - to determine the neighborhood of points, - which avoids calculating the full distance matrix - (as was done in scikit-learn versions before 0.14). - The possibility to use custom metrics is retained; - for details, see :class:`NearestNeighbors`. - -.. topic:: Memory consumption for large sample sizes - - This implementation is by default not memory efficient because it constructs - a full pairwise similarity matrix in the case where kd-trees or ball-trees cannot - be used (e.g., with sparse matrices). This matrix will consume :math:`n^2` floats. - A couple of mechanisms for getting around this are: - - - Use :ref:`OPTICS ` clustering in conjunction with the - `extract_dbscan` method. OPTICS clustering also calculates the full - pairwise matrix, but only keeps one row in memory at a time (memory - complexity n). - - - A sparse radius neighborhood graph (where missing entries are presumed to - be out of eps) can be precomputed in a memory-efficient way and dbscan - can be run over this with ``metric='precomputed'``. See - :meth:`sklearn.neighbors.NearestNeighbors.radius_neighbors_graph`. - - - The dataset can be compressed, either by removing exact duplicates if - these occur in your data, or by using BIRCH. Then you only have a - relatively small number of representatives for a large number of points. - You can then provide a ``sample_weight`` when fitting DBSCAN. - -.. topic:: References: - - * "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases - with Noise" - Ester, M., H. P. Kriegel, J. Sander, and X. Xu, - In Proceedings of the 2nd International Conference on Knowledge Discovery - and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996 - - * "DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. - Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). - In ACM Transactions on Database Systems (TODS), 42(3), 19. +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py` + +.. dropdown:: Implementation + + The DBSCAN algorithm is deterministic, always generating the same clusters when + given the same data in the same order. However, the results can differ when + data is provided in a different order. First, even though the core samples will + always be assigned to the same clusters, the labels of those clusters will + depend on the order in which those samples are encountered in the data. Second + and more importantly, the clusters to which non-core samples are assigned can + differ depending on the data order. This would happen when a non-core sample + has a distance lower than ``eps`` to two core samples in different clusters. By + the triangular inequality, those two core samples must be more distant than + ``eps`` from each other, or they would be in the same cluster. The non-core + sample is assigned to whichever cluster is generated first in a pass through the + data, and so the results will depend on the data ordering. + + The current implementation uses ball trees and kd-trees to determine the + neighborhood of points, which avoids calculating the full distance matrix (as + was done in scikit-learn versions before 0.14). The possibility to use custom + metrics is retained; for details, see :class:`NearestNeighbors`. + +.. dropdown:: Memory consumption for large sample sizes + + This implementation is by default not memory efficient because it constructs a + full pairwise similarity matrix in the case where kd-trees or ball-trees cannot + be used (e.g., with sparse matrices). This matrix will consume :math:`n^2` + floats. A couple of mechanisms for getting around this are: + + - Use :ref:`OPTICS ` clustering in conjunction with the `extract_dbscan` + method. OPTICS clustering also calculates the full pairwise matrix, but only + keeps one row in memory at a time (memory complexity n). + + - A sparse radius neighborhood graph (where missing entries are presumed to be + out of eps) can be precomputed in a memory-efficient way and dbscan can be run + over this with ``metric='precomputed'``. See + :meth:`sklearn.neighbors.NearestNeighbors.radius_neighbors_graph`. + + - The dataset can be compressed, either by removing exact duplicates if these + occur in your data, or by using BIRCH. Then you only have a relatively small + number of representatives for a large number of points. You can then provide a + ``sample_weight`` when fitting DBSCAN. + +.. dropdown:: References + +* `A Density-Based Algorithm for Discovering Clusters in Large Spatial + Databases with Noise `_ + Ester, M., H. P. Kriegel, J. Sander, and X. Xu, In Proceedings of the 2nd + International Conference on Knowledge Discovery and Data Mining, Portland, OR, + AAAI Press, pp. 226-231. 1996 + +* :doi:`DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. + <10.1145/3068335>` Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, + X. (2017). In ACM Transactions on Database Systems (TODS), 42(3), 19. + + +.. _hdbscan: + +HDBSCAN +======= + +The :class:`HDBSCAN` algorithm can be seen as an extension of :class:`DBSCAN` +and :class:`OPTICS`. Specifically, :class:`DBSCAN` assumes that the clustering +criterion (i.e. density requirement) is *globally homogeneous*. +In other words, :class:`DBSCAN` may struggle to successfully capture clusters +with different densities. +:class:`HDBSCAN` alleviates this assumption and explores all possible density +scales by building an alternative representation of the clustering problem. + +.. note:: + + This implementation is adapted from the original implementation of HDBSCAN, + `scikit-learn-contrib/hdbscan `_ based on [LJ2017]_. + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_hdbscan.py` + +Mutual Reachability Graph +------------------------- + +HDBSCAN first defines :math:`d_c(x_p)`, the *core distance* of a sample :math:`x_p`, as the +distance to its `min_samples` th-nearest neighbor, counting itself. For example, +if `min_samples=5` and :math:`x_*` is the 5th-nearest neighbor of :math:`x_p` +then the core distance is: + +.. math:: d_c(x_p)=d(x_p, x_*). + +Next it defines :math:`d_m(x_p, x_q)`, the *mutual reachability distance* of two points +:math:`x_p, x_q`, as: + +.. math:: d_m(x_p, x_q) = \max\{d_c(x_p), d_c(x_q), d(x_p, x_q)\} + +These two notions allow us to construct the *mutual reachability graph* +:math:`G_{ms}` defined for a fixed choice of `min_samples` by associating each +sample :math:`x_p` with a vertex of the graph, and thus edges between points +:math:`x_p, x_q` are the mutual reachability distance :math:`d_m(x_p, x_q)` +between them. We may build subsets of this graph, denoted as +:math:`G_{ms,\varepsilon}`, by removing any edges with value greater than :math:`\varepsilon`: +from the original graph. Any points whose core distance is less than :math:`\varepsilon`: +are at this staged marked as noise. The remaining points are then clustered by +finding the connected components of this trimmed graph. + +.. note:: + + Taking the connected components of a trimmed graph :math:`G_{ms,\varepsilon}` is + equivalent to running DBSCAN* with `min_samples` and :math:`\varepsilon`. DBSCAN* is a + slightly modified version of DBSCAN mentioned in [CM2013]_. + +Hierarchical Clustering +----------------------- +HDBSCAN can be seen as an algorithm which performs DBSCAN* clustering across all +values of :math:`\varepsilon`. As mentioned prior, this is equivalent to finding the connected +components of the mutual reachability graphs for all values of :math:`\varepsilon`. To do this +efficiently, HDBSCAN first extracts a minimum spanning tree (MST) from the fully +-connected mutual reachability graph, then greedily cuts the edges with highest +weight. An outline of the HDBSCAN algorithm is as follows: + +1. Extract the MST of :math:`G_{ms}`. +2. Extend the MST by adding a "self edge" for each vertex, with weight equal + to the core distance of the underlying sample. +3. Initialize a single cluster and label for the MST. +4. Remove the edge with the greatest weight from the MST (ties are + removed simultaneously). +5. Assign cluster labels to the connected components which contain the + end points of the now-removed edge. If the component does not have at least + one edge it is instead assigned a "null" label marking it as noise. +6. Repeat 4-5 until there are no more connected components. + +HDBSCAN is therefore able to obtain all possible partitions achievable by +DBSCAN* for a fixed choice of `min_samples` in a hierarchical fashion. +Indeed, this allows HDBSCAN to perform clustering across multiple densities +and as such it no longer needs :math:`\varepsilon` to be given as a hyperparameter. Instead +it relies solely on the choice of `min_samples`, which tends to be a more robust +hyperparameter. + +.. |hdbscan_ground_truth| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_005.png + :target: ../auto_examples/cluster/plot_hdbscan.html + :scale: 75 +.. |hdbscan_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_hdbscan_007.png + :target: ../auto_examples/cluster/plot_hdbscan.html + :scale: 75 + +.. centered:: |hdbscan_ground_truth| +.. centered:: |hdbscan_results| + +HDBSCAN can be smoothed with an additional hyperparameter `min_cluster_size` +which specifies that during the hierarchical clustering, components with fewer +than `minimum_cluster_size` many samples are considered noise. In practice, one +can set `minimum_cluster_size = min_samples` to couple the parameters and +simplify the hyperparameter space. + +.. rubric:: References + +.. [CM2013] Campello, R.J.G.B., Moulavi, D., Sander, J. (2013). Density-Based + Clustering Based on Hierarchical Density Estimates. In: Pei, J., Tseng, V.S., + Cao, L., Motoda, H., Xu, G. (eds) Advances in Knowledge Discovery and Data + Mining. PAKDD 2013. Lecture Notes in Computer Science(), vol 7819. Springer, + Berlin, Heidelberg. :doi:`Density-Based Clustering Based on Hierarchical + Density Estimates <10.1007/978-3-642-37456-2_14>` + +.. [LJ2017] L. McInnes and J. Healy, (2017). Accelerated Hierarchical Density + Based Clustering. In: IEEE International Conference on Data Mining Workshops + (ICDMW), 2017, pp. 33-42. :doi:`Accelerated Hierarchical Density Based + Clustering <10.1109/ICDMW.2017.12>` .. _optics: @@ -905,47 +1145,48 @@ the linear segment clusters of the reachability plot. Note that the blue and red clusters are adjacent in the reachability plot, and can be hierarchically represented as children of a larger parent cluster. -.. topic:: Examples: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_cluster_plot_optics.py` - * :ref:`sphx_glr_auto_examples_cluster_plot_optics.py` +.. dropdown:: Comparison with DBSCAN -.. topic:: Comparison with DBSCAN + The results from OPTICS ``cluster_optics_dbscan`` method and DBSCAN are very + similar, but not always identical; specifically, labeling of periphery and noise + points. This is in part because the first samples of each dense area processed + by OPTICS have a large reachability value while being close to other points in + their area, and will thus sometimes be marked as noise rather than periphery. + This affects adjacent points when they are considered as candidates for being + marked as either periphery or noise. - The results from OPTICS ``cluster_optics_dbscan`` method and DBSCAN are - very similar, but not always identical; specifically, labeling of periphery - and noise points. This is in part because the first samples of each dense - area processed by OPTICS have a large reachability value while being close - to other points in their area, and will thus sometimes be marked as noise - rather than periphery. This affects adjacent points when they are - considered as candidates for being marked as either periphery or noise. + Note that for any single value of ``eps``, DBSCAN will tend to have a shorter + run time than OPTICS; however, for repeated runs at varying ``eps`` values, a + single run of OPTICS may require less cumulative runtime than DBSCAN. It is also + important to note that OPTICS' output is close to DBSCAN's only if ``eps`` and + ``max_eps`` are close. - Note that for any single value of ``eps``, DBSCAN will tend to have a - shorter run time than OPTICS; however, for repeated runs at varying ``eps`` - values, a single run of OPTICS may require less cumulative runtime than - DBSCAN. It is also important to note that OPTICS' output is close to - DBSCAN's only if ``eps`` and ``max_eps`` are close. +.. dropdown:: Computational Complexity -.. topic:: Computational Complexity + Spatial indexing trees are used to avoid calculating the full distance matrix, + and allow for efficient memory usage on large sets of samples. Different + distance metrics can be supplied via the ``metric`` keyword. - Spatial indexing trees are used to avoid calculating the full distance - matrix, and allow for efficient memory usage on large sets of samples. - Different distance metrics can be supplied via the ``metric`` keyword. + For large datasets, similar (but not identical) results can be obtained via + :class:`HDBSCAN`. The HDBSCAN implementation is multithreaded, and has better + algorithmic runtime complexity than OPTICS, at the cost of worse memory scaling. + For extremely large datasets that exhaust system memory using HDBSCAN, OPTICS + will maintain :math:`n` (as opposed to :math:`n^2`) memory scaling; however, + tuning of the ``max_eps`` parameter will likely need to be used to give a + solution in a reasonable amount of wall time. - For large datasets, similar (but not identical) results can be obtained via - `HDBSCAN `_. The HDBSCAN implementation is - multithreaded, and has better algorithmic runtime complexity than OPTICS, - at the cost of worse memory scaling. For extremely large datasets that - exhaust system memory using HDBSCAN, OPTICS will maintain :math:`n` (as opposed - to :math:`n^2`) memory scaling; however, tuning of the ``max_eps`` parameter - will likely need to be used to give a solution in a reasonable amount of - wall time. -.. topic:: References: +.. dropdown:: References + + * "OPTICS: ordering points to identify the clustering structure." Ankerst, + Mihael, Markus M. Breunig, Hans-Peter Kriegel, and JÃļrg Sander. In ACM Sigmod + Record, vol. 28, no. 2, pp. 49-60. ACM, 1999. - * "OPTICS: ordering points to identify the clustering structure." - Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and JÃļrg Sander. - In ACM Sigmod Record, vol. 28, no. 2, pp. 49-60. ACM, 1999. .. _birch: @@ -981,60 +1222,60 @@ If ``n_clusters`` is set to None, the subclusters from the leaves are directly read off, otherwise a global clustering step labels these subclusters into global clusters (labels) and the samples are mapped to the global label of the nearest subcluster. -**Algorithm description:** - -- A new sample is inserted into the root of the CF Tree which is a CF Node. - It is then merged with the subcluster of the root, that has the smallest - radius after merging, constrained by the threshold and branching factor conditions. - If the subcluster has any child node, then this is done repeatedly till it reaches - a leaf. After finding the nearest subcluster in the leaf, the properties of this - subcluster and the parent subclusters are recursively updated. - -- If the radius of the subcluster obtained by merging the new sample and the - nearest subcluster is greater than the square of the threshold and if the - number of subclusters is greater than the branching factor, then a space is temporarily - allocated to this new sample. The two farthest subclusters are taken and - the subclusters are divided into two groups on the basis of the distance - between these subclusters. - -- If this split node has a parent subcluster and there is room - for a new subcluster, then the parent is split into two. If there is no room, - then this node is again split into two and the process is continued - recursively, till it reaches the root. - -**BIRCH or MiniBatchKMeans?** - - - BIRCH does not scale very well to high dimensional data. As a rule of thumb if - ``n_features`` is greater than twenty, it is generally better to use MiniBatchKMeans. - - If the number of instances of data needs to be reduced, or if one wants a - large number of subclusters either as a preprocessing step or otherwise, - BIRCH is more useful than MiniBatchKMeans. - +.. dropdown:: Algorithm description + + - A new sample is inserted into the root of the CF Tree which is a CF Node. It + is then merged with the subcluster of the root, that has the smallest radius + after merging, constrained by the threshold and branching factor conditions. + If the subcluster has any child node, then this is done repeatedly till it + reaches a leaf. After finding the nearest subcluster in the leaf, the + properties of this subcluster and the parent subclusters are recursively + updated. + + - If the radius of the subcluster obtained by merging the new sample and the + nearest subcluster is greater than the square of the threshold and if the + number of subclusters is greater than the branching factor, then a space is + temporarily allocated to this new sample. The two farthest subclusters are + taken and the subclusters are divided into two groups on the basis of the + distance between these subclusters. + + - If this split node has a parent subcluster and there is room for a new + subcluster, then the parent is split into two. If there is no room, then this + node is again split into two and the process is continued recursively, till it + reaches the root. + +.. dropdown:: BIRCH or MiniBatchKMeans? + + - BIRCH does not scale very well to high dimensional data. As a rule of thumb if + ``n_features`` is greater than twenty, it is generally better to use MiniBatchKMeans. + - If the number of instances of data needs to be reduced, or if one wants a + large number of subclusters either as a preprocessing step or otherwise, + BIRCH is more useful than MiniBatchKMeans. + + .. image:: ../auto_examples/cluster/images/sphx_glr_plot_birch_vs_minibatchkmeans_001.png + :target: ../auto_examples/cluster/plot_birch_vs_minibatchkmeans.html -**How to use partial_fit?** +.. dropdown:: How to use partial_fit? -To avoid the computation of global clustering, for every call of ``partial_fit`` -the user is advised + To avoid the computation of global clustering, for every call of ``partial_fit`` + the user is advised: - 1. To set ``n_clusters=None`` initially - 2. Train all data by multiple calls to partial_fit. - 3. Set ``n_clusters`` to a required value using - ``brc.set_params(n_clusters=n_clusters)``. - 4. Call ``partial_fit`` finally with no arguments, i.e. ``brc.partial_fit()`` - which performs the global clustering. + 1. To set ``n_clusters=None`` initially. + 2. Train all data by multiple calls to partial_fit. + 3. Set ``n_clusters`` to a required value using + ``brc.set_params(n_clusters=n_clusters)``. + 4. Call ``partial_fit`` finally with no arguments, i.e. ``brc.partial_fit()`` + which performs the global clustering. -.. image:: ../auto_examples/cluster/images/sphx_glr_plot_birch_vs_minibatchkmeans_001.png - :target: ../auto_examples/cluster/plot_birch_vs_minibatchkmeans.html +.. dropdown:: References -.. topic:: References: + * Tian Zhang, Raghu Ramakrishnan, Maron Livny BIRCH: An efficient data + clustering method for large databases. + https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf - * Tian Zhang, Raghu Ramakrishnan, Maron Livny - BIRCH: An efficient data clustering method for large databases. - https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf + * Roberto Perdisci JBirch - Java implementation of BIRCH clustering algorithm + https://code.google.com/archive/p/jbirch - * Roberto Perdisci - JBirch - Java implementation of BIRCH clustering algorithm - https://code.google.com/archive/p/jbirch .. _clustering_evaluation: @@ -1069,32 +1310,32 @@ ignoring permutations:: >>> labels_true = [0, 0, 0, 1, 1, 1] >>> labels_pred = [0, 0, 1, 1, 2, 2] >>> metrics.rand_score(labels_true, labels_pred) - 0.66... + 0.66 The Rand index does not ensure to obtain a value close to 0.0 for a random labelling. The adjusted Rand index **corrects for chance** and will give such a baseline. >>> metrics.adjusted_rand_score(labels_true, labels_pred) - 0.24... + 0.24 As with all clustering metrics, one can permute 0 and 1 in the predicted labels, rename 2 to 3, and get the same score:: >>> labels_pred = [1, 1, 0, 0, 3, 3] >>> metrics.rand_score(labels_true, labels_pred) - 0.66... + 0.66 >>> metrics.adjusted_rand_score(labels_true, labels_pred) - 0.24... + 0.24 -Furthermore, both :func:`rand_score` :func:`adjusted_rand_score` are +Furthermore, both :func:`rand_score` and :func:`adjusted_rand_score` are **symmetric**: swapping the argument does not change the scores. They can thus be used as **consensus measures**:: >>> metrics.rand_score(labels_pred, labels_true) - 0.66... + 0.66 >>> metrics.adjusted_rand_score(labels_pred, labels_true) - 0.24... + 0.24 Perfect labeling is scored 1.0:: @@ -1107,114 +1348,104 @@ Perfect labeling is scored 1.0:: Poorly agreeing labels (e.g. independent labelings) have lower scores, and for the adjusted Rand index the score will be negative or close to zero. However, for the unadjusted Rand index the score, while lower, -will not necessarily be close to zero.:: +will not necessarily be close to zero:: >>> labels_true = [0, 0, 0, 0, 0, 0, 1, 1] >>> labels_pred = [0, 1, 2, 3, 4, 5, 5, 6] >>> metrics.rand_score(labels_true, labels_pred) - 0.39... + 0.39 >>> metrics.adjusted_rand_score(labels_true, labels_pred) - -0.07... - + -0.072 -Advantages -~~~~~~~~~~ -- **Interpretability**: The unadjusted Rand index is proportional - to the number of sample pairs whose labels are the same in both - `labels_pred` and `labels_true`, or are different in both. +.. topic:: Advantages: -- **Random (uniform) label assignments have an adjusted Rand index - score close to 0.0** for any value of ``n_clusters`` and - ``n_samples`` (which is not the case for the unadjusted Rand index - or the V-measure for instance). + - **Interpretability**: The unadjusted Rand index is proportional to the + number of sample pairs whose labels are the same in both `labels_pred` and + `labels_true`, or are different in both. -- **Bounded range**: Lower values indicate different labelings, - similar clusterings have a high (adjusted or unadjusted) Rand index, - 1.0 is the perfect match score. The score range is [0, 1] for the - unadjusted Rand index and [-1, 1] for the adjusted Rand index. + - **Random (uniform) label assignments have an adjusted Rand index score close + to 0.0** for any value of ``n_clusters`` and ``n_samples`` (which is not the + case for the unadjusted Rand index or the V-measure for instance). -- **No assumption is made on the cluster structure**: The (adjusted or - unadjusted) Rand index can be used to compare all kinds of - clustering algorithms, and can be used to compare clustering - algorithms such as k-means which assumes isotropic blob shapes with - results of spectral clustering algorithms which can find cluster - with "folded" shapes. + - **Bounded range**: Lower values indicate different labelings, similar + clusterings have a high (adjusted or unadjusted) Rand index, 1.0 is the + perfect match score. The score range is [0, 1] for the unadjusted Rand index + and [-0.5, 1] for the adjusted Rand index. + - **No assumption is made on the cluster structure**: The (adjusted or + unadjusted) Rand index can be used to compare all kinds of clustering + algorithms, and can be used to compare clustering algorithms such as k-means + which assumes isotropic blob shapes with results of spectral clustering + algorithms which can find cluster with "folded" shapes. -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- Contrary to inertia, the **(adjusted or unadjusted) Rand index - requires knowledge of the ground truth classes** which is almost - never available in practice or requires manual assignment by human - annotators (as in the supervised learning setting). + - Contrary to inertia, the **(adjusted or unadjusted) Rand index requires + knowledge of the ground truth classes** which is almost never available in + practice or requires manual assignment by human annotators (as in the + supervised learning setting). - However (adjusted or unadjusted) Rand index can also be useful in a - purely unsupervised setting as a building block for a Consensus - Index that can be used for clustering model selection (TODO). + However (adjusted or unadjusted) Rand index can also be useful in a purely + unsupervised setting as a building block for a Consensus Index that can be + used for clustering model selection (TODO). -- The **unadjusted Rand index is often close to 1.0** even if the - clusterings themselves differ significantly. This can be understood - when interpreting the Rand index as the accuracy of element pair - labeling resulting from the clusterings: In practice there often is - a majority of element pairs that are assigned the ``different`` pair - label under both the predicted and the ground truth clustering - resulting in a high proportion of pair labels that agree, which - leads subsequently to a high score. + - The **unadjusted Rand index is often close to 1.0** even if the clusterings + themselves differ significantly. This can be understood when interpreting + the Rand index as the accuracy of element pair labeling resulting from the + clusterings: In practice there often is a majority of element pairs that are + assigned the ``different`` pair label under both the predicted and the + ground truth clustering resulting in a high proportion of pair labels that + agree, which leads subsequently to a high score. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: - Analysis of the impact of the dataset size on the value of - clustering measures for random assignments. +* :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: + Analysis of the impact of the dataset size on the value of + clustering measures for random assignments. +.. dropdown:: Mathematical formulation -Mathematical formulation -~~~~~~~~~~~~~~~~~~~~~~~~ + If C is a ground truth class assignment and K the clustering, let us define + :math:`a` and :math:`b` as: -If C is a ground truth class assignment and K the clustering, let us -define :math:`a` and :math:`b` as: + - :math:`a`, the number of pairs of elements that are in the same set in C and + in the same set in K -- :math:`a`, the number of pairs of elements that are in the same set - in C and in the same set in K + - :math:`b`, the number of pairs of elements that are in different sets in C and + in different sets in K -- :math:`b`, the number of pairs of elements that are in different sets - in C and in different sets in K + The unadjusted Rand index is then given by: -The unadjusted Rand index is then given by: + .. math:: \text{RI} = \frac{a + b}{C_2^{n_{samples}}} -.. math:: \text{RI} = \frac{a + b}{C_2^{n_{samples}}} + where :math:`C_2^{n_{samples}}` is the total number of possible pairs in the + dataset. It does not matter if the calculation is performed on ordered pairs or + unordered pairs as long as the calculation is performed consistently. -where :math:`C_2^{n_{samples}}` is the total number of possible pairs -in the dataset. It does not matter if the calculation is performed on -ordered pairs or unordered pairs as long as the calculation is -performed consistently. + However, the Rand index does not guarantee that random label assignments will + get a value close to zero (esp. if the number of clusters is in the same order + of magnitude as the number of samples). -However, the Rand index does not guarantee that random label assignments -will get a value close to zero (esp. if the number of clusters is in -the same order of magnitude as the number of samples). + To counter this effect we can discount the expected RI :math:`E[\text{RI}]` of + random labelings by defining the adjusted Rand index as follows: -To counter this effect we can discount the expected RI :math:`E[\text{RI}]` of -random labelings by defining the adjusted Rand index as follows: + .. math:: \text{ARI} = \frac{\text{RI} - E[\text{RI}]}{\max(\text{RI}) - E[\text{RI}]} -.. math:: \text{ARI} = \frac{\text{RI} - E[\text{RI}]}{\max(\text{RI}) - E[\text{RI}]} +.. dropdown:: References -.. topic:: References + * `Comparing Partitions + `_ L. Hubert and P. + Arabie, Journal of Classification 1985 - * `Comparing Partitions - `_ - L. Hubert and P. Arabie, Journal of Classification 1985 + * `Properties of the Hubert-Arabie adjusted Rand index + `_ D. Steinley, Psychological + Methods 2004 - * `Properties of the Hubert-Arabie adjusted Rand index - `_ - D. Steinley, Psychological Methods 2004 + * `Wikipedia entry for the Rand index + `_ - * `Wikipedia entry for the Rand index - `_ - - * `Wikipedia entry for the adjusted Rand index - `_ + * :doi:`Minimum adjusted Rand index for two clusterings of a given size, 2022, J. E. ChacÃŗn and A. I. Rastrojo <10.1007/s11634-022-00491-w>` .. _mutual_info_score: @@ -1235,21 +1466,21 @@ proposed more recently and is **normalized against chance**:: >>> labels_pred = [0, 0, 1, 1, 2, 2] >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred) # doctest: +SKIP - 0.22504... + 0.22504 One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get the same score:: >>> labels_pred = [1, 1, 0, 0, 3, 3] >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred) # doctest: +SKIP - 0.22504... + 0.22504 All, :func:`mutual_info_score`, :func:`adjusted_mutual_info_score` and :func:`normalized_mutual_info_score` are symmetric: swapping the argument does not change the score. Thus they can be used as a **consensus measure**:: >>> metrics.adjusted_mutual_info_score(labels_pred, labels_true) # doctest: +SKIP - 0.22504... + 0.22504 Perfect labeling is scored 1.0:: @@ -1263,144 +1494,134 @@ Perfect labeling is scored 1.0:: This is not true for ``mutual_info_score``, which is therefore harder to judge:: >>> metrics.mutual_info_score(labels_true, labels_pred) # doctest: +SKIP - 0.69... + 0.69 Bad (e.g. independent labelings) have non-positive scores:: >>> labels_true = [0, 1, 2, 0, 3, 4, 5, 1] >>> labels_pred = [1, 1, 0, 0, 2, 2, 2, 2] >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred) # doctest: +SKIP - -0.10526... - + -0.10526 -Advantages -~~~~~~~~~~ -- **Random (uniform) label assignments have a AMI score close to 0.0** - for any value of ``n_clusters`` and ``n_samples`` (which is not the - case for raw Mutual Information or the V-measure for instance). +.. topic:: Advantages: -- **Upper bound of 1**: Values close to zero indicate two label - assignments that are largely independent, while values close to one - indicate significant agreement. Further, an AMI of exactly 1 indicates - that the two label assignments are equal (with or without permutation). + - **Random (uniform) label assignments have a AMI score close to 0.0** for any + value of ``n_clusters`` and ``n_samples`` (which is not the case for raw + Mutual Information or the V-measure for instance). + - **Upper bound of 1**: Values close to zero indicate two label assignments + that are largely independent, while values close to one indicate significant + agreement. Further, an AMI of exactly 1 indicates that the two label + assignments are equal (with or without permutation). -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- Contrary to inertia, **MI-based measures require the knowledge - of the ground truth classes** while almost never available in practice or - requires manual assignment by human annotators (as in the supervised learning - setting). + - Contrary to inertia, **MI-based measures require the knowledge of the ground + truth classes** while almost never available in practice or requires manual + assignment by human annotators (as in the supervised learning setting). - However MI-based measures can also be useful in purely unsupervised setting as a - building block for a Consensus Index that can be used for clustering - model selection. + However MI-based measures can also be useful in purely unsupervised setting + as a building block for a Consensus Index that can be used for clustering + model selection. -- NMI and MI are not adjusted against chance. + - NMI and MI are not adjusted against chance. +.. rubric:: Examples -.. topic:: Examples: +* :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: Analysis + of the impact of the dataset size on the value of clustering measures for random + assignments. This example also includes the Adjusted Rand Index. - * :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: Analysis of - the impact of the dataset size on the value of clustering measures - for random assignments. This example also includes the Adjusted Rand - Index. +.. dropdown:: Mathematical formulation + Assume two label assignments (of the same N objects), :math:`U` and :math:`V`. + Their entropy is the amount of uncertainty for a partition set, defined by: -Mathematical formulation -~~~~~~~~~~~~~~~~~~~~~~~~ + .. math:: H(U) = - \sum_{i=1}^{|U|}P(i)\log(P(i)) -Assume two label assignments (of the same N objects), :math:`U` and :math:`V`. -Their entropy is the amount of uncertainty for a partition set, defined by: + where :math:`P(i) = |U_i| / N` is the probability that an object picked at + random from :math:`U` falls into class :math:`U_i`. Likewise for :math:`V`: -.. math:: H(U) = - \sum_{i=1}^{|U|}P(i)\log(P(i)) + .. math:: H(V) = - \sum_{j=1}^{|V|}P'(j)\log(P'(j)) -where :math:`P(i) = |U_i| / N` is the probability that an object picked at -random from :math:`U` falls into class :math:`U_i`. Likewise for :math:`V`: + With :math:`P'(j) = |V_j| / N`. The mutual information (MI) between :math:`U` + and :math:`V` is calculated by: -.. math:: H(V) = - \sum_{j=1}^{|V|}P'(j)\log(P'(j)) + .. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|}\sum_{j=1}^{|V|}P(i, j)\log\left(\frac{P(i,j)}{P(i)P'(j)}\right) -With :math:`P'(j) = |V_j| / N`. The mutual information (MI) between :math:`U` -and :math:`V` is calculated by: + where :math:`P(i, j) = |U_i \cap V_j| / N` is the probability that an object + picked at random falls into both classes :math:`U_i` and :math:`V_j`. -.. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|}\sum_{j=1}^{|V|}P(i, j)\log\left(\frac{P(i,j)}{P(i)P'(j)}\right) + It also can be expressed in set cardinality formulation: -where :math:`P(i, j) = |U_i \cap V_j| / N` is the probability that an object -picked at random falls into both classes :math:`U_i` and :math:`V_j`. + .. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \frac{|U_i \cap V_j|}{N}\log\left(\frac{N|U_i \cap V_j|}{|U_i||V_j|}\right) -It also can be expressed in set cardinality formulation: + The normalized mutual information is defined as -.. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \frac{|U_i \cap V_j|}{N}\log\left(\frac{N|U_i \cap V_j|}{|U_i||V_j|}\right) + .. math:: \text{NMI}(U, V) = \frac{\text{MI}(U, V)}{\text{mean}(H(U), H(V))} -The normalized mutual information is defined as + This value of the mutual information and also the normalized variant is not + adjusted for chance and will tend to increase as the number of different labels + (clusters) increases, regardless of the actual amount of "mutual information" + between the label assignments. -.. math:: \text{NMI}(U, V) = \frac{\text{MI}(U, V)}{\text{mean}(H(U), H(V))} + The expected value for the mutual information can be calculated using the + following equation [VEB2009]_. In this equation, :math:`a_i = |U_i|` (the number + of elements in :math:`U_i`) and :math:`b_j = |V_j|` (the number of elements in + :math:`V_j`). -This value of the mutual information and also the normalized variant is not -adjusted for chance and will tend to increase as the number of different labels -(clusters) increases, regardless of the actual amount of "mutual information" -between the label assignments. + .. math:: E[\text{MI}(U,V)]=\sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \sum_{n_{ij}=(a_i+b_j-N)^+ + }^{\min(a_i, b_j)} \frac{n_{ij}}{N}\log \left( \frac{ N.n_{ij}}{a_i b_j}\right) + \frac{a_i!b_j!(N-a_i)!(N-b_j)!}{N!n_{ij}!(a_i-n_{ij})!(b_j-n_{ij})! + (N-a_i-b_j+n_{ij})!} -The expected value for the mutual information can be calculated using the -following equation [VEB2009]_. In this equation, -:math:`a_i = |U_i|` (the number of elements in :math:`U_i`) and -:math:`b_j = |V_j|` (the number of elements in :math:`V_j`). + Using the expected value, the adjusted mutual information can then be calculated + using a similar form to that of the adjusted Rand index: + .. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]} -.. math:: E[\text{MI}(U,V)]=\sum_{i=1}^{|U|} \sum_{j=1}^{|V|} \sum_{n_{ij}=(a_i+b_j-N)^+ - }^{\min(a_i, b_j)} \frac{n_{ij}}{N}\log \left( \frac{ N.n_{ij}}{a_i b_j}\right) - \frac{a_i!b_j!(N-a_i)!(N-b_j)!}{N!n_{ij}!(a_i-n_{ij})!(b_j-n_{ij})! - (N-a_i-b_j+n_{ij})!} + For normalized mutual information and adjusted mutual information, the + normalizing value is typically some *generalized* mean of the entropies of each + clustering. Various generalized means exist, and no firm rules exist for + preferring one over the others. The decision is largely a field-by-field basis; + for instance, in community detection, the arithmetic mean is most common. Each + normalizing method provides "qualitatively similar behaviours" [YAT2016]_. In + our implementation, this is controlled by the ``average_method`` parameter. -Using the expected value, the adjusted mutual information can then be -calculated using a similar form to that of the adjusted Rand index: + Vinh et al. (2010) named variants of NMI and AMI by their averaging method + [VEB2010]_. Their 'sqrt' and 'sum' averages are the geometric and arithmetic + means; we use these more broadly common names. -.. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\text{mean}(H(U), H(V)) - E[\text{MI}]} + .. rubric:: References -For normalized mutual information and adjusted mutual information, the normalizing -value is typically some *generalized* mean of the entropies of each clustering. -Various generalized means exist, and no firm rules exist for preferring one over the -others. The decision is largely a field-by-field basis; for instance, in community -detection, the arithmetic mean is most common. Each -normalizing method provides "qualitatively similar behaviours" [YAT2016]_. In our -implementation, this is controlled by the ``average_method`` parameter. + * Strehl, Alexander, and Joydeep Ghosh (2002). "Cluster ensembles - a + knowledge reuse framework for combining multiple partitions". Journal of + Machine Learning Research 3: 583-617. `doi:10.1162/153244303321897735 + `_. -Vinh et al. (2010) named variants of NMI and AMI by their averaging method [VEB2010]_. Their -'sqrt' and 'sum' averages are the geometric and arithmetic means; we use these -more broadly common names. + * `Wikipedia entry for the (normalized) Mutual Information + `_ -.. topic:: References + * `Wikipedia entry for the Adjusted Mutual Information + `_ - * Strehl, Alexander, and Joydeep Ghosh (2002). "Cluster ensembles – a - knowledge reuse framework for combining multiple partitions". Journal of - Machine Learning Research 3: 583–617. - `doi:10.1162/153244303321897735 `_. + .. [VEB2009] Vinh, Epps, and Bailey, (2009). "Information theoretic measures + for clusterings comparison". Proceedings of the 26th Annual International + Conference on Machine Learning - ICML '09. `doi:10.1145/1553374.1553511 + `_. ISBN + 9781605585161. - * `Wikipedia entry for the (normalized) Mutual Information - `_ + .. [VEB2010] Vinh, Epps, and Bailey, (2010). "Information Theoretic Measures + for Clusterings Comparison: Variants, Properties, Normalization and + Correction for Chance". JMLR + - * `Wikipedia entry for the Adjusted Mutual Information - `_ - - .. [VEB2009] Vinh, Epps, and Bailey, (2009). "Information theoretic measures - for clusterings comparison". Proceedings of the 26th Annual International - Conference on Machine Learning - ICML '09. - `doi:10.1145/1553374.1553511 `_. - ISBN 9781605585161. + .. [YAT2016] Yang, Algesheimer, and Tessone, (2016). "A comparative analysis + of community detection algorithms on artificial networks". Scientific + Reports 6: 30750. `doi:10.1038/srep30750 + `_. - .. [VEB2010] Vinh, Epps, and Bailey, (2010). "Information Theoretic Measures for - Clusterings Comparison: Variants, Properties, Normalization and - Correction for Chance". JMLR - - - .. [YAT2016] Yang, Algesheimer, and Tessone, (2016). "A comparative analysis of - community - detection algorithms on artificial networks". Scientific Reports 6: 30750. - `doi:10.1038/srep30750 `_. - - .. _homogeneity_completeness: @@ -1428,16 +1649,16 @@ We can turn those concept as scores :func:`homogeneity_score` and >>> labels_pred = [0, 0, 1, 1, 2, 2] >>> metrics.homogeneity_score(labels_true, labels_pred) - 0.66... + 0.66 >>> metrics.completeness_score(labels_true, labels_pred) - 0.42... + 0.42 Their harmonic mean called **V-measure** is computed by :func:`v_measure_score`:: >>> metrics.v_measure_score(labels_true, labels_pred) - 0.51... + 0.516 This function's formula is as follows: @@ -1446,12 +1667,12 @@ This function's formula is as follows: `beta` defaults to a value of 1.0, but for using a value less than 1 for beta:: >>> metrics.v_measure_score(labels_true, labels_pred, beta=0.6) - 0.54... + 0.547 more weight will be attributed to homogeneity, and using a value greater than 1:: >>> metrics.v_measure_score(labels_true, labels_pred, beta=1.8) - 0.48... + 0.48 more weight will be attributed to completeness. @@ -1462,14 +1683,14 @@ Homogeneity, completeness and V-measure can be computed at once using :func:`homogeneity_completeness_v_measure` as follows:: >>> metrics.homogeneity_completeness_v_measure(labels_true, labels_pred) - (0.66..., 0.42..., 0.51...) + (0.67, 0.42, 0.52) The following clustering assignment is slightly better, since it is homogeneous but not complete:: >>> labels_pred = [0, 0, 0, 1, 2, 2] >>> metrics.homogeneity_completeness_v_measure(labels_true, labels_pred) - (1.0, 0.68..., 0.81...) + (1.0, 0.68, 0.81) .. note:: @@ -1482,114 +1703,114 @@ homogeneous but not complete:: homogeneity_score(a, b) == completeness_score(b, a) -Advantages -~~~~~~~~~~ +.. topic:: Advantages: -- **Bounded scores**: 0.0 is as bad as it can be, 1.0 is a perfect score. + - **Bounded scores**: 0.0 is as bad as it can be, 1.0 is a perfect score. -- Intuitive interpretation: clustering with bad V-measure can be - **qualitatively analyzed in terms of homogeneity and completeness** - to better feel what 'kind' of mistakes is done by the assignment. + - Intuitive interpretation: clustering with bad V-measure can be + **qualitatively analyzed in terms of homogeneity and completeness** to + better feel what 'kind' of mistakes is done by the assignment. -- **No assumption is made on the cluster structure**: can be used - to compare clustering algorithms such as k-means which assumes isotropic - blob shapes with results of spectral clustering algorithms which can - find cluster with "folded" shapes. + - **No assumption is made on the cluster structure**: can be used to compare + clustering algorithms such as k-means which assumes isotropic blob shapes + with results of spectral clustering algorithms which can find cluster with + "folded" shapes. +.. topic:: Drawbacks: -Drawbacks -~~~~~~~~~ + - The previously introduced metrics are **not normalized with regards to + random labeling**: this means that depending on the number of samples, + clusters and ground truth classes, a completely random labeling will not + always yield the same values for homogeneity, completeness and hence + v-measure. In particular **random labeling won't yield zero scores + especially when the number of clusters is large**. -- The previously introduced metrics are **not normalized with regards to - random labeling**: this means that depending on the number of samples, - clusters and ground truth classes, a completely random labeling will - not always yield the same values for homogeneity, completeness and - hence v-measure. In particular **random labeling won't yield zero - scores especially when the number of clusters is large**. + This problem can safely be ignored when the number of samples is more than a + thousand and the number of clusters is less than 10. **For smaller sample + sizes or larger number of clusters it is safer to use an adjusted index such + as the Adjusted Rand Index (ARI)**. - This problem can safely be ignored when the number of samples is more - than a thousand and the number of clusters is less than 10. **For - smaller sample sizes or larger number of clusters it is safer to use - an adjusted index such as the Adjusted Rand Index (ARI)**. - -.. figure:: ../auto_examples/cluster/images/sphx_glr_plot_adjusted_for_chance_measures_001.png - :target: ../auto_examples/cluster/plot_adjusted_for_chance_measures.html - :align: center - :scale: 100 - -- These metrics **require the knowledge of the ground truth classes** while - almost never available in practice or requires manual assignment by - human annotators (as in the supervised learning setting). + .. figure:: ../auto_examples/cluster/images/sphx_glr_plot_adjusted_for_chance_measures_001.png + :target: ../auto_examples/cluster/plot_adjusted_for_chance_measures.html + :align: center + :scale: 100 + - These metrics **require the knowledge of the ground truth classes** while + almost never available in practice or requires manual assignment by human + annotators (as in the supervised learning setting). -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: Analysis of - the impact of the dataset size on the value of clustering measures - for random assignments. +* :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py`: Analysis + of the impact of the dataset size on the value of clustering measures for + random assignments. +.. dropdown:: Mathematical formulation -Mathematical formulation -~~~~~~~~~~~~~~~~~~~~~~~~ + Homogeneity and completeness scores are formally given by: -Homogeneity and completeness scores are formally given by: + .. math:: h = 1 - \frac{H(C|K)}{H(C)} -.. math:: h = 1 - \frac{H(C|K)}{H(C)} + .. math:: c = 1 - \frac{H(K|C)}{H(K)} -.. math:: c = 1 - \frac{H(K|C)}{H(K)} + where :math:`H(C|K)` is the **conditional entropy of the classes given the + cluster assignments** and is given by: -where :math:`H(C|K)` is the **conditional entropy of the classes given -the cluster assignments** and is given by: + .. math:: H(C|K) = - \sum_{c=1}^{|C|} \sum_{k=1}^{|K|} \frac{n_{c,k}}{n} + \cdot \log\left(\frac{n_{c,k}}{n_k}\right) -.. math:: H(C|K) = - \sum_{c=1}^{|C|} \sum_{k=1}^{|K|} \frac{n_{c,k}}{n} - \cdot \log\left(\frac{n_{c,k}}{n_k}\right) + and :math:`H(C)` is the **entropy of the classes** and is given by: -and :math:`H(C)` is the **entropy of the classes** and is given by: + .. math:: H(C) = - \sum_{c=1}^{|C|} \frac{n_c}{n} \cdot \log\left(\frac{n_c}{n}\right) -.. math:: H(C) = - \sum_{c=1}^{|C|} \frac{n_c}{n} \cdot \log\left(\frac{n_c}{n}\right) + with :math:`n` the total number of samples, :math:`n_c` and :math:`n_k` the + number of samples respectively belonging to class :math:`c` and cluster + :math:`k`, and finally :math:`n_{c,k}` the number of samples from class + :math:`c` assigned to cluster :math:`k`. -with :math:`n` the total number of samples, :math:`n_c` and :math:`n_k` -the number of samples respectively belonging to class :math:`c` and -cluster :math:`k`, and finally :math:`n_{c,k}` the number of samples -from class :math:`c` assigned to cluster :math:`k`. + The **conditional entropy of clusters given class** :math:`H(K|C)` and the + **entropy of clusters** :math:`H(K)` are defined in a symmetric manner. -The **conditional entropy of clusters given class** :math:`H(K|C)` and the -**entropy of clusters** :math:`H(K)` are defined in a symmetric manner. + Rosenberg and Hirschberg further define **V-measure** as the **harmonic mean of + homogeneity and completeness**: -Rosenberg and Hirschberg further define **V-measure** as the **harmonic -mean of homogeneity and completeness**: + .. math:: v = 2 \cdot \frac{h \cdot c}{h + c} -.. math:: v = 2 \cdot \frac{h \cdot c}{h + c} +.. rubric:: References -.. topic:: References +* `V-Measure: A conditional entropy-based external cluster evaluation measure + `_ Andrew Rosenberg and Julia + Hirschberg, 2007 - * `V-Measure: A conditional entropy-based external cluster evaluation - measure `_ - Andrew Rosenberg and Julia Hirschberg, 2007 +.. [B2011] `Identification and Characterization of Events in Social Media + `_, Hila + Becker, PhD Thesis. - .. [B2011] `Identication and Characterization of Events in Social Media - `_, Hila - Becker, PhD Thesis. .. _fowlkes_mallows_scores: Fowlkes-Mallows scores ---------------------- -The Fowlkes-Mallows index (:func:`sklearn.metrics.fowlkes_mallows_score`) can be -used when the ground truth class assignments of the samples is known. The -Fowlkes-Mallows score FMI is defined as the geometric mean of the -pairwise precision and recall: +The original Fowlkes-Mallows index (FMI) was intended to measure the similarity +between two clustering results, which is inherently an unsupervised comparison. +The supervised adaptation of the Fowlkes-Mallows index +(as implemented in :func:`sklearn.metrics.fowlkes_mallows_score`) can be used +when the ground truth class assignments of the samples are known. +The FMI is defined as the geometric mean of the pairwise precision and recall: .. math:: \text{FMI} = \frac{\text{TP}}{\sqrt{(\text{TP} + \text{FP}) (\text{TP} + \text{FN})}} -Where ``TP`` is the number of **True Positive** (i.e. the number of pair -of points that belong to the same clusters in both the true labels and the -predicted labels), ``FP`` is the number of **False Positive** (i.e. the number -of pair of points that belong to the same clusters in the true labels and not -in the predicted labels) and ``FN`` is the number of **False Negative** (i.e the -number of pair of points that belongs in the same clusters in the predicted -labels and not in the true labels). +In the above formula: + +* ``TP`` (**True Positive**): The number of pairs of points that are clustered together + both in the true labels and in the predicted labels. + +* ``FP`` (**False Positive**): The number of pairs of points that are clustered together + in the predicted labels but not in the true labels. + +* ``FN`` (**False Negative**): The number of pairs of points that are clustered together + in the true labels but not in the predicted labels. The score ranges from 0 to 1. A high value indicates a good similarity between two clusters. @@ -1599,7 +1820,7 @@ between two clusters. >>> labels_pred = [0, 0, 1, 1, 2, 2] >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) - 0.47140... + 0.47140 One can permute 0 and 1 in the predicted labels, rename 2 to 3 and get the same score:: @@ -1607,7 +1828,7 @@ the same score:: >>> labels_pred = [1, 1, 0, 0, 3, 3] >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) - 0.47140... + 0.47140 Perfect labeling is scored 1.0:: @@ -1622,42 +1843,40 @@ Bad (e.g. independent labelings) have zero scores:: >>> metrics.fowlkes_mallows_score(labels_true, labels_pred) 0.0 -Advantages -~~~~~~~~~~ - -- **Random (uniform) label assignments have a FMI score close to 0.0** - for any value of ``n_clusters`` and ``n_samples`` (which is not the - case for raw Mutual Information or the V-measure for instance). +.. topic:: Advantages: -- **Upper-bounded at 1**: Values close to zero indicate two label - assignments that are largely independent, while values close to one - indicate significant agreement. Further, values of exactly 0 indicate - **purely** independent label assignments and a FMI of exactly 1 indicates - that the two label assignments are equal (with or without permutation). + - **Random (uniform) label assignments have a FMI score close to 0.0** for any + value of ``n_clusters`` and ``n_samples`` (which is not the case for raw + Mutual Information or the V-measure for instance). -- **No assumption is made on the cluster structure**: can be used - to compare clustering algorithms such as k-means which assumes isotropic - blob shapes with results of spectral clustering algorithms which can - find cluster with "folded" shapes. + - **Upper-bounded at 1**: Values close to zero indicate two label assignments + that are largely independent, while values close to one indicate significant + agreement. Further, values of exactly 0 indicate **purely** independent + label assignments and a FMI of exactly 1 indicates that the two label + assignments are equal (with or without permutation). + - **No assumption is made on the cluster structure**: can be used to compare + clustering algorithms such as k-means which assumes isotropic blob shapes + with results of spectral clustering algorithms which can find cluster with + "folded" shapes. -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- Contrary to inertia, **FMI-based measures require the knowledge - of the ground truth classes** while almost never available in practice or - requires manual assignment by human annotators (as in the supervised learning - setting). + - Contrary to inertia, **FMI-based measures require the knowledge of the + ground truth classes** while almost never available in practice or requires + manual assignment by human annotators (as in the supervised learning + setting). -.. topic:: References +.. dropdown:: References * E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two hierarchical clusterings". Journal of the American Statistical Association. - http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf + https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008 * `Wikipedia entry for the Fowlkes-Mallows Index `_ + .. _silhouette_coefficient: Silhouette Coefficient @@ -1698,37 +1917,33 @@ cluster analysis. >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X) >>> labels = kmeans_model.labels_ >>> metrics.silhouette_score(X, labels, metric='euclidean') - 0.55... - -.. topic:: References + 0.55 - * Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the - Interpretation and Validation of Cluster Analysis". Computational - and Applied Mathematics 20: 53–65. - `doi:10.1016/0377-0427(87)90125-7 `_. +.. topic:: Advantages: + - The score is bounded between -1 for incorrect clustering and +1 for highly + dense clustering. Scores around zero indicate overlapping clusters. -Advantages -~~~~~~~~~~ + - The score is higher when clusters are dense and well separated, which + relates to a standard concept of a cluster. -- The score is bounded between -1 for incorrect clustering and +1 for highly - dense clustering. Scores around zero indicate overlapping clusters. +.. topic:: Drawbacks: -- The score is higher when clusters are dense and well separated, which relates - to a standard concept of a cluster. + - The Silhouette Coefficient is generally higher for convex clusters than + other concepts of clusters, such as density based clusters like those + obtained through DBSCAN. +.. rubric:: Examples -Drawbacks -~~~~~~~~~ +* :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` : In + this example the silhouette analysis is used to choose an optimal value for + n_clusters. -- The Silhouette Coefficient is generally higher for convex clusters than other - concepts of clusters, such as density based clusters like those obtained - through DBSCAN. +.. dropdown:: References -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` : In this example - the silhouette analysis is used to choose an optimal value for n_clusters. + * Peter J. Rousseeuw (1987). :doi:`"Silhouettes: a Graphical Aid to the + Interpretation and Validation of Cluster Analysis"<10.1016/0377-0427(87)90125-7>`. + Computational and Applied Mathematics 20: 53-65. .. _calinski_harabasz_index: @@ -1738,8 +1953,8 @@ Calinski-Harabasz Index If the ground truth labels are not known, the Calinski-Harabasz index -(:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance -Ratio Criterion - can be used to evaluate the model, where a higher +(:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance +Ratio Criterion - can be used to evaluate the model, where a higher Calinski-Harabasz score relates to a model with better defined clusters. The index is the ratio of the sum of between-clusters dispersion and of @@ -1759,53 +1974,50 @@ cluster analysis: >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X) >>> labels = kmeans_model.labels_ >>> metrics.calinski_harabasz_score(X, labels) - 561.62... + 561.59 -Advantages -~~~~~~~~~~ -- The score is higher when clusters are dense and well separated, which relates - to a standard concept of a cluster. +.. topic:: Advantages: -- The score is fast to compute. + - The score is higher when clusters are dense and well separated, which + relates to a standard concept of a cluster. + - The score is fast to compute. -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- The Calinski-Harabasz index is generally higher for convex clusters than other - concepts of clusters, such as density based clusters like those obtained - through DBSCAN. + - The Calinski-Harabasz index is generally higher for convex clusters than + other concepts of clusters, such as density based clusters like those + obtained through DBSCAN. -Mathematical formulation -~~~~~~~~~~~~~~~~~~~~~~~~ +.. dropdown:: Mathematical formulation -For a set of data :math:`E` of size :math:`n_E` which has been clustered into -:math:`k` clusters, the Calinski-Harabasz score :math:`s` is defined as the -ratio of the between-clusters dispersion mean and the within-cluster dispersion: + For a set of data :math:`E` of size :math:`n_E` which has been clustered into + :math:`k` clusters, the Calinski-Harabasz score :math:`s` is defined as the + ratio of the between-clusters dispersion mean and the within-cluster + dispersion: -.. math:: - s = \frac{\mathrm{tr}(B_k)}{\mathrm{tr}(W_k)} \times \frac{n_E - k}{k - 1} + .. math:: + s = \frac{\mathrm{tr}(B_k)}{\mathrm{tr}(W_k)} \times \frac{n_E - k}{k - 1} -where :math:`\mathrm{tr}(B_k)` is trace of the between group dispersion matrix -and :math:`\mathrm{tr}(W_k)` is the trace of the within-cluster dispersion -matrix defined by: + where :math:`\mathrm{tr}(B_k)` is trace of the between group dispersion matrix + and :math:`\mathrm{tr}(W_k)` is the trace of the within-cluster dispersion + matrix defined by: -.. math:: W_k = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T + .. math:: W_k = \sum_{q=1}^k \sum_{x \in C_q} (x - c_q) (x - c_q)^T -.. math:: B_k = \sum_{q=1}^k n_q (c_q - c_E) (c_q - c_E)^T + .. math:: B_k = \sum_{q=1}^k n_q (c_q - c_E) (c_q - c_E)^T -with :math:`C_q` the set of points in cluster :math:`q`, :math:`c_q` the center -of cluster :math:`q`, :math:`c_E` the center of :math:`E`, and :math:`n_q` the -number of points in cluster :math:`q`. + with :math:`C_q` the set of points in cluster :math:`q`, :math:`c_q` the + center of cluster :math:`q`, :math:`c_E` the center of :math:`E`, and + :math:`n_q` the number of points in cluster :math:`q`. -.. topic:: References +.. dropdown:: References - * Caliński, T., & Harabasz, J. (1974). - `"A Dendrite Method for Cluster Analysis" - `_. - Communications in Statistics-theory and Methods 3: 1-27. - `doi:10.1080/03610927408827101 `_. + * Caliński, T., & Harabasz, J. (1974). `"A Dendrite Method for Cluster Analysis" + `_. + :doi:`Communications in Statistics-theory and Methods 3: 1-27 + <10.1080/03610927408827101>`. .. _davies-bouldin_index: @@ -1836,61 +2048,57 @@ cluster analysis as follows: >>> kmeans = KMeans(n_clusters=3, random_state=1).fit(X) >>> labels = kmeans.labels_ >>> davies_bouldin_score(X, labels) - 0.6619... + 0.666 -Advantages -~~~~~~~~~~ +.. topic:: Advantages: -- The computation of Davies-Bouldin is simpler than that of Silhouette scores. -- The index is computed only quantities and features inherent to the dataset. + - The computation of Davies-Bouldin is simpler than that of Silhouette scores. + - The index is solely based on quantities and features inherent to the dataset + as its computation only uses point-wise distances. -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- The Davies-Boulding index is generally higher for convex clusters than other - concepts of clusters, such as density based clusters like those obtained from - DBSCAN. -- The usage of centroid distance limits the distance metric to Euclidean space. + - The Davies-Bouldin index is generally higher for convex clusters than other + concepts of clusters, such as density-based clusters like those + obtained from DBSCAN. + - The usage of centroid distance limits the distance metric to Euclidean + space. -Mathematical formulation -~~~~~~~~~~~~~~~~~~~~~~~~ +.. dropdown:: Mathematical formulation -The index is defined as the average similarity between each cluster :math:`C_i` -for :math:`i=1, ..., k` and its most similar one :math:`C_j`. In the context of -this index, similarity is defined as a measure :math:`R_{ij}` that trades off: + The index is defined as the average similarity between each cluster :math:`C_i` + for :math:`i=1, ..., k` and its most similar one :math:`C_j`. In the context of + this index, similarity is defined as a measure :math:`R_{ij}` that trades off: -- :math:`s_i`, the average distance between each point of cluster :math:`i` and - the centroid of that cluster -- also know as cluster diameter. -- :math:`d_{ij}`, the distance between cluster centroids :math:`i` and :math:`j`. + - :math:`s_i`, the average distance between each point of cluster :math:`i` and + the centroid of that cluster -- also known as cluster diameter. + - :math:`d_{ij}`, the distance between cluster centroids :math:`i` and + :math:`j`. -A simple choice to construct :math:`R_{ij}` so that it is nonnegative and -symmetric is: + A simple choice to construct :math:`R_{ij}` so that it is nonnegative and + symmetric is: -.. math:: - R_{ij} = \frac{s_i + s_j}{d_{ij}} - -Then the Davies-Bouldin index is defined as: + .. math:: + R_{ij} = \frac{s_i + s_j}{d_{ij}} -.. math:: - DB = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} R_{ij} + Then the Davies-Bouldin index is defined as: + .. math:: + DB = \frac{1}{k} \sum_{i=1}^k \max_{i \neq j} R_{ij} -.. topic:: References +.. dropdown:: References - * Davies, David L.; Bouldin, Donald W. (1979). - "A Cluster Separation Measure" - IEEE Transactions on Pattern Analysis and Machine Intelligence. - PAMI-1 (2): 224-227. - `doi:10.1109/TPAMI.1979.4766909 `_. + * Davies, David L.; Bouldin, Donald W. (1979). :doi:`"A Cluster Separation + Measure" <10.1109/TPAMI.1979.4766909>` IEEE Transactions on Pattern Analysis + and Machine Intelligence. PAMI-1 (2): 224-227. - * Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001). - "On Clustering Validation Techniques" - Journal of Intelligent Information Systems, 17(2-3), 107-145. - `doi:10.1023/A:1012801612483 `_. + * Halkidi, Maria; Batistakis, Yannis; Vazirgiannis, Michalis (2001). :doi:`"On + Clustering Validation Techniques" <10.1023/A:1012801612483>` Journal of + Intelligent Information Systems, 17(2-3), 107-145. - * `Wikipedia entry for Davies-Bouldin index - `_. + * `Wikipedia entry for Davies-Bouldin index + `_. .. _contingency_matrix: @@ -1913,7 +2121,7 @@ Here is an example:: array([[2, 1, 0], [0, 1, 2]]) -The first row of output array indicates that there are three samples whose +The first row of the output array indicates that there are three samples whose true cluster is "a". Of them, two are in predicted cluster 0, one is in 1, and none is in 2. And the second row indicates that there are three samples whose true cluster is "b". Of them, none is in predicted cluster 0, one is in @@ -1924,30 +2132,28 @@ contingency matrix where the order of rows and columns correspond to a list of classes. -Advantages -~~~~~~~~~~ +.. topic:: Advantages: -- Allows to examine the spread of each true cluster across predicted - clusters and vice versa. + - Allows to examine the spread of each true cluster across predicted clusters + and vice versa. -- The contingency table calculated is typically utilized in the calculation - of a similarity statistic (like the others listed in this document) between - the two clusterings. + - The contingency table calculated is typically utilized in the calculation of + a similarity statistic (like the others listed in this document) between the + two clusterings. -Drawbacks -~~~~~~~~~ +.. topic:: Drawbacks: -- Contingency matrix is easy to interpret for a small number of clusters, but - becomes very hard to interpret for a large number of clusters. + - Contingency matrix is easy to interpret for a small number of clusters, but + becomes very hard to interpret for a large number of clusters. -- It doesn't give a single metric to use as an objective for clustering - optimisation. + - It doesn't give a single metric to use as an objective for clustering + optimisation. +.. dropdown:: References -.. topic:: References + * `Wikipedia entry for contingency matrix + `_ - * `Wikipedia entry for contingency matrix - `_ .. _pair_confusion_matrix: @@ -1970,19 +2176,19 @@ under the true and predicted clusterings. It has the following entries: - :math:`C_{00}` : number of pairs with both clusterings having the samples - not clustered together +:math:`C_{00}` : number of pairs with both clusterings having the samples +not clustered together - :math:`C_{10}` : number of pairs with the true label clustering having the - samples clustered together but the other clustering not having the samples - clustered together +:math:`C_{10}` : number of pairs with the true label clustering having the +samples clustered together but the other clustering not having the samples +clustered together - :math:`C_{01}` : number of pairs with the true label clustering not having - the samples clustered together but the other clustering having the samples - clustered together +:math:`C_{01}` : number of pairs with the true label clustering not having +the samples clustered together but the other clustering having the samples +clustered together - :math:`C_{11}` : number of pairs with both clusterings having the samples - clustered together +:math:`C_{11}` : number of pairs with both clusterings having the samples +clustered together Considering a pair of samples that is clustered together a positive pair, then as in binary classification the count of true negatives is @@ -2025,8 +2231,7 @@ diagonal entries:: array([[ 0, 0], [12, 0]]) -.. topic:: References +.. dropdown:: References - * L. Hubert and P. Arabie, Comparing Partitions, Journal of - Classification 1985 - _ + * :doi:`"Comparing Partitions" <10.1007/BF01908075>` L. Hubert and P. Arabie, + Journal of Classification 1985 diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 6e827304c38cd..3ef0d94236aa6 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -5,14 +5,24 @@ Pipelines and composite estimators ================================== -Transformers are usually combined with classifiers, regressors or other -estimators to build a composite estimator. The most common tool is a -:ref:`Pipeline `. Pipeline is often used in combination with -:ref:`FeatureUnion ` which concatenates the output of -transformers into a composite feature space. :ref:`TransformedTargetRegressor -` deals with transforming the :term:`target` -(i.e. log-transform :term:`y`). In contrast, Pipelines only transform the -observed data (:term:`X`). +To build a composite estimator, transformers are usually combined with other +transformers or with :term:`predictors` (such as classifiers or regressors). +The most common tool used for composing estimators is a :ref:`Pipeline +`. Pipelines require all steps except the last to be a +:term:`transformer`. The last step can be anything, a transformer, a +:term:`predictor`, or a clustering estimator which might have or not have a +`.predict(...)` method. A pipeline exposes all methods provided by the last +estimator: if the last step provides a `transform` method, then the pipeline +would have a `transform` method and behave like a transformer. If the last step +provides a `predict` method, then the pipeline would expose that method, and +given a data :term:`X`, use all steps except the last to transform the data, +and then give that transformed data to the `predict` method of the last step of +the pipeline. The class :class:`Pipeline` is often used in combination with +:ref:`ColumnTransformer ` or +:ref:`FeatureUnion ` which concatenate the output of transformers +into a composite feature space. +:ref:`TransformedTargetRegressor ` +deals with transforming the :term:`target` (i.e. log-transform :term:`y`). .. _pipeline: @@ -41,12 +51,21 @@ All estimators in a pipeline, except the last one, must be transformers (i.e. must have a :term:`transform` method). The last estimator may be any type (transformer, classifier, etc.). +.. note:: + + Calling ``fit`` on the pipeline is the same as calling ``fit`` on + each estimator in turn, ``transform`` the input and pass it on to the next step. + The pipeline has all the methods that the last estimator in the pipeline has, + i.e. if the last estimator is a classifier, the :class:`Pipeline` can be used + as a classifier. If the last estimator is a transformer, again, so is the + pipeline. + Usage ----- -Construction -............ +Build a pipeline +................ The :class:`Pipeline` is built using a list of ``(key, value)`` pairs, where the ``key`` is a string containing the name you want to give this step and ``value`` @@ -60,38 +79,22 @@ is an estimator object:: >>> pipe Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())]) -The utility function :func:`make_pipeline` is a shorthand -for constructing pipelines; -it takes a variable number of estimators and returns a pipeline, -filling in the names automatically:: - - >>> from sklearn.pipeline import make_pipeline - >>> from sklearn.naive_bayes import MultinomialNB - >>> from sklearn.preprocessing import Binarizer - >>> make_pipeline(Binarizer(), MultinomialNB()) - Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())]) +.. dropdown:: Shorthand version using :func:`make_pipeline` -Accessing steps -............... + The utility function :func:`make_pipeline` is a shorthand + for constructing pipelines; + it takes a variable number of estimators and returns a pipeline, + filling in the names automatically:: -The estimators of a pipeline are stored as a list in the ``steps`` attribute, -but can be accessed by index or name by indexing (with ``[idx]``) the -Pipeline:: + >>> from sklearn.pipeline import make_pipeline + >>> make_pipeline(PCA(), SVC()) + Pipeline(steps=[('pca', PCA()), ('svc', SVC())]) - >>> pipe.steps[0] - ('reduce_dim', PCA()) - >>> pipe[0] - PCA() - >>> pipe['reduce_dim'] - PCA() +Access pipeline steps +..................... -Pipeline's `named_steps` attribute allows accessing steps by name with tab -completion in interactive environments:: - - >>> pipe.named_steps.reduce_dim is pipe['reduce_dim'] - True - -A sub-pipeline can also be extracted using the slicing notation commonly used +The estimators of a pipeline are stored as a list in the ``steps`` attribute. +A sub-pipeline can be extracted using the slicing notation commonly used for Python Sequences such as lists or strings (although only a step of 1 is permitted). This is convenient for performing only some of the transformations (or their inverse): @@ -101,67 +104,97 @@ permitted). This is convenient for performing only some of the transformations >>> pipe[-1:] Pipeline(steps=[('clf', SVC())]) +.. dropdown:: Accessing a step by name or position -.. _pipeline_nested_parameters: - -Nested parameters -................. + A specific step can also be accessed by index or name by indexing (with ``[idx]``) the + pipeline:: -Parameters of the estimators in the pipeline can be accessed using the -``__`` syntax:: + >>> pipe.steps[0] + ('reduce_dim', PCA()) + >>> pipe[0] + PCA() + >>> pipe['reduce_dim'] + PCA() - >>> pipe.set_params(clf__C=10) - Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC(C=10))]) + `Pipeline`'s `named_steps` attribute allows accessing steps by name with tab + completion in interactive environments:: -This is particularly important for doing grid searches:: + >>> pipe.named_steps.reduce_dim is pipe['reduce_dim'] + True - >>> from sklearn.model_selection import GridSearchCV - >>> param_grid = dict(reduce_dim__n_components=[2, 5, 10], - ... clf__C=[0.1, 10, 100]) - >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) +Tracking feature names in a pipeline +.................................... -Individual steps may also be replaced as parameters, and non-final steps may be -ignored by setting them to ``'passthrough'``:: +To enable model inspection, :class:`~sklearn.pipeline.Pipeline` has a +``get_feature_names_out()`` method, just like all transformers. You can use +pipeline slicing to get the feature names going into each step:: + >>> from sklearn.datasets import load_iris >>> from sklearn.linear_model import LogisticRegression - >>> param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)], - ... clf=[SVC(), LogisticRegression()], - ... clf__C=[0.1, 10, 100]) - >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) + >>> from sklearn.feature_selection import SelectKBest + >>> iris = load_iris() + >>> pipe = Pipeline(steps=[ + ... ('select', SelectKBest(k=2)), + ... ('clf', LogisticRegression())]) + >>> pipe.fit(iris.data, iris.target) + Pipeline(steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))]) + >>> pipe[:-1].get_feature_names_out() + array(['x2', 'x3'], ...) + +.. dropdown:: Customize feature names + + You can also provide custom feature names for the input data using + ``get_feature_names_out``:: -The estimators of the pipeline can be retrieved by index: + >>> pipe[:-1].get_feature_names_out(iris.feature_names) + array(['petal length (cm)', 'petal width (cm)'], ...) - >>> pipe[0] - PCA() +.. _pipeline_nested_parameters: -or by name:: +Access to nested parameters +........................... - >>> pipe['reduce_dim'] - PCA() +It is common to adjust the parameters of an estimator within a pipeline. This parameter +is therefore nested because it belongs to a particular sub-step. Parameters of the +estimators in the pipeline are accessible using the ``__`` +syntax:: -.. topic:: Examples: + >>> pipe = Pipeline(steps=[("reduce_dim", PCA()), ("clf", SVC())]) + >>> pipe.set_params(clf__C=10) + Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC(C=10))]) - * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` - * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` - * :ref:`sphx_glr_auto_examples_compose_plot_digits_pipe.py` - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` - * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py` - * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` +.. dropdown:: When does it matter? -.. topic:: See Also: + This is particularly important for doing grid searches:: - * :ref:`composite_grid_search` + >>> from sklearn.model_selection import GridSearchCV + >>> param_grid = dict(reduce_dim__n_components=[2, 5, 10], + ... clf__C=[0.1, 10, 100]) + >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) + Individual steps may also be replaced as parameters, and non-final steps may be + ignored by setting them to ``'passthrough'``:: -Notes ------ + >>> param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)], + ... clf=[SVC(), LogisticRegression()], + ... clf__C=[0.1, 10, 100]) + >>> grid_search = GridSearchCV(pipe, param_grid=param_grid) + + .. seealso:: + + * :ref:`composite_grid_search` + + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py` +* :ref:`sphx_glr_auto_examples_compose_plot_digits_pipe.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` +* :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py` +* :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_pipeline_display.py` -Calling ``fit`` on the pipeline is the same as calling ``fit`` on -each estimator in turn, ``transform`` the input and pass it on to the next step. -The pipeline has all the methods that the last estimator in the pipeline has, -i.e. if the last estimator is a classifier, the :class:`Pipeline` can be used -as a classifier. If the last estimator is a transformer, again, so is the -pipeline. .. _pipeline_cache: @@ -176,11 +209,11 @@ after calling ``fit``. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical. A typical example is the case of a grid search in which the transformers can be fitted only once and reused for -each configuration. +each configuration. The last step will never be cached, even if it is a transformer. The parameter ``memory`` is needed in order to cache the transformers. ``memory`` can be either a string containing the directory where to cache the -transformers or a `joblib.Memory `_ +transformers or a `joblib.Memory `_ object:: >>> from tempfile import mkdtemp @@ -197,47 +230,49 @@ object:: >>> # Clear the cache directory when you don't need it anymore >>> rmtree(cachedir) -.. warning:: **Side effect of caching transformers** - - Using a :class:`Pipeline` without cache enabled, it is possible to - inspect the original instance such as:: - - >>> from sklearn.datasets import load_digits - >>> X_digits, y_digits = load_digits(return_X_y=True) - >>> pca1 = PCA() - >>> svm1 = SVC() - >>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)]) - >>> pipe.fit(X_digits, y_digits) - Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())]) - >>> # The pca instance can be inspected directly - >>> print(pca1.components_) - [[-1.77484909e-19 ... 4.07058917e-18]] - - Enabling caching triggers a clone of the transformers before fitting. - Therefore, the transformer instance given to the pipeline cannot be - inspected directly. - In following example, accessing the :class:`PCA` instance ``pca2`` - will raise an ``AttributeError`` since ``pca2`` will be an unfitted - transformer. - Instead, use the attribute ``named_steps`` to inspect estimators within - the pipeline:: - - >>> cachedir = mkdtemp() - >>> pca2 = PCA() - >>> svm2 = SVC() - >>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)], - ... memory=cachedir) - >>> cached_pipe.fit(X_digits, y_digits) - Pipeline(memory=..., - steps=[('reduce_dim', PCA()), ('clf', SVC())]) - >>> print(cached_pipe.named_steps['reduce_dim'].components_) - [[-1.77484909e-19 ... 4.07058917e-18]] - >>> # Remove the cache directory - >>> rmtree(cachedir) - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` +.. dropdown:: Side effect of caching transformers + :color: warning + + Using a :class:`Pipeline` without cache enabled, it is possible to + inspect the original instance such as:: + + >>> from sklearn.datasets import load_digits + >>> X_digits, y_digits = load_digits(return_X_y=True) + >>> pca1 = PCA(n_components=10) + >>> svm1 = SVC() + >>> pipe = Pipeline([('reduce_dim', pca1), ('clf', svm1)]) + >>> pipe.fit(X_digits, y_digits) + Pipeline(steps=[('reduce_dim', PCA(n_components=10)), ('clf', SVC())]) + >>> # The pca instance can be inspected directly + >>> pca1.components_.shape + (10, 64) + + Enabling caching triggers a clone of the transformers before fitting. + Therefore, the transformer instance given to the pipeline cannot be + inspected directly. + In the following example, accessing the :class:`~sklearn.decomposition.PCA` + instance ``pca2`` will raise an ``AttributeError`` since ``pca2`` will be an + unfitted transformer. + Instead, use the attribute ``named_steps`` to inspect estimators within + the pipeline:: + + >>> cachedir = mkdtemp() + >>> pca2 = PCA(n_components=10) + >>> svm2 = SVC() + >>> cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)], + ... memory=cachedir) + >>> cached_pipe.fit(X_digits, y_digits) + Pipeline(memory=..., + steps=[('reduce_dim', PCA(n_components=10)), ('clf', SVC())]) + >>> cached_pipe.named_steps['reduce_dim'].components_.shape + (10, 64) + >>> # Remove the cache directory + >>> rmtree(cachedir) + + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` .. _transformed_target_regressor: @@ -310,9 +345,9 @@ each other. However, it is possible to bypass this checking by setting pair of functions ``func`` and ``inverse_func``. However, setting both options will raise an error. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py` +* :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py` .. _feature_union: @@ -374,9 +409,9 @@ and ignored by setting to ``'drop'``:: FeatureUnion(transformer_list=[('linear_pca', PCA()), ('kernel_pca', 'drop')]) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_compose_plot_feature_union.py` +* :ref:`sphx_glr_auto_examples_compose_plot_feature_union.py` .. _column_transformer: @@ -426,21 +461,20 @@ By default, the remaining rating columns are ignored (``remainder='drop'``):: >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.preprocessing import OneHotEncoder >>> column_trans = ColumnTransformer( - ... [('city_category', OneHotEncoder(dtype='int'),['city']), + ... [('categories', OneHotEncoder(dtype='int'), ['city']), ... ('title_bow', CountVectorizer(), 'title')], - ... remainder='drop') + ... remainder='drop', verbose_feature_names_out=False) >>> column_trans.fit(X) - ColumnTransformer(transformers=[('city_category', OneHotEncoder(dtype='int'), + ColumnTransformer(transformers=[('categories', OneHotEncoder(dtype='int'), ['city']), - ('title_bow', CountVectorizer(), 'title')]) + ('title_bow', CountVectorizer(), 'title')], + verbose_feature_names_out=False) - >>> column_trans.get_feature_names() - ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw', - 'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his', - 'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable', - 'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson', - 'title_bow__wrath'] + >>> column_trans.get_feature_names_out() + array(['city_London', 'city_Paris', 'city_Sallisaw', 'bow', 'feast', + 'grapes', 'his', 'how', 'last', 'learned', 'moveable', 'of', 'the', + 'trick', 'watson', 'wrath'], ...) >>> column_trans.transform(X).toarray() array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], @@ -470,10 +504,10 @@ on data type or column name:: ... OneHotEncoder(), ... make_column_selector(pattern='city', dtype_include=object))]) >>> ct.fit_transform(X) - array([[ 0.904..., 0. , 1. , 0. , 0. ], - [-1.507..., 1.414..., 1. , 0. , 0. ], - [-0.301..., 0. , 0. , 1. , 0. ], - [ 0.904..., -1.414..., 0. , 0. , 1. ]]) + array([[ 0.904, 0. , 1. , 0. , 0. ], + [-1.507, 1.414, 1. , 0. , 0. ], + [-0.301, 0. , 0. , 1. , 0. ], + [ 0.904, -1.414, 0. , 0. , 1. ]]) Strings can reference columns if the input is a DataFrame, integers are always interpreted as the positional columns. @@ -527,19 +561,37 @@ above example would be:: ('countvectorizer', CountVectorizer(), 'title')]) +If :class:`~sklearn.compose.ColumnTransformer` is fitted with a dataframe +and the dataframe only has string column names, then transforming a dataframe +will use the column names to select the columns:: + + + >>> ct = ColumnTransformer( + ... [("scale", StandardScaler(), ["expert_rating"])]).fit(X) + >>> X_new = pd.DataFrame({"expert_rating": [5, 6, 1], + ... "ignored_new_col": [1.2, 0.3, -0.1]}) + >>> ct.transform(X_new) + array([[ 0.9], + [ 2.1], + [-3.9]]) + .. _visualizing_composite_estimators: Visualizing Composite Estimators ================================ -Estimators can be displayed with a HTML representation when shown in a -jupyter notebook. This can be useful to diagnose or visualize a Pipeline with -many estimators. This visualization is activated by setting the -`display` option in :func:`~sklearn.set_config`:: +Estimators are displayed with an HTML representation when shown in a +jupyter notebook. This is useful to diagnose or visualize a Pipeline with +many estimators. This visualization is activated by default:: + + >>> column_trans # doctest: +SKIP + +It can be deactivated by setting the `display` option in :func:`~sklearn.set_config` +to 'text':: >>> from sklearn import set_config - >>> set_config(display='diagram') # doctest: +SKIP - >>> # diplays HTML representation in a jupyter context + >>> set_config(display='text') # doctest: +SKIP + >>> # displays text representation in a jupyter context >>> column_trans # doctest: +SKIP An example of the HTML output can be seen in the @@ -552,7 +604,7 @@ As an alternative, the HTML can be written to a file using >>> with open('my_estimator.html', 'w') as f: # doctest: +SKIP ... f.write(estimator_html_repr(clf)) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` - * :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py` +* :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` +* :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py` diff --git a/doc/modules/covariance.rst b/doc/modules/covariance.rst index c97676ea62108..0eadfa2c8c584 100644 --- a/doc/modules/covariance.rst +++ b/doc/modules/covariance.rst @@ -40,11 +40,10 @@ on whether the data are centered, so one may want to use the same mean vector as the training set. If not, both should be centered by the user, and ``assume_centered=True`` should be used. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for - an example on how to fit an :class:`EmpiricalCovariance` object - to data. +* See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for + an example on how to fit an :class:`EmpiricalCovariance` object to data. .. _shrunk_covariance: @@ -77,18 +76,17 @@ smallest and the largest eigenvalues of the empirical covariance matrix. It can be done by simply shifting every eigenvalue according to a given offset, which is equivalent of finding the l2-penalized Maximum Likelihood Estimator of the covariance matrix. In practice, shrinkage -boils down to a simple a convex transformation : :math:`\Sigma_{\rm +boils down to a simple convex transformation : :math:`\Sigma_{\rm shrunk} = (1-\alpha)\hat{\Sigma} + \alpha\frac{{\rm Tr}\hat{\Sigma}}{p}\rm Id`. Choosing the amount of shrinkage, :math:`\alpha` amounts to setting a bias/variance trade-off, and is discussed below. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for - an example on how to fit a :class:`ShrunkCovariance` object - to data. +* See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for + an example on how to fit a :class:`ShrunkCovariance` object to data. Ledoit-Wolf shrinkage @@ -109,30 +107,30 @@ fitting a :class:`LedoitWolf` object to the same sample. It is important to note that when the number of samples is much larger than the number of features, one would expect that no shrinkage would be necessary. The intuition behind this is that if the population covariance - is full rank, when the number of sample grows, the sample covariance will - also become positive definite. As a result, no shrinkage would necessary + is full rank, when the number of samples grows, the sample covariance will + also become positive definite. As a result, no shrinkage would be necessary and the method should automatically do this. This, however, is not the case in the Ledoit-Wolf procedure when the population covariance happens to be a multiple of the identity matrix. In this case, the Ledoit-Wolf shrinkage estimate approaches 1 as the number of samples increases. This indicates that the optimal estimate of the - covariance matrix in the Ledoit-Wolf sense is multiple of the identity. + covariance matrix in the Ledoit-Wolf sense is a multiple of the identity. Since the population covariance is already a multiple of the identity matrix, the Ledoit-Wolf solution is indeed a reasonable estimate. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for - an example on how to fit a :class:`LedoitWolf` object to data and - for visualizing the performances of the Ledoit-Wolf estimator in - terms of likelihood. +* See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for + an example on how to fit a :class:`LedoitWolf` object to data and + for visualizing the performances of the Ledoit-Wolf estimator in + terms of likelihood. -.. topic:: References: +.. rubric:: References - .. [1] O. Ledoit and M. Wolf, "A Well-Conditioned Estimator for Large-Dimensional - Covariance Matrices", Journal of Multivariate Analysis, Volume 88, Issue 2, - February 2004, pages 365-411. +.. [1] O. Ledoit and M. Wolf, "A Well-Conditioned Estimator for Large-Dimensional + Covariance Matrices", Journal of Multivariate Analysis, Volume 88, Issue 2, + February 2004, pages 365-411. .. _oracle_approximating_shrinkage: @@ -158,20 +156,21 @@ object to the same sample. Bias-variance trade-off when setting the shrinkage: comparing the choices of Ledoit-Wolf and OAS estimators -.. topic:: References: +.. rubric:: References - .. [2] Chen et al., "Shrinkage Algorithms for MMSE Covariance Estimation", - IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010. +.. [2] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.", + Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. + IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010. + <0907.4698>` -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for - an example on how to fit an :class:`OAS` object - to data. +* See :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` for + an example on how to fit an :class:`OAS` object to data. - * See :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py` to visualize the - Mean Squared Error difference between a :class:`LedoitWolf` and - an :class:`OAS` estimator of the covariance. +* See :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py` to visualize the + Mean Squared Error difference between a :class:`LedoitWolf` and + an :class:`OAS` estimator of the covariance. .. figure:: ../auto_examples/covariance/images/sphx_glr_plot_lw_vs_oas_001.png @@ -252,20 +251,20 @@ problem is the GLasso algorithm, from the Friedman 2008 Biostatistics paper. It is the same algorithm as in the R ``glasso`` package. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_covariance_plot_sparse_cov.py`: example on synthetic - data showing some recovery of a structure, and comparing to other - covariance estimators. +* :ref:`sphx_glr_auto_examples_covariance_plot_sparse_cov.py`: example on synthetic + data showing some recovery of a structure, and comparing to other + covariance estimators. - * :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`: example on real - stock market data, finding which symbols are most linked. +* :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`: example on real + stock market data, finding which symbols are most linked. -.. topic:: References: +.. rubric:: References - * Friedman et al, `"Sparse inverse covariance estimation with the - graphical lasso" `_, - Biostatistics 9, pp 432, 2008 +* Friedman et al, `"Sparse inverse covariance estimation with the + graphical lasso" `_, + Biostatistics 9, pp 432, 2008 .. _robust_covariance: @@ -311,24 +310,24 @@ the same time. Raw estimates can be accessed as ``raw_location_`` and ``raw_covariance_`` attributes of a :class:`MinCovDet` robust covariance estimator object. -.. topic:: References: +.. rubric:: References - .. [3] P. J. Rousseeuw. Least median of squares regression. - J. Am Stat Ass, 79:871, 1984. - .. [4] A Fast Algorithm for the Minimum Covariance Determinant Estimator, - 1999, American Statistical Association and the American Society - for Quality, TECHNOMETRICS. +.. [3] P. J. Rousseeuw. Least median of squares regression. + J. Am Stat Ass, 79:871, 1984. +.. [4] A Fast Algorithm for the Minimum Covariance Determinant Estimator, + 1999, American Statistical Association and the American Society + for Quality, TECHNOMETRICS. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_covariance_plot_robust_vs_empirical_covariance.py` for - an example on how to fit a :class:`MinCovDet` object to data and see how - the estimate remains accurate despite the presence of outliers. +* See :ref:`sphx_glr_auto_examples_covariance_plot_robust_vs_empirical_covariance.py` for + an example on how to fit a :class:`MinCovDet` object to data and see how + the estimate remains accurate despite the presence of outliers. - * See :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py` to - visualize the difference between :class:`EmpiricalCovariance` and - :class:`MinCovDet` covariance estimators in terms of Mahalanobis distance - (so we get a better estimate of the precision matrix too). +* See :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py` to + visualize the difference between :class:`EmpiricalCovariance` and + :class:`MinCovDet` covariance estimators in terms of Mahalanobis distance + (so we get a better estimate of the precision matrix too). .. |robust_vs_emp| image:: ../auto_examples/covariance/images/sphx_glr_plot_robust_vs_empirical_covariance_001.png :target: ../auto_examples/covariance/plot_robust_vs_empirical_covariance.html diff --git a/doc/modules/cross_decomposition.rst b/doc/modules/cross_decomposition.rst index 981f7d98fbbde..01722cbd07ab6 100644 --- a/doc/modules/cross_decomposition.rst +++ b/doc/modules/cross_decomposition.rst @@ -28,9 +28,9 @@ PLS draws similarities with `Principal Component Regression `_ (PCR), where the samples are first projected into a lower-dimensional subspace, and the targets `y` are predicted using `transformed(X)`. One issue with PCR is that -the dimensionality reduction is unsupervized, and may lose some important +the dimensionality reduction is unsupervised, and may lose some important variables: PCR would keep the features with the most variance, but it's -possible that features with a small variances are relevant from predicting +possible that features with small variances are relevant for predicting the target. In a way, PLS allows for the same kind of dimensionality reduction, but by taking into account the targets `y`. An illustration of this fact is given in the following example: @@ -64,7 +64,7 @@ Set :math:`X_1` to :math:`X` and :math:`Y_1` to :math:`Y`. Then, for each :math:`C = X_k^T Y_k`. :math:`u_k` and :math:`v_k` are called the *weights*. By definition, :math:`u_k` and :math:`v_k` are - choosen so that they maximize the covariance between the projected + chosen so that they maximize the covariance between the projected :math:`X_k` and the projected target, that is :math:`\text{Cov}(X_k u_k, Y_k v_k)`. - b) Project :math:`X_k` and :math:`Y_k` on the singular vectors to obtain @@ -88,42 +88,39 @@ Note that the scores matrices :math:`\Xi` and :math:`\Omega` correspond to the projections of the training data :math:`X` and :math:`Y`, respectively. Step *a)* may be performed in two ways: either by computing the whole SVD of -:math:`C` and only retain the singular vectors with the biggest singular +:math:`C` and only retaining the singular vectors with the biggest singular values, or by directly computing the singular vectors using the power method (cf section 11.3 in [1]_), which corresponds to the `'nipals'` option of the `algorithm` parameter. +.. dropdown:: Transforming data -Transforming data -^^^^^^^^^^^^^^^^^ + To transform :math:`X` into :math:`\bar{X}`, we need to find a projection + matrix :math:`P` such that :math:`\bar{X} = XP`. We know that for the + training data, :math:`\Xi = XP`, and :math:`X = \Xi \Gamma^T`. Setting + :math:`P = U(\Gamma^T U)^{-1}` where :math:`U` is the matrix with the + :math:`u_k` in the columns, we have :math:`XP = X U(\Gamma^T U)^{-1} = \Xi + (\Gamma^T U) (\Gamma^T U)^{-1} = \Xi` as desired. The rotation matrix + :math:`P` can be accessed from the `x_rotations_` attribute. -To transform :math:`X` into :math:`\bar{X}`, we need to find a projection -matrix :math:`P` such that :math:`\bar{X} = XP`. We know that for the -training data, :math:`\Xi = XP`, and :math:`X = \Xi \Gamma^T`. Setting -:math:`P = U(\Gamma^T U)^{-1}` where :math:`U` is the matrix with the -:math:`u_k` in the columns, we have :math:`XP = X U(\Gamma^T U)^{-1} = \Xi -(\Gamma^T U) (\Gamma^T U)^{-1} = \Xi` as desired. The rotation matrix -:math:`P` can be accessed from the `x_rotations_` attribute. + Similarly, :math:`Y` can be transformed using the rotation matrix + :math:`V(\Delta^T V)^{-1}`, accessed via the `y_rotations_` attribute. -Similarly, :math:`Y` can be transformed using the rotation matrix -:math:`V(\Delta^T V)^{-1}`, accessed via the `y_rotations_` attribute. +.. dropdown:: Predicting the targets `Y` -Predicting the targets Y -^^^^^^^^^^^^^^^^^^^^^^^^ + To predict the targets of some data :math:`X`, we are looking for a + coefficient matrix :math:`\beta \in R^{d \times t}` such that :math:`Y = + X\beta`. -To predict the targets of some data :math:`X`, we are looking for a -coefficient matrix :math:`\beta \in R^{d \times t}` such that :math:`Y = -X\beta`. + The idea is to try to predict the transformed targets :math:`\Omega` as a + function of the transformed samples :math:`\Xi`, by computing :math:`\alpha + \in \mathbb{R}` such that :math:`\Omega = \alpha \Xi`. -The idea is to try to predict the transformed targets :math:`\Omega` as a -function of the transformed samples :math:`\Xi`, by computing :math:`\alpha -\in \mathbb{R}` such that :math:`\Omega = \alpha \Xi`. + Then, we have :math:`Y = \Omega \Delta^T = \alpha \Xi \Delta^T`, and since + :math:`\Xi` is the transformed training data we have that :math:`Y = X \alpha + P \Delta^T`, and as a result the coefficient matrix :math:`\beta = \alpha P + \Delta^T`. -Then, we have :math:`Y = \Omega \Delta^T = \alpha \Xi \Delta^T`, and since -:math:`\Xi` is the transformed training data we have that :math:`Y = X \alpha -P \Delta^T`, and as a result the coefficient matrix :math:`\beta = \alpha P -\Delta^T`. - -:math:`\beta` can be accessed through the `coef_` attribute. + :math:`\beta` can be accessed through the `coef_` attribute. PLSSVD ------ @@ -180,15 +177,13 @@ Since :class:`CCA` involves the inversion of :math:`X_k^TX_k` and :math:`Y_k^TY_k`, this estimator can be unstable if the number of features or targets is greater than the number of samples. +.. rubric:: References -.. topic:: Reference: - - .. [1] `A survey of Partial Least Squares (PLS) methods, with emphasis on - the two-block case - `_ - JA Wegelin +.. [1] `A survey of Partial Least Squares (PLS) methods, with emphasis on the two-block + case `_, + JA Wegelin -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py` - * :ref:`sphx_glr_auto_examples_cross_decomposition_plot_pcr_vs_pls.py` +* :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py` +* :ref:`sphx_glr_auto_examples_cross_decomposition_plot_pcr_vs_pls.py` diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index ae3d38f168f3f..b1c9ccec8f641 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -55,7 +55,7 @@ data for testing (evaluating) our classifier:: >>> clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) >>> clf.score(X_test, y_test) - 0.96... + 0.96 When evaluating different settings ("hyperparameters") for estimators, such as the ``C`` setting that must be manually set for an SVM, @@ -86,10 +86,10 @@ the training set is split into *k* smaller sets but generally follow the same principles). The following procedure is followed for each of the *k* "folds": - * A model is trained using :math:`k-1` of the folds as training data; - * the resulting model is validated on the remaining part of the data - (i.e., it is used as a test set to compute a performance measure - such as accuracy). +* A model is trained using :math:`k-1` of the folds as training data; +* the resulting model is validated on the remaining part of the data + (i.e., it is used as a test set to compute a performance measure + such as accuracy). The performance measure reported by *k*-fold cross-validation is then the average of the values computed in the loop. @@ -102,6 +102,7 @@ where the number of samples is very small. .. image:: ../images/grid_search_cross_validation.png :width: 500px :height: 300px + :alt: A depiction of a 5 fold cross validation on a training set, while holding out a test set. :align: center Computing cross-validated metrics @@ -119,7 +120,7 @@ time):: >>> clf = svm.SVC(kernel='linear', C=1, random_state=42) >>> scores = cross_val_score(clf, X, y, cv=5) >>> scores - array([0.96..., 1. , 0.96..., 0.96..., 1. ]) + array([0.96, 1. , 0.96, 0.96, 1. ]) The mean score and the standard deviation are hence given by:: @@ -134,7 +135,7 @@ scoring parameter:: >>> scores = cross_val_score( ... clf, X, y, cv=5, scoring='f1_macro') >>> scores - array([0.96..., 1. ..., 0.96..., 0.96..., 1. ]) + array([0.96, 1., 0.96, 0.96, 1.]) See :ref:`scoring_parameter` for details. In the case of the Iris dataset, the samples are balanced across target @@ -152,7 +153,7 @@ validation iterator instead, for instance:: >>> n_samples = X.shape[0] >>> cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) >>> cross_val_score(clf, X, y, cv=cv) - array([0.977..., 0.977..., 1. ..., 0.955..., 1. ]) + array([0.977, 0.977, 1., 0.955, 1.]) Another option is to use an iterable yielding (train, test) splits as arrays of indices, for example:: @@ -167,34 +168,34 @@ indices, for example:: ... >>> custom_cv = custom_cv_2folds(X) >>> cross_val_score(clf, X, y, cv=custom_cv) - array([1. , 0.973...]) + array([1. , 0.973]) -.. topic:: Data transformation with held out data +.. dropdown:: Data transformation with held-out data - Just as it is important to test a predictor on data held-out from - training, preprocessing (such as standardization, feature selection, etc.) - and similar :ref:`data transformations ` similarly should - be learnt from a training set and applied to held-out data for prediction:: + Just as it is important to test a predictor on data held-out from + training, preprocessing (such as standardization, feature selection, etc.) + and similar :ref:`data transformations ` similarly should + be learnt from a training set and applied to held-out data for prediction:: - >>> from sklearn import preprocessing - >>> X_train, X_test, y_train, y_test = train_test_split( - ... X, y, test_size=0.4, random_state=0) - >>> scaler = preprocessing.StandardScaler().fit(X_train) - >>> X_train_transformed = scaler.transform(X_train) - >>> clf = svm.SVC(C=1).fit(X_train_transformed, y_train) - >>> X_test_transformed = scaler.transform(X_test) - >>> clf.score(X_test_transformed, y_test) - 0.9333... + >>> from sklearn import preprocessing + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.4, random_state=0) + >>> scaler = preprocessing.StandardScaler().fit(X_train) + >>> X_train_transformed = scaler.transform(X_train) + >>> clf = svm.SVC(C=1).fit(X_train_transformed, y_train) + >>> X_test_transformed = scaler.transform(X_test) + >>> clf.score(X_test_transformed, y_test) + 0.9333 - A :class:`Pipeline ` makes it easier to compose - estimators, providing this behavior under cross-validation:: + A :class:`Pipeline ` makes it easier to compose + estimators, providing this behavior under cross-validation:: - >>> from sklearn.pipeline import make_pipeline - >>> clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1)) - >>> cross_val_score(clf, X, y, cv=cv) - array([0.977..., 0.933..., 0.955..., 0.933..., 0.977...]) + >>> from sklearn.pipeline import make_pipeline + >>> clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1)) + >>> cross_val_score(clf, X, y, cv=cv) + array([0.977, 0.933, 0.955, 0.933, 0.977]) - See :ref:`combining_estimators`. + See :ref:`combining_estimators`. .. _multimetric_cross_validation: @@ -208,8 +209,8 @@ two ways: - It allows specifying multiple metrics for evaluation. - It returns a dict containing fit-times, score-times - (and optionally training scores as well as fitted estimators) in - addition to the test score. + (and optionally training scores, fitted estimators, train-test split indices) + in addition to the test score. For single metric evaluation, where the scoring parameter is a string, callable or None, the keys will be - ``['test_score', 'fit_time', 'score_time']`` @@ -220,10 +221,10 @@ following keys - ``return_train_score`` is set to ``False`` by default to save computation time. To evaluate the scores on the training set as well you need to set it to -``True``. - -You may also retain the estimator fitted on each training set by setting -``return_estimator=True``. +``True``. You may also retain the estimator fitted on each training set by +setting ``return_estimator=True``. Similarly, you may set +`return_indices=True` to retain the training and testing indices used to split +the dataset into train and test sets for each cv split. The multiple metrics can be specified either as a list, tuple or set of predefined scorer names:: @@ -236,7 +237,7 @@ predefined scorer names:: >>> sorted(scores.keys()) ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro'] >>> scores['test_recall_macro'] - array([0.96..., 1. ..., 0.96..., 0.96..., 1. ]) + array([0.96, 1., 0.96, 0.96, 1.]) Or as a dict mapping scorer name to a predefined or custom scoring function:: @@ -249,7 +250,7 @@ Or as a dict mapping scorer name to a predefined or custom scoring function:: ['fit_time', 'score_time', 'test_prec_macro', 'test_rec_macro', 'train_prec_macro', 'train_rec_macro'] >>> scores['train_rec_macro'] - array([0.97..., 0.97..., 0.99..., 0.98..., 0.98...]) + array([0.97, 0.97, 0.99, 0.98, 0.98]) Here is an example of ``cross_validate`` using a single metric:: @@ -278,7 +279,7 @@ can be used (otherwise, an exception is raised). over cross-validation folds, whereas :func:`cross_val_predict` simply returns the labels (or probabilities) from several distinct models undistinguished. Thus, :func:`cross_val_predict` is not an appropriate - measure of generalisation error. + measure of generalization error. The function :func:`cross_val_predict` is appropriate for: @@ -290,14 +291,14 @@ The function :func:`cross_val_predict` is appropriate for: The available cross validation iterators are introduced in the following section. -.. topic:: Examples +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`, - * :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`, - * :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py`, - * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py`, - * :ref:`sphx_glr_auto_examples_model_selection_plot_cv_predict.py`, - * :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py`. +* :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`, +* :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`, +* :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py`, +* :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py`, +* :ref:`sphx_glr_auto_examples_model_selection_plot_cv_predict.py`, +* :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py`. Cross validation iterators ========================== @@ -353,7 +354,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples:: Here is a visualization of the cross-validation behavior. Note that :class:`KFold` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_004.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -371,8 +372,7 @@ Thus, one can create the training/test sets using numpy indexing:: Repeated K-Fold ^^^^^^^^^^^^^^^ -:class:`RepeatedKFold` repeats K-Fold n times. It can be used when one -requires to run :class:`KFold` n times, producing different splits in +:class:`RepeatedKFold` repeats :class:`KFold` :math:`n` times, producing different splits in each repetition. Example of 2-fold K-Fold repeated 2 times:: @@ -391,7 +391,7 @@ Example of 2-fold K-Fold repeated 2 times:: [1 3] [0 2] -Similarly, :class:`RepeatedStratifiedKFold` repeats Stratified K-Fold n times +Similarly, :class:`RepeatedStratifiedKFold` repeats :class:`StratifiedKFold` :math:`n` times with different randomization in each repetition. .. _leave_one_out: @@ -402,7 +402,7 @@ Leave One Out (LOO) :class:`LeaveOneOut` (or LOO) is a simple cross-validation. Each learning set is created by taking all the samples except one, the test set being the sample left out. Thus, for :math:`n` samples, we have :math:`n` different -training sets and :math:`n` different tests set. This cross-validation +training sets and :math:`n` different test sets. This cross-validation procedure does not waste much data as only one sample is removed from the training set:: @@ -433,25 +433,24 @@ folds are virtually identical to each other and to the model built from the entire training set. However, if the learning curve is steep for the training size in question, -then 5- or 10- fold cross validation can overestimate the generalization error. +then 5 or 10-fold cross validation can overestimate the generalization error. -As a general rule, most authors, and empirical evidence, suggest that 5- or 10- -fold cross validation should be preferred to LOO. +As a general rule, most authors and empirical evidence suggest that 5 or 10-fold +cross validation should be preferred to LOO. +.. dropdown:: References -.. topic:: References: - - * ``_; - * T. Hastie, R. Tibshirani, J. Friedman, `The Elements of Statistical Learning - `_, Springer 2009 - * L. Breiman, P. Spector `Submodel selection and evaluation in regression: The X-random case - `_, International Statistical Review 1992; - * R. Kohavi, `A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection - `_, Intl. Jnt. Conf. AI - * R. Bharat Rao, G. Fung, R. Rosales, `On the Dangers of Cross-Validation. An Experimental Evaluation - `_, SIAM 2008; - * G. James, D. Witten, T. Hastie, R Tibshirani, `An Introduction to - Statistical Learning `_, Springer 2013. + * ``_; + * T. Hastie, R. Tibshirani, J. Friedman, `The Elements of Statistical Learning + `_, Springer 2009 + * L. Breiman, P. Spector `Submodel selection and evaluation in regression: The X-random case + `_, International Statistical Review 1992; + * R. Kohavi, `A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection + `_, Intl. Jnt. Conf. AI + * R. Bharat Rao, G. Fung, R. Rosales, `On the Dangers of Cross-Validation. An Experimental Evaluation + `_, SIAM 2008; + * G. James, D. Witten, T. Hastie, R. Tibshirani, `An Introduction to + Statistical Learning `_, Springer 2013. .. _leave_p_out: @@ -509,7 +508,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. Note that :class:`ShuffleSplit` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -520,22 +519,43 @@ the proportion of samples on each side of the train / test split. .. _stratification: -Cross-validation iterators with stratification based on class labels. ---------------------------------------------------------------------- +Cross-validation iterators with stratification based on class labels +-------------------------------------------------------------------- + +Some classification tasks can naturally exhibit rare classes: for instance, +there could be orders of magnitude more negative observations than positive +observations (e.g. medical screening, fraud detection, etc). As a result, +cross-validation splitting can generate train or validation folds without any +occurrence of a particular class. This typically leads to undefined +classification metrics (e.g. ROC AUC), exceptions raised when attempting to +call :term:`fit` or missing columns in the output of the `predict_proba` or +`decision_function` methods of multiclass classifiers trained on different +folds. -Some classification problems can exhibit a large imbalance in the distribution -of the target classes: for instance there could be several times more negative -samples than positive samples. In such cases it is recommended to use -stratified sampling as implemented in :class:`StratifiedKFold` and -:class:`StratifiedShuffleSplit` to ensure that relative class frequencies is -approximately preserved in each train and validation fold. +To mitigate such problems, splitters such as :class:`StratifiedKFold` and +:class:`StratifiedShuffleSplit` implement stratified sampling to ensure that +relative class frequencies are approximately preserved in each fold. + +.. note:: + + Stratified sampling was introduced in scikit-learn to workaround the + aforementioned engineering problems rather than solve a statistical one. + + Stratification makes cross-validation folds more homogeneous, and as a result + hides some of the variability inherent to fitting models with a limited + number of observations. + + As a result, stratification can artificially shrink the spread of the metric + measured across cross-validation iterations: the inter-fold variability does + no longer reflect the uncertainty in the performance of classifiers in the + presence of rare classes. .. _stratified_k_fold: -Stratified k-fold +Stratified K-fold ^^^^^^^^^^^^^^^^^ -:class:`StratifiedKFold` is a variation of *k-fold* which returns *stratified* +:class:`StratifiedKFold` is a variation of *K-fold* which returns *stratified* folds: each set contains approximately the same percentage of samples of each target class as the complete set. @@ -562,11 +582,11 @@ two unbalanced classes. We show the number of samples in each class and compare train - [34] | test - [11 5] We can see that :class:`StratifiedKFold` preserves the class ratios -(approximately 1 / 10) in both train and test dataset. +(approximately 1 / 10) in both train and test datasets. Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -580,22 +600,35 @@ Stratified Shuffle Split ^^^^^^^^^^^^^^^^^^^^^^^^ :class:`StratifiedShuffleSplit` is a variation of *ShuffleSplit*, which returns -stratified splits, *i.e* which creates splits by preserving the same +stratified splits, *i.e.* which creates splits by preserving the same percentage for each target class as in the complete set. Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_012.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% +.. _predefined_split: + +Predefined fold-splits / Validation-sets +---------------------------------------- + +For some datasets, a pre-defined split of the data into training- and +validation fold or into several cross-validation folds already +exists. Using :class:`PredefinedSplit` it is possible to use these folds +e.g. when searching for hyperparameters. + +For example, when using a validation set, set the ``test_fold`` to 0 for all +samples that are part of the validation set, and to -1 for all other samples. + .. _group_cv: -Cross-validation iterators for grouped data. --------------------------------------------- +Cross-validation iterators for grouped data +------------------------------------------- -The i.i.d. assumption is broken if the underlying generative process yield +The i.i.d. assumption is broken if the underlying generative process yields groups of dependent samples. Such a grouping of data is domain specific. An example would be when there is @@ -614,10 +647,10 @@ parameter. .. _group_k_fold: -Group k-fold +Group K-fold ^^^^^^^^^^^^ -:class:`GroupKFold` is a variation of k-fold which ensures that the same group is +:class:`GroupKFold` is a variation of K-fold which ensures that the same group is not represented in both testing and training sets. For example if the data is obtained from different subjects with several samples per-subject and if the model is flexible enough to learn from highly person specific features it @@ -641,10 +674,71 @@ Imagine you have three subjects, each with an associated number from 1 to 3:: Each subject is in a different testing fold, and the same subject is never in both testing and training. Notice that the folds do not have exactly the same -size due to the imbalance in the data. +size due to the imbalance in the data. If class proportions must be balanced +across folds, :class:`StratifiedGroupKFold` is a better option. Here is a visualization of the cross-validation behavior. +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png + :target: ../auto_examples/model_selection/plot_cv_indices.html + :align: center + :scale: 75% + +Similar to :class:`KFold`, the test sets from :class:`GroupKFold` will form a +complete partition of all the data. + +While :class:`GroupKFold` attempts to place the same number of samples in each +fold when ``shuffle=False``, when ``shuffle=True`` it attempts to place an equal +number of distinct groups in each fold (but does not account for group sizes). + +.. _stratified_group_k_fold: + +StratifiedGroupKFold +^^^^^^^^^^^^^^^^^^^^ + +:class:`StratifiedGroupKFold` is a cross-validation scheme that combines both +:class:`StratifiedKFold` and :class:`GroupKFold`. The idea is to try to +preserve the distribution of classes in each split while keeping each group +within a single split. That might be useful when you have an unbalanced +dataset so that using just :class:`GroupKFold` might produce skewed splits. + +Example:: + + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = list(range(18)) + >>> y = [1] * 6 + [0] * 12 + >>> groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6] + >>> sgkf = StratifiedGroupKFold(n_splits=3) + >>> for train, test in sgkf.split(X, y, groups=groups): + ... print("%s %s" % (train, test)) + [ 0 2 3 4 5 6 7 10 11 15 16 17] [ 1 8 9 12 13 14] + [ 0 1 4 5 6 7 8 9 11 12 13 14] [ 2 3 10 15 16 17] + [ 1 2 3 8 9 10 12 13 14 15 16 17] [ 0 4 5 6 7 11] + +.. dropdown:: Implementation notes + + - With the current implementation full shuffle is not possible in most + scenarios. When shuffle=True, the following happens: + + 1. All groups are shuffled. + 2. Groups are sorted by standard deviation of classes using stable sort. + 3. Sorted groups are iterated over and assigned to folds. + + That means that only groups with the same standard deviation of class + distribution will be shuffled, which might be useful when each group has only + a single class. + - The algorithm greedily assigns each group to one of n_splits test sets, + choosing the test set that minimises the variance in class distribution + across test sets. Group assignment proceeds from groups with highest to + lowest variance in class frequency, i.e. large groups peaked on one or few + classes are assigned first. + - This split is suboptimal in a sense that it might produce imbalanced splits + even if perfect stratification is possible. If you have relatively close + distribution of classes in each group, using :class:`GroupKFold` is better. + + +Here is a visualization of cross-validation behavior for uneven groups: + .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_005.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center @@ -655,13 +749,14 @@ Here is a visualization of the cross-validation behavior. Leave One Group Out ^^^^^^^^^^^^^^^^^^^ -:class:`LeaveOneGroupOut` is a cross-validation scheme which holds out -the samples according to a third-party provided array of integer groups. This -group information can be used to encode arbitrary domain specific pre-defined -cross-validation folds. +:class:`LeaveOneGroupOut` is a cross-validation scheme where each split holds +out samples belonging to one specific group. Group information is +provided via an array that encodes the group of each sample. Each training set is thus constituted by all the samples except the ones -related to a specific group. +related to a specific group. This is the same as :class:`LeavePGroupsOut` with +`n_groups=1` and the same as :class:`GroupKFold` with `n_splits` equal to the +number of unique labels passed to the `groups` parameter. For example, in the cases of multiple experiments, :class:`LeaveOneGroupOut` can be used to create a cross-validation based on the different experiments: @@ -688,8 +783,10 @@ for cross-validation against time-based splits. Leave P Groups Out ^^^^^^^^^^^^^^^^^^ -:class:`LeavePGroupsOut` is similar as :class:`LeaveOneGroupOut`, but removes -samples related to :math:`P` groups for each training/test set. +:class:`LeavePGroupsOut` is similar to :class:`LeaveOneGroupOut`, but removes +samples related to :math:`P` groups for each training/test set. All possible +combinations of :math:`P` groups are left out, meaning test sets will overlap +for :math:`P>1`. Example of Leave-2-Group Out:: @@ -713,7 +810,8 @@ Group Shuffle Split The :class:`GroupShuffleSplit` iterator behaves as a combination of :class:`ShuffleSplit` and :class:`LeavePGroupsOut`, and generates a sequence of randomized partitions in which a subset of groups are held -out for each split. +out for each split. Each train/test split is performed independently meaning +there is no guaranteed relationship between successive test sets. Here is a usage example:: @@ -733,7 +831,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_011.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -745,23 +843,10 @@ expensive. In such a scenario, :class:`GroupShuffleSplit` provides a random sample (with replacement) of the train / test splits generated by :class:`LeavePGroupsOut`. -.. _predefined_split: - -Predefined Fold-Splits / Validation-Sets ----------------------------------------- - -For some datasets, a pre-defined split of the data into training- and -validation fold or into several cross-validation folds already -exists. Using :class:`PredefinedSplit` it is possible to use these folds -e.g. when searching for hyperparameters. - -For example, when using a validation set, set the ``test_fold`` to 0 for all -samples that are part of the validation set, and to -1 for all other samples. - Using cross-validation iterators to split train and test -------------------------------------------------------- -The above group cross-validation functions may also be useful for spitting a +The above group cross-validation functions may also be useful for splitting a dataset into training and testing subsets. Note that the convenience function :func:`train_test_split` is a wrapper around :func:`ShuffleSplit` and thus only allows for stratified splitting (using the class labels) @@ -792,13 +877,13 @@ cross-validation splitter. For example:: Cross validation of time series data ------------------------------------ -Time series data is characterised by the correlation between observations +Time series data is characterized by the correlation between observations that are near in time (*autocorrelation*). However, classical cross-validation techniques such as :class:`KFold` and :class:`ShuffleSplit` assume the samples are independent and identically distributed, and would result in unreasonable correlation between training and testing instances (yielding poor estimates of -generalisation error) on time series data. Therefore, it is very important +generalization error) on time series data. Therefore, it is very important to evaluate our model for time series data on the "future" observations least like those that are used to train the model. To achieve this, one solution is provided by :class:`TimeSeriesSplit`. @@ -816,7 +901,8 @@ Also, it adds all surplus data to the first training partition, which is always used to train the model. This class can be used to cross-validate time series data samples -that are observed at fixed time intervals. +that are observed at fixed time intervals. Indeed, the folds must +represent the same duration, in order to have comparable metrics across folds. Example of 3-split time series cross-validation on a dataset with 6 samples:: @@ -835,7 +921,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_010.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_013.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -844,8 +930,8 @@ A note on shuffling =================== If the data ordering is not arbitrary (e.g. samples with the same class label -are contiguous), shuffling it first may be essential to get a meaningful cross- -validation result. However, the opposite may be true if the samples are not +are contiguous), shuffling it first may be essential to get a meaningful +cross-validation result. However, the opposite may be true if the samples are not independently and identically distributed. For example, if samples correspond to news articles, and are ordered by their time of publication, then shuffling the data will likely lead to a model that is overfit and an inflated validation @@ -856,8 +942,8 @@ Some cross validation iterators, such as :class:`KFold`, have an inbuilt option to shuffle the data indices before splitting them. Note that: * This consumes less memory than shuffling the data directly. -* By default no shuffling occurs, including for the (stratified) K fold cross- - validation performed by specifying ``cv=some_integer`` to +* By default no shuffling occurs, including for the (stratified) K fold + cross-validation performed by specifying ``cv=some_integer`` to :func:`cross_val_score`, grid search, etc. Keep in mind that :func:`train_test_split` still returns a random split. * The ``random_state`` parameter defaults to ``None``, meaning that the @@ -882,56 +968,59 @@ Permutation test score ====================== :func:`~sklearn.model_selection.permutation_test_score` offers another way -to evaluate the performance of classifiers. It provides a permutation-based -p-value, which represents how likely an observed performance of the -classifier would be obtained by chance. The null hypothesis in this test is -that the classifier fails to leverage any statistical dependency between the -features and the labels to make correct predictions on left out data. +to evaluate the performance of a :term:`predictor`. It provides a +permutation-based p-value, which represents how likely an observed performance of the +estimator would be obtained by chance. The null hypothesis in this test is +that the estimator fails to leverage any statistical dependency between the +features and the targets to make correct predictions on left-out data. :func:`~sklearn.model_selection.permutation_test_score` generates a null distribution by calculating `n_permutations` different permutations of the -data. In each permutation the labels are randomly shuffled, thereby removing -any dependency between the features and the labels. The p-value output -is the fraction of permutations for which the average cross-validation score -obtained by the model is better than the cross-validation score obtained by -the model using the original data. For reliable results ``n_permutations`` -should typically be larger than 100 and ``cv`` between 3-10 folds. - -A low p-value provides evidence that the dataset contains real dependency -between features and labels and the classifier was able to utilize this -to obtain good results. A high p-value could be due to a lack of dependency -between features and labels (there is no difference in feature values between -the classes) or because the classifier was not able to use the dependency in -the data. In the latter case, using a more appropriate classifier that -is able to utilize the structure in the data, would result in a low -p-value. - -Cross-validation provides information about how well a classifier generalizes, -specifically the range of expected errors of the classifier. However, a -classifier trained on a high dimensional dataset with no structure may still +data. In each permutation the target values are randomly shuffled, thereby removing +any dependency between the features and the targets. The p-value output is the fraction +of permutations whose cross-validation score is better or equal than the true score +without permuting targets. For reliable results ``n_permutations`` should typically be +larger than 100 and ``cv`` between 3-10 folds. + +A low p-value provides evidence that the dataset contains some real dependency between +features and targets **and** that the estimator was able to utilize this dependency to +obtain good results. A high p-value, in reverse, could be due to either one of these: + +- a lack of dependency between features and targets (i.e., there is no systematic + relationship and any observed patterns are likely due to random chance) +- **or** because the estimator was not able to use the dependency in the data (for + instance because it underfit). + +In the latter case, using a more appropriate estimator that is able to use the +structure in the data, would result in a lower p-value. + +Cross-validation provides information about how well an estimator generalizes +by estimating the range of its expected scores. However, an +estimator trained on a high dimensional dataset with no structure may still perform better than expected on cross-validation, just by chance. This can typically happen with small datasets with less than a few hundred samples. :func:`~sklearn.model_selection.permutation_test_score` provides information -on whether the classifier has found a real class structure and can help in -evaluating the performance of the classifier. +on whether the estimator has found a real dependency between features and targets and +can help in evaluating the performance of the estimator. It is important to note that this test has been shown to produce low p-values even if there is only weak structure in the data because in the corresponding permutated datasets there is absolutely no structure. This -test is therefore only able to show when the model reliably outperforms +test is therefore only able to show whether the model reliably outperforms random guessing. Finally, :func:`~sklearn.model_selection.permutation_test_score` is computed -using brute force and interally fits ``(n_permutations + 1) * n_cv`` models. +using brute force and internally fits ``(n_permutations + 1) * n_cv`` models. It is therefore only tractable with small datasets for which fitting an -individual model is very fast. +individual model is very fast. Using the `n_jobs` parameter parallelizes the +computation and thus speeds it up. -.. topic:: Examples +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_feature_selection_plot_permutation_test_for_classification.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_permutation_tests_for_classification.py` -.. topic:: References: +.. dropdown:: References - * Ojala and Garriga. `Permutation Tests for Studying Classifier Performance - `_. - J. Mach. Learn. Res. 2010. + * Ojala and Garriga. `Permutation Tests for Studying Classifier Performance + `_. + J. Mach. Learn. Res. 2010. diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 7e8e79d9d8bdd..24fcd43a292c0 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -51,10 +51,11 @@ data based on the amount of variance it explains. As such it implements a :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_fa_model_selection.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_fa_model_selection.py` .. _IncrementalPCA: @@ -71,14 +72,14 @@ exactly match the results of :class:`PCA` while processing the data in a minibatch fashion. :class:`IncrementalPCA` makes it possible to implement out-of-core Principal Component Analysis either by: - * Using its ``partial_fit`` method on chunks of data fetched sequentially - from the local hard drive or a network database. +* Using its ``partial_fit`` method on chunks of data fetched sequentially + from the local hard drive or a network database. - * Calling its fit method on a sparse matrix or a memory mapped file using - ``numpy.memmap``. +* Calling its fit method on a memory mapped file using + ``numpy.memmap``. :class:`IncrementalPCA` only stores estimates of component and noise variances, -in order update ``explained_variance_ratio_`` incrementally. This is why +in order to update ``explained_variance_ratio_`` incrementally. This is why memory usage depends on the number of samples per batch, rather than the number of samples to be processed in the dataset. @@ -96,9 +97,9 @@ input data for each feature before applying the SVD. :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_incremental_pca.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_incremental_pca.py` .. _RandomizedPCA: @@ -119,7 +120,7 @@ pictures of human faces look somewhat alike. The samples lie on a manifold of much lower dimension (say around 200 for instance). The PCA algorithm can be used to linearly transform the data while both reducing the dimensionality -and preserve most of the explained variance at the same time. +and preserving most of the explained variance at the same time. The class :class:`PCA` used with the optional parameter ``svd_solver='randomized'`` is very useful in that case: since we are going @@ -159,39 +160,20 @@ Note: the implementation of ``inverse_transform`` in :class:`PCA` with ``transform`` even when ``whiten=False`` (default). -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` +* :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` -.. topic:: References: +.. rubric:: References - * `"Finding structure with randomness: Stochastic algorithms for - constructing approximate matrix decompositions" - `_ - Halko, et al., 2009 - - -.. _kernel_PCA: - -Kernel PCA ----------- - -:class:`KernelPCA` is an extension of PCA which achieves non-linear -dimensionality reduction through the use of kernels (see :ref:`metrics`). It -has many applications including denoising, compression and structured -prediction (kernel dependency estimation). :class:`KernelPCA` supports both -``transform`` and ``inverse_transform``. - -.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png - :target: ../auto_examples/decomposition/plot_kernel_pca.html - :align: center - :scale: 75% - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py` +* Algorithm 4.3 in + :arxiv:`"Finding structure with randomness: Stochastic algorithms for + constructing approximate matrix decompositions" <0909.4061>` + Halko, et al., 2009 +* :arxiv:`"An implementation of a randomized algorithm for principal component + analysis" <1412.3510>` A. Szlam et al. 2014 .. _SparsePCA: @@ -215,7 +197,7 @@ the real underlying components can be more naturally imagined as sparse vectors; for example in face recognition, components might naturally map to parts of faces. -Sparse principal components yields a more parsimonious, interpretable +Sparse principal components yield a more parsimonious, interpretable representation, clearly emphasizing which of the original features contribute to the differences between samples. @@ -246,12 +228,14 @@ problem solved is a PCA problem (dictionary learning) with an .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} - ||X-UV||_2^2+\alpha||V||_1 \\ - \text{subject to } & ||U_k||_2 = 1 \text{ for all } + ||X-UV||_{\text{Fro}}^2+\alpha||V||_{1,1} \\ + \text{subject to } & ||U_k||_2 \leq 1 \text{ for all } 0 \leq k < n_{components} - -The sparsity-inducing :math:`\ell_1` norm also prevents learning +:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}` +stands for the entry-wise matrix norm which is the sum of the absolute values +of all the entries in the matrix. +The sparsity-inducing :math:`||.||_{1,1}` matrix norm also prevents learning components from noise when few training samples are available. The degree of penalization (and thus sparsity) can be adjusted through the hyperparameter ``alpha``. Small values lead to a gently regularized @@ -264,98 +248,206 @@ factorization, while larger values shrink many coefficients to zero. the algorithm is online along the features direction, not the samples direction. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` -.. topic:: References: +.. rubric:: References - .. [Mrl09] `"Online Dictionary Learning for Sparse Coding" - `_ - J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009 - .. [Jen09] `"Structured Sparse Principal Component Analysis" - `_ - R. Jenatton, G. Obozinski, F. Bach, 2009 +.. [Mrl09] `"Online Dictionary Learning for Sparse Coding" + `_ + J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009 +.. [Jen09] `"Structured Sparse Principal Component Analysis" + `_ + R. Jenatton, G. Obozinski, F. Bach, 2009 -.. _LSA: +.. _kernel_PCA: -Truncated singular value decomposition and latent semantic analysis -=================================================================== +Kernel Principal Component Analysis (kPCA) +========================================== -:class:`TruncatedSVD` implements a variant of singular value decomposition -(SVD) that only computes the :math:`k` largest singular values, -where :math:`k` is a user-specified parameter. +Exact Kernel PCA +---------------- -When truncated SVD is applied to term-document matrices -(as returned by :class:`~sklearn.feature_extraction.text.CountVectorizer` or -:class:`~sklearn.feature_extraction.text.TfidfVectorizer`), -this transformation is known as -`latent semantic analysis `_ -(LSA), because it transforms such matrices -to a "semantic" space of low dimensionality. -In particular, LSA is known to combat the effects of synonymy and polysemy -(both of which roughly mean there are multiple meanings per word), -which cause term-document matrices to be overly sparse -and exhibit poor similarity under measures such as cosine similarity. +:class:`KernelPCA` is an extension of PCA which achieves non-linear +dimensionality reduction through the use of kernels (see :ref:`metrics`) [Scholkopf1997]_. It +has many applications including denoising, compression and structured +prediction (kernel dependency estimation). :class:`KernelPCA` supports both +``transform`` and ``inverse_transform``. + +.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png + :target: ../auto_examples/decomposition/plot_kernel_pca.html + :align: center + :scale: 75% .. note:: - LSA is also known as latent semantic indexing, LSI, - though strictly that refers to its use in persistent indexes - for information retrieval purposes. + :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the + function mapping samples from the PCA basis into the original feature + space [Bakir2003]_. Thus, the reconstruction obtained with + :meth:`KernelPCA.inverse_transform` is an approximation. See the example + linked below for more details. -Mathematically, truncated SVD applied to training samples :math:`X` -produces a low-rank approximation :math:`X`: +.. rubric:: Examples -.. math:: - X \approx X_k = U_k \Sigma_k V_k^\top +* :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py` +* :ref:`sphx_glr_auto_examples_applications_plot_digits_denoising.py` -After this operation, :math:`U_k \Sigma_k^\top` -is the transformed training set with :math:`k` features -(called ``n_components`` in the API). +.. rubric:: References -To also transform a test set :math:`X`, we multiply it with :math:`V_k`: +.. [Scholkopf1997] SchÃļlkopf, Bernhard, Alexander Smola, and Klaus-Robert MÃŧller. + `"Kernel principal component analysis." + `_ + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. -.. math:: - X' = X V_k +.. [Bakir2003] BakÄąr, GÃļkhan H., Jason Weston, and Bernhard SchÃļlkopf. + `"Learning to find pre-images." + `_ + Advances in neural information processing systems 16 (2003): 449-456. -.. note:: - Most treatments of LSA in the natural language processing (NLP) - and information retrieval (IR) literature - swap the axes of the matrix :math:`X` so that it has shape - ``n_features`` × ``n_samples``. - We present LSA in a different way that matches the scikit-learn API better, - but the singular values found are the same. +.. _kPCA_Solvers: + +Choice of solver for Kernel PCA +------------------------------- + +While in :class:`PCA` the number of components is bounded by the number of +features, in :class:`KernelPCA` the number of components is bounded by the +number of samples. Many real-world datasets have large number of samples! In +these cases finding *all* the components with a full kPCA is a waste of +computation time, as data is mostly described by the first few components +(e.g. ``n_components<=100``). In other words, the centered Gram matrix that +is eigendecomposed in the Kernel PCA fitting process has an effective rank that +is much smaller than its size. This is a situation where approximate +eigensolvers can provide speedup with very low precision loss. + + +.. dropdown:: Eigensolvers + + The optional parameter ``eigen_solver='randomized'`` can be used to + *significantly* reduce the computation time when the number of requested + ``n_components`` is small compared with the number of samples. It relies on + randomized decomposition methods to find an approximate solution in a shorter + time. + + The time complexity of the randomized :class:`KernelPCA` is + :math:`O(n_{\mathrm{samples}}^2 \cdot n_{\mathrm{components}})` + instead of :math:`O(n_{\mathrm{samples}}^3)` for the exact method + implemented with ``eigen_solver='dense'``. + + The memory footprint of randomized :class:`KernelPCA` is also proportional to + :math:`2 \cdot n_{\mathrm{samples}} \cdot n_{\mathrm{components}}` instead of + :math:`n_{\mathrm{samples}}^2` for the exact method. + + Note: this technique is the same as in :ref:`RandomizedPCA`. + + In addition to the above two solvers, ``eigen_solver='arpack'`` can be used as + an alternate way to get an approximate decomposition. In practice, this method + only provides reasonable execution times when the number of components to find + is extremely small. It is enabled by default when the desired number of + components is less than 10 (strict) and the number of samples is more than 200 + (strict). See :class:`KernelPCA` for details. + + .. rubric:: References + + * *dense* solver: + `scipy.linalg.eigh documentation + `_ + + * *randomized* solver: + + * Algorithm 4.3 in + :arxiv:`"Finding structure with randomness: Stochastic + algorithms for constructing approximate matrix decompositions" <0909.4061>` + Halko, et al. (2009) + + * :arxiv:`"An implementation of a randomized algorithm + for principal component analysis" <1412.3510>` + A. Szlam et al. (2014) + + * *arpack* solver: + `scipy.sparse.linalg.eigsh documentation + `_ + R. B. Lehoucq, D. C. Sorensen, and C. Yang, (1998) + + +.. _LSA: + +Truncated singular value decomposition and latent semantic analysis +=================================================================== + +:class:`TruncatedSVD` implements a variant of singular value decomposition +(SVD) that only computes the :math:`k` largest singular values, +where :math:`k` is a user-specified parameter. :class:`TruncatedSVD` is very similar to :class:`PCA`, but differs in that the matrix :math:`X` does not need to be centered. When the columnwise (per-feature) means of :math:`X` are subtracted from the feature values, truncated SVD on the resulting matrix is equivalent to PCA. -In practical terms, this means -that the :class:`TruncatedSVD` transformer accepts ``scipy.sparse`` -matrices without the need to densify them, -as densifying may fill up memory even for medium-sized document collections. -While the :class:`TruncatedSVD` transformer -works with any feature matrix, -using it on tf–idf matrices is recommended over raw frequency counts -in an LSA/document processing setting. -In particular, sublinear scaling and inverse document frequency -should be turned on (``sublinear_tf=True, use_idf=True``) -to bring the feature values closer to a Gaussian distribution, -compensating for LSA's erroneous assumptions about textual data. +.. dropdown:: About truncated SVD and latent semantic analysis (LSA) + + When truncated SVD is applied to term-document matrices + (as returned by :class:`~sklearn.feature_extraction.text.CountVectorizer` or + :class:`~sklearn.feature_extraction.text.TfidfVectorizer`), + this transformation is known as + `latent semantic analysis `_ + (LSA), because it transforms such matrices + to a "semantic" space of low dimensionality. + In particular, LSA is known to combat the effects of synonymy and polysemy + (both of which roughly mean there are multiple meanings per word), + which cause term-document matrices to be overly sparse + and exhibit poor similarity under measures such as cosine similarity. + + .. note:: + LSA is also known as latent semantic indexing, LSI, + though strictly that refers to its use in persistent indexes + for information retrieval purposes. + + Mathematically, truncated SVD applied to training samples :math:`X` + produces a low-rank approximation :math:`X`: + + .. math:: + X \approx X_k = U_k \Sigma_k V_k^\top -.. topic:: Examples: + After this operation, :math:`U_k \Sigma_k` + is the transformed training set with :math:`k` features + (called ``n_components`` in the API). - * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` + To also transform a test set :math:`X`, we multiply it with :math:`V_k`: -.. topic:: References: + .. math:: + X' = X V_k + + .. note:: + Most treatments of LSA in the natural language processing (NLP) + and information retrieval (IR) literature + swap the axes of the matrix :math:`X` so that it has shape + ``(n_features, n_samples)``. + We present LSA in a different way that matches the scikit-learn API better, + but the singular values found are the same. + + While the :class:`TruncatedSVD` transformer + works with any feature matrix, + using it on tf-idf matrices is recommended over raw frequency counts + in an LSA/document processing setting. + In particular, sublinear scaling and inverse document frequency + should be turned on (``sublinear_tf=True, use_idf=True``) + to bring the feature values closer to a Gaussian distribution, + compensating for LSA's erroneous assumptions about textual data. + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` + +.. rubric:: References + +* Christopher D. Manning, Prabhakar Raghavan and Hinrich SchÃŧtze (2008), + *Introduction to Information Retrieval*, Cambridge University Press, + chapter 18: `Matrix decompositions & latent semantic indexing + `_ - * Christopher D. Manning, Prabhakar Raghavan and Hinrich SchÃŧtze (2008), - *Introduction to Information Retrieval*, Cambridge University Press, - chapter 18: `Matrix decompositions & latent semantic indexing - `_ .. _DictionaryLearning: @@ -408,9 +500,9 @@ the split code is filled with the negative part of the code vector, only with a positive sign. Therefore, the split_code is non-negative. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_sparse_coding.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_sparse_coding.py` Generic dictionary learning @@ -432,8 +524,8 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code. .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} - ||X-UV||_2^2+\alpha||U||_1 \\ - \text{subject to } & ||V_k||_2 = 1 \text{ for all } + ||X-UV||_{\text{Fro}}^2+\alpha||U||_{1,1} \\ + \text{subject to } & ||V_k||_2 \leq 1 \text{ for all } 0 \leq k < n_{\mathrm{atoms}} @@ -441,13 +533,15 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code. :target: ../auto_examples/decomposition/plot_faces_decomposition.html :scale: 60% -.. |dict_img2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_006.png +.. |dict_img2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_007.png :target: ../auto_examples/decomposition/plot_faces_decomposition.html :scale: 60% .. centered:: |pca_img2| |dict_img2| - +:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}` +stands for the entry-wise matrix norm which is the sum of the absolute values +of all the entries in the matrix. After using such a procedure to fit the dictionary, the transform is simply a sparse coding step that shares the same implementation with all dictionary learning objects (see :ref:`SparseCoder`). @@ -458,19 +552,19 @@ different positivity constraints applied. Red indicates negative values, blue indicates positive values, and white represents zeros. -.. |dict_img_pos1| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_011.png +.. |dict_img_pos1| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_010.png :target: ../auto_examples/decomposition/plot_image_denoising.html :scale: 60% -.. |dict_img_pos2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_012.png +.. |dict_img_pos2| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_011.png :target: ../auto_examples/decomposition/plot_image_denoising.html :scale: 60% -.. |dict_img_pos3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_013.png +.. |dict_img_pos3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_012.png :target: ../auto_examples/decomposition/plot_image_denoising.html :scale: 60% -.. |dict_img_pos4| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_014.png +.. |dict_img_pos4| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_013.png :target: ../auto_examples/decomposition/plot_image_denoising.html :scale: 60% @@ -488,16 +582,16 @@ extracted from part of the image of a raccoon face looks like. :scale: 50% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_image_denoising.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_image_denoising.py` -.. topic:: References: +.. rubric:: References - * `"Online dictionary learning for sparse coding" - `_ - J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009 +* `"Online dictionary learning for sparse coding" + `_ + J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009 .. _MiniBatchDictionaryLearning: @@ -516,7 +610,7 @@ implement a stopping condition. The estimator also implements ``partial_fit``, which updates the dictionary by iterating only once over a mini-batch. This can be used for online learning when the data is not readily available from the start, or for when the data -does not fit into the memory. +does not fit into memory. .. currentmodule:: sklearn.cluster @@ -533,7 +627,7 @@ does not fit into the memory. computationally efficient and implements on-line learning with a ``partial_fit`` method. - Example: :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` + Example: :ref:`sphx_glr_auto_examples_cluster_plot_dict_face_patches.py` .. currentmodule:: sklearn.decomposition @@ -599,7 +693,7 @@ about these components (e.g. whether they are orthogonal): :target: ../auto_examples/decomposition/plot_faces_decomposition.html :scale: 60% -.. |fa_img3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_009.png +.. |fa_img3| image:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_008.png :target: ../auto_examples/decomposition/plot_faces_decomposition.html :scale: 60% @@ -609,7 +703,7 @@ The main advantage for Factor Analysis over :class:`PCA` is that it can model the variance in every direction of the input space independently (heteroscedastic noise): -.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_008.png +.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_faces_decomposition_009.png :target: ../auto_examples/decomposition/plot_faces_decomposition.html :align: center :scale: 75% @@ -628,10 +722,10 @@ Varimax rotation maximizes the sum of the variances of the squared loadings, i.e., it tends to produce sparser factors, which are influenced by only a few features each (the "simple structure"). See e.g., the first example below. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_varimax_fa.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_fa_model_selection.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_varimax_fa.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_fa_model_selection.py` .. _ICA: @@ -645,7 +739,7 @@ implemented in scikit-learn using the :class:`Fast ICA ` algorithm. Typically, ICA is not used for reducing dimensionality but for separating superimposed signals. Since the ICA model does not include a noise term, for the model to be correct, whitening must be applied. -This can be done internally using the whiten argument or manually using one +This can be done internally using the `whiten` argument or manually using one of the PCA variants. It is classically used to separate mixed signals (a problem known as @@ -670,11 +764,11 @@ components with some sparsity: .. centered:: |pca_img4| |ica_img4| -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_decomposition_plot_ica_blind_source_separation.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_ica_vs_pca.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_ica_blind_source_separation.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_ica_vs_pca.py` +* :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` .. _NMF: @@ -718,7 +812,7 @@ faces dataset, in comparison with the PCA eigenfaces. .. centered:: |pca_img5| |nmf_img5| -The :attr:`init` attribute determines the initialization method applied, which +The `init` attribute determines the initialization method applied, which has a great impact on the performance of the method. :class:`NMF` implements the method Nonnegative Double Singular Value Decomposition. NNDSVD [4]_ is based on two SVD processes, one approximating the data matrix, the other approximating @@ -735,33 +829,31 @@ basic NNDSVD algorithm which introduces a lot of zeros; in this case, NNDSVDa or NNDSVDar should be preferred. :class:`NMF` can also be initialized with correctly scaled random non-negative -matrices by setting :attr:`init="random"`. An integer seed or a -``RandomState`` can also be passed to :attr:`random_state` to control +matrices by setting `init="random"`. An integer seed or a +``RandomState`` can also be passed to `random_state` to control reproducibility. -In :class:`NMF`, L1 and L2 priors can be added to the loss function in order -to regularize the model. The L2 prior uses the Frobenius norm, while the L1 -prior uses an elementwise L1 norm. As in :class:`ElasticNet`, we control the -combination of L1 and L2 with the :attr:`l1_ratio` (:math:`\rho`) parameter, -and the intensity of the regularization with the :attr:`alpha` -(:math:`\alpha`) parameter. Then the priors terms are: +In :class:`NMF`, L1 and L2 priors can be added to the loss function in order to +regularize the model. The L2 prior uses the Frobenius norm, while the L1 prior +uses an elementwise L1 norm. As in :class:`~sklearn.linear_model.ElasticNet`, +we control the combination of L1 and L2 with the `l1_ratio` (:math:`\rho`) +parameter, and the intensity of the regularization with the `alpha_W` and +`alpha_H` (:math:`\alpha_W` and :math:`\alpha_H`) parameters. The priors are +scaled by the number of samples (:math:`n\_samples`) for `H` and the number of +features (:math:`n\_features`) for `W` to keep their impact balanced with +respect to one another and to the data fit term as independent as possible of +the size of the training set. Then the priors terms are: .. math:: - \alpha \rho ||W||_1 + \alpha \rho ||H||_1 - + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 - + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 + (\alpha_W \rho ||W||_1 + \frac{\alpha_W(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2) * n\_features + + (\alpha_H \rho ||H||_1 + \frac{\alpha_H(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2) * n\_samples and the regularized objective function is: .. math:: d_{\mathrm{Fro}}(X, WH) - + \alpha \rho ||W||_1 + \alpha \rho ||H||_1 - + \frac{\alpha(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2 - + \frac{\alpha(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2 - -:class:`NMF` regularizes both W and H by default. The :attr:`regularization` -parameter allows for finer control, with which only W, only H, -or both can be regularized. + + (\alpha_W \rho ||W||_1 + \frac{\alpha_W(1-\rho)}{2} ||W||_{\mathrm{Fro}} ^ 2) * n\_features + + (\alpha_H \rho ||H||_1 + \frac{\alpha_H(1-\rho)}{2} ||H||_{\mathrm{Fro}} ^ 2) * n\_samples NMF with a beta-divergence -------------------------- @@ -785,14 +877,13 @@ Or, the Itakura-Saito (IS) divergence: d_{IS}(X, Y) = \sum_{i,j} (\frac{X_{ij}}{Y_{ij}} - \log(\frac{X_{ij}}{Y_{ij}}) - 1) These three distances are special cases of the beta-divergence family, with -:math:`\beta = 2, 1, 0` respectively [6]_. The beta-divergence are +:math:`\beta = 2, 1, 0` respectively [6]_. The beta-divergence is defined by : .. math:: d_{\beta}(X, Y) = \sum_{i,j} \frac{1}{\beta(\beta - 1)}(X_{ij}^\beta + (\beta-1)Y_{ij}^\beta - \beta X_{ij} Y_{ij}^{\beta - 1}) -.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_beta_divergence_001.png - :target: ../auto_examples/decomposition/plot_beta_divergence.html +.. image:: ../images/beta_divergence.png :align: center :scale: 75% @@ -800,18 +891,20 @@ Note that this definition is not valid if :math:`\beta \in (0; 1)`, yet it can be continuously extended to the definitions of :math:`d_{KL}` and :math:`d_{IS}` respectively. -:class:`NMF` implements two solvers, using Coordinate Descent ('cd') [5]_, and -Multiplicative Update ('mu') [6]_. The 'mu' solver can optimize every -beta-divergence, including of course the Frobenius norm (:math:`\beta=2`), the -(generalized) Kullback-Leibler divergence (:math:`\beta=1`) and the -Itakura-Saito divergence (:math:`\beta=0`). Note that for -:math:`\beta \in (1; 2)`, the 'mu' solver is significantly faster than for other -values of :math:`\beta`. Note also that with a negative (or 0, i.e. -'itakura-saito') :math:`\beta`, the input matrix cannot contain zero values. +.. dropdown:: NMF implemented solvers + + :class:`NMF` implements two solvers, using Coordinate Descent ('cd') [5]_, and + Multiplicative Update ('mu') [6]_. The 'mu' solver can optimize every + beta-divergence, including of course the Frobenius norm (:math:`\beta=2`), the + (generalized) Kullback-Leibler divergence (:math:`\beta=1`) and the + Itakura-Saito divergence (:math:`\beta=0`). Note that for + :math:`\beta \in (1; 2)`, the 'mu' solver is significantly faster than for other + values of :math:`\beta`. Note also that with a negative (or 0, i.e. + 'itakura-saito') :math:`\beta`, the input matrix cannot contain zero values. -The 'cd' solver can only optimize the Frobenius norm. Due to the -underlying non-convexity of NMF, the different solvers may converge to -different minima, even when optimizing the same distance function. + The 'cd' solver can only optimize the Frobenius norm. Due to the + underlying non-convexity of NMF, the different solvers may converge to + different minima, even when optimizing the same distance function. NMF is best used with the ``fit_transform`` method, which returns the matrix W. The matrix H is stored into the fitted model in the ``components_`` attribute; @@ -827,36 +920,63 @@ stored components:: >>> X_new = np.array([[1, 0], [1, 6.1], [1, 0], [1, 4], [3.2, 1], [0, 4]]) >>> W_new = model.transform(X_new) -.. topic:: Examples: - * :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` - * :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` - * :ref:`sphx_glr_auto_examples_decomposition_plot_beta_divergence.py` -.. topic:: References: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` +* :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` + +.. _MiniBatchNMF: - .. [1] `"Learning the parts of objects by non-negative matrix factorization" - `_ - D. Lee, S. Seung, 1999 +Mini-batch Non Negative Matrix Factorization +-------------------------------------------- - .. [2] `"Non-negative Matrix Factorization with Sparseness Constraints" - `_ - P. Hoyer, 2004 +:class:`MiniBatchNMF` [7]_ implements a faster, but less accurate version of the +non negative matrix factorization (i.e. :class:`~sklearn.decomposition.NMF`), +better suited for large datasets. - .. [4] `"SVD based initialization: A head start for nonnegative - matrix factorization" - `_ - C. Boutsidis, E. Gallopoulos, 2008 +By default, :class:`MiniBatchNMF` divides the data into mini-batches and +optimizes the NMF model in an online manner by cycling over the mini-batches +for the specified number of iterations. The ``batch_size`` parameter controls +the size of the batches. - .. [5] `"Fast local algorithms for large scale nonnegative matrix and tensor - factorizations." - `_ - A. Cichocki, A. Phan, 2009 +In order to speed up the mini-batch algorithm it is also possible to scale +past batches, giving them less importance than newer batches. This is done +by introducing a so-called forgetting factor controlled by the ``forget_factor`` +parameter. - .. [6] `"Algorithms for nonnegative matrix factorization with the beta-divergence" - `_ - C. Fevotte, J. Idier, 2011 +The estimator also implements ``partial_fit``, which updates ``H`` by iterating +only once over a mini-batch. This can be used for online learning when the data +is not readily available from the start, or when the data does not fit into memory. +.. rubric:: References + +.. [1] `"Learning the parts of objects by non-negative matrix factorization" + `_ + D. Lee, S. Seung, 1999 + +.. [2] `"Non-negative Matrix Factorization with Sparseness Constraints" + `_ + P. Hoyer, 2004 + +.. [4] `"SVD based initialization: A head start for nonnegative + matrix factorization" + `_ + C. Boutsidis, E. Gallopoulos, 2008 + +.. [5] `"Fast local algorithms for large scale nonnegative matrix and tensor + factorizations." + `_ + A. Cichocki, A. Phan, 2009 + +.. [6] :arxiv:`"Algorithms for nonnegative matrix factorization with + the beta-divergence" <1010.1763>` + C. Fevotte, J. Idier, 2011 + +.. [7] :arxiv:`"Online algorithms for nonnegative matrix factorization with the + Itakura-Saito divergence" <1106.4198>` + A. Lefevre, F. Bach, C. Fevotte, 2011 .. _LatentDirichletAllocation: @@ -864,7 +984,7 @@ Latent Dirichlet Allocation (LDA) ================================= Latent Dirichlet Allocation is a generative probabilistic model for collections of -discrete dataset such as text corpora. It is also a topic model that is used for +discrete datasets such as text corpora. It is also a topic model that is used for discovering abstract topics from a collection of documents. The graphical model of LDA is a three-level generative model: @@ -872,66 +992,69 @@ The graphical model of LDA is a three-level generative model: .. image:: ../images/lda_model_graph.png :align: center -Note on notations presented in the graphical model above, which can be found in +Note on notations presented in the graphical model above, which can be found in Hoffman et al. (2013): - * The corpus is a collection of :math:`D` documents. - * A document is a sequence of :math:`N` words. - * There are :math:`K` topics in the corpus. - * The boxes represent repeated sampling. +* The corpus is a collection of :math:`D` documents. +* A document is a sequence of :math:`N` words. +* There are :math:`K` topics in the corpus. +* The boxes represent repeated sampling. -In the graphical model, each node is a random variable and has a role in the -generative process. A shaded node indicates an observed variable and an unshaded -node indicates a hidden (latent) variable. In this case, words in the corpus are -the only data that we observe. The latent variables determine the random mixture -of topics in the corpus and the distribution of words in the documents. -The goal of LDA is to use the observed words to infer the hidden topic -structure. +In the graphical model, each node is a random variable and has a role in the +generative process. A shaded node indicates an observed variable and an unshaded +node indicates a hidden (latent) variable. In this case, words in the corpus are +the only data that we observe. The latent variables determine the random mixture +of topics in the corpus and the distribution of words in the documents. +The goal of LDA is to use the observed words to infer the hidden topic +structure. -When modeling text corpora, the model assumes the following generative process -for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K` -corresponding to :attr:`n_components` in the API: +.. dropdown:: Details on modeling text corpora - 1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim - \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words, - i.e. the probability of a word appearing in topic :math:`k`. - :math:`\eta` corresponds to :attr:`topic_word_prior`. + When modeling text corpora, the model assumes the following generative process + for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K` + corresponding to `n_components` in the API: - 2. For each document :math:`d \in D`, draw the topic proportions - :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha` - corresponds to :attr:`doc_topic_prior`. + 1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim + \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words, + i.e. the probability of a word appearing in topic :math:`k`. + :math:`\eta` corresponds to `topic_word_prior`. - 3. For each word :math:`i` in document :math:`d`: + 2. For each document :math:`d \in D`, draw the topic proportions + :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha` + corresponds to `doc_topic_prior`. - a. Draw the topic assignment :math:`z_{di} \sim \mathrm{Multinomial} - (\theta_d)` - b. Draw the observed word :math:`w_{ij} \sim \mathrm{Multinomial} - (\beta_{z_{di}})` + 3. For each word :math:`i` in document :math:`d`: -For parameter estimation, the posterior distribution is: + a. Draw the topic assignment :math:`z_{di} \sim \mathrm{Multinomial} + (\theta_d)` + b. Draw the observed word :math:`w_{ij} \sim \mathrm{Multinomial} + (\beta_{z_{di}})` -.. math:: - p(z, \theta, \beta |w, \alpha, \eta) = - \frac{p(z, \theta, \beta|\alpha, \eta)}{p(w|\alpha, \eta)} + For parameter estimation, the posterior distribution is: -Since the posterior is intractable, variational Bayesian method -uses a simpler distribution :math:`q(z,\theta,\beta | \lambda, \phi, \gamma)` -to approximate it, and those variational parameters :math:`\lambda`, -:math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence -Lower Bound (ELBO): + .. math:: + p(z, \theta, \beta |w, \alpha, \eta) = + \frac{p(z, \theta, \beta|\alpha, \eta)}{p(w|\alpha, \eta)} -.. math:: - \log\: P(w | \alpha, \eta) \geq L(w,\phi,\gamma,\lambda) \overset{\triangle}{=} - E_{q}[\log\:p(w,z,\theta,\beta|\alpha,\eta)] - E_{q}[\log\:q(z, \theta, \beta)] + Since the posterior is intractable, variational Bayesian method + uses a simpler distribution :math:`q(z,\theta,\beta | \lambda, \phi, \gamma)` + to approximate it, and those variational parameters :math:`\lambda`, + :math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence + Lower Bound (ELBO): + + .. math:: + \log\: P(w | \alpha, \eta) \geq L(w,\phi,\gamma,\lambda) \overset{\triangle}{=} + E_{q}[\log\:p(w,z,\theta,\beta|\alpha,\eta)] - E_{q}[\log\:q(z, \theta, \beta)] + + Maximizing ELBO is equivalent to minimizing the Kullback-Leibler(KL) divergence + between :math:`q(z,\theta,\beta)` and the true posterior + :math:`p(z, \theta, \beta |w, \alpha, \eta)`. -Maximizing ELBO is equivalent to minimizing the Kullback-Leibler(KL) divergence -between :math:`q(z,\theta,\beta)` and the true posterior -:math:`p(z, \theta, \beta |w, \alpha, \eta)`. -:class:`LatentDirichletAllocation` implements the online variational Bayes +:class:`LatentDirichletAllocation` implements the online variational Bayes algorithm and supports both online and batch update methods. -While the batch method updates variational variables after each full pass through -the data, the online method updates variational variables from mini-batch data +While the batch method updates variational variables after each full pass through +the data, the online method updates variational variables from mini-batch data points. .. note:: @@ -942,33 +1065,33 @@ points. When :class:`LatentDirichletAllocation` is applied on a "document-term" matrix, the matrix will be decomposed into a "topic-term" matrix and a "document-topic" matrix. While -"topic-term" matrix is stored as :attr:`components_` in the model, "document-topic" matrix +"topic-term" matrix is stored as `components_` in the model, "document-topic" matrix can be calculated from ``transform`` method. :class:`LatentDirichletAllocation` also implements ``partial_fit`` method. This is used when data can be fetched sequentially. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` +* :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` -.. topic:: References: +.. rubric:: References - * `"Latent Dirichlet Allocation" - `_ - D. Blei, A. Ng, M. Jordan, 2003 +* `"Latent Dirichlet Allocation" + `_ + D. Blei, A. Ng, M. Jordan, 2003 - * `"Online Learning for Latent Dirichlet Allocation” - `_ - M. Hoffman, D. Blei, F. Bach, 2010 +* `"Online Learning for Latent Dirichlet Allocation” + `_ + M. Hoffman, D. Blei, F. Bach, 2010 - * `"Stochastic Variational Inference" - `_ - M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013 +* `"Stochastic Variational Inference" + `_ + M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013 - * `"The varimax criterion for analytic rotation in factor analysis" - `_ - H. F. Kaiser, 1958 +* `"The varimax criterion for analytic rotation in factor analysis" + `_ + H. F. Kaiser, 1958 See also :ref:`nca_dim_reduction` for dimensionality reduction with Neighborhood Components Analysis. diff --git a/doc/modules/density.rst b/doc/modules/density.rst index 115d318183577..16c73bd5349a2 100644 --- a/doc/modules/density.rst +++ b/doc/modules/density.rst @@ -100,6 +100,10 @@ between bias and variance in the result. A large bandwidth leads to a very smooth (i.e. high-bias) density distribution. A small bandwidth leads to an unsmooth (i.e. high-variance) density distribution. +The parameter `bandwidth` controls this smoothing. One can either set +manually this parameter or use Scott's and Silverman's estimation +methods. + :class:`~sklearn.neighbors.KernelDensity` implements several common kernel forms, which are shown in the following figure: @@ -109,36 +113,39 @@ forms, which are shown in the following figure: .. centered:: |kde_kernels| -The form of these kernels is as follows: +.. dropdown:: Kernels' mathematical expressions + + The form of these kernels is as follows: + + * Gaussian kernel (``kernel = 'gaussian'``) -* Gaussian kernel (``kernel = 'gaussian'``) + :math:`K(x; h) \propto \exp(- \frac{x^2}{2h^2} )` - :math:`K(x; h) \propto \exp(- \frac{x^2}{2h^2} )` + * Tophat kernel (``kernel = 'tophat'``) -* Tophat kernel (``kernel = 'tophat'``) + :math:`K(x; h) \propto 1` if :math:`x < h` - :math:`K(x; h) \propto 1` if :math:`x < h` + * Epanechnikov kernel (``kernel = 'epanechnikov'``) -* Epanechnikov kernel (``kernel = 'epanechnikov'``) + :math:`K(x; h) \propto 1 - \frac{x^2}{h^2}` - :math:`K(x; h) \propto 1 - \frac{x^2}{h^2}` + * Exponential kernel (``kernel = 'exponential'``) -* Exponential kernel (``kernel = 'exponential'``) + :math:`K(x; h) \propto \exp(-x/h)` - :math:`K(x; h) \propto \exp(-x/h)` + * Linear kernel (``kernel = 'linear'``) -* Linear kernel (``kernel = 'linear'``) + :math:`K(x; h) \propto 1 - x/h` if :math:`x < h` - :math:`K(x; h) \propto 1 - x/h` if :math:`x < h` + * Cosine kernel (``kernel = 'cosine'``) -* Cosine kernel (``kernel = 'cosine'``) + :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h` - :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h` The kernel density estimator can be used with any of the valid distance -metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though -the results are properly normalized only for the Euclidean metric. One -particularly useful metric is the +metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of +available metrics), though the results are properly normalized only +for the Euclidean metric. One particularly useful metric is the `Haversine distance `_ which measures the angular distance between points on a sphere. Here is an example of using a kernel density estimate for a visualization @@ -167,14 +174,14 @@ on a PCA projection of the data: The "new" data consists of linear combinations of the input data, with weights probabilistically drawn given the KDE model. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_plot_kde_1d.py`: computation of simple kernel - density estimates in one dimension. +* :ref:`sphx_glr_auto_examples_neighbors_plot_kde_1d.py`: computation of simple kernel + density estimates in one dimension. - * :ref:`sphx_glr_auto_examples_neighbors_plot_digits_kde_sampling.py`: an example of using - Kernel Density estimation to learn a generative model of the hand-written - digits data, and drawing new samples from this model. +* :ref:`sphx_glr_auto_examples_neighbors_plot_digits_kde_sampling.py`: an example of using + Kernel Density estimation to learn a generative model of the hand-written + digits data, and drawing new samples from this model. - * :ref:`sphx_glr_auto_examples_neighbors_plot_species_kde.py`: an example of Kernel Density - estimation using the Haversine distance metric to visualize geospatial data +* :ref:`sphx_glr_auto_examples_neighbors_plot_species_kde.py`: an example of Kernel Density + estimation using the Haversine distance metric to visualize geospatial data diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0e0aaaafaffba..31ca150df372e 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1,584 +1,592 @@ .. _ensemble: -================ -Ensemble methods -================ +=========================================================================== +Ensembles: Gradient boosting, random forests, bagging, voting, stacking +=========================================================================== .. currentmodule:: sklearn.ensemble -The goal of **ensemble methods** is to combine the predictions of several +**Ensemble methods** combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator. -Two families of ensemble methods are usually distinguished: +Two very famous examples of ensemble methods are :ref:`gradient-boosted trees +` and :ref:`random forests `. -- In **averaging methods**, the driving principle is to build several - estimators independently and then to average their predictions. On average, - the combined estimator is usually better than any of the single base - estimator because its variance is reduced. +More generally, ensemble models can be applied to any base learner beyond +trees, in averaging methods such as :ref:`Bagging methods `, +:ref:`model stacking `, or :ref:`Voting `, or in +boosting, as :ref:`AdaBoost `. - **Examples:** :ref:`Bagging methods `, :ref:`Forests of randomized trees `, ... +.. _gradient_boosting: -- By contrast, in **boosting methods**, base estimators are built sequentially - and one tries to reduce the bias of the combined estimator. The motivation is - to combine several weak models to produce a powerful ensemble. +Gradient-boosted trees +====================== - **Examples:** :ref:`AdaBoost `, :ref:`Gradient Tree Boosting `, ... +`Gradient Tree Boosting `_ +or Gradient Boosted Decision Trees (GBDT) is a generalization +of boosting to arbitrary differentiable loss functions, see the seminal work of +[Friedman2001]_. GBDT is an excellent model for both regression and +classification, in particular for tabular data. + +.. topic:: :class:`GradientBoostingClassifier` vs :class:`HistGradientBoostingClassifier` + + Scikit-learn provides two implementations of gradient-boosted trees: + :class:`HistGradientBoostingClassifier` vs + :class:`GradientBoostingClassifier` for classification, and the + corresponding classes for regression. The former can be **orders of + magnitude faster** than the latter when the number of samples is + larger than tens of thousands of samples. + + Missing values and categorical data are natively supported by the + Hist... version, removing the need for additional preprocessing such as + imputation. + + :class:`GradientBoostingClassifier` and + :class:`GradientBoostingRegressor` might be preferred for small sample + sizes since binning may lead to split points that are too approximate + in this setting. +.. _histogram_based_gradient_boosting: -.. _bagging: +Histogram-Based Gradient Boosting +---------------------------------- -Bagging meta-estimator -====================== +Scikit-learn 0.21 introduced two new implementations of +gradient boosted trees, namely :class:`HistGradientBoostingClassifier` +and :class:`HistGradientBoostingRegressor`, inspired by +`LightGBM `__ (See [LightGBM]_). -In ensemble algorithms, bagging methods form a class of algorithms which build -several instances of a black-box estimator on random subsets of the original -training set and then aggregate their individual predictions to form a final -prediction. These methods are used as a way to reduce the variance of a base -estimator (e.g., a decision tree), by introducing randomization into its -construction procedure and then making an ensemble out of it. In many cases, -bagging methods constitute a very simple way to improve with respect to a -single model, without making it necessary to adapt the underlying base -algorithm. As they provide a way to reduce overfitting, bagging methods work -best with strong and complex models (e.g., fully developed decision trees), in -contrast with boosting methods which usually work best with weak models (e.g., -shallow decision trees). +These histogram-based estimators can be **orders of magnitude faster** +than :class:`GradientBoostingClassifier` and +:class:`GradientBoostingRegressor` when the number of samples is larger +than tens of thousands of samples. -Bagging methods come in many flavours but mostly differ from each other by the -way they draw random subsets of the training set: +They also have built-in support for missing values, which avoids the need +for an imputer. - * When random subsets of the dataset are drawn as random subsets of the - samples, then this algorithm is known as Pasting [B1999]_. +These fast estimators first bin the input samples ``X`` into +integer-valued bins (typically 256 bins) which tremendously reduces the +number of splitting points to consider, and allows the algorithm to +leverage integer-based data structures (histograms) instead of relying on +sorted continuous values when building the trees. The API of these +estimators is slightly different, and some of the features from +:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` +are not yet supported, for instance some loss functions. - * When samples are drawn with replacement, then the method is known as - Bagging [B1996]_. +.. rubric:: Examples - * When random subsets of the dataset are drawn as random subsets of - the features, then the method is known as Random Subspaces [H1998]_. +* :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py` - * Finally, when base estimators are built on subsets of both samples and - features, then the method is known as Random Patches [LG2012]_. +Usage +^^^^^ -In scikit-learn, bagging methods are offered as a unified -:class:`BaggingClassifier` meta-estimator (resp. :class:`BaggingRegressor`), -taking as input a user-specified base estimator along with parameters -specifying the strategy to draw random subsets. In particular, ``max_samples`` -and ``max_features`` control the size of the subsets (in terms of samples and -features), while ``bootstrap`` and ``bootstrap_features`` control whether -samples and features are drawn with or without replacement. When using a subset -of the available samples the generalization accuracy can be estimated with the -out-of-bag samples by setting ``oob_score=True``. As an example, the -snippet below illustrates how to instantiate a bagging ensemble of -:class:`KNeighborsClassifier` base estimators, each built on random subsets of -50% of the samples and 50% of the features. +Most of the parameters are unchanged from +:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`. +One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and +controls the number of iterations of the boosting process:: - >>> from sklearn.ensemble import BaggingClassifier - >>> from sklearn.neighbors import KNeighborsClassifier - >>> bagging = BaggingClassifier(KNeighborsClassifier(), - ... max_samples=0.5, max_features=0.5) + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> from sklearn.datasets import make_hastie_10_2 -.. topic:: Examples: + >>> X, y = make_hastie_10_2(random_state=0) + >>> X_train, X_test = X[:2000], X[2000:] + >>> y_train, y_test = y[:2000], y[2000:] - * :ref:`sphx_glr_auto_examples_ensemble_plot_bias_variance.py` + >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) + >>> clf.score(X_test, y_test) + 0.8965 -.. topic:: References +Available losses for **regression** are: - .. [B1999] L. Breiman, "Pasting small votes for classification in large - databases and on-line", Machine Learning, 36(1), 85-103, 1999. +- 'squared_error', which is the default loss; +- 'absolute_error', which is less sensitive to outliers than the squared error; +- 'gamma', which is well suited to model strictly positive outcomes; +- 'poisson', which is well suited to model counts and frequencies; +- 'quantile', which allows for estimating a conditional quantile that can later + be used to obtain prediction intervals. - .. [B1996] L. Breiman, "Bagging predictors", Machine Learning, 24(2), - 123-140, 1996. +For **classification**, 'log_loss' is the only option. For binary classification +it uses the binary log loss, also known as binomial deviance or binary +cross-entropy. For `n_classes >= 3`, it uses the multi-class log loss function, +with multinomial deviance and categorical cross-entropy as alternative names. +The appropriate loss version is selected based on :term:`y` passed to +:term:`fit`. - .. [H1998] T. Ho, "The random subspace method for constructing decision - forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, - 1998. +The size of the trees can be controlled through the ``max_leaf_nodes``, +``max_depth``, and ``min_samples_leaf`` parameters. - .. [LG2012] G. Louppe and P. Geurts, "Ensembles on Random Patches", - Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. +The number of bins used to bin the data is controlled with the ``max_bins`` +parameter. Using less bins acts as a form of regularization. It is generally +recommended to use as many bins as possible (255), which is the default. -.. _forest: +The ``l2_regularization`` parameter acts as a regularizer for the loss function, +and corresponds to :math:`\lambda` in the following expression (see equation (2) +in [XGBoost]_): -Forests of randomized trees -=========================== +.. math:: -The :mod:`sklearn.ensemble` module includes two averaging algorithms based -on randomized :ref:`decision trees `: the RandomForest algorithm -and the Extra-Trees method. Both algorithms are perturb-and-combine -techniques [B1998]_ specifically designed for trees. This means a diverse -set of classifiers is created by introducing randomness in the classifier -construction. The prediction of the ensemble is given as the averaged -prediction of the individual classifiers. + \mathcal{L}(\phi) = \sum_i l(\hat{y}_i, y_i) + \frac12 \sum_k \lambda ||w_k||^2 -As other classifiers, forest classifiers have to be fitted with two -arrays: a sparse or dense array X of shape ``(n_samples, n_features)`` -holding the training samples, and an array Y of shape ``(n_samples,)`` -holding the target values (class labels) for the training samples:: +.. dropdown:: Details on l2 regularization - >>> from sklearn.ensemble import RandomForestClassifier - >>> X = [[0, 0], [1, 1]] - >>> Y = [0, 1] - >>> clf = RandomForestClassifier(n_estimators=10) - >>> clf = clf.fit(X, Y) + It is important to notice that the loss term :math:`l(\hat{y}_i, y_i)` describes + only half of the actual loss function except for the pinball loss and absolute + error. -Like :ref:`decision trees `, forests of trees also extend to -:ref:`multi-output problems ` (if Y is an array -of shape ``(n_samples, n_outputs)``). + The index :math:`k` refers to the k-th tree in the ensemble of trees. In the + case of regression and binary classification, gradient boosting models grow one + tree per iteration, then :math:`k` runs up to `max_iter`. In the case of + multiclass classification problems, the maximal value of the index :math:`k` is + `n_classes` :math:`\times` `max_iter`. -Random Forests --------------- + If :math:`T_k` denotes the number of leaves in the k-th tree, then :math:`w_k` + is a vector of length :math:`T_k`, which contains the leaf values of the form `w + = -sum_gradient / (sum_hessian + l2_regularization)` (see equation (5) in + [XGBoost]_). -In random forests (see :class:`RandomForestClassifier` and -:class:`RandomForestRegressor` classes), each tree in the ensemble is built -from a sample drawn with replacement (i.e., a bootstrap sample) from the -training set. + The leaf values :math:`w_k` are derived by dividing the sum of the gradients of + the loss function by the combined sum of hessians. Adding the regularization to + the denominator penalizes the leaves with small hessians (flat regions), + resulting in smaller updates. Those :math:`w_k` values contribute then to the + model's prediction for a given input that ends up in the corresponding leaf. The + final prediction is the sum of the base prediction and the contributions from + each tree. The result of that sum is then transformed by the inverse link + function depending on the choice of the loss function (see + :ref:`gradient_boosting_formulation`). -Furthermore, when splitting each node during the construction of a tree, the -best split is found either from all input features or a random subset of size -``max_features``. (See the :ref:`parameter tuning guidelines -` for more details). + Notice that the original paper [XGBoost]_ introduces a term :math:`\gamma\sum_k + T_k` that penalizes the number of leaves (making it a smooth version of + `max_leaf_nodes`) not presented here as it is not implemented in scikit-learn; + whereas :math:`\lambda` penalizes the magnitude of the individual tree + predictions before being rescaled by the learning rate, see + :ref:`gradient_boosting_shrinkage`. -The purpose of these two sources of randomness is to decrease the variance of -the forest estimator. Indeed, individual decision trees typically exhibit high -variance and tend to overfit. The injected randomness in forests yield decision -trees with somewhat decoupled prediction errors. By taking an average of those -predictions, some errors can cancel out. Random forests achieve a reduced -variance by combining diverse trees, sometimes at the cost of a slight increase -in bias. In practice the variance reduction is often significant hence yielding -an overall better model. -In contrast to the original publication [B2001]_, the scikit-learn -implementation combines classifiers by averaging their probabilistic -prediction, instead of letting each classifier vote for a single class. +Note that **early-stopping is enabled by default if the number of samples is +larger than 10,000**. The early-stopping behaviour is controlled via the +``early_stopping``, ``scoring``, ``validation_fraction``, +``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop +using an arbitrary :term:`scorer`, or just the training or validation loss. +Note that for technical reasons, using a callable as a scorer is significantly slower +than using the loss. By default, early-stopping is performed if there are at least +10,000 samples in the training set, using the validation loss. -Extremely Randomized Trees --------------------------- +.. _nan_support_hgbt: -In extremely randomized trees (see :class:`ExtraTreesClassifier` -and :class:`ExtraTreesRegressor` classes), randomness goes one step -further in the way splits are computed. As in random forests, a random -subset of candidate features is used, but instead of looking for the -most discriminative thresholds, thresholds are drawn at random for each -candidate feature and the best of these randomly-generated thresholds is -picked as the splitting rule. This usually allows to reduce the variance -of the model a bit more, at the expense of a slightly greater increase -in bias:: +Missing values support +^^^^^^^^^^^^^^^^^^^^^^ - >>> from sklearn.model_selection import cross_val_score - >>> from sklearn.datasets import make_blobs - >>> from sklearn.ensemble import RandomForestClassifier - >>> from sklearn.ensemble import ExtraTreesClassifier - >>> from sklearn.tree import DecisionTreeClassifier +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` have built-in support for missing +values (NaNs). - >>> X, y = make_blobs(n_samples=10000, n_features=10, centers=100, - ... random_state=0) +During training, the tree grower learns at each split point whether samples +with missing values should go to the left or right child, based on the +potential gain. When predicting, samples with missing values are assigned to +the left or right child consequently:: - >>> clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, - ... random_state=0) - >>> scores = cross_val_score(clf, X, y, cv=5) - >>> scores.mean() - 0.98... + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> import numpy as np - >>> clf = RandomForestClassifier(n_estimators=10, max_depth=None, - ... min_samples_split=2, random_state=0) - >>> scores = cross_val_score(clf, X, y, cv=5) - >>> scores.mean() - 0.999... + >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 0, 1, 1] - >>> clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, - ... min_samples_split=2, random_state=0) - >>> scores = cross_val_score(clf, X, y, cv=5) - >>> scores.mean() > 0.999 - True + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) + >>> gbdt.predict(X) + array([0, 0, 1, 1]) -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_forest_iris_001.png - :target: ../auto_examples/ensemble/plot_forest_iris.html - :align: center - :scale: 75% +When the missingness pattern is predictive, the splits can be performed on +whether the feature value is missing or not:: -.. _random_forest_parameters: + >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) + >>> y = [0, 1, 0, 0, 1] + >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, + ... max_depth=2, + ... learning_rate=1, + ... max_iter=1).fit(X, y) + >>> gbdt.predict(X) + array([0, 1, 0, 0, 1]) -Parameters ----------- +If no missing values were encountered for a given feature during training, +then samples with missing values are mapped to whichever child has the most +samples. -The main parameters to adjust when using these methods is ``n_estimators`` and -``max_features``. The former is the number of trees in the forest. The larger -the better, but also the longer it will take to compute. In addition, note that -results will stop getting significantly better beyond a critical number of -trees. The latter is the size of the random subsets of features to consider -when splitting a node. The lower the greater the reduction of variance, but -also the greater the increase in bias. Empirical good default values are -``max_features=None`` (always considering all features instead of a random -subset) for regression problems, and ``max_features="sqrt"`` (using a random -subset of size ``sqrt(n_features)``) for classification tasks (where -``n_features`` is the number of features in the data). Good results are often -achieved when setting ``max_depth=None`` in combination with -``min_samples_split=2`` (i.e., when fully developing the trees). Bear in mind -though that these values are usually not optimal, and might result in models -that consume a lot of RAM. The best parameter values should always be -cross-validated. In addition, note that in random forests, bootstrap samples -are used by default (``bootstrap=True``) while the default strategy for -extra-trees is to use the whole dataset (``bootstrap=False``). When using -bootstrap sampling the generalization accuracy can be estimated on the left out -or out-of-bag samples. This can be enabled by setting ``oob_score=True``. +.. rubric:: Examples -.. note:: +* :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` - The size of the model with the default parameters is :math:`O( M * N * log (N) )`, - where :math:`M` is the number of trees and :math:`N` is the number of samples. - In order to reduce the size of the model, you can change these parameters: - ``min_samples_split``, ``max_leaf_nodes``, ``max_depth`` and ``min_samples_leaf``. +.. _sw_hgbdt: -Parallelization ---------------- +Sample weight support +^^^^^^^^^^^^^^^^^^^^^ -Finally, this module also features the parallel construction of the trees -and the parallel computation of the predictions through the ``n_jobs`` -parameter. If ``n_jobs=k`` then computations are partitioned into -``k`` jobs, and run on ``k`` cores of the machine. If ``n_jobs=-1`` -then all cores available on the machine are used. Note that because of -inter-process communication overhead, the speedup might not be linear -(i.e., using ``k`` jobs will unfortunately not be ``k`` times as -fast). Significant speedup can still be achieved though when building -a large number of trees, or when building a single tree requires a fair -amount of time (e.g., on large datasets). +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` support sample weights during +:term:`fit`. -.. topic:: Examples: +The following toy example demonstrates that samples with a sample weight of zero are ignored: - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_iris.py` - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py` - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` + >>> X = [[1, 0], + ... [1, 0], + ... [1, 0], + ... [0, 1]] + >>> y = [0, 0, 1, 0] + >>> # ignore the first 2 training samples by setting their weight to 0 + >>> sample_weight = [0, 0, 1, 1] + >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1) + >>> gb.fit(X, y, sample_weight=sample_weight) + HistGradientBoostingClassifier(...) + >>> gb.predict([[1, 0]]) + array([1]) + >>> gb.predict_proba([[1, 0]])[0, 1] + np.float64(0.999) -.. topic:: References +As you can see, the `[1, 0]` is comfortably classified as `1` since the first +two samples are ignored due to their sample weights. - .. [B2001] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. +Implementation detail: taking sample weights into account amounts to +multiplying the gradients (and the hessians) by the sample weights. Note that +the binning stage (specifically the quantiles computation) does not take the +weights into account. - .. [B1998] L. Breiman, "Arcing Classifiers", Annals of Statistics 1998. +.. _categorical_support_gbdt: - * P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized - trees", Machine Learning, 63(1), 3-42, 2006. +Categorical Features Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. _random_forest_feature_importance: +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` have native support for categorical +features: they can consider splits on non-ordered, categorical data. -Feature importance evaluation ------------------------------ +For datasets with categorical features, using the native categorical support +is often better than relying on one-hot encoding +(:class:`~sklearn.preprocessing.OneHotEncoder`), because one-hot encoding +requires more tree depth to achieve equivalent splits. It is also usually +better to rely on the native categorical support rather than to treat +categorical features as continuous (ordinal), which happens for ordinal-encoded +categorical data, since categories are nominal quantities where order does not +matter. -The relative rank (i.e. depth) of a feature used as a decision node in a -tree can be used to assess the relative importance of that feature with -respect to the predictability of the target variable. Features used at -the top of the tree contribute to the final prediction decision of a -larger fraction of the input samples. The **expected fraction of the -samples** they contribute to can thus be used as an estimate of the -**relative importance of the features**. In scikit-learn, the fraction of -samples a feature contributes to is combined with the decrease in impurity -from splitting them to create a normalized estimate of the predictive power -of that feature. +To enable categorical support, a boolean mask can be passed to the +`categorical_features` parameter, indicating which feature is categorical. In +the following, the first feature will be treated as categorical and the +second feature as numerical:: -By **averaging** the estimates of predictive ability over several randomized -trees one can **reduce the variance** of such an estimate and use it -for feature selection. This is known as the mean decrease in impurity, or MDI. -Refer to [L2014]_ for more information on MDI and feature importance -evaluation with Random Forests. + >>> gbdt = HistGradientBoostingClassifier(categorical_features=[True, False]) -.. warning:: +Equivalently, one can pass a list of integers indicating the indices of the +categorical features:: - The impurity-based feature importances computed on tree-based models suffer - from two flaws that can lead to misleading conclusions. First they are - computed on statistics derived from the training dataset and therefore **do - not necessarily inform us on which features are most important to make good - predictions on held-out dataset**. Secondly, **they favor high cardinality - features**, that is features with many unique values. - :ref:`permutation_importance` is an alternative to impurity-based feature - importance that does not suffer from these flaws. These two methods of - obtaining feature importance are explored in: - :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`. + >>> gbdt = HistGradientBoostingClassifier(categorical_features=[0]) -The following example shows a color-coded representation of the relative -importances of each individual pixel for a face recognition task using -a :class:`ExtraTreesClassifier` model. +When the input is a DataFrame, it is also possible to pass a list of column +names:: -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_forest_importances_faces_001.png - :target: ../auto_examples/ensemble/plot_forest_importances_faces.html - :align: center - :scale: 75 + >>> gbdt = HistGradientBoostingClassifier(categorical_features=["site", "manufacturer"]) -In practice those estimates are stored as an attribute named -``feature_importances_`` on the fitted model. This is an array with shape -``(n_features,)`` whose values are positive and sum to 1.0. The higher -the value, the more important is the contribution of the matching feature -to the prediction function. +Finally, when the input is a DataFrame we can use +`categorical_features="from_dtype"` in which case all columns with a categorical +`dtype` will be treated as categorical features. -.. topic:: Examples: +The cardinality of each categorical feature must be less than the `max_bins` +parameter. For an example using histogram-based gradient boosting on categorical +features, see +:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`. - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py` - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances.py` +If there are missing values during training, the missing values will be +treated as a proper category. If there are no missing values during training, +then at prediction time, missing values are mapped to the child node that has +the most samples (just like for continuous features). When predicting, +categories that were not seen during fit time will be treated as missing +values. -.. topic:: References +.. dropdown:: Split finding with categorical features - .. [L2014] G. Louppe, - "Understanding Random Forests: From Theory to Practice", - PhD Thesis, U. of Liege, 2014. + The canonical way of considering categorical splits in a tree is to consider + all of the :math:`2^{K - 1} - 1` partitions, where :math:`K` is the number of + categories. This can quickly become prohibitive when :math:`K` is large. + Fortunately, since gradient boosting trees are always regression trees (even + for classification problems), there exists a faster strategy that can yield + equivalent splits. First, the categories of a feature are sorted according to + the variance of the target, for each category `k`. Once the categories are + sorted, one can consider *continuous partitions*, i.e. treat the categories + as if they were ordered continuous values (see Fisher [Fisher1958]_ for a + formal proof). As a result, only :math:`K - 1` splits need to be considered + instead of :math:`2^{K - 1} - 1`. The initial sorting is a + :math:`\mathcal{O}(K \log(K))` operation, leading to a total complexity of + :math:`\mathcal{O}(K \log(K) + K)`, instead of :math:`\mathcal{O}(2^K)`. -.. _random_trees_embedding: +.. rubric:: Examples -Totally Random Trees Embedding ------------------------------- +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py` -:class:`RandomTreesEmbedding` implements an unsupervised transformation of the -data. Using a forest of completely random trees, :class:`RandomTreesEmbedding` -encodes the data by the indices of the leaves a data point ends up in. This -index is then encoded in a one-of-K manner, leading to a high dimensional, -sparse binary coding. -This coding can be computed very efficiently and can then be used as a basis -for other learning tasks. -The size and sparsity of the code can be influenced by choosing the number of -trees and the maximum depth per tree. For each tree in the ensemble, the coding -contains one entry of one. The size of the coding is at most ``n_estimators * 2 -** max_depth``, the maximum number of leaves in the forest. +.. _monotonic_cst_gbdt: -As neighboring data points are more likely to lie within the same leaf of a -tree, the transformation performs an implicit, non-parametric density -estimation. +Monotonic Constraints +^^^^^^^^^^^^^^^^^^^^^ -.. topic:: Examples: +Depending on the problem at hand, you may have prior knowledge indicating +that a given feature should in general have a positive (or negative) effect +on the target value. For example, all else being equal, a higher credit +score should increase the probability of getting approved for a loan. +Monotonic constraints allow you to incorporate such prior knowledge into the +model. - * :ref:`sphx_glr_auto_examples_ensemble_plot_random_forest_embedding.py` +For a predictor :math:`F` with two features: - * :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` compares non-linear - dimensionality reduction techniques on handwritten digits. +- a **monotonic increase constraint** is a constraint of the form: - * :ref:`sphx_glr_auto_examples_ensemble_plot_feature_transformation.py` compares - supervised and unsupervised tree based feature transformations. + .. math:: + x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2) -.. seealso:: +- a **monotonic decrease constraint** is a constraint of the form: - :ref:`manifold` techniques can also be useful to derive non-linear - representations of feature space, also these approaches focus also on - dimensionality reduction. + .. math:: + x_1 \leq x_1' \implies F(x_1, x_2) \geq F(x_1', x_2) +You can specify a monotonic constraint on each feature using the +`monotonic_cst` parameter. For each feature, a value of 0 indicates no +constraint, while 1 and -1 indicate a monotonic increase and +monotonic decrease constraint, respectively:: -.. _adaboost: + >>> from sklearn.ensemble import HistGradientBoostingRegressor -AdaBoost -======== + ... # monotonic increase, monotonic decrease, and no constraint on the 3 features + >>> gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1, 0]) -The module :mod:`sklearn.ensemble` includes the popular boosting algorithm -AdaBoost, introduced in 1995 by Freund and Schapire [FS1995]_. +In a binary classification context, imposing a monotonic increase (decrease) constraint means that higher values of the feature are supposed +to have a positive (negative) effect on the probability of samples +to belong to the positive class. -The core principle of AdaBoost is to fit a sequence of weak learners (i.e., -models that are only slightly better than random guessing, such as small -decision trees) on repeatedly modified versions of the data. The predictions -from all of them are then combined through a weighted majority vote (or sum) to -produce the final prediction. The data modifications at each so-called boosting -iteration consist of applying weights :math:`w_1`, :math:`w_2`, ..., :math:`w_N` -to each of the training samples. Initially, those weights are all set to -:math:`w_i = 1/N`, so that the first step simply trains a weak learner on the -original data. For each successive iteration, the sample weights are -individually modified and the learning algorithm is reapplied to the reweighted -data. At a given step, those training examples that were incorrectly predicted -by the boosted model induced at the previous step have their weights increased, -whereas the weights are decreased for those that were predicted correctly. As -iterations proceed, examples that are difficult to predict receive -ever-increasing influence. Each subsequent weak learner is thereby forced to -concentrate on the examples that are missed by the previous ones in the sequence -[HTF]_. +Nevertheless, monotonic constraints only marginally constrain feature effects on the output. +For instance, monotonic increase and decrease constraints cannot be used to enforce the +following modelling constraint: -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_adaboost_hastie_10_2_001.png - :target: ../auto_examples/ensemble/plot_adaboost_hastie_10_2.html - :align: center - :scale: 75 +.. math:: + x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2') -AdaBoost can be used both for classification and regression problems: +Also, monotonic constraints are not supported for multiclass classification. - - For multi-class classification, :class:`AdaBoostClassifier` implements - AdaBoost-SAMME and AdaBoost-SAMME.R [ZZRH2009]_. +For a practical implementation of monotonic constraints with the histogram-based +gradient boosting, including how they can improve generalization when domain knowledge +is available, see +:ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`. - - For regression, :class:`AdaBoostRegressor` implements AdaBoost.R2 [D1997]_. +.. note:: + Since categories are unordered quantities, it is not possible to enforce + monotonic constraints on categorical features. -Usage ------ +.. rubric:: Examples -The following example shows how to fit an AdaBoost classifier with 100 weak -learners:: +* :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` - >>> from sklearn.model_selection import cross_val_score - >>> from sklearn.datasets import load_iris - >>> from sklearn.ensemble import AdaBoostClassifier +.. _interaction_cst_hgbt: - >>> X, y = load_iris(return_X_y=True) - >>> clf = AdaBoostClassifier(n_estimators=100) - >>> scores = cross_val_score(clf, X, y, cv=5) - >>> scores.mean() - 0.9... +Interaction constraints +^^^^^^^^^^^^^^^^^^^^^^^ -The number of weak learners is controlled by the parameter ``n_estimators``. The -``learning_rate`` parameter controls the contribution of the weak learners in -the final combination. By default, weak learners are decision stumps. Different -weak learners can be specified through the ``base_estimator`` parameter. -The main parameters to tune to obtain good results are ``n_estimators`` and -the complexity of the base estimators (e.g., its depth ``max_depth`` or -minimum required number of samples to consider a split ``min_samples_split``). +A priori, the histogram gradient boosted trees are allowed to use any feature +to split a node into child nodes. This creates so called interactions between +features, i.e. usage of different features as split along a branch. Sometimes, +one wants to restrict the possible interactions, see [Mayer2022]_. This can be +done by the parameter ``interaction_cst``, where one can specify the indices +of features that are allowed to interact. +For instance, with 3 features in total, ``interaction_cst=[{0}, {1}, {2}]`` +forbids all interactions. +The constraints ``[{0, 1}, {1, 2}]`` specify two groups of possibly +interacting features. Features 0 and 1 may interact with each other, as well +as features 1 and 2. But note that features 0 and 2 are forbidden to interact. +The following depicts a tree and the possible splits of the tree: -.. topic:: Examples: +.. code-block:: none - * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_hastie_10_2.py` compares the - classification error of a decision stump, decision tree, and a boosted - decision stump using AdaBoost-SAMME and AdaBoost-SAMME.R. + 1 <- Both constraint groups could be applied from now on + / \ + 1 2 <- Left split still fulfills both constraint groups. + / \ / \ Right split at feature 2 has only group {1, 2} from now on. - * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py` shows the performance - of AdaBoost-SAMME and AdaBoost-SAMME.R on a multi-class problem. +LightGBM uses the same logic for overlapping groups. - * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_twoclass.py` shows the decision boundary - and decision function values for a non-linearly separable two-class problem - using AdaBoost-SAMME. +Note that features not listed in ``interaction_cst`` are automatically +assigned an interaction group for themselves. With again 3 features, this +means that ``[{0}]`` is equivalent to ``[{0}, {1, 2}]``. - * :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` demonstrates regression - with the AdaBoost.R2 algorithm. +.. rubric:: Examples -.. topic:: References +* :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` - .. [FS1995] Y. Freund, and R. Schapire, "A Decision-Theoretic Generalization of - On-Line Learning and an Application to Boosting", 1997. +.. rubric:: References - .. [ZZRH2009] J. Zhu, H. Zou, S. Rosset, T. Hastie. "Multi-class AdaBoost", - 2009. +.. [Mayer2022] M. Mayer, S.C. Bourassa, M. Hoesli, and D.F. Scognamiglio. + 2022. :doi:`Machine Learning Applications to Land and Structure Valuation + <10.3390/jrfm15050193>`. + Journal of Risk and Financial Management 15, no. 5: 193 - .. [D1997] H. Drucker. "Improving Regressors using Boosting Techniques", 1997. +Low-level parallelism +^^^^^^^^^^^^^^^^^^^^^ - .. [HTF] T. Hastie, R. Tibshirani and J. Friedman, "Elements of - Statistical Learning Ed. 2", Springer, 2009. +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` use OpenMP +for parallelization through Cython. For more details on how to control the +number of threads, please refer to our :ref:`parallelism` notes. -.. _gradient_boosting: +The following parts are parallelized: -Gradient Tree Boosting -====================== +- mapping samples from real values to integer-valued bins (finding the bin + thresholds is however sequential) +- building histograms is parallelized over features +- finding the best split point at a node is parallelized over features +- during fit, mapping samples into the left and right children is + parallelized over samples +- gradient and hessians computations are parallelized over samples +- predicting is parallelized over samples + +.. _Why_it's_faster: + +Why it's faster +^^^^^^^^^^^^^^^ + +The bottleneck of a gradient boosting procedure is building the decision +trees. Building a traditional decision tree (as in the other GBDTs +:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`) +requires sorting the samples at each node (for +each feature). Sorting is needed so that the potential gain of a split point +can be computed efficiently. Splitting a single node has thus a complexity +of :math:`\mathcal{O}(n_\text{features} \times n \log(n))` where :math:`n` +is the number of samples at the node. + +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor`, in contrast, do not require sorting the +feature values and instead use a data-structure called a histogram, where the +samples are implicitly ordered. Building a histogram has a +:math:`\mathcal{O}(n)` complexity, so the node splitting procedure has a +:math:`\mathcal{O}(n_\text{features} \times n)` complexity, much smaller +than the previous one. In addition, instead of considering :math:`n` split +points, we consider only ``max_bins`` split points, which might be much +smaller. -`Gradient Tree Boosting `_ -or Gradient Boosted Decision Trees (GBDT) is a generalization -of boosting to arbitrary -differentiable loss functions. GBDT is an accurate and effective -off-the-shelf procedure that can be used for both regression and -classification problems in a -variety of areas including Web search ranking and ecology. +In order to build histograms, the input data `X` needs to be binned into +integer-valued bins. This binning procedure does require sorting the feature +values, but it only happens once at the very beginning of the boosting process +(not at each node, like in :class:`GradientBoostingClassifier` and +:class:`GradientBoostingRegressor`). -The module :mod:`sklearn.ensemble` provides methods -for both classification and regression via gradient boosted decision -trees. +Finally, many parts of the implementation of +:class:`HistGradientBoostingClassifier` and +:class:`HistGradientBoostingRegressor` are parallelized. -.. note:: +.. rubric:: References - Scikit-learn 0.21 introduces two new experimental implementations of - gradient boosting trees, namely :class:`HistGradientBoostingClassifier` - and :class:`HistGradientBoostingRegressor`, inspired by - `LightGBM `__ (See [LightGBM]_). +.. [XGBoost] Tianqi Chen, Carlos Guestrin, :arxiv:`"XGBoost: A Scalable Tree + Boosting System" <1603.02754>` - These histogram-based estimators can be **orders of magnitude faster** - than :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor` when the number of samples is larger - than tens of thousands of samples. +.. [LightGBM] Ke et. al. `"LightGBM: A Highly Efficient Gradient + BoostingDecision Tree" `_ - They also have built-in support for missing values, which avoids the need - for an imputer. +.. [Fisher1958] Fisher, W.D. (1958). `"On Grouping for Maximum Homogeneity" + `_ + Journal of the American Statistical Association, 53, 789-798. - These estimators are described in more detail below in - :ref:`histogram_based_gradient_boosting`. - The following guide focuses on :class:`GradientBoostingClassifier` and - :class:`GradientBoostingRegressor`, which might be preferred for small - sample sizes since binning may lead to split points that are too approximate - in this setting. +:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` +---------------------------------------------------------------------------- The usage and the parameters of :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` are described below. The 2 most important parameters of these estimators are `n_estimators` and `learning_rate`. -Classification ---------------- - -:class:`GradientBoostingClassifier` supports both binary and multi-class -classification. -The following example shows how to fit a gradient boosting classifier -with 100 decision stumps as weak learners:: - - >>> from sklearn.datasets import make_hastie_10_2 - >>> from sklearn.ensemble import GradientBoostingClassifier - - >>> X, y = make_hastie_10_2(random_state=0) - >>> X_train, X_test = X[:2000], X[2000:] - >>> y_train, y_test = y[:2000], y[2000:] - - >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, - ... max_depth=1, random_state=0).fit(X_train, y_train) - >>> clf.score(X_test, y_test) - 0.913... - -The number of weak learners (i.e. regression trees) is controlled by the -parameter ``n_estimators``; :ref:`The size of each tree -` can be controlled either by setting the tree -depth via ``max_depth`` or by setting the number of leaf nodes via -``max_leaf_nodes``. The ``learning_rate`` is a hyper-parameter in the range -(0.0, 1.0] that controls overfitting via :ref:`shrinkage -` . - -.. note:: - - Classification with more than 2 classes requires the induction - of ``n_classes`` regression trees at each iteration, - thus, the total number of induced trees equals - ``n_classes * n_estimators``. For datasets with a large number - of classes we strongly recommend to use - :class:`HistGradientBoostingClassifier` as an alternative to - :class:`GradientBoostingClassifier` . - -Regression ----------- - -:class:`GradientBoostingRegressor` supports a number of -:ref:`different loss functions ` -for regression which can be specified via the argument -``loss``; the default loss function for regression is least squares (``'ls'``). - -:: - - >>> import numpy as np - >>> from sklearn.metrics import mean_squared_error - >>> from sklearn.datasets import make_friedman1 - >>> from sklearn.ensemble import GradientBoostingRegressor - - >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) - >>> X_train, X_test = X[:200], X[200:] - >>> y_train, y_test = y[:200], y[200:] - >>> est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, - ... max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) - >>> mean_squared_error(y_test, est.predict(X_test)) - 5.00... - -The figure below shows the results of applying :class:`GradientBoostingRegressor` -with least squares loss and 500 base learners to the diabetes dataset -(:func:`sklearn.datasets.load_diabetes`). -The plot on the left shows the train and test error at each iteration. -The train error at each iteration is stored in the -:attr:`~GradientBoostingRegressor.train_score_` attribute -of the gradient boosting model. The test error at each iterations can be obtained -via the :meth:`~GradientBoostingRegressor.staged_predict` method which returns a -generator that yields the predictions at each stage. Plots like these can be used -to determine the optimal number of trees (i.e. ``n_estimators``) by early stopping. - -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_gradient_boosting_regression_001.png - :target: ../auto_examples/ensemble/plot_gradient_boosting_regression.html - :align: center - :scale: 75 +.. dropdown:: Classification + + :class:`GradientBoostingClassifier` supports both binary and multi-class + classification. + The following example shows how to fit a gradient boosting classifier + with 100 decision stumps as weak learners:: + + >>> from sklearn.datasets import make_hastie_10_2 + >>> from sklearn.ensemble import GradientBoostingClassifier + + >>> X, y = make_hastie_10_2(random_state=0) + >>> X_train, X_test = X[:2000], X[2000:] + >>> y_train, y_test = y[:2000], y[2000:] + + >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, + ... max_depth=1, random_state=0).fit(X_train, y_train) + >>> clf.score(X_test, y_test) + 0.913 + + The number of weak learners (i.e. regression trees) is controlled by the + parameter ``n_estimators``; :ref:`The size of each tree + ` can be controlled either by setting the tree + depth via ``max_depth`` or by setting the number of leaf nodes via + ``max_leaf_nodes``. The ``learning_rate`` is a hyper-parameter in the range + (0.0, 1.0] that controls overfitting via :ref:`shrinkage + ` . + + .. note:: + + Classification with more than 2 classes requires the induction + of ``n_classes`` regression trees at each iteration, + thus, the total number of induced trees equals + ``n_classes * n_estimators``. For datasets with a large number + of classes we strongly recommend to use + :class:`HistGradientBoostingClassifier` as an alternative to + :class:`GradientBoostingClassifier` . + +.. dropdown:: Regression + + :class:`GradientBoostingRegressor` supports a number of + :ref:`different loss functions ` + for regression which can be specified via the argument + ``loss``; the default loss function for regression is squared error + (``'squared_error'``). + + :: + + >>> import numpy as np + >>> from sklearn.metrics import mean_squared_error + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.ensemble import GradientBoostingRegressor + + >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) + >>> X_train, X_test = X[:200], X[200:] + >>> y_train, y_test = y[:200], y[200:] + >>> est = GradientBoostingRegressor( + ... n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, + ... loss='squared_error' + ... ).fit(X_train, y_train) + >>> mean_squared_error(y_test, est.predict(X_test)) + 5.00 + + The figure below shows the results of applying :class:`GradientBoostingRegressor` + with least squares loss and 500 base learners to the diabetes dataset + (:func:`sklearn.datasets.load_diabetes`). + The plot shows the train and test error at each iteration. + The train error at each iteration is stored in the + `train_score_` attribute of the gradient boosting model. + The test error at each iteration can be obtained + via the :meth:`~GradientBoostingRegressor.staged_predict` method which returns a + generator that yields the predictions at each stage. Plots like these can be used + to determine the optimal number of trees (i.e. ``n_estimators``) by early stopping. + + .. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_gradient_boosting_regression_001.png + :target: ../auto_examples/ensemble/plot_gradient_boosting_regression.html + :align: center + :scale: 75 -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_oob.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_oob.py` .. _gradient_boosting_warm_start: Fitting additional weak-learners --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Both :class:`GradientBoostingRegressor` and :class:`GradientBoostingClassifier` support ``warm_start=True`` which allows you to add more estimators to an already @@ -586,15 +594,30 @@ fitted model. :: - >>> _ = est.set_params(n_estimators=200, warm_start=True) # set warm_start and new nr of trees + >>> import numpy as np + >>> from sklearn.metrics import mean_squared_error + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.ensemble import GradientBoostingRegressor + + >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) + >>> X_train, X_test = X[:200], X[200:] + >>> y_train, y_test = y[:200], y[200:] + >>> est = GradientBoostingRegressor( + ... n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, + ... loss='squared_error' + ... ) + >>> est = est.fit(X_train, y_train) # fit with 100 trees + >>> mean_squared_error(y_test, est.predict(X_test)) + 5.00 + >>> _ = est.set_params(n_estimators=200, warm_start=True) # set warm_start and increase num of trees >>> _ = est.fit(X_train, y_train) # fit additional 100 trees to est >>> mean_squared_error(y_test, est.predict(X_test)) - 3.84... + 3.84 .. _gradient_boosting_tree_size: Controlling the tree size -------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ The size of the regression tree base learners defines the level of variable interactions that can be captured by the gradient boosting model. In general, @@ -617,54 +640,55 @@ We found that ``max_leaf_nodes=k`` gives comparable results to ``max_depth=k-1`` but is significantly faster to train at the expense of a slightly higher training error. The parameter ``max_leaf_nodes`` corresponds to the variable ``J`` in the -chapter on gradient boosting in [F2001]_ and is related to the parameter +chapter on gradient boosting in [Friedman2001]_ and is related to the parameter ``interaction.depth`` in R's gbm package where ``max_leaf_nodes == interaction.depth + 1`` . +.. _gradient_boosting_formulation: + Mathematical formulation -------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^ We first present GBRT for regression, and then detail the classification case. -Regression -^^^^^^^^^^ +.. dropdown:: Regression -GBRT regressors are additive models whose prediction :math:`y_i` for a -given input :math:`x_i` is of the following form: + GBRT regressors are additive models whose prediction :math:`\hat{y}_i` for a + given input :math:`x_i` is of the following form: .. math:: - \hat{y_i} = F_M(x_i) = \sum_{m=1}^{M} h_m(x_i) + \hat{y}_i = F_M(x_i) = \sum_{m=1}^{M} h_m(x_i) -where the :math:`h_m` are estimators called *weak learners* in the context -of boosting. Gradient Tree Boosting uses :ref:`decision tree regressors -` of fixed size as weak learners. The constant M corresponds to the -`n_estimators` parameter. + where the :math:`h_m` are estimators called *weak learners* in the context + of boosting. Gradient Tree Boosting uses :ref:`decision tree regressors + ` of fixed size as weak learners. The constant M corresponds to the + `n_estimators` parameter. -Similar to other boosting algorithms, a GBRT is built in a greedy fashion: + Similar to other boosting algorithms, a GBRT is built in a greedy fashion: .. math:: F_m(x) = F_{m-1}(x) + h_m(x), -where the newly added tree :math:`h_m` is fitted in order to minimize a sum -of losses :math:`L_m`, given the previous ensemble :math:`F_{m-1}`: + where the newly added tree :math:`h_m` is fitted in order to minimize a sum + of losses :math:`L_m`, given the previous ensemble :math:`F_{m-1}`: .. math:: h_m = \arg\min_{h} L_m = \arg\min_{h} \sum_{i=1}^{n} l(y_i, F_{m-1}(x_i) + h(x_i)), -where :math:`l(y_i, F(x_i))` is defined by the `loss` parameter, detailed -in the next section. + where :math:`l(y_i, F(x_i))` is defined by the `loss` parameter, detailed + in the next section. -By default, the initial model :math:`F_{0}` is chosen as the constant that -minimizes the loss: for a least-squares loss, this is the empirical mean of -the target values. The initial model can also be specified via the ``init`` -argument. + By default, the initial model :math:`F_{0}` is chosen as the constant that + minimizes the loss: for a least-squares loss, this is the empirical mean of + the target values. The initial model can also be specified via the ``init`` + argument. -Using a first-order Taylor approximation, the value of :math:`l` can be -approximated as follows: + Using a first-order Taylor approximation, the value of :math:`l` can be + approximated as follows: .. math:: @@ -673,113 +697,112 @@ approximated as follows: + h_m(x_i) \left[ \frac{\partial l(y_i, F(x_i))}{\partial F(x_i)} \right]_{F=F_{m - 1}}. -.. note:: + .. note:: - Briefly, a first-order Taylor approximation says that - :math:`l(z) \approx l(a) + (z - a) \frac{\partial l(a)}{\partial a}`. - Here, :math:`z` corresponds to :math:`F_{m - 1}(x_i) + h_m(x_i)`, and - :math:`a` corresponds to :math:`F_{m-1}(x_i)` + Briefly, a first-order Taylor approximation says that + :math:`l(z) \approx l(a) + (z - a) \frac{\partial l}{\partial z}(a)`. + Here, :math:`z` corresponds to :math:`F_{m - 1}(x_i) + h_m(x_i)`, and + :math:`a` corresponds to :math:`F_{m-1}(x_i)` -The quantity :math:`\left[ \frac{\partial l(y_i, F(x_i))}{\partial F(x_i)} -\right]_{F=F_{m - 1}}` is the derivative of the loss with respect to its -second parameter, evaluated at :math:`F_{m-1}(x)`. It is easy to compute for -any given :math:`F_{m - 1}(x_i)` in a closed form since the loss is -differentiable. We will denote it by :math:`g_i`. + The quantity :math:`\left[ \frac{\partial l(y_i, F(x_i))}{\partial F(x_i)} + \right]_{F=F_{m - 1}}` is the derivative of the loss with respect to its + second parameter, evaluated at :math:`F_{m-1}(x)`. It is easy to compute for + any given :math:`F_{m - 1}(x_i)` in a closed form since the loss is + differentiable. We will denote it by :math:`g_i`. -Removing the constant terms, we have: + Removing the constant terms, we have: .. math:: h_m \approx \arg\min_{h} \sum_{i=1}^{n} h(x_i) g_i -This is minimized if :math:`h(x_i)` is fitted to predict a value that is -proportional to the negative gradient :math:`-g_i`. Therefore, at each -iteration, **the estimator** :math:`h_m` **is fitted to predict the negative -gradients of the samples**. The gradients are updated at each iteration. -This can be considered as some kind of gradient descent in a functional -space. + This is minimized if :math:`h(x_i)` is fitted to predict a value that is + proportional to the negative gradient :math:`-g_i`. Therefore, at each + iteration, **the estimator** :math:`h_m` **is fitted to predict the negative + gradients of the samples**. The gradients are updated at each iteration. + This can be considered as some kind of gradient descent in a functional + space. -.. note:: + .. note:: - For some losses, e.g. the least absolute deviation (LAD) where the gradients - are :math:`\pm 1`, the values predicted by a fitted :math:`h_m` are not - accurate enough: the tree can only output integer values. As a result, the - leaves values of the tree :math:`h_m` are modified once the tree is - fitted, such that the leaves values minimize the loss :math:`L_m`. The - update is loss-dependent: for the LAD loss, the value of a leaf is updated - to the median of the samples in that leaf. + For some losses, e.g. ``'absolute_error'`` where the gradients + are :math:`\pm 1`, the values predicted by a fitted :math:`h_m` are not + accurate enough: the tree can only output integer values. As a result, the + leaves values of the tree :math:`h_m` are modified once the tree is + fitted, such that the leaves values minimize the loss :math:`L_m`. The + update is loss-dependent: for the absolute error loss, the value of + a leaf is updated to the median of the samples in that leaf. -Classification -^^^^^^^^^^^^^^ +.. dropdown:: Classification -Gradient boosting for classification is very similar to the regression case. -However, the sum of the trees :math:`F_M(x_i) = \sum_m h_m(x_i)` is not -homogeneous to a prediction: it cannot be a class, since the trees predict -continuous values. + Gradient boosting for classification is very similar to the regression case. + However, the sum of the trees :math:`F_M(x_i) = \sum_m h_m(x_i)` is not + homogeneous to a prediction: it cannot be a class, since the trees predict + continuous values. -The mapping from the value :math:`F_M(x_i)` to a class or a probability is -loss-dependent. For the deviance (or log-loss), the probability that -:math:`x_i` belongs to the positive class is modeled as :math:`p(y_i = 1 | -x_i) = \sigma(F_M(x_i))` where :math:`\sigma` is the sigmoid function. + The mapping from the value :math:`F_M(x_i)` to a class or a probability is + loss-dependent. For the log-loss, the probability that + :math:`x_i` belongs to the positive class is modeled as :math:`p(y_i = 1 | + x_i) = \sigma(F_M(x_i))` where :math:`\sigma` is the sigmoid or expit function. -For multiclass classification, K trees (for K classes) are built at each of -the :math:`M` iterations. The probability that :math:`x_i` belongs to class -k is modeled as a softmax of the :math:`F_{M,k}(x_i)` values. + For multiclass classification, K trees (for K classes) are built at each of + the :math:`M` iterations. The probability that :math:`x_i` belongs to class + k is modeled as a softmax of the :math:`F_{M,k}(x_i)` values. -Note that even for a classification task, the :math:`h_m` sub-estimator is -still a regressor, not a classifier. This is because the sub-estimators are -trained to predict (negative) *gradients*, which are always continuous -quantities. + Note that even for a classification task, the :math:`h_m` sub-estimator is + still a regressor, not a classifier. This is because the sub-estimators are + trained to predict (negative) *gradients*, which are always continuous + quantities. .. _gradient_boosting_loss: Loss Functions --------------- +^^^^^^^^^^^^^^ The following loss functions are supported and can be specified using the parameter ``loss``: - * Regression - - * Least squares (``'ls'``): The natural choice for regression due - to its superior computational properties. The initial model is - given by the mean of the target values. - * Least absolute deviation (``'lad'``): A robust loss function for - regression. The initial model is given by the median of the - target values. - * Huber (``'huber'``): Another robust loss function that combines - least squares and least absolute deviation; use ``alpha`` to - control the sensitivity with regards to outliers (see [F2001]_ for - more details). - * Quantile (``'quantile'``): A loss function for quantile regression. - Use ``0 < alpha < 1`` to specify the quantile. This loss function - can be used to create prediction intervals - (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`). - - * Classification - - * Binomial deviance (``'deviance'``): The negative binomial - log-likelihood loss function for binary classification (provides - probability estimates). The initial model is given by the - log odds-ratio. - * Multinomial deviance (``'deviance'``): The negative multinomial - log-likelihood loss function for multi-class classification with - ``n_classes`` mutually exclusive classes. It provides - probability estimates. The initial model is given by the - prior probability of each class. At each iteration ``n_classes`` - regression trees have to be constructed which makes GBRT rather - inefficient for data sets with a large number of classes. - * Exponential loss (``'exponential'``): The same loss function - as :class:`AdaBoostClassifier`. Less robust to mislabeled - examples than ``'deviance'``; can only be used for binary - classification. +.. dropdown:: Regression + + * Squared error (``'squared_error'``): The natural choice for regression + due to its superior computational properties. The initial model is + given by the mean of the target values. + * Absolute error (``'absolute_error'``): A robust loss function for + regression. The initial model is given by the median of the + target values. + * Huber (``'huber'``): Another robust loss function that combines + least squares and least absolute deviation; use ``alpha`` to + control the sensitivity with regards to outliers (see [Friedman2001]_ for + more details). + * Quantile (``'quantile'``): A loss function for quantile regression. + Use ``0 < alpha < 1`` to specify the quantile. This loss function + can be used to create prediction intervals + (see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`). + +.. dropdown:: Classification + + * Binary log-loss (``'log-loss'``): The binomial + negative log-likelihood loss function for binary classification. It provides + probability estimates. The initial model is given by the + log odds-ratio. + * Multi-class log-loss (``'log-loss'``): The multinomial + negative log-likelihood loss function for multi-class classification with + ``n_classes`` mutually exclusive classes. It provides + probability estimates. The initial model is given by the + prior probability of each class. At each iteration ``n_classes`` + regression trees have to be constructed which makes GBRT rather + inefficient for data sets with a large number of classes. + * Exponential loss (``'exponential'``): The same loss function + as :class:`AdaBoostClassifier`. Less robust to mislabeled + examples than ``'log-loss'``; can only be used for binary + classification. .. _gradient_boosting_shrinkage: Shrinkage via learning rate ---------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[F2001]_ proposed a simple regularization strategy that scales +[Friedman2001]_ proposed a simple regularization strategy that scales the contribution of each weak learner by a constant factor :math:`\nu`: .. math:: @@ -787,7 +810,7 @@ the contribution of each weak learner by a constant factor :math:`\nu`: F_m(x) = F_{m-1}(x) + \nu h_m(x) The parameter :math:`\nu` is also called the **learning rate** because -it scales the step length the gradient descent procedure; it can +it scales the step length of the gradient descent procedure; it can be set via the ``learning_rate`` parameter. The parameter ``learning_rate`` strongly interacts with the parameter @@ -796,14 +819,16 @@ of ``learning_rate`` require larger numbers of weak learners to maintain a constant training error. Empirical evidence suggests that small values of ``learning_rate`` favor better test error. [HTF]_ recommend to set the learning rate to a small constant -(e.g. ``learning_rate <= 0.1``) and choose ``n_estimators`` by early -stopping. For a more detailed discussion of the interaction between +(e.g. ``learning_rate <= 0.1``) and choose ``n_estimators`` large enough +that early stopping applies, +see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_early_stopping.py` +for a more detailed discussion of the interaction between ``learning_rate`` and ``n_estimators`` see [R2007]_. Subsampling ------------ +^^^^^^^^^^^^ -[F1999]_ proposed stochastic gradient boosting, which combines gradient +[Friedman2002]_ proposed stochastic gradient boosting, which combines gradient boosting with bootstrap averaging (bagging). At each iteration the base classifier is trained on a fraction ``subsample`` of the available training data. The subsample is drawn without replacement. @@ -821,7 +846,7 @@ does poorly. :scale: 75 Another strategy to reduce the variance is by subsampling the features -analogous to the random splits in :class:`RandomForestClassifier` . +analogous to the random splits in :class:`RandomForestClassifier`. The number of subsampled features can be controlled via the ``max_features`` parameter. @@ -830,23 +855,22 @@ parameter. Stochastic gradient boosting allows to compute out-of-bag estimates of the test deviance by computing the improvement in deviance on the examples that are not included in the bootstrap sample (i.e. the out-of-bag examples). -The improvements are stored in the attribute -:attr:`~GradientBoostingRegressor.oob_improvement_`. ``oob_improvement_[i]`` holds -the improvement in terms of the loss on the OOB samples if you add the i-th stage -to the current predictions. +The improvements are stored in the attribute `oob_improvement_`. +``oob_improvement_[i]`` holds the improvement in terms of the loss on the OOB samples +if you add the i-th stage to the current predictions. Out-of-bag estimates can be used for model selection, for example to determine the optimal number of iterations. OOB estimates are usually very pessimistic thus we recommend to use cross-validation instead and only use OOB if cross-validation is too time consuming. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regularization.py` - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_oob.py` - * :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regularization.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_oob.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py` Interpretation with feature importance --------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Individual decision trees can be interpreted easily by simply visualizing the tree structure. Gradient boosting models, however, @@ -859,7 +883,7 @@ Often features do not contribute equally to predict the target response; in many situations the majority of the features are in fact irrelevant. When interpreting a model, the first question usually is: what are -those important features and how do they contributing in predicting +those important features and how do they contribute in predicting the target response? Individual decision trees intrinsically perform feature selection by selecting @@ -880,352 +904,420 @@ accessed via the ``feature_importances_`` property:: >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, ... max_depth=1, random_state=0).fit(X, y) >>> clf.feature_importances_ - array([0.10..., 0.10..., 0.11..., ... + array([0.107, 0.105, 0.113, 0.0987, 0.0947, + 0.107, 0.0916, 0.0972, 0.0958, 0.0906]) Note that this computation of feature importance is based on entropy, and it is distinct from :func:`sklearn.inspection.permutation_importance` which is based on permutation of the features. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` -.. _histogram_based_gradient_boosting: +.. rubric:: References -Histogram-Based Gradient Boosting -================================= +.. [Friedman2001] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient + boosting machine <10.1214/aos/1013203451>`. + Annals of Statistics, 29, 1189-1232. -Scikit-learn 0.21 introduced two new experimental implementations of -gradient boosting trees, namely :class:`HistGradientBoostingClassifier` -and :class:`HistGradientBoostingRegressor`, inspired by -`LightGBM `__ (See [LightGBM]_). +.. [Friedman2002] Friedman, J.H. (2002). `Stochastic gradient boosting. + `_. + Computational Statistics & Data Analysis, 38, 367-378. -These histogram-based estimators can be **orders of magnitude faster** -than :class:`GradientBoostingClassifier` and -:class:`GradientBoostingRegressor` when the number of samples is larger -than tens of thousands of samples. +.. [R2007] G. Ridgeway (2006). `Generalized Boosted Models: A guide to the gbm + package `_ -They also have built-in support for missing values, which avoids the need -for an imputer. +.. _forest: -These fast estimators first bin the input samples ``X`` into -integer-valued bins (typically 256 bins) which tremendously reduces the -number of splitting points to consider, and allows the algorithm to -leverage integer-based data structures (histograms) instead of relying on -sorted continuous values when building the trees. The API of these -estimators is slightly different, and some of the features from -:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` -are not yet supported, for instance some loss functions. +Random forests and other randomized tree ensembles +=================================================== -These estimators are still **experimental**: their predictions -and their API might change without any deprecation cycle. To use them, you -need to explicitly import ``enable_hist_gradient_boosting``:: +The :mod:`sklearn.ensemble` module includes two averaging algorithms based +on randomized :ref:`decision trees `: the RandomForest algorithm +and the Extra-Trees method. Both algorithms are perturb-and-combine +techniques [B1998]_ specifically designed for trees. This means a diverse +set of classifiers is created by introducing randomness in the classifier +construction. The prediction of the ensemble is given as the averaged +prediction of the individual classifiers. - >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble - >>> from sklearn.ensemble import HistGradientBoostingClassifier +As other classifiers, forest classifiers have to be fitted with two +arrays: a sparse or dense array X of shape ``(n_samples, n_features)`` +holding the training samples, and an array Y of shape ``(n_samples,)`` +holding the target values (class labels) for the training samples:: -.. topic:: Examples: + >>> from sklearn.ensemble import RandomForestClassifier + >>> X = [[0, 0], [1, 1]] + >>> Y = [0, 1] + >>> clf = RandomForestClassifier(n_estimators=10) + >>> clf = clf.fit(X, Y) - * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` +Like :ref:`decision trees `, forests of trees also extend to +:ref:`multi-output problems ` (if Y is an array +of shape ``(n_samples, n_outputs)``). -Usage ------ +Random Forests +-------------- -Most of the parameters are unchanged from -:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`. -One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and -controls the number of iterations of the boosting process:: +In random forests (see :class:`RandomForestClassifier` and +:class:`RandomForestRegressor` classes), each tree in the ensemble is built +from a sample drawn with replacement (i.e., a bootstrap sample) from the +training set. - >>> from sklearn.experimental import enable_hist_gradient_boosting - >>> from sklearn.ensemble import HistGradientBoostingClassifier - >>> from sklearn.datasets import make_hastie_10_2 +Furthermore, when splitting each node during the construction of a tree, the +best split is found through an exhaustive search of the features values of +either all input features or a random subset of size ``max_features``. +(See the :ref:`parameter tuning guidelines ` for more details.) - >>> X, y = make_hastie_10_2(random_state=0) - >>> X_train, X_test = X[:2000], X[2000:] - >>> y_train, y_test = y[:2000], y[2000:] +The purpose of these two sources of randomness is to decrease the variance of +the forest estimator. Indeed, individual decision trees typically exhibit high +variance and tend to overfit. The injected randomness in forests yield decision +trees with somewhat decoupled prediction errors. By taking an average of those +predictions, some errors can cancel out. Random forests achieve a reduced +variance by combining diverse trees, sometimes at the cost of a slight increase +in bias. In practice the variance reduction is often significant hence yielding +an overall better model. - >>> clf = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train) - >>> clf.score(X_test, y_test) - 0.8965 +In contrast to the original publication [B2001]_, the scikit-learn +implementation combines classifiers by averaging their probabilistic +prediction, instead of letting each classifier vote for a single class. -Available losses for regression are 'least_squares', -'least_absolute_deviation', which is less sensitive to outliers, and -'poisson', which is well suited to model counts and frequencies. For -classification, 'binary_crossentropy' is used for binary classification and -'categorical_crossentropy' is used for multiclass classification. By default -the loss is 'auto' and will select the appropriate loss depending on -:term:`y` passed to :term:`fit`. +A competitive alternative to random forests are +:ref:`histogram_based_gradient_boosting` (HGBT) models: -The size of the trees can be controlled through the ``max_leaf_nodes``, -``max_depth``, and ``min_samples_leaf`` parameters. +- Building trees: Random forests typically rely on deep trees (that overfit + individually) which uses much computational resources, as they require + several splittings and evaluations of candidate splits. Boosting models + build shallow trees (that underfit individually) which are faster to fit + and predict. -The number of bins used to bin the data is controlled with the ``max_bins`` -parameter. Using less bins acts as a form of regularization. It is -generally recommended to use as many bins as possible, which is the default. +- Sequential boosting: In HGBT, the decision trees are built sequentially, + where each tree is trained to correct the errors made by the previous ones. + This allows them to iteratively improve the model's performance using + relatively few trees. In contrast, random forests use a majority vote to + predict the outcome, which can require a larger number of trees to achieve + the same level of accuracy. -The ``l2_regularization`` parameter is a regularizer on the loss function and -corresponds to :math:`\lambda` in equation (2) of [XGBoost]_. +- Efficient binning: HGBT uses an efficient binning algorithm that can handle + large datasets with a high number of features. The binning algorithm can + pre-process the data to speed up the subsequent tree construction (see + :ref:`Why it's faster `). In contrast, the scikit-learn + implementation of random forests does not use binning and relies on exact + splitting, which can be computationally expensive. -Note that **early-stopping is enabled by default if the number of samples is -larger than 10,000**. The early-stopping behaviour is controlled via the -``early-stopping``, ``scoring``, ``validation_fraction``, -``n_iter_no_change``, and ``tol`` parameters. It is possible to early-stop -using an arbitrary :term:`scorer`, or just the training or validation loss. -Note that for technical reasons, using a scorer is significantly slower than -using the loss. By default, early-stopping is performed if there are at least -10,000 samples in the training set, using the validation loss. +Overall, the computational cost of HGBT versus RF depends on the specific +characteristics of the dataset and the modeling task. It's a good idea +to try both models and compare their performance and computational efficiency +on your specific problem to determine which model is the best fit. -Missing values support ----------------------- +.. rubric:: Examples -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` have built-in support for missing -values (NaNs). +* :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py` -During training, the tree grower learns at each split point whether samples -with missing values should go to the left or right child, based on the -potential gain. When predicting, samples with missing values are assigned to -the left or right child consequently:: +Extremely Randomized Trees +-------------------------- - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> from sklearn.ensemble import HistGradientBoostingClassifier - >>> import numpy as np +In extremely randomized trees (see :class:`ExtraTreesClassifier` +and :class:`ExtraTreesRegressor` classes), randomness goes one step +further in the way splits are computed. As in random forests, a random +subset of candidate features is used, but instead of looking for the +most discriminative thresholds, thresholds are drawn at random for each +candidate feature and the best of these randomly-generated thresholds is +picked as the splitting rule. This usually allows to reduce the variance +of the model a bit more, at the expense of a slightly greater increase +in bias:: - >>> X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) - >>> y = [0, 0, 1, 1] + >>> from sklearn.model_selection import cross_val_score + >>> from sklearn.datasets import make_blobs + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.ensemble import ExtraTreesClassifier + >>> from sklearn.tree import DecisionTreeClassifier - >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1).fit(X, y) - >>> gbdt.predict(X) - array([0, 0, 1, 1]) + >>> X, y = make_blobs(n_samples=10000, n_features=10, centers=100, + ... random_state=0) -When the missingness pattern is predictive, the splits can be done on -whether the feature value is missing or not:: + >>> clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, + ... random_state=0) + >>> scores = cross_val_score(clf, X, y, cv=5) + >>> scores.mean() + np.float64(0.98) - >>> X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1) - >>> y = [0, 1, 0, 0, 1] - >>> gbdt = HistGradientBoostingClassifier(min_samples_leaf=1, - ... max_depth=2, - ... learning_rate=1, - ... max_iter=1).fit(X, y) - >>> gbdt.predict(X) - array([0, 1, 0, 0, 1]) + >>> clf = RandomForestClassifier(n_estimators=10, max_depth=None, + ... min_samples_split=2, random_state=0) + >>> scores = cross_val_score(clf, X, y, cv=5) + >>> scores.mean() + np.float64(0.999) -If no missing values were encountered for a given feature during training, -then samples with missing values are mapped to whichever child has the most -samples. + >>> clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, + ... min_samples_split=2, random_state=0) + >>> scores = cross_val_score(clf, X, y, cv=5) + >>> scores.mean() > 0.999 + np.True_ -.. _sw_hgbdt: +.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_forest_iris_001.png + :target: ../auto_examples/ensemble/plot_forest_iris.html + :align: center + :scale: 75% -Sample weight support ---------------------- +.. _random_forest_parameters: + +Parameters +---------- + +The main parameters to adjust when using these methods is ``n_estimators`` and +``max_features``. The former is the number of trees in the forest. The larger +the better, but also the longer it will take to compute. In addition, note that +results will stop getting significantly better beyond a critical number of +trees. The latter is the size of the random subsets of features to consider +when splitting a node. The lower the greater the reduction of variance, but +also the greater the increase in bias. Empirical good default values are +``max_features=1.0`` or equivalently ``max_features=None`` (always considering +all features instead of a random subset) for regression problems, and +``max_features="sqrt"`` (using a random subset of size ``sqrt(n_features)``) +for classification tasks (where ``n_features`` is the number of features in +the data). The default value of ``max_features=1.0`` is equivalent to bagged +trees and more randomness can be achieved by setting smaller values (e.g. 0.3 +is a typical default in the literature). Good results are often achieved when +setting ``max_depth=None`` in combination with ``min_samples_split=2`` (i.e., +when fully developing the trees). Bear in mind though that these values are +usually not optimal, and might result in models that consume a lot of RAM. +The best parameter values should always be cross-validated. In addition, note +that in random forests, bootstrap samples are used by default +(``bootstrap=True``) while the default strategy for extra-trees is to use the +whole dataset (``bootstrap=False``). When using bootstrap sampling the +generalization error can be estimated on the left out or out-of-bag samples. +This can be enabled by setting ``oob_score=True``. + +.. note:: + + The size of the model with the default parameters is :math:`O( M * N * log (N) )`, + where :math:`M` is the number of trees and :math:`N` is the number of samples. + In order to reduce the size of the model, you can change these parameters: + ``min_samples_split``, ``max_leaf_nodes``, ``max_depth`` and ``min_samples_leaf``. + +Parallelization +--------------- + +Finally, this module also features the parallel construction of the trees +and the parallel computation of the predictions through the ``n_jobs`` +parameter. If ``n_jobs=k`` then computations are partitioned into +``k`` jobs, and run on ``k`` cores of the machine. If ``n_jobs=-1`` +then all cores available on the machine are used. Note that because of +inter-process communication overhead, the speedup might not be linear +(i.e., using ``k`` jobs will unfortunately not be ``k`` times as +fast). Significant speedup can still be achieved though when building +a large number of trees, or when building a single tree requires a fair +amount of time (e.g., on large datasets). + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_ensemble_plot_forest_iris.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` + +.. rubric:: References + +.. [B2001] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. + +.. [B1998] L. Breiman, "Arcing Classifiers", Annals of Statistics 1998. + +* P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized + trees", Machine Learning, 63(1), 3-42, 2006. + +.. _random_forest_feature_importance: + +Feature importance evaluation +----------------------------- + +The relative rank (i.e. depth) of a feature used as a decision node in a +tree can be used to assess the relative importance of that feature with +respect to the predictability of the target variable. Features used at +the top of the tree contribute to the final prediction decision of a +larger fraction of the input samples. The **expected fraction of the +samples** they contribute to can thus be used as an estimate of the +**relative importance of the features**. In scikit-learn, the fraction of +samples a feature contributes to is combined with the decrease in impurity +from splitting them to create a normalized estimate of the predictive power +of that feature. + +By **averaging** the estimates of predictive ability over several randomized +trees one can **reduce the variance** of such an estimate and use it +for feature selection. This is known as the mean decrease in impurity, or MDI. +Refer to [L2014]_ for more information on MDI and feature importance +evaluation with Random Forests. + +.. warning:: + + The impurity-based feature importances computed on tree-based models suffer + from two flaws that can lead to misleading conclusions. First they are + computed on statistics derived from the training dataset and therefore **do + not necessarily inform us on which features are most important to make good + predictions on held-out dataset**. Secondly, **they favor high cardinality + features**, that is features with many unique values. + :ref:`permutation_importance` is an alternative to impurity-based feature + importance that does not suffer from these flaws. These two methods of + obtaining feature importance are explored in: + :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`. -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` sample support weights during -:term:`fit`. +In practice those estimates are stored as an attribute named +``feature_importances_`` on the fitted model. This is an array with shape +``(n_features,)`` whose values are positive and sum to 1.0. The higher +the value, the more important is the contribution of the matching feature +to the prediction function. -The following toy example demonstrates how the model ignores the samples with -zero sample weights: +.. rubric:: Examples - >>> X = [[1, 0], - ... [1, 0], - ... [1, 0], - ... [0, 1]] - >>> y = [0, 0, 1, 0] - >>> # ignore the first 2 training samples by setting their weight to 0 - >>> sample_weight = [0, 0, 1, 1] - >>> gb = HistGradientBoostingClassifier(min_samples_leaf=1) - >>> gb.fit(X, y, sample_weight=sample_weight) - HistGradientBoostingClassifier(...) - >>> gb.predict([[1, 0]]) - array([1]) - >>> gb.predict_proba([[1, 0]])[0, 1] - 0.99... +* :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances.py` -As you can see, the `[1, 0]` is comfortably classified as `1` since the first -two samples are ignored due to their sample weights. +.. rubric:: References -Implementation detail: taking sample weights into account amounts to -multiplying the gradients (and the hessians) by the sample weights. Note that -the binning stage (specifically the quantiles computation) does not take the -weights into account. +.. [L2014] G. Louppe, :arxiv:`"Understanding Random Forests: From Theory to + Practice" <1407.7502>`, + PhD Thesis, U. of Liege, 2014. -.. _categorical_support_gbdt: +.. _random_trees_embedding: -Categorical Features Support ----------------------------- +Totally Random Trees Embedding +------------------------------ -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` have native support for categorical -features: they can consider splits on non-ordered, categorical data. +:class:`RandomTreesEmbedding` implements an unsupervised transformation of the +data. Using a forest of completely random trees, :class:`RandomTreesEmbedding` +encodes the data by the indices of the leaves a data point ends up in. This +index is then encoded in a one-of-K manner, leading to a high dimensional, +sparse binary coding. +This coding can be computed very efficiently and can then be used as a basis +for other learning tasks. +The size and sparsity of the code can be influenced by choosing the number of +trees and the maximum depth per tree. For each tree in the ensemble, the coding +contains one entry of one. The size of the coding is at most ``n_estimators * 2 +** max_depth``, the maximum number of leaves in the forest. -For datasets with categorical features, using the native categorical support -is often better than relying on one-hot encoding -(:class:`~sklearn.preprocessing.OneHotEncoder`), because one-hot encoding -requires more tree depth to achieve equivalent splits. It is also usually -better to rely on the native categorical support rather than to treat -categorical features as continuous (ordinal), which happens for ordinal-encoded -categorical data, since categories are nominal quantities where order does not -matter. +As neighboring data points are more likely to lie within the same leaf of a +tree, the transformation performs an implicit, non-parametric density +estimation. -To enable categorical support, a boolean mask can be passed to the -`categorical_features` parameter, indicating which feature is categorical. In -the following, the first feature will be treated as categorical and the -second feature as numerical:: +.. rubric:: Examples - >>> gbdt = HistGradientBoostingClassifier(categorical_features=[True, False]) +* :ref:`sphx_glr_auto_examples_ensemble_plot_random_forest_embedding.py` -Equivalently, one can pass a list of integers indicating the indices of the -categorical features:: +* :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` compares non-linear + dimensionality reduction techniques on handwritten digits. - >>> gbdt = HistGradientBoostingClassifier(categorical_features=[0]) +* :ref:`sphx_glr_auto_examples_ensemble_plot_feature_transformation.py` compares + supervised and unsupervised tree based feature transformations. -The cardinality of each categorical feature should be less than the `max_bins` -parameter, and each categorical feature is expected to be encoded in -`[0, max_bins - 1]`. To that end, it might be useful to pre-process the data -with an :class:`~sklearn.preprocessing.OrdinalEncoder` as done in -:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`. +.. seealso:: -If there are missing values during training, the missing values will be -treated as a proper category. If there are no missing values during training, -then at prediction time, missing values are mapped to the child node that has -the most samples (just like for continuous features). When predicting, -categories that were not seen during fit time will be treated as missing -values. + :ref:`manifold` techniques can also be useful to derive non-linear + representations of feature space, also these approaches focus also on + dimensionality reduction. -**Split finding with categorical features**: The canonical way of considering -categorical splits in a tree is to consider -all of the :math:`2^{K - 1} - 1` partitions, where :math:`K` is the number of -categories. This can quickly become prohibitive when :math:`K` is large. -Fortunately, since gradient boosting trees are always regression trees (even -for classification problems), there exist a faster strategy that can yield -equivalent splits. First, the categories of a feature are sorted according to -the variance of the target, for each category `k`. Once the categories are -sorted, one can consider *continuous partitions*, i.e. treat the categories -as if they were ordered continuous values (see Fisher [Fisher1958]_ for a -formal proof). As a result, only :math:`K - 1` splits need to be considered -instead of :math:`2^{K - 1} - 1`. The initial sorting is a -:math:`\mathcal{O}(K \log(K))` operation, leading to a total complexity of -:math:`\mathcal{O}(K \log(K) + K)`, instead of :math:`\mathcal{O}(2^K)`. - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py` +.. _tree_ensemble_warm_start: -.. _monotonic_cst_gbdt: +Fitting additional trees +------------------------ -Monotonic Constraints ---------------------- +RandomForest, Extra-Trees and :class:`RandomTreesEmbedding` estimators all support +``warm_start=True`` which allows you to add more trees to an already fitted model. -Depending on the problem at hand, you may have prior knowledge indicating -that a given feature should in general have a positive (or negative) effect -on the target value. For example, all else being equal, a higher credit -score should increase the probability of getting approved for a loan. -Monotonic constraints allow you to incorporate such prior knowledge into the -model. +:: -A positive monotonic constraint is a constraint of the form: + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + + >>> X, y = make_classification(n_samples=100, random_state=1) + >>> clf = RandomForestClassifier(n_estimators=10) + >>> clf = clf.fit(X, y) # fit with 10 trees + >>> len(clf.estimators_) + 10 + >>> # set warm_start and increase num of estimators + >>> _ = clf.set_params(n_estimators=20, warm_start=True) + >>> _ = clf.fit(X, y) # fit additional 10 trees + >>> len(clf.estimators_) + 20 + +When ``random_state`` is also set, the internal random state is also preserved +between ``fit`` calls. This means that training a model once with ``n`` estimators is +the same as building the model iteratively via multiple ``fit`` calls, where the +final number of estimators is equal to ``n``. -:math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2)`, -where :math:`F` is the predictor with two features. +:: -Similarly, a negative monotonic constraint is of the form: + >>> clf = RandomForestClassifier(n_estimators=20) # set `n_estimators` to 10 + 10 + >>> _ = clf.fit(X, y) # fit `estimators_` will be the same as `clf` above -:math:`x_1 \leq x_1' \implies F(x_1, x_2) \geq F(x_1', x_2)`. +Note that this differs from the usual behavior of :term:`random_state` in that it does +*not* result in the same result across different calls. -Note that monotonic constraints only constraint the output "all else being -equal". Indeed, the following relation **is not enforced** by a positive -constraint: :math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2')`. +.. _bagging: -You can specify a monotonic constraint on each feature using the -`monotonic_cst` parameter. For each feature, a value of 0 indicates no -constraint, while -1 and 1 indicate a negative and positive constraint, -respectively:: +Bagging meta-estimator +====================== - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> from sklearn.ensemble import HistGradientBoostingRegressor +In ensemble algorithms, bagging methods form a class of algorithms which build +several instances of a black-box estimator on random subsets of the original +training set and then aggregate their individual predictions to form a final +prediction. These methods are used as a way to reduce the variance of a base +estimator (e.g., a decision tree), by introducing randomization into its +construction procedure and then making an ensemble out of it. In many cases, +bagging methods constitute a very simple way to improve with respect to a +single model, without making it necessary to adapt the underlying base +algorithm. As they provide a way to reduce overfitting, bagging methods work +best with strong and complex models (e.g., fully developed decision trees), in +contrast with boosting methods which usually work best with weak models (e.g., +shallow decision trees). - ... # positive, negative, and no constraint on the 3 features - >>> gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1, 0]) +Bagging methods come in many flavours but mostly differ from each other by the +way they draw random subsets of the training set: -In a binary classification context, imposing a monotonic constraint means -that the feature is supposed to have a positive / negative effect on the -probability to belong to the positive class. Monotonic constraints are not -supported for multiclass context. +* When random subsets of the dataset are drawn as random subsets of the + samples, then this algorithm is known as Pasting [B1999]_. -.. note:: - Since categories are unordered quantities, it is not possible to enforce - monotonic constraints on categorical features. +* When samples are drawn with replacement, then the method is known as + Bagging [B1996]_. -.. topic:: Examples: +* When random subsets of the dataset are drawn as random subsets of + the features, then the method is known as Random Subspaces [H1998]_. - * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py` +* Finally, when base estimators are built on subsets of both samples and + features, then the method is known as Random Patches [LG2012]_. -Low-level parallelism ---------------------- +In scikit-learn, bagging methods are offered as a unified +:class:`BaggingClassifier` meta-estimator (resp. :class:`BaggingRegressor`), +taking as input a user-specified estimator along with parameters +specifying the strategy to draw random subsets. In particular, ``max_samples`` +and ``max_features`` control the size of the subsets (in terms of samples and +features), while ``bootstrap`` and ``bootstrap_features`` control whether +samples and features are drawn with or without replacement. When using a subset +of the available samples the generalization accuracy can be estimated with the +out-of-bag samples by setting ``oob_score=True``. As an example, the +snippet below illustrates how to instantiate a bagging ensemble of +:class:`~sklearn.neighbors.KNeighborsClassifier` estimators, each built on random +subsets of 50% of the samples and 50% of the features. -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` have implementations that use OpenMP -for parallelization through Cython. For more details on how to control the -number of threads, please refer to our :ref:`parallelism` notes. + >>> from sklearn.ensemble import BaggingClassifier + >>> from sklearn.neighbors import KNeighborsClassifier + >>> bagging = BaggingClassifier(KNeighborsClassifier(), + ... max_samples=0.5, max_features=0.5) -The following parts are parallelized: +.. rubric:: Examples -- mapping samples from real values to integer-valued bins (finding the bin - thresholds is however sequential) -- building histograms is parallelized over features -- finding the best split point at a node is parallelized over features -- during fit, mapping samples into the left and right children is - parallelized over samples -- gradient and hessians computations are parallelized over samples -- predicting is parallelized over samples +* :ref:`sphx_glr_auto_examples_ensemble_plot_bias_variance.py` -Why it's faster ---------------- +.. rubric:: References -The bottleneck of a gradient boosting procedure is building the decision -trees. Building a traditional decision tree (as in the other GBDTs -:class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`) -requires sorting the samples at each node (for -each feature). Sorting is needed so that the potential gain of a split point -can be computed efficiently. Splitting a single node has thus a complexity -of :math:`\mathcal{O}(n_\text{features} \times n \log(n))` where :math:`n` -is the number of samples at the node. +.. [B1999] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor`, in contrast, do not require sorting the -feature values and instead use a data-structure called a histogram, where the -samples are implicitly ordered. Building a histogram has a -:math:`\mathcal{O}(n)` complexity, so the node splitting procedure has a -:math:`\mathcal{O}(n_\text{features} \times n)` complexity, much smaller -than the previous one. In addition, instead of considering :math:`n` split -points, we here consider only ``max_bins`` split points, which is much -smaller. +.. [B1996] L. Breiman, "Bagging predictors", Machine Learning, 24(2), + 123-140, 1996. -In order to build histograms, the input data `X` needs to be binned into -integer-valued bins. This binning procedure does require sorting the feature -values, but it only happens once at the very beginning of the boosting process -(not at each node, like in :class:`GradientBoostingClassifier` and -:class:`GradientBoostingRegressor`). +.. [H1998] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, 1998. -Finally, many parts of the implementation of -:class:`HistGradientBoostingClassifier` and -:class:`HistGradientBoostingRegressor` are parallelized. +.. [LG2012] G. Louppe and P. Geurts, "Ensembles on Random Patches", + Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. -.. topic:: References - .. [F1999] Friedmann, Jerome H., 2007, `"Stochastic Gradient Boosting" - `_ - .. [R2007] G. Ridgeway, "Generalized Boosted Models: A guide to the gbm - package", 2007 - .. [XGBoost] Tianqi Chen, Carlos Guestrin, `"XGBoost: A Scalable Tree - Boosting System" `_ - .. [LightGBM] Ke et. al. `"LightGBM: A Highly Efficient Gradient - BoostingDecision Tree" `_ - .. [Fisher1958] Walter D. Fisher. `"On Grouping for Maximum Homogeneity" - `_ .. _voting_classifier: @@ -1235,7 +1327,7 @@ Voting Classifier The idea behind the :class:`VotingClassifier` is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilities (soft vote) to predict the class labels. -Such a classifier can be useful for a set of equally well performing model +Such a classifier can be useful for a set of equally well performing models in order to balance out their individual weaknesses. @@ -1308,7 +1400,7 @@ and averaged. The final class label is then derived from the class label with the highest average probability. To illustrate this with a simple example, let's assume we have 3 -classifiers and a 3-class classification problems where we assign +classifiers and a 3-class classification problem where we assign equal weights to all classifiers: w1=1, w2=1, w3=1. The weighted average probabilities for a sample would then be @@ -1317,69 +1409,27 @@ calculated as follows: ================ ========== ========== ========== classifier class 1 class 2 class 3 ================ ========== ========== ========== -classifier 1 w1 * 0.2 w1 * 0.5 w1 * 0.3 -classifier 2 w2 * 0.6 w2 * 0.3 w2 * 0.1 +classifier 1 w1 * 0.2 w1 * 0.5 w1 * 0.3 +classifier 2 w2 * 0.6 w2 * 0.3 w2 * 0.1 classifier 3 w3 * 0.3 w3 * 0.4 w3 * 0.3 -weighted average 0.37 0.4 0.23 +weighted average 0.37 0.4 0.23 ================ ========== ========== ========== -Here, the predicted class label is 2, since it has the -highest average probability. - -The following example illustrates how the decision regions may change -when a soft :class:`VotingClassifier` is used based on an linear Support -Vector Machine, a Decision Tree, and a K-nearest neighbor classifier:: - - >>> from sklearn import datasets - >>> from sklearn.tree import DecisionTreeClassifier - >>> from sklearn.neighbors import KNeighborsClassifier - >>> from sklearn.svm import SVC - >>> from itertools import product - >>> from sklearn.ensemble import VotingClassifier +Here, the predicted class label is 2, since it has the highest average +predicted probability. See the example on +:ref:`sphx_glr_auto_examples_ensemble_plot_voting_decision_regions.py` for a +demonstration of how the predicted class label can be obtained from the weighted +average of predicted probabilities. - >>> # Loading some example data - >>> iris = datasets.load_iris() - >>> X = iris.data[:, [0, 2]] - >>> y = iris.target +The following figure illustrates how the decision regions may change when +a soft :class:`VotingClassifier` is trained with weights on three linear +models: - >>> # Training classifiers - >>> clf1 = DecisionTreeClassifier(max_depth=4) - >>> clf2 = KNeighborsClassifier(n_neighbors=7) - >>> clf3 = SVC(kernel='rbf', probability=True) - >>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], - ... voting='soft', weights=[2, 1, 2]) - - >>> clf1 = clf1.fit(X, y) - >>> clf2 = clf2.fit(X, y) - >>> clf3 = clf3.fit(X, y) - >>> eclf = eclf.fit(X, y) - -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_001.png +.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_002.png :target: ../auto_examples/ensemble/plot_voting_decision_regions.html :align: center :scale: 75% -Using the `VotingClassifier` with `GridSearchCV` ------------------------------------------------- - -The :class:`VotingClassifier` can also be used together with -:class:`~sklearn.model_selection.GridSearchCV` in order to tune the -hyperparameters of the individual estimators:: - - >>> from sklearn.model_selection import GridSearchCV - >>> clf1 = LogisticRegression(random_state=1) - >>> clf2 = RandomForestClassifier(random_state=1) - >>> clf3 = GaussianNB() - >>> eclf = VotingClassifier( - ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], - ... voting='soft' - ... ) - - >>> params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]} - - >>> grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) - >>> grid = grid.fit(iris.data, iris.target) - Usage ----- @@ -1399,6 +1449,26 @@ Optionally, weights can be provided for the individual classifiers:: ... voting='soft', weights=[2,5,1] ... ) +.. dropdown:: Using the :class:`VotingClassifier` with :class:`~sklearn.model_selection.GridSearchCV` + + The :class:`VotingClassifier` can also be used together with + :class:`~sklearn.model_selection.GridSearchCV` in order to tune the + hyperparameters of the individual estimators:: + + >>> from sklearn.model_selection import GridSearchCV + >>> clf1 = LogisticRegression(random_state=1) + >>> clf2 = RandomForestClassifier(random_state=1) + >>> clf3 = GaussianNB() + >>> eclf = VotingClassifier( + ... estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft' + ... ) + + >>> params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]} + + >>> grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) + >>> grid = grid.fit(iris.data, iris.target) + .. _voting_regressor: Voting Regressor @@ -1435,9 +1505,9 @@ The following example shows how to fit the VotingRegressor:: :align: center :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py` +* :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py` .. _stacking: @@ -1501,8 +1571,8 @@ availability, tested in the order of preference: `predict_proba`, `decision_function` and `predict`. A :class:`StackingRegressor` and :class:`StackingClassifier` can be used as -any other regressor or classifier, exposing a `predict`, `predict_proba`, and -`decision_function` methods, e.g.:: +any other regressor or classifier, exposing a `predict`, `predict_proba`, or +`decision_function` method, e.g.:: >>> y_pred = reg.predict(X_test) >>> from sklearn.metrics import r2_score @@ -1513,15 +1583,15 @@ Note that it is also possible to get the output of the stacked `estimators` using the `transform` method:: >>> reg.transform(X_test[:5]) - array([[142..., 138..., 146...], - [179..., 182..., 151...], - [139..., 132..., 158...], - [286..., 292..., 225...], - [126..., 124..., 164...]]) + array([[142, 138, 146], + [179, 182, 151], + [139, 132, 158], + [286, 292, 225], + [126, 124, 164]]) In practice, a stacking predictor predicts as good as the best predictor of the base layer and even sometimes outperforms it by combining the different -strengths of the these predictors. However, training a stacking predictor is +strengths of these predictors. However, training a stacking predictor is computationally expensive. .. note:: @@ -1556,7 +1626,99 @@ computationally expensive. ... .format(multi_layer_regressor.score(X_test, y_test))) R2 score: 0.53 -.. topic:: References +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_ensemble_plot_stack_predictors.py` + +.. rubric:: References + +.. [W1992] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. + + + +.. _adaboost: + +AdaBoost +======== + +The module :mod:`sklearn.ensemble` includes the popular boosting algorithm +AdaBoost, introduced in 1995 by Freund and Schapire [FS1995]_. + +The core principle of AdaBoost is to fit a sequence of weak learners (i.e., +models that are only slightly better than random guessing, such as small +decision trees) on repeatedly modified versions of the data. The predictions +from all of them are then combined through a weighted majority vote (or sum) to +produce the final prediction. The data modifications at each so-called boosting +iteration consists of applying weights :math:`w_1`, :math:`w_2`, ..., :math:`w_N` +to each of the training samples. Initially, those weights are all set to +:math:`w_i = 1/N`, so that the first step simply trains a weak learner on the +original data. For each successive iteration, the sample weights are +individually modified and the learning algorithm is reapplied to the reweighted +data. At a given step, those training examples that were incorrectly predicted +by the boosted model induced at the previous step have their weights increased, +whereas the weights are decreased for those that were predicted correctly. As +iterations proceed, examples that are difficult to predict receive +ever-increasing influence. Each subsequent weak learner is thereby forced to +concentrate on the examples that are missed by the previous ones in the sequence +[HTF]_. + +.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_adaboost_multiclass_001.png + :target: ../auto_examples/ensemble/plot_adaboost_multiclass.html + :align: center + :scale: 75 + +AdaBoost can be used both for classification and regression problems: + +- For multi-class classification, :class:`AdaBoostClassifier` implements + AdaBoost.SAMME [ZZRH2009]_. + +- For regression, :class:`AdaBoostRegressor` implements AdaBoost.R2 [D1997]_. + +Usage +----- + +The following example shows how to fit an AdaBoost classifier with 100 weak +learners:: + + >>> from sklearn.model_selection import cross_val_score + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import AdaBoostClassifier + + >>> X, y = load_iris(return_X_y=True) + >>> clf = AdaBoostClassifier(n_estimators=100) + >>> scores = cross_val_score(clf, X, y, cv=5) + >>> scores.mean() + np.float64(0.95) + +The number of weak learners is controlled by the parameter ``n_estimators``. The +``learning_rate`` parameter controls the contribution of the weak learners in +the final combination. By default, weak learners are decision stumps. Different +weak learners can be specified through the ``estimator`` parameter. +The main parameters to tune to obtain good results are ``n_estimators`` and +the complexity of the base estimators (e.g., its depth ``max_depth`` or +minimum required number of samples to consider a split ``min_samples_split``). + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py` shows the performance + of AdaBoost on a multi-class problem. + +* :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_twoclass.py` shows the decision boundary + and decision function values for a non-linearly separable two-class problem + using AdaBoost-SAMME. + +* :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` demonstrates regression + with the AdaBoost.R2 algorithm. + +.. rubric:: References + +.. [FS1995] Y. Freund, and R. Schapire, "A Decision-Theoretic Generalization of + On-Line Learning and an Application to Boosting", 1997. + +.. [ZZRH2009] J. Zhu, H. Zou, S. Rosset, T. Hastie. "Multi-class AdaBoost", 2009. + +.. [D1997] H. Drucker. "Improving Regressors using Boosting Techniques", 1997. - .. [W1992] Wolpert, David H. "Stacked generalization." Neural networks 5.2 - (1992): 241-259. +.. [HTF] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning + Ed. 2", Springer, 2009. diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index e074d93fad7b8..42bcf18e1d572 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -1,4 +1,4 @@ -īģŋ.. _feature_extraction: +.. _feature_extraction: ================== Feature extraction @@ -13,9 +13,9 @@ consisting of formats such as text and image. .. note:: Feature extraction is very different from :ref:`feature_selection`: - the former consists in transforming arbitrary data, such as text or + the former consists of transforming arbitrary data, such as text or images, into numerical features usable for machine learning. The latter - is a machine learning technique applied on these features. + is a machine learning technique applied to these features. .. _dict_feature_extraction: @@ -33,7 +33,7 @@ need not be stored) and storing feature names in addition to values. :class:`DictVectorizer` implements what is called one-of-K or "one-hot" coding for categorical (aka nominal, discrete) features. Categorical features are "attribute-value" pairs where the value is restricted -to a list of discrete of possibilities without ordering (e.g. topic +to a list of discrete possibilities without ordering (e.g. topic identifiers, types of objects, tags, names...). In the following, "city" is a categorical attribute while "temperature" @@ -53,13 +53,13 @@ is a traditional numerical feature:: [ 0., 1., 0., 12.], [ 0., 0., 1., 18.]]) - >>> vec.get_feature_names() - ['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'] + >>> vec.get_feature_names_out() + array(['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'], ...) :class:`DictVectorizer` accepts multiple string values for one feature, like, e.g., multiple categories for a movie. -Assume a database classifies each movie using some categories (not mandatories) +Assume a database classifies each movie using some categories (not mandatory) and its year of release. >>> movie_entry = [{'category': ['thriller', 'drama'], 'year': 2003}, @@ -69,10 +69,9 @@ and its year of release. array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03], [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03], [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03]]) - >>> vec.get_feature_names() == ['category=animation', 'category=drama', - ... 'category=family', 'category=thriller', - ... 'year'] - True + >>> vec.get_feature_names_out() + array(['category=animation', 'category=drama', 'category=family', + 'category=thriller', 'year'], ...) >>> vec.transform({'category': ['thriller'], ... 'unseen_feature': '3'}).toarray() array([[0., 0., 0., 1., 0.]]) @@ -107,12 +106,13 @@ suitable for feeding into a classifier (maybe after being piped into a >>> vec = DictVectorizer() >>> pos_vectorized = vec.fit_transform(pos_window) >>> pos_vectorized - <1x6 sparse matrix of type '<... 'numpy.float64'>' - with 6 stored elements in Compressed Sparse ... format> + >>> pos_vectorized.toarray() array([[1., 1., 1., 1., 1., 1.]]) - >>> vec.get_feature_names() - ['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the'] + >>> vec.get_feature_names_out() + array(['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', + 'word-2=the'], ...) As you can imagine, if one extracts such a context around each individual word of a corpus of documents the resulting matrix will be very wide @@ -158,7 +158,7 @@ feature selectors that expect non-negative inputs. (like Python's ``dict`` and its variants in the ``collections`` module), ``(feature, value)`` pairs, or strings, depending on the constructor parameter ``input_type``. -Mapping are treated as lists of ``(feature, value)`` pairs, +Mappings are treated as lists of ``(feature, value)`` pairs, while single strings have an implicit value of 1, so ``['feat1', 'feat2', 'feat3']`` is interpreted as ``[('feat1', 1), ('feat2', 1), ('feat3', 1)]``. @@ -206,32 +206,32 @@ Note the use of a generator comprehension, which introduces laziness into the feature extraction: tokens are only processed on demand from the hasher. -Implementation details ----------------------- +.. dropdown:: Implementation details -:class:`FeatureHasher` uses the signed 32-bit variant of MurmurHash3. -As a result (and because of limitations in ``scipy.sparse``), -the maximum number of features supported is currently :math:`2^{31} - 1`. + :class:`FeatureHasher` uses the signed 32-bit variant of MurmurHash3. + As a result (and because of limitations in ``scipy.sparse``), + the maximum number of features supported is currently :math:`2^{31} - 1`. -The original formulation of the hashing trick by Weinberger et al. -used two separate hash functions :math:`h` and :math:`\xi` -to determine the column index and sign of a feature, respectively. -The present implementation works under the assumption -that the sign bit of MurmurHash3 is independent of its other bits. + The original formulation of the hashing trick by Weinberger et al. + used two separate hash functions :math:`h` and :math:`\xi` + to determine the column index and sign of a feature, respectively. + The present implementation works under the assumption + that the sign bit of MurmurHash3 is independent of its other bits. -Since a simple modulo is used to transform the hash function to a column index, -it is advisable to use a power of two as the ``n_features`` parameter; -otherwise the features will not be mapped evenly to the columns. + Since a simple modulo is used to transform the hash function to a column index, + it is advisable to use a power of two as the ``n_features`` parameter; + otherwise the features will not be mapped evenly to the columns. + .. rubric:: References -.. topic:: References: + * `MurmurHash3 `_. - * Kilian Weinberger, Anirban Dasgupta, John Langford, Alex Smola and - Josh Attenberg (2009). `Feature hashing for large scale multitask learning - `_. Proc. ICML. - * `MurmurHash3 `_. +.. rubric:: References +* Kilian Weinberger, Anirban Dasgupta, John Langford, Alex Smola and + Josh Attenberg (2009). `Feature hashing for large scale multitask learning + `_. Proc. ICML. .. _text_feature_extraction: @@ -245,7 +245,7 @@ The Bag of Words representation ------------------------------- Text Analysis is a major application field for machine learning -algorithms. However the raw data, a sequence of symbols cannot be fed +algorithms. However the raw data, a sequence of symbols, cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length. @@ -307,7 +307,7 @@ counting in a single class:: This model has many parameters, however the default values are quite reasonable (please see the :ref:`reference documentation -` for the details):: +` for the details):: >>> vectorizer = CountVectorizer() >>> vectorizer @@ -324,8 +324,8 @@ corpus of text documents:: ... ] >>> X = vectorizer.fit_transform(corpus) >>> X - <4x9 sparse matrix of type '<... 'numpy.int64'>' - with 19 stored elements in Compressed Sparse ... format> + The default configuration tokenizes the string by extracting words of at least 2 letters. The specific function that does this step can be @@ -340,10 +340,9 @@ Each term found by the analyzer during the fit is assigned a unique integer index corresponding to a column in the resulting matrix. This interpretation of the columns can be retrieved as follows:: - >>> vectorizer.get_feature_names() == ( - ... ['and', 'document', 'first', 'is', 'one', - ... 'second', 'the', 'third', 'this']) - True + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', + 'third', 'this'], ...) >>> X.toarray() array([[0, 1, 1, 1, 0, 0, 1, 0, 1], @@ -397,17 +396,17 @@ last document:: .. _stop_words: Using stop words -................ +---------------- Stop words are words like "and", "the", "him", which are presumed to be uninformative in representing the content of a text, and which may be -removed to avoid them being construed as signal for prediction. Sometimes, +removed to avoid them being construed as informative for prediction. Sometimes, however, similar words are useful for prediction, such as in classifying writing style or personality. There are several known issues in our provided 'english' stop word list. It -does not aim to be a general, 'one-size-fits-all' solution as some tasks -may require a more custom solution. See [NQY18]_ for more details. +does not aim to be a general, 'one-size-fits-all' solution as some tasks +may require a more custom solution. See [NQY18]_ for more details. Please take care in choosing a stop word list. Popular stop word lists may include words that are highly informative to @@ -420,12 +419,13 @@ tokenizer, so if *we've* is in ``stop_words``, but *ve* is not, *ve* will be retained from *we've* in transformed text. Our vectorizers will try to identify and warn about some kinds of inconsistencies. -.. topic:: References +.. rubric:: References + +.. [NQY18] J. Nothman, H. Qin and R. Yurchak (2018). + `"Stop Word Lists in Free Open-source Software Packages" + `__. + In *Proc. Workshop for NLP Open Source Software*. - .. [NQY18] J. Nothman, H. Qin and R. Yurchak (2018). - `"Stop Word Lists in Free Open-source Software Packages" - `__. - In *Proc. Workshop for NLP Open Source Software*. .. _tfidf: @@ -489,126 +489,125 @@ class:: TfidfTransformer(smooth_idf=False) Again please see the :ref:`reference documentation -` for the details on all the parameters. - -Let's take an example with the following counts. The first term is present -100% of the time hence not very interesting. The two other features only -in less than 50% of the time hence probably more representative of the -content of the documents:: - - >>> counts = [[3, 0, 1], - ... [2, 0, 0], - ... [3, 0, 0], - ... [4, 0, 0], - ... [3, 2, 0], - ... [3, 0, 2]] - ... - >>> tfidf = transformer.fit_transform(counts) - >>> tfidf - <6x3 sparse matrix of type '<... 'numpy.float64'>' - with 9 stored elements in Compressed Sparse ... format> +` for the details on all the parameters. - >>> tfidf.toarray() - array([[0.81940995, 0. , 0.57320793], - [1. , 0. , 0. ], - [1. , 0. , 0. ], - [1. , 0. , 0. ], - [0.47330339, 0.88089948, 0. ], - [0.58149261, 0. , 0.81355169]]) +.. dropdown:: Numeric example of a tf-idf matrix -Each row is normalized to have unit Euclidean norm: + Let's take an example with the following counts. The first term is present + 100% of the time hence not very interesting. The two other features only + in less than 50% of the time hence probably more representative of the + content of the documents:: -:math:`v_{norm} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v{_1}^2 + -v{_2}^2 + \dots + v{_n}^2}}` + >>> counts = [[3, 0, 1], + ... [2, 0, 0], + ... [3, 0, 0], + ... [4, 0, 0], + ... [3, 2, 0], + ... [3, 0, 2]] + ... + >>> tfidf = transformer.fit_transform(counts) + >>> tfidf + -For example, we can compute the tf-idf of the first term in the first -document in the `counts` array as follows: + >>> tfidf.toarray() + array([[0.81940995, 0. , 0.57320793], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0.47330339, 0.88089948, 0. ], + [0.58149261, 0. , 0.81355169]]) -:math:`n = 6` + Each row is normalized to have unit Euclidean norm: -:math:`\text{df}(t)_{\text{term1}} = 6` + :math:`v_{norm} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v{_1}^2 + + v{_2}^2 + \dots + v{_n}^2}}` -:math:`\text{idf}(t)_{\text{term1}} = -\log \frac{n}{\text{df}(t)} + 1 = \log(1)+1 = 1` + For example, we can compute the tf-idf of the first term in the first + document in the `counts` array as follows: -:math:`\text{tf-idf}_{\text{term1}} = \text{tf} \times \text{idf} = 3 \times 1 = 3` + :math:`n = 6` -Now, if we repeat this computation for the remaining 2 terms in the document, -we get + :math:`\text{df}(t)_{\text{term1}} = 6` -:math:`\text{tf-idf}_{\text{term2}} = 0 \times (\log(6/1)+1) = 0` + :math:`\text{idf}(t)_{\text{term1}} = + \log \frac{n}{\text{df}(t)} + 1 = \log(1)+1 = 1` -:math:`\text{tf-idf}_{\text{term3}} = 1 \times (\log(6/2)+1) \approx 2.0986` + :math:`\text{tf-idf}_{\text{term1}} = \text{tf} \times \text{idf} = 3 \times 1 = 3` -and the vector of raw tf-idfs: + Now, if we repeat this computation for the remaining 2 terms in the document, + we get -:math:`\text{tf-idf}_{\text{raw}} = [3, 0, 2.0986].` + :math:`\text{tf-idf}_{\text{term2}} = 0 \times (\log(6/1)+1) = 0` + :math:`\text{tf-idf}_{\text{term3}} = 1 \times (\log(6/2)+1) \approx 2.0986` -Then, applying the Euclidean (L2) norm, we obtain the following tf-idfs -for document 1: + and the vector of raw tf-idfs: -:math:`\frac{[3, 0, 2.0986]}{\sqrt{\big(3^2 + 0^2 + 2.0986^2\big)}} -= [ 0.819, 0, 0.573].` + :math:`\text{tf-idf}_{\text{raw}} = [3, 0, 2.0986].` -Furthermore, the default parameter ``smooth_idf=True`` adds "1" to the numerator -and denominator as if an extra document was seen containing every term in the -collection exactly once, which prevents zero divisions: -:math:`\text{idf}(t) = \log{\frac{1 + n}{1+\text{df}(t)}} + 1` + Then, applying the Euclidean (L2) norm, we obtain the following tf-idfs + for document 1: -Using this modification, the tf-idf of the third term in document 1 changes to -1.8473: + :math:`\frac{[3, 0, 2.0986]}{\sqrt{\big(3^2 + 0^2 + 2.0986^2\big)}} + = [ 0.819, 0, 0.573].` -:math:`\text{tf-idf}_{\text{term3}} = 1 \times \log(7/3)+1 \approx 1.8473` + Furthermore, the default parameter ``smooth_idf=True`` adds "1" to the numerator + and denominator as if an extra document was seen containing every term in the + collection exactly once, which prevents zero divisions: -And the L2-normalized tf-idf changes to + :math:`\text{idf}(t) = \log{\frac{1 + n}{1+\text{df}(t)}} + 1` -:math:`\frac{[3, 0, 1.8473]}{\sqrt{\big(3^2 + 0^2 + 1.8473^2\big)}} -= [0.8515, 0, 0.5243]`:: + Using this modification, the tf-idf of the third term in document 1 changes to + 1.8473: - >>> transformer = TfidfTransformer() - >>> transformer.fit_transform(counts).toarray() - array([[0.85151335, 0. , 0.52433293], - [1. , 0. , 0. ], - [1. , 0. , 0. ], - [1. , 0. , 0. ], - [0.55422893, 0.83236428, 0. ], - [0.63035731, 0. , 0.77630514]]) + :math:`\text{tf-idf}_{\text{term3}} = 1 \times \log(7/3)+1 \approx 1.8473` -The weights of each -feature computed by the ``fit`` method call are stored in a model -attribute:: + And the L2-normalized tf-idf changes to - >>> transformer.idf_ - array([1. ..., 2.25..., 1.84...]) + :math:`\frac{[3, 0, 1.8473]}{\sqrt{\big(3^2 + 0^2 + 1.8473^2\big)}} + = [0.8515, 0, 0.5243]`:: + >>> transformer = TfidfTransformer() + >>> transformer.fit_transform(counts).toarray() + array([[0.85151335, 0. , 0.52433293], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0.55422893, 0.83236428, 0. ], + [0.63035731, 0. , 0.77630514]]) + The weights of each + feature computed by the ``fit`` method call are stored in a model + attribute:: + >>> transformer.idf_ + array([1., 2.25, 1.84]) -As tf–idf is very often used for text features, there is also another -class called :class:`TfidfVectorizer` that combines all the options of -:class:`CountVectorizer` and :class:`TfidfTransformer` in a single model:: + As tf-idf is very often used for text features, there is also another + class called :class:`TfidfVectorizer` that combines all the options of + :class:`CountVectorizer` and :class:`TfidfTransformer` in a single model:: - >>> from sklearn.feature_extraction.text import TfidfVectorizer - >>> vectorizer = TfidfVectorizer() - >>> vectorizer.fit_transform(corpus) - <4x9 sparse matrix of type '<... 'numpy.float64'>' - with 19 stored elements in Compressed Sparse ... format> + >>> from sklearn.feature_extraction.text import TfidfVectorizer + >>> vectorizer = TfidfVectorizer() + >>> vectorizer.fit_transform(corpus) + -While the tf–idf normalization is often very useful, there might -be cases where the binary occurrence markers might offer better -features. This can be achieved by using the ``binary`` parameter -of :class:`CountVectorizer`. In particular, some estimators such as -:ref:`bernoulli_naive_bayes` explicitly model discrete boolean random -variables. Also, very short texts are likely to have noisy tf–idf values -while the binary occurrence info is more stable. + While the tf-idf normalization is often very useful, there might + be cases where the binary occurrence markers might offer better + features. This can be achieved by using the ``binary`` parameter + of :class:`CountVectorizer`. In particular, some estimators such as + :ref:`bernoulli_naive_bayes` explicitly model discrete boolean random + variables. Also, very short texts are likely to have noisy tf-idf values + while the binary occurrence info is more stable. -As usual the best way to adjust the feature extraction parameters -is to use a cross-validated grid search, for instance by pipelining the -feature extractor with a classifier: + As usual the best way to adjust the feature extraction parameters + is to use a cross-validated grid search, for instance by pipelining the + feature extractor with a classifier: - * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` + * :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py` Decoding text files @@ -638,58 +637,59 @@ or ``"replace"``. See the documentation for the Python function ``bytes.decode`` for more details (type ``help(bytes.decode)`` at the Python prompt). -If you are having trouble decoding text, here are some things to try: - -- Find out what the actual encoding of the text is. The file might come - with a header or README that tells you the encoding, or there might be some - standard encoding you can assume based on where the text comes from. - -- You may be able to find out what kind of encoding it is in general - using the UNIX command ``file``. The Python ``chardet`` module comes with - a script called ``chardetect.py`` that will guess the specific encoding, - though you cannot rely on its guess being correct. - -- You could try UTF-8 and disregard the errors. You can decode byte - strings with ``bytes.decode(errors='replace')`` to replace all - decoding errors with a meaningless character, or set - ``decode_error='replace'`` in the vectorizer. This may damage the - usefulness of your features. - -- Real text may come from a variety of sources that may have used different - encodings, or even be sloppily decoded in a different encoding than the - one it was encoded with. This is common in text retrieved from the Web. - The Python package `ftfy`_ can automatically sort out some classes of - decoding errors, so you could try decoding the unknown text as ``latin-1`` - and then using ``ftfy`` to fix errors. - -- If the text is in a mish-mash of encodings that is simply too hard to sort - out (which is the case for the 20 Newsgroups dataset), you can fall back on - a simple single-byte encoding such as ``latin-1``. Some text may display - incorrectly, but at least the same sequence of bytes will always represent - the same feature. - -For example, the following snippet uses ``chardet`` -(not shipped with scikit-learn, must be installed separately) -to figure out the encoding of three texts. -It then vectorizes the texts and prints the learned vocabulary. -The output is not shown here. - - >>> import chardet # doctest: +SKIP - >>> text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut" - >>> text2 = b"holdselig sind deine Ger\xfcche" - >>> text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00" - >>> decoded = [x.decode(chardet.detect(x)['encoding']) - ... for x in (text1, text2, text3)] # doctest: +SKIP - >>> v = CountVectorizer().fit(decoded).vocabulary_ # doctest: +SKIP - >>> for term in v: print(v) # doctest: +SKIP - -(Depending on the version of ``chardet``, it might get the first one wrong.) - -For an introduction to Unicode and character encodings in general, -see Joel Spolsky's `Absolute Minimum Every Software Developer Must Know -About Unicode `_. - -.. _`ftfy`: https://github.com/LuminosoInsight/python-ftfy +.. dropdown:: Troubleshooting decoding text + + If you are having trouble decoding text, here are some things to try: + + - Find out what the actual encoding of the text is. The file might come + with a header or README that tells you the encoding, or there might be some + standard encoding you can assume based on where the text comes from. + + - You may be able to find out what kind of encoding it is in general + using the UNIX command ``file``. The Python ``chardet`` module comes with + a script called ``chardetect.py`` that will guess the specific encoding, + though you cannot rely on its guess being correct. + + - You could try UTF-8 and disregard the errors. You can decode byte + strings with ``bytes.decode(errors='replace')`` to replace all + decoding errors with a meaningless character, or set + ``decode_error='replace'`` in the vectorizer. This may damage the + usefulness of your features. + + - Real text may come from a variety of sources that may have used different + encodings, or even be sloppily decoded in a different encoding than the + one it was encoded with. This is common in text retrieved from the Web. + The Python package `ftfy `__ + can automatically sort out some classes of + decoding errors, so you could try decoding the unknown text as ``latin-1`` + and then using ``ftfy`` to fix errors. + + - If the text is in a mish-mash of encodings that is simply too hard to sort + out (which is the case for the 20 Newsgroups dataset), you can fall back on + a simple single-byte encoding such as ``latin-1``. Some text may display + incorrectly, but at least the same sequence of bytes will always represent + the same feature. + + For example, the following snippet uses ``chardet`` + (not shipped with scikit-learn, must be installed separately) + to figure out the encoding of three texts. + It then vectorizes the texts and prints the learned vocabulary. + The output is not shown here. + + >>> import chardet # doctest: +SKIP + >>> text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut" + >>> text2 = b"holdselig sind deine Ger\xfcche" + >>> text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00" + >>> decoded = [x.decode(chardet.detect(x)['encoding']) + ... for x in (text1, text2, text3)] # doctest: +SKIP + >>> v = CountVectorizer().fit(decoded).vocabulary_ # doctest: +SKIP + >>> for term in v: print(v) # doctest: +SKIP + + (Depending on the version of ``chardet``, it might get the first one wrong.) + + For an introduction to Unicode and character encodings in general, + see Joel Spolsky's `Absolute Minimum Every Software Developer Must Know + About Unicode `_. Applications and examples @@ -702,18 +702,18 @@ In particular in a **supervised setting** it can be successfully combined with fast and scalable linear models to train **document classifiers**, for instance: - * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` +* :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` In an **unsupervised setting** it can be used to group similar documents together by applying clustering algorithms such as :ref:`k_means`: - * :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` +* :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` Finally it is possible to discover the main topics of a corpus by relaxing the hard assignment constraint of clustering, for instance by using :ref:`NMF`: - * :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` +* :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py` Limitations of the Bag of Words representation @@ -742,9 +742,8 @@ decide better:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) >>> counts = ngram_vectorizer.fit_transform(['words', 'wprds']) - >>> ngram_vectorizer.get_feature_names() == ( - ... [' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']) - True + >>> ngram_vectorizer.get_feature_names_out() + array([' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'], ...) >>> counts.toarray().astype(int) array([[1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 0, 1]]) @@ -756,19 +755,18 @@ span across words:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5)) >>> ngram_vectorizer.fit_transform(['jumpy fox']) - <1x4 sparse matrix of type '<... 'numpy.int64'>' - with 4 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names() == ( - ... [' fox ', ' jump', 'jumpy', 'umpy ']) - True + + + >>> ngram_vectorizer.get_feature_names_out() + array([' fox ', ' jump', 'jumpy', 'umpy '], ...) >>> ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5)) >>> ngram_vectorizer.fit_transform(['jumpy fox']) - <1x5 sparse matrix of type '<... 'numpy.int64'>' - with 5 stored elements in Compressed Sparse ... format> - >>> ngram_vectorizer.get_feature_names() == ( - ... ['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox']) - True + + >>> ngram_vectorizer.get_feature_names_out() + array(['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox'], ...) The word boundaries-aware variant ``char_wb`` is especially interesting for languages that use white-spaces for word separation as it generates @@ -794,9 +792,9 @@ problems which are currently outside of the scope of scikit-learn. Vectorizing a large text corpus with the hashing trick ------------------------------------------------------ -The above vectorization scheme is simple but the fact that it holds an **in- -memory mapping from the string tokens to the integer feature indices** (the -``vocabulary_`` attribute) causes several **problems when dealing with large +The above vectorization scheme is simple but the fact that it holds an +**in-memory mapping from the string tokens to the integer feature indices** +(the ``vocabulary_`` attribute) causes several **problems when dealing with large datasets**: - the larger the corpus, the larger the vocabulary will grow and hence the @@ -815,7 +813,7 @@ datasets**: - it is not easily possible to split the vectorization work into concurrent sub tasks as the ``vocabulary_`` attribute would have to be a shared state with a fine grained synchronization barrier: the mapping from token string to - feature index is dependent on ordering of the first occurrence of each token + feature index is dependent on the ordering of the first occurrence of each token hence would have to be shared, potentially harming the concurrent workers' performance to the point of making them slower than the sequential variant. @@ -824,7 +822,7 @@ It is possible to overcome those limitations by combining the "hashing trick" :class:`~sklearn.feature_extraction.FeatureHasher` class and the text preprocessing and tokenization features of the :class:`CountVectorizer`. -This combination is implementing in :class:`HashingVectorizer`, +This combination is implemented in :class:`HashingVectorizer`, a transformer class that is mostly API compatible with :class:`CountVectorizer`. :class:`HashingVectorizer` is stateless, meaning that you don't have to call ``fit`` on it:: @@ -832,8 +830,8 @@ meaning that you don't have to call ``fit`` on it:: >>> from sklearn.feature_extraction.text import HashingVectorizer >>> hv = HashingVectorizer(n_features=10) >>> hv.transform(corpus) - <4x10 sparse matrix of type '<... 'numpy.float64'>' - with 16 stored elements in Compressed Sparse ... format> + You can see that 16 non-zero feature tokens were extracted in the vector output: this is less than the 19 non-zeros extracted previously by the @@ -850,14 +848,14 @@ Note that the dimensionality does not affect the CPU training time of algorithms which operate on CSR matrices (``LinearSVC(dual=True)``, ``Perceptron``, ``SGDClassifier``, ``PassiveAggressive``) but it does for algorithms that work with CSC matrices (``LinearSVC(dual=False)``, ``Lasso()``, -etc). +etc.). Let's try again with the default setting:: >>> hv = HashingVectorizer() >>> hv.transform(corpus) - <4x1048576 sparse matrix of type '<... 'numpy.float64'>' - with 19 stored elements in Compressed Sparse ... format> + We no longer get the collisions, but this comes at the expense of a much larger dimensionality of the output space. @@ -874,25 +872,25 @@ The :class:`HashingVectorizer` also comes with the following limitations: model. A :class:`TfidfTransformer` can be appended to it in a pipeline if required. -Performing out-of-core scaling with HashingVectorizer ------------------------------------------------------- +.. dropdown:: Performing out-of-core scaling with HashingVectorizer + + An interesting development of using a :class:`HashingVectorizer` is the ability + to perform `out-of-core`_ scaling. This means that we can learn from data that + does not fit into the computer's main memory. -An interesting development of using a :class:`HashingVectorizer` is the ability -to perform `out-of-core`_ scaling. This means that we can learn from data that -does not fit into the computer's main memory. + .. _out-of-core: https://en.wikipedia.org/wiki/Out-of-core_algorithm -.. _out-of-core: https://en.wikipedia.org/wiki/Out-of-core_algorithm + A strategy to implement out-of-core scaling is to stream data to the estimator + in mini-batches. Each mini-batch is vectorized using :class:`HashingVectorizer` + so as to guarantee that the input space of the estimator has always the same + dimensionality. The amount of memory used at any time is thus bounded by the + size of a mini-batch. Although there is no limit to the amount of data that can + be ingested using such an approach, from a practical point of view the learning + time is often limited by the CPU time one wants to spend on the task. -A strategy to implement out-of-core scaling is to stream data to the estimator -in mini-batches. Each mini-batch is vectorized using :class:`HashingVectorizer` -so as to guarantee that the input space of the estimator has always the same -dimensionality. The amount of memory used at any time is thus bounded by the -size of a mini-batch. Although there is no limit to the amount of data that can -be ingested using such an approach, from a practical point of view the learning -time is often limited by the CPU time one wants to spend on the task. + For a full-fledged example of out-of-core scaling in a text classification + task see :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. -For a full-fledged example of out-of-core scaling in a text classification -task see :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. Customizing the vectorizer classes ---------------------------------- @@ -910,19 +908,19 @@ to the vectorizer constructor:: In particular we name: - * ``preprocessor``: a callable that takes an entire document as input (as a - single string), and returns a possibly transformed version of the document, - still as an entire string. This can be used to remove HTML tags, lowercase - the entire document, etc. +* ``preprocessor``: a callable that takes an entire document as input (as a + single string), and returns a possibly transformed version of the document, + still as an entire string. This can be used to remove HTML tags, lowercase + the entire document, etc. - * ``tokenizer``: a callable that takes the output from the preprocessor - and splits it into tokens, then returns a list of these. +* ``tokenizer``: a callable that takes the output from the preprocessor + and splits it into tokens, then returns a list of these. - * ``analyzer``: a callable that replaces the preprocessor and tokenizer. - The default analyzers all call the preprocessor and tokenizer, but custom - analyzers will skip this. N-gram extraction and stop word filtering take - place at the analyzer level, so a custom analyzer may have to reproduce - these steps. +* ``analyzer``: a callable that replaces the preprocessor and tokenizer. + The default analyzers all call the preprocessor and tokenizer, but custom + analyzers will skip this. N-gram extraction and stop word filtering take + place at the analyzer level, so a custom analyzer may have to reproduce + these steps. (Lucene users might recognize these names, but be aware that scikit-learn concepts may not map one-to-one onto Lucene concepts.) @@ -932,7 +930,8 @@ parameters it is possible to derive from the class and override the ``build_preprocessor``, ``build_tokenizer`` and ``build_analyzer`` factory methods instead of passing custom functions. -Some tips and tricks: +.. dropdown:: Tips and tricks + :color: success * If documents are pre-tokenized by an external package, then store them in files (or strings) with the tokens separated by whitespace and pass @@ -956,7 +955,6 @@ Some tips and tricks: (Note that this will not filter out punctuation.) - The following example will, for instance, transform some British spelling to American spelling:: @@ -980,11 +978,10 @@ Some tips and tricks: for other styles of preprocessing; examples include stemming, lemmatization, or normalizing numerical tokens, with the latter illustrated in: - * :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py` - + * :ref:`sphx_glr_auto_examples_bicluster_plot_bicluster_newsgroups.py` -Customizing the vectorizer can also be useful when handling Asian languages -that do not use an explicit word separator such as whitespace. + Customizing the vectorizer can also be useful when handling Asian languages + that do not use an explicit word separator such as whitespace. .. _image_feature_extraction: @@ -999,7 +996,7 @@ Patch extraction The :func:`extract_patches_2d` function extracts patches from an image stored as a two-dimensional array, or three-dimensional with color information along the third axis. For rebuilding an image from all its patches, use -:func:`reconstruct_from_patches_2d`. For example let use generate a 4x4 pixel +:func:`reconstruct_from_patches_2d`. For example let us generate a 4x4 pixel picture with 3 color channels (e.g. in RGB format):: >>> import numpy as np @@ -1037,17 +1034,19 @@ on overlapping areas:: The :class:`PatchExtractor` class works in the same way as :func:`extract_patches_2d`, only it supports multiple images as input. It is -implemented as an estimator, so it can be used in pipelines. See:: +implemented as a scikit-learn transformer, so it can be used in pipelines. See:: >>> five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3) >>> patches = image.PatchExtractor(patch_size=(2, 2)).transform(five_images) >>> patches.shape (45, 2, 2, 3) +.. _connectivity_graph_image: + Connectivity graph of an image ------------------------------- -Several estimators in the scikit-learn can use connectivity information between +Several estimators in scikit-learn can use connectivity information between features or samples. For instance Ward clustering (:ref:`hierarchical_clustering`) can cluster together only neighboring pixels of an image, thus forming contiguous patches: @@ -1061,8 +1060,8 @@ For this purpose, the estimators use a 'connectivity' matrix, giving which samples are connected. The function :func:`img_to_graph` returns such a matrix from a 2D or 3D -image. Similarly, :func:`grid_to_graph` build a connectivity matrix for -images given the shape of these image. +image. Similarly, :func:`grid_to_graph` builds a connectivity matrix for +images given the shape of these images. These matrices can be used to impose connectivity in estimators that use connectivity information, such as Ward clustering diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst index c27a334c2ed4b..ffee801f34ccc 100644 --- a/doc/modules/feature_selection.rst +++ b/doc/modules/feature_selection.rst @@ -57,29 +57,29 @@ univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the ``transform`` method: - * :class:`SelectKBest` removes all but the :math:`k` highest scoring features +* :class:`SelectKBest` removes all but the :math:`k` highest scoring features - * :class:`SelectPercentile` removes all but a user-specified highest scoring - percentage of features +* :class:`SelectPercentile` removes all but a user-specified highest scoring + percentage of features - * using common univariate statistical tests for each feature: - false positive rate :class:`SelectFpr`, false discovery rate - :class:`SelectFdr`, or family wise error :class:`SelectFwe`. +* using common univariate statistical tests for each feature: + false positive rate :class:`SelectFpr`, false discovery rate + :class:`SelectFdr`, or family wise error :class:`SelectFwe`. - * :class:`GenericUnivariateSelect` allows to perform univariate feature - selection with a configurable strategy. This allows to select the best - univariate selection strategy with hyper-parameter search estimator. +* :class:`GenericUnivariateSelect` allows to perform univariate feature + selection with a configurable strategy. This allows to select the best + univariate selection strategy with hyper-parameter search estimator. -For instance, we can perform a :math:`\chi^2` test to the samples -to retrieve only the two best features as follows: +For instance, we can use a F-test to retrieve the two +best features for a dataset as follows: >>> from sklearn.datasets import load_iris >>> from sklearn.feature_selection import SelectKBest - >>> from sklearn.feature_selection import chi2 + >>> from sklearn.feature_selection import f_classif >>> X, y = load_iris(return_X_y=True) >>> X.shape (150, 4) - >>> X_new = SelectKBest(chi2, k=2).fit_transform(X, y) + >>> X_new = SelectKBest(f_classif, k=2).fit_transform(X, y) >>> X_new.shape (150, 2) @@ -87,14 +87,15 @@ These objects take as input a scoring function that returns univariate scores and p-values (or only scores for :class:`SelectKBest` and :class:`SelectPercentile`): - * For regression: :func:`f_regression`, :func:`mutual_info_regression` +* For regression: :func:`r_regression`, :func:`f_regression`, :func:`mutual_info_regression` - * For classification: :func:`chi2`, :func:`f_classif`, :func:`mutual_info_classif` +* For classification: :func:`chi2`, :func:`f_classif`, :func:`mutual_info_classif` The methods based on F-test estimate the degree of linear dependency between two random variables. On the other hand, mutual information methods can capture any kind of statistical dependency, but being nonparametric, they require more -samples for accurate estimation. +samples for accurate estimation. Note that the :math:`\chi^2`-test should only be +applied to non-negative features, such as frequencies. .. topic:: Feature selection with sparse data @@ -107,11 +108,17 @@ samples for accurate estimation. Beware not to use a regression scoring function with a classification problem, you will get useless results. -.. topic:: Examples: +.. note:: - * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection.py` + The :class:`SelectPercentile` and :class:`SelectKBest` support unsupervised + feature selection as well. One needs to provide a `score_func` where `y=None`. + The `score_func` should use internally `X` to compute the scores. - * :ref:`sphx_glr_auto_examples_feature_selection_plot_f_test_vs_mi.py` +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection.py` + +* :ref:`sphx_glr_auto_examples_feature_selection_plot_f_test_vs_mi.py` .. _rfe: @@ -124,21 +131,27 @@ is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through any specific attribute (such as ``coef_``, ``feature_importances_``) or callable. Then, the least important -features are pruned from current set of features. That procedure is recursively +features are pruned from the current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. :class:`RFECV` performs RFE in a cross-validation loop to find the optimal -number of features. +number of features. In more details, the number of features selected is tuned +automatically by fitting an :class:`RFE` selector on the different +cross-validation splits (provided by the `cv` parameter). The performance +of the :class:`RFE` selector is evaluated using `scorer` for different numbers +of selected features and aggregated together. Finally, the scores are averaged +across folds and the number of features selected is set to the number of +features that maximize the cross-validation score. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_digits.py`: A recursive feature elimination example - showing the relevance of pixels in a digit classification task. +* :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_digits.py`: A recursive feature elimination example + showing the relevance of pixels in a digit classification task. - * :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`: A recursive feature - elimination example with automatic tuning of the number of features - selected with cross-validation. +* :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`: A recursive feature + elimination example with automatic tuning of the number of features + selected with cross-validation. .. _select_from_model: @@ -149,7 +162,7 @@ Feature selection using SelectFromModel estimator that assigns importance to each feature through a specific attribute (such as ``coef_``, ``feature_importances_``) or via an `importance_getter` callable after fitting. The features are considered unimportant and removed if the corresponding -importance of the feature values are below the provided +importance of the feature values is below the provided ``threshold`` parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument. Available heuristics are "mean", "median" and float multiples of these like @@ -158,9 +171,9 @@ Available heuristics are "mean", "median" and float multiples of these like For examples on how it is to be used refer to the sections below. -.. topic:: Examples +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_feature_selection_plot_select_from_model_diabetes.py` +* :ref:`sphx_glr_auto_examples_feature_selection_plot_select_from_model_diabetes.py` .. _l1_feature_selection: @@ -190,41 +203,45 @@ for classification:: >>> X_new.shape (150, 3) -With SVMs and logistic-regression, the parameter C controls the sparsity: +With SVMs and logistic regression, the parameter C controls the sparsity: the smaller C the fewer features selected. With Lasso, the higher the alpha parameter, the fewer features selected. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`: Comparison - of different algorithms for document classification including L1-based - feature selection. +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_dense_vs_sparse_data.py`. .. _compressive_sensing: -.. topic:: **L1-recovery and compressive sensing** - - For a good choice of alpha, the :ref:`lasso` can fully recover the - exact set of non-zero variables using only few observations, provided - certain specific conditions are met. In particular, the number of - samples should be "sufficiently large", or L1 models will perform at - random, where "sufficiently large" depends on the number of non-zero - coefficients, the logarithm of the number of features, the amount of - noise, the smallest absolute value of non-zero coefficients, and the - structure of the design matrix X. In addition, the design matrix must - display certain specific properties, such as not being too correlated. - - There is no general rule to select an alpha parameter for recovery of - non-zero coefficients. It can by set by cross-validation - (:class:`LassoCV` or :class:`LassoLarsCV`), though this may lead to - under-penalized models: including a small number of non-relevant - variables is not detrimental to prediction score. BIC - (:class:`LassoLarsIC`) tends, on the opposite, to set high values of - alpha. - - **Reference** Richard G. Baraniuk "Compressive Sensing", IEEE Signal - Processing Magazine [120] July 2007 - http://users.isr.ist.utl.pt/~aguiar/CS_notes.pdf +.. dropdown:: L1-recovery and compressive sensing + + For a good choice of alpha, the :ref:`lasso` can fully recover the + exact set of non-zero variables using only few observations, provided + certain specific conditions are met. In particular, the number of + samples should be "sufficiently large", or L1 models will perform at + random, where "sufficiently large" depends on the number of non-zero + coefficients, the logarithm of the number of features, the amount of + noise, the smallest absolute value of non-zero coefficients, and the + structure of the design matrix X. In addition, the design matrix must + display certain specific properties, such as not being too correlated. + On the use of Lasso for sparse signal recovery, see this example on + compressive sensing: + :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py`. + + There is no general rule to select an alpha parameter for recovery of + non-zero coefficients. It can be set by cross-validation + (:class:`~sklearn.linear_model.LassoCV` or + :class:`~sklearn.linear_model.LassoLarsCV`), though this may lead to + under-penalized models: including a small number of non-relevant variables + is not detrimental to prediction score. BIC + (:class:`~sklearn.linear_model.LassoLarsIC`) tends, on the opposite, to set + high values of alpha. + + .. rubric:: References + + Richard G. Baraniuk "Compressive Sensing", IEEE Signal + Processing Magazine [120] July 2007 + http://users.isr.ist.utl.pt/~aguiar/CS_notes.pdf Tree-based feature selection @@ -245,20 +262,20 @@ meta-transformer):: >>> clf = ExtraTreesClassifier(n_estimators=50) >>> clf = clf.fit(X, y) >>> clf.feature_importances_ # doctest: +SKIP - array([ 0.04..., 0.05..., 0.4..., 0.4...]) + array([ 0.04, 0.05, 0.4, 0.4]) >>> model = SelectFromModel(clf, prefit=True) >>> X_new = model.transform(X) >>> X_new.shape # doctest: +SKIP (150, 2) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances.py`: example on - synthetic data showing the recovery of the actually meaningful - features. +* :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances.py`: example on + synthetic data showing the recovery of the actually meaningful features. - * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py`: example - on face recognition data. +* :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`: example + discussing the caveats of using impurity-based feature importances as a proxy for + feature relevance. .. _sequential_feature_selection: @@ -271,43 +288,46 @@ SFS can be either forward or backward: Forward-SFS is a greedy procedure that iteratively finds the best new feature to add to the set of selected features. Concretely, we initially start with -zero feature and find the one feature that maximizes a cross-validated score +zero features and find the one feature that maximizes a cross-validated score when an estimator is trained on this single feature. Once that first feature is selected, we repeat the procedure by adding a new feature to the set of selected features. The procedure stops when the desired number of selected features is reached, as determined by the `n_features_to_select` parameter. Backward-SFS follows the same idea but works in the opposite direction: -instead of starting with no feature and greedily adding features, we start +instead of starting with no features and greedily adding features, we start with *all* the features and greedily *remove* features from the set. The `direction` parameter controls whether forward or backward SFS is used. -In general, forward and backward selection do not yield equivalent results. -Also, one may be much faster than the other depending on the requested number -of selected features: if we have 10 features and ask for 7 selected features, -forward selection would need to perform 7 iterations while backward selection -would only need to perform 3. +.. dropdown:: Details on Sequential Feature Selection -SFS differs from :class:`~sklearn.feature_selection.RFE` and -:class:`~sklearn.feature_selection.SelectFromModel` in that it does not -require the underlying model to expose a `coef_` or `feature_importances_` -attribute. It may however be slower considering that more models need to be -evaluated, compared to the other approaches. For example in backward -selection, the iteration going from `m` features to `m - 1` features using k-fold -cross-validation requires fitting `m * k` models, while -:class:`~sklearn.feature_selection.RFE` would require only a single fit, and -:class:`~sklearn.feature_selection.SelectFromModel` always just does a single -fit and requires no iterations. + In general, forward and backward selection do not yield equivalent results. + Also, one may be much faster than the other depending on the requested number + of selected features: if we have 10 features and ask for 7 selected features, + forward selection would need to perform 7 iterations while backward selection + would only need to perform 3. -.. topic:: Examples + SFS differs from :class:`~sklearn.feature_selection.RFE` and + :class:`~sklearn.feature_selection.SelectFromModel` in that it does not + require the underlying model to expose a `coef_` or `feature_importances_` + attribute. It may however be slower considering that more models need to be + evaluated, compared to the other approaches. For example in backward + selection, the iteration going from `m` features to `m - 1` features using k-fold + cross-validation requires fitting `m * k` models, while + :class:`~sklearn.feature_selection.RFE` would require only a single fit, and + :class:`~sklearn.feature_selection.SelectFromModel` always just does a single + fit and requires no iterations. - * :ref:`sphx_glr_auto_examples_feature_selection_plot_select_from_model_diabetes.py` + .. rubric:: References -.. topic:: References: - - .. [sfs] Ferri et al, `Comparative study of techniques for + .. [sfs] Ferri et al, `Comparative study of techniques for large-scale feature selection - `_. + `_. + + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_feature_selection_plot_select_from_model_diabetes.py` Feature selection as part of a pipeline ======================================= diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst index 6aa9cb417aa5d..46d04ac35d832 100644 --- a/doc/modules/gaussian_process.rst +++ b/doc/modules/gaussian_process.rst @@ -1,5 +1,3 @@ - - .. _gaussian_process: ================== @@ -8,30 +6,30 @@ Gaussian Processes .. currentmodule:: sklearn.gaussian_process -**Gaussian Processes (GP)** are a generic supervised learning method designed +**Gaussian Processes (GP)** are a nonparametric supervised learning method used to solve *regression* and *probabilistic classification* problems. The advantages of Gaussian processes are: - - The prediction interpolates the observations (at least for regular - kernels). +- The prediction interpolates the observations (at least for regular + kernels). - - The prediction is probabilistic (Gaussian) so that one can compute - empirical confidence intervals and decide based on those if one should - refit (online fitting, adaptive fitting) the prediction in some - region of interest. +- The prediction is probabilistic (Gaussian) so that one can compute + empirical confidence intervals and decide based on those if one should + refit (online fitting, adaptive fitting) the prediction in some + region of interest. - - Versatile: different :ref:`kernels - ` can be specified. Common kernels are provided, but - it is also possible to specify custom kernels. +- Versatile: different :ref:`kernels + ` can be specified. Common kernels are provided, but + it is also possible to specify custom kernels. The disadvantages of Gaussian processes include: - - They are not sparse, i.e., they use the whole samples/features information to - perform the prediction. +- Our implementation is not sparse, i.e., they use the whole samples/features + information to perform the prediction. - - They lose efficiency in high dimensional spaces -- namely when the number - of features exceeds a few dozens. +- They lose efficiency in high dimensional spaces -- namely when the number + of features exceeds a few dozens. .. _gpr: @@ -42,31 +40,44 @@ Gaussian Process Regression (GPR) .. currentmodule:: sklearn.gaussian_process The :class:`GaussianProcessRegressor` implements Gaussian processes (GP) for -regression purposes. For this, the prior of the GP needs to be specified. The -prior mean is assumed to be constant and zero (for ``normalize_y=False``) or the -training data's mean (for ``normalize_y=True``). The prior's -covariance is specified by passing a :ref:`kernel ` object. The -hyperparameters of the kernel are optimized during fitting of -GaussianProcessRegressor by maximizing the log-marginal-likelihood (LML) based -on the passed ``optimizer``. As the LML may have multiple local optima, the -optimizer can be started repeatedly by specifying ``n_restarts_optimizer``. The -first run is always conducted starting from the initial hyperparameter values -of the kernel; subsequent runs are conducted from hyperparameter values -that have been chosen randomly from the range of allowed values. -If the initial hyperparameters should be kept fixed, `None` can be passed as -optimizer. +regression purposes. For this, the prior of the GP needs to be specified. GP +will combine this prior and the likelihood function based on training samples. +It allows to give a probabilistic approach to prediction by giving the mean and +standard deviation as output when predicting. -The noise level in the targets can be specified by passing it via the -parameter ``alpha``, either globally as a scalar or per datapoint. -Note that a moderate noise level can also be helpful for dealing with numeric -issues during fitting as it is effectively implemented as Tikhonov -regularization, i.e., by adding it to the diagonal of the kernel matrix. An -alternative to specifying the noise level explicitly is to include a -WhiteKernel component into the kernel, which can estimate the global noise -level from the data (see example below). +.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_002.png + :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html + :align: center + +The prior mean is assumed to be constant and zero (for `normalize_y=False`) or +the training data's mean (for `normalize_y=True`). The prior's covariance is +specified by passing a :ref:`kernel ` object. The hyperparameters +of the kernel are optimized when fitting the :class:`GaussianProcessRegressor` +by maximizing the log-marginal-likelihood (LML) based on the passed +`optimizer`. As the LML may have multiple local optima, the optimizer can be +started repeatedly by specifying `n_restarts_optimizer`. The first run is +always conducted starting from the initial hyperparameter values of the kernel; +subsequent runs are conducted from hyperparameter values that have been chosen +randomly from the range of allowed values. If the initial hyperparameters +should be kept fixed, `None` can be passed as optimizer. + +The noise level in the targets can be specified by passing it via the parameter +`alpha`, either globally as a scalar or per datapoint. Note that a moderate +noise level can also be helpful for dealing with numeric instabilities during +fitting as it is effectively implemented as Tikhonov regularization, i.e., by +adding it to the diagonal of the kernel matrix. An alternative to specifying +the noise level explicitly is to include a +:class:`~sklearn.gaussian_process.kernels.WhiteKernel` component into the +kernel, which can estimate the global noise level from the data (see example +below). The figure below shows the effect of noisy target handled by setting +the parameter `alpha`. + +.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_targets_003.png + :target: ../auto_examples/gaussian_process/plot_gpr_noisy_targets.html + :align: center The implementation is based on Algorithm 2.1 of [RW2006]_. In addition to -the API of standard scikit-learn estimators, GaussianProcessRegressor: +the API of standard scikit-learn estimators, :class:`GaussianProcessRegressor`: * allows prediction without prior fitting (based on the GP prior) @@ -77,152 +88,12 @@ the API of standard scikit-learn estimators, GaussianProcessRegressor: externally for other ways of selecting hyperparameters, e.g., via Markov chain Monte Carlo. +.. rubric:: Examples -GPR examples -============ - -GPR with noise-level estimation -------------------------------- -This example illustrates that GPR with a sum-kernel including a WhiteKernel can -estimate the noise level of data. An illustration of the -log-marginal-likelihood (LML) landscape shows that there exist two local -maxima of LML. - -.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_001.png - :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html - :align: center - -The first corresponds to a model with a high noise level and a -large length scale, which explains all variations in the data by noise. - -.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_002.png - :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html - :align: center - -The second one has a smaller noise level and shorter length scale, which explains -most of the variation by the noise-free functional relationship. The second -model has a higher likelihood; however, depending on the initial value for the -hyperparameters, the gradient-based optimization might also converge to the -high-noise solution. It is thus important to repeat the optimization several -times for different initializations. - -.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_003.png - :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html - :align: center - - -Comparison of GPR and Kernel Ridge Regression ---------------------------------------------- - -Both kernel ridge regression (KRR) and GPR learn -a target function by employing internally the "kernel trick". KRR learns a -linear function in the space induced by the respective kernel which corresponds -to a non-linear function in the original space. The linear function in the -kernel space is chosen based on the mean-squared error loss with -ridge regularization. GPR uses the kernel to define the covariance of -a prior distribution over the target functions and uses the observed training -data to define a likelihood function. Based on Bayes theorem, a (Gaussian) -posterior distribution over target functions is defined, whose mean is used -for prediction. - -A major difference is that GPR can choose the kernel's hyperparameters based -on gradient-ascent on the marginal likelihood function while KRR needs to -perform a grid search on a cross-validated loss function (mean-squared error -loss). A further difference is that GPR learns a generative, probabilistic -model of the target function and can thus provide meaningful confidence -intervals and posterior samples along with the predictions while KRR only -provides predictions. - -The following figure illustrates both methods on an artificial dataset, which -consists of a sinusoidal target function and strong noise. The figure compares -the learned model of KRR and GPR based on a ExpSineSquared kernel, which is -suited for learning periodic functions. The kernel's hyperparameters control -the smoothness (length_scale) and periodicity of the kernel (periodicity). -Moreover, the noise level -of the data is learned explicitly by GPR by an additional WhiteKernel component -in the kernel and by the regularization parameter alpha of KRR. - -.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_001.png - :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html - :align: center - -The figure shows that both methods learn reasonable models of the target -function. GPR correctly identifies the periodicity of the function to be -roughly :math:`2*\pi` (6.28), while KRR chooses the doubled periodicity -:math:`4*\pi` . Besides -that, GPR provides reasonable confidence bounds on the prediction which are not -available for KRR. A major difference between the two methods is the time -required for fitting and predicting: while fitting KRR is fast in principle, -the grid-search for hyperparameter optimization scales exponentially with the -number of hyperparameters ("curse of dimensionality"). The gradient-based -optimization of the parameters in GPR does not suffer from this exponential -scaling and is thus considerably faster on this example with 3-dimensional -hyperparameter space. The time for predicting is similar; however, generating -the variance of the predictive distribution of GPR takes considerably longer -than just predicting the mean. - -GPR on Mauna Loa CO2 data -------------------------- - -This example is based on Section 5.4.3 of [RW2006]_. -It illustrates an example of complex kernel engineering and -hyperparameter optimization using gradient ascent on the -log-marginal-likelihood. The data consists of the monthly average atmospheric -CO2 concentrations (in parts per million by volume (ppmv)) collected at the -Mauna Loa Observatory in Hawaii, between 1958 and 1997. The objective is to -model the CO2 concentration as a function of the time t. - -The kernel is composed of several terms that are responsible for explaining -different properties of the signal: - -- a long term, smooth rising trend is to be explained by an RBF kernel. The - RBF kernel with a large length-scale enforces this component to be smooth; - it is not enforced that the trend is rising which leaves this choice to the - GP. The specific length-scale and the amplitude are free hyperparameters. - -- a seasonal component, which is to be explained by the periodic - ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale - of this periodic component, controlling its smoothness, is a free parameter. - In order to allow decaying away from exact periodicity, the product with an - RBF kernel is taken. The length-scale of this RBF component controls the - decay time and is a further free parameter. - -- smaller, medium term irregularities are to be explained by a - RationalQuadratic kernel component, whose length-scale and alpha parameter, - which determines the diffuseness of the length-scales, are to be determined. - According to [RW2006]_, these irregularities can better be explained by - a RationalQuadratic than an RBF kernel component, probably because it can - accommodate several length-scales. - -- a "noise" term, consisting of an RBF kernel contribution, which shall - explain the correlated noise components such as local weather phenomena, - and a WhiteKernel contribution for the white noise. The relative amplitudes - and the RBF's length scale are further free parameters. - -Maximizing the log-marginal-likelihood after subtracting the target's mean -yields the following kernel with an LML of -83.214: - -:: - - 34.4**2 * RBF(length_scale=41.8) - + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44, - periodicity=1) - + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957) - + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336) - -Thus, most of the target signal (34.4ppm) is explained by a long-term rising -trend (length-scale 41.8 years). The periodic component has an amplitude of -3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay -time indicates that we have a locally very close to periodic seasonal -component. The correlated noise has an amplitude of 0.197ppm with a length -scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the -overall noise level is very small, indicating that the data can be very well -explained by the model. The figure shows also that the model makes very -confident predictions until around 2015 - -.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_001.png - :target: ../auto_examples/gaussian_process/plot_gpr_co2.html - :align: center +* :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_targets.py` +* :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy.py` +* :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py` +* :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_co2.py` .. _gpc: @@ -235,11 +106,11 @@ The :class:`GaussianProcessClassifier` implements Gaussian processes (GP) for classification purposes, more specifically for probabilistic classification, where test predictions take the form of class probabilities. GaussianProcessClassifier places a GP prior on a latent function :math:`f`, -which is then squashed through a link function to obtain the probabilistic +which is then squashed through a link function :math:`\pi` to obtain the probabilistic classification. The latent function :math:`f` is a so-called nuisance function, whose values are not observed and are not relevant by themselves. Its purpose is to allow a convenient formulation of the model, and :math:`f` -is removed (integrated out) during prediction. GaussianProcessClassifier +is removed (integrated out) during prediction. :class:`GaussianProcessClassifier` implements the logistic link function, for which the integral cannot be computed analytically but is easily approximated in the binary case. @@ -263,6 +134,11 @@ that have been chosen randomly from the range of allowed values. If the initial hyperparameters should be kept fixed, `None` can be passed as optimizer. +In some scenarios, information about the latent function :math:`f` is desired +(i.e. the mean :math:`\bar{f_*}` and the variance :math:`\text{Var}[f_*]` described +in Eqs. (3.21) and (3.24) of [RW2006]_). The :class:`GaussianProcessClassifier` +provides access to these quantities via the `latent_mean_and_variance` method. + :class:`GaussianProcessClassifier` supports multi-class classification by performing either one-versus-rest or one-versus-one based training and prediction. In one-versus-rest, one binary Gaussian process classifier is @@ -298,7 +174,7 @@ While the hyperparameters chosen by optimizing LML have a considerably larger LML, they perform slightly worse according to the log-loss on test data. The figure shows that this is because they exhibit a steep change of the class probabilities at the class boundaries (which is good) but have predicted -probabilities close to 0.5 far away from the class boundaries (which is bad) +probabilities close to 0.5 far away from the class boundaries (which is bad). This undesirable effect is caused by the Laplace approximation used internally by GPC. @@ -338,7 +214,7 @@ Gaussian process classification (GPC) on iris dataset ----------------------------------------------------- This example illustrates the predicted probability of GPC for an isotropic -and anisotropic RBF kernel on a two-dimensional version for the iris-dataset. +and anisotropic RBF kernel on a two-dimensional version for the iris dataset. This illustrates the applicability of GPC to non-binary classification. The anisotropic RBF kernel obtains slightly higher log-marginal-likelihood by assigning different length-scales to the two feature dimensions. @@ -365,91 +241,93 @@ translations in the input space, while non-stationary kernels depend also on the specific values of the datapoints. Stationary kernels can further be subdivided into isotropic and anisotropic kernels, where isotropic kernels are also invariant to rotations in the input space. For more details, we refer to -Chapter 4 of [RW2006]_. For guidance on how to best combine different kernels, -we refer to [Duv2014]_. - -Gaussian Process Kernel API ---------------------------- -The main usage of a :class:`Kernel` is to compute the GP's covariance between -datapoints. For this, the method ``__call__`` of the kernel can be called. This -method can either be used to compute the "auto-covariance" of all pairs of -datapoints in a 2d array X, or the "cross-covariance" of all combinations -of datapoints of a 2d array X with datapoints in a 2d array Y. The following -identity holds true for all kernels k (except for the :class:`WhiteKernel`): -``k(X) == K(X, Y=X)`` - -If only the diagonal of the auto-covariance is being used, the method ``diag()`` -of a kernel can be called, which is more computationally efficient than the -equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)`` - -Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These -hyperparameters can for instance control length-scales or periodicity of a -kernel (see below). All kernels support computing analytic gradients -of the kernel's auto-covariance with respect to :math:`log(\theta)` via setting -``eval_gradient=True`` in the ``__call__`` method. -That is, a ``(len(X), len(X), len(theta))`` array is returned where the entry -``[i, j, l]`` contains :math:`\frac{\partial k_\theta(x_i, x_j)}{\partial log(\theta_l)}`. -This gradient is used by the Gaussian process (both regressor and classifier) -in computing the gradient of the log-marginal-likelihood, which in turn is used -to determine the value of :math:`\theta`, which maximizes the log-marginal-likelihood, -via gradient ascent. For each hyperparameter, the initial value and the -bounds need to be specified when creating an instance of the kernel. The -current value of :math:`\theta` can be get and set via the property -``theta`` of the kernel object. Moreover, the bounds of the hyperparameters can be -accessed by the property ``bounds`` of the kernel. Note that both properties -(theta and bounds) return log-transformed values of the internally used values -since those are typically more amenable to gradient-based optimization. -The specification of each hyperparameter is stored in the form of an instance of -:class:`Hyperparameter` in the respective kernel. Note that a kernel using a -hyperparameter with name "x" must have the attributes self.x and self.x_bounds. - -The abstract base class for all kernels is :class:`Kernel`. Kernel implements a -similar interface as :class:`Estimator`, providing the methods ``get_params()``, -``set_params()``, and ``clone()``. This allows setting kernel values also via -meta-estimators such as :class:`Pipeline` or :class:`GridSearch`. Note that due to the nested -structure of kernels (by applying kernel operators, see below), the names of -kernel parameters might become relatively complicated. In general, for a -binary kernel operator, parameters of the left operand are prefixed with ``k1__`` -and parameters of the right operand with ``k2__``. An additional convenience -method is ``clone_with_theta(theta)``, which returns a cloned version of the -kernel but with the hyperparameters set to ``theta``. An illustrative example: - - >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF - >>> kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0)) - >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter) - Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) - Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) - Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) - >>> params = kernel.get_params() - >>> for key in sorted(params): print("%s : %s" % (key, params[key])) - k1 : 1**2 * RBF(length_scale=0.5) - k1__k1 : 1**2 - k1__k1__constant_value : 1.0 - k1__k1__constant_value_bounds : (0.0, 10.0) - k1__k2 : RBF(length_scale=0.5) - k1__k2__length_scale : 0.5 - k1__k2__length_scale_bounds : (0.0, 10.0) - k2 : RBF(length_scale=2) - k2__length_scale : 2.0 - k2__length_scale_bounds : (0.0, 10.0) - >>> print(kernel.theta) # Note: log-transformed - [ 0. -0.69314718 0.69314718] - >>> print(kernel.bounds) # Note: log-transformed - [[ -inf 2.30258509] - [ -inf 2.30258509] - [ -inf 2.30258509]] - - -All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise` -and vice versa: instances of subclasses of :class:`Kernel` can be passed as -``metric`` to ``pairwise_kernels`` from :mod:`sklearn.metrics.pairwise`. Moreover, -kernel functions from pairwise can be used as GP kernels by using the wrapper -class :class:`PairwiseKernel`. The only caveat is that the gradient of -the hyperparameters is not analytic but numeric and all those kernels support -only isotropic distances. The parameter ``gamma`` is considered to be a -hyperparameter and may be optimized. The other kernel parameters are set -directly at initialization and are kept fixed. - +Chapter 4 of [RW2006]_. :ref:`This example +` +shows how to define a custom kernel over discrete data. For guidance on how to best +combine different kernels, we refer to [Duv2014]_. + +.. dropdown:: Gaussian Process Kernel API + + The main usage of a :class:`Kernel` is to compute the GP's covariance between + datapoints. For this, the method ``__call__`` of the kernel can be called. This + method can either be used to compute the "auto-covariance" of all pairs of + datapoints in a 2d array X, or the "cross-covariance" of all combinations + of datapoints of a 2d array X with datapoints in a 2d array Y. The following + identity holds true for all kernels k (except for the :class:`WhiteKernel`): + ``k(X) == K(X, Y=X)`` + + If only the diagonal of the auto-covariance is being used, the method ``diag()`` + of a kernel can be called, which is more computationally efficient than the + equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)`` + + Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These + hyperparameters can for instance control length-scales or periodicity of a + kernel (see below). All kernels support computing analytic gradients + of the kernel's auto-covariance with respect to :math:`log(\theta)` via setting + ``eval_gradient=True`` in the ``__call__`` method. + That is, a ``(len(X), len(X), len(theta))`` array is returned where the entry + ``[i, j, l]`` contains :math:`\frac{\partial k_\theta(x_i, x_j)}{\partial log(\theta_l)}`. + This gradient is used by the Gaussian process (both regressor and classifier) + in computing the gradient of the log-marginal-likelihood, which in turn is used + to determine the value of :math:`\theta`, which maximizes the log-marginal-likelihood, + via gradient ascent. For each hyperparameter, the initial value and the + bounds need to be specified when creating an instance of the kernel. The + current value of :math:`\theta` can be get and set via the property + ``theta`` of the kernel object. Moreover, the bounds of the hyperparameters can be + accessed by the property ``bounds`` of the kernel. Note that both properties + (theta and bounds) return log-transformed values of the internally used values + since those are typically more amenable to gradient-based optimization. + The specification of each hyperparameter is stored in the form of an instance of + :class:`Hyperparameter` in the respective kernel. Note that a kernel using a + hyperparameter with name "x" must have the attributes self.x and self.x_bounds. + + The abstract base class for all kernels is :class:`Kernel`. Kernel implements a + similar interface as :class:`~sklearn.base.BaseEstimator`, providing the + methods ``get_params()``, ``set_params()``, and ``clone()``. This allows + setting kernel values also via meta-estimators such as + :class:`~sklearn.pipeline.Pipeline` or + :class:`~sklearn.model_selection.GridSearchCV`. Note that due to the nested + structure of kernels (by applying kernel operators, see below), the names of + kernel parameters might become relatively complicated. In general, for a binary + kernel operator, parameters of the left operand are prefixed with ``k1__`` and + parameters of the right operand with ``k2__``. An additional convenience method + is ``clone_with_theta(theta)``, which returns a cloned version of the kernel + but with the hyperparameters set to ``theta``. An illustrative example: + + >>> from sklearn.gaussian_process.kernels import ConstantKernel, RBF + >>> kernel = ConstantKernel(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0)) + >>> for hyperparameter in kernel.hyperparameters: print(hyperparameter) + Hyperparameter(name='k1__k1__constant_value', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) + Hyperparameter(name='k1__k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) + Hyperparameter(name='k2__length_scale', value_type='numeric', bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) + >>> params = kernel.get_params() + >>> for key in sorted(params): print("%s : %s" % (key, params[key])) + k1 : 1**2 * RBF(length_scale=0.5) + k1__k1 : 1**2 + k1__k1__constant_value : 1.0 + k1__k1__constant_value_bounds : (0.0, 10.0) + k1__k2 : RBF(length_scale=0.5) + k1__k2__length_scale : 0.5 + k1__k2__length_scale_bounds : (0.0, 10.0) + k2 : RBF(length_scale=2) + k2__length_scale : 2.0 + k2__length_scale_bounds : (0.0, 10.0) + >>> print(kernel.theta) # Note: log-transformed + [ 0. -0.69314718 0.69314718] + >>> print(kernel.bounds) # Note: log-transformed + [[ -inf 2.30258509] + [ -inf 2.30258509] + [ -inf 2.30258509]] + + All Gaussian process kernels are interoperable with :mod:`sklearn.metrics.pairwise` + and vice versa: instances of subclasses of :class:`Kernel` can be passed as + ``metric`` to ``pairwise_kernels`` from :mod:`sklearn.metrics.pairwise`. Moreover, + kernel functions from pairwise can be used as GP kernels by using the wrapper + class :class:`PairwiseKernel`. The only caveat is that the gradient of + the hyperparameters is not analytic but numeric and all those kernels support + only isotropic distances. The parameter ``gamma`` is considered to be a + hyperparameter and may be optimized. The other kernel parameters are set + directly at initialization and are kept fixed. Basic kernels ------------- @@ -484,7 +362,7 @@ Note that magic methods ``__add__``, ``__mul___`` and ``__pow__`` are overridden on the Kernel objects, so one can use e.g. ``RBF() + RBF()`` as a shortcut for ``Sum(RBF(), RBF())``. -Radial-basis function (RBF) kernel +Radial basis function (RBF) kernel ---------------------------------- The :class:`RBF` kernel is a stationary kernel. It is also known as the "squared exponential" kernel. It is parameterized by a length-scale parameter :math:`l>0`, which @@ -510,36 +388,41 @@ MatÊrn kernel ------------- The :class:`Matern` kernel is a stationary kernel and a generalization of the :class:`RBF` kernel. It has an additional parameter :math:`\nu` which controls -the smoothness of the resulting function. It is parameterized by a length-scale parameter :math:`l>0`, which can either be a scalar (isotropic variant of the kernel) or a vector with the same number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel). The kernel is given by: +the smoothness of the resulting function. It is parameterized by a length-scale parameter :math:`l>0`, which can either be a scalar (isotropic variant of the kernel) or a vector with the same number of dimensions as the inputs :math:`x` (anisotropic variant of the kernel). -.. math:: +.. dropdown:: Mathematical implementation of MatÊrn kernel - k(x_i, x_j) = \frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg(\frac{\sqrt{2\nu}}{l} d(x_i , x_j )\Bigg)^\nu K_\nu\Bigg(\frac{\sqrt{2\nu}}{l} d(x_i , x_j )\Bigg), + The kernel is given by: -where :math:`d(\cdot,\cdot)` is the Euclidean distance, :math:`K_\nu(\cdot)` is a modified Bessel function and :math:`\Gamma(\cdot)` is the gamma function. -As :math:`\nu\rightarrow\infty`, the MatÊrn kernel converges to the RBF kernel. -When :math:`\nu = 1/2`, the MatÊrn kernel becomes identical to the absolute -exponential kernel, i.e., + .. math:: -.. math:: - k(x_i, x_j) = \exp \Bigg(- \frac{1}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{1}{2} + k(x_i, x_j) = \frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg(\frac{\sqrt{2\nu}}{l} d(x_i , x_j )\Bigg)^\nu K_\nu\Bigg(\frac{\sqrt{2\nu}}{l} d(x_i , x_j )\Bigg), -In particular, :math:`\nu = 3/2`: + where :math:`d(\cdot,\cdot)` is the Euclidean distance, :math:`K_\nu(\cdot)` is a modified Bessel function and :math:`\Gamma(\cdot)` is the gamma function. + As :math:`\nu\rightarrow\infty`, the MatÊrn kernel converges to the RBF kernel. + When :math:`\nu = 1/2`, the MatÊrn kernel becomes identical to the absolute + exponential kernel, i.e., -.. math:: - k(x_i, x_j) = \Bigg(1 + \frac{\sqrt{3}}{l} d(x_i , x_j )\Bigg) \exp \Bigg(-\frac{\sqrt{3}}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{3}{2} + .. math:: + k(x_i, x_j) = \exp \Bigg(- \frac{1}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{1}{2} -and :math:`\nu = 5/2`: + In particular, :math:`\nu = 3/2`: -.. math:: - k(x_i, x_j) = \Bigg(1 + \frac{\sqrt{5}}{l} d(x_i , x_j ) +\frac{5}{3l} d(x_i , x_j )^2 \Bigg) \exp \Bigg(-\frac{\sqrt{5}}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{5}{2} + .. math:: + k(x_i, x_j) = \Bigg(1 + \frac{\sqrt{3}}{l} d(x_i , x_j )\Bigg) \exp \Bigg(-\frac{\sqrt{3}}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{3}{2} + + and :math:`\nu = 5/2`: + + .. math:: + k(x_i, x_j) = \Bigg(1 + \frac{\sqrt{5}}{l} d(x_i , x_j ) +\frac{5}{3l} d(x_i , x_j )^2 \Bigg) \exp \Bigg(-\frac{\sqrt{5}}{l} d(x_i , x_j ) \Bigg) \quad \quad \nu= \tfrac{5}{2} + + are popular choices for learning functions that are not infinitely + differentiable (as assumed by the RBF kernel) but at least once (:math:`\nu = + 3/2`) or twice differentiable (:math:`\nu = 5/2`). -are popular choices for learning functions that are not infinitely -differentiable (as assumed by the RBF kernel) but at least once (:math:`\nu = -3/2`) or twice differentiable (:math:`\nu = 5/2`). + The flexibility of controlling the smoothness of the learned function via :math:`\nu` + allows adapting to the properties of the true underlying functional relation. -The flexibility of controlling the smoothness of the learned function via :math:`\nu` -allows adapting to the properties of the true underlying functional relation. The prior and posterior of a GP resulting from a MatÊrn kernel are shown in the following figure: @@ -610,8 +493,11 @@ shown in the following figure: References ---------- -.. [RW2006] Carl Eduard Rasmussen and Christopher K.I. Williams, "Gaussian Processes for Machine Learning", MIT Press 2006, Link to an official complete PDF version of the book `here `_ . +.. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams, + "Gaussian Processes for Machine Learning", + MIT Press 2006 `_ -.. [Duv2014] David Duvenaud, "The Kernel Cookbook: Advice on Covariance functions", 2014, `Link `_ . +.. [Duv2014] `David Duvenaud, "The Kernel Cookbook: Advice on Covariance functions", 2014 + `_ .. currentmodule:: sklearn.gaussian_process diff --git a/doc/modules/grid_search.rst b/doc/modules/grid_search.rst index 331607c86e657..edb915b193e37 100644 --- a/doc/modules/grid_search.rst +++ b/doc/modules/grid_search.rst @@ -72,35 +72,41 @@ evaluated and the best combination is retained. .. currentmodule:: sklearn.model_selection -.. topic:: Examples: +.. rubric:: Examples - - See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` for an example of - Grid Search computation on the digits dataset. +- See :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py` + for an example of Grid Search within a cross validation loop on the iris + dataset. This is the best practice for evaluating the performance of a + model with grid search. - - See :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` for an example - of Grid Search coupling parameters from a text documents feature - extractor (n-gram count vectorizer and TF-IDF transformer) with a - classifier (here a linear SVM trained with SGD with either elastic - net or L2 penalty) using a :class:`pipeline.Pipeline` instance. +- See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_text_feature_extraction.py` for an example + of Grid Search coupling parameters from a text documents feature + extractor (n-gram count vectorizer and TF-IDF transformer) with a + classifier (here a linear SVM trained with SGD with either elastic + net or L2 penalty) using a :class:`~sklearn.pipeline.Pipeline` instance. - - See :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py` - for an example of Grid Search within a cross validation loop on the iris - dataset. This is the best practice for evaluating the performance of a - model with grid search. - - See :ref:`sphx_glr_auto_examples_model_selection_plot_multi_metric_evaluation.py` - for an example of :class:`GridSearchCV` being used to evaluate multiple - metrics simultaneously. +.. dropdown:: Advanced examples - - See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py` - for an example of using ``refit=callable`` interface in - :class:`GridSearchCV`. The example shows how this interface adds certain - amount of flexibility in identifying the "best" estimator. This interface - can also be used in multiple metrics evaluation. + - See :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py` + for an example of Grid Search within a cross validation loop on the iris + dataset. This is the best practice for evaluating the performance of a + model with grid search. + + - See :ref:`sphx_glr_auto_examples_model_selection_plot_multi_metric_evaluation.py` + for an example of :class:`GridSearchCV` being used to evaluate multiple + metrics simultaneously. + + - See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py` + for an example of using ``refit=callable`` interface in + :class:`GridSearchCV`. The example shows how this interface adds a certain + amount of flexibility in identifying the "best" estimator. This interface + can also be used in multiple metrics evaluation. + + - See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py` + for an example of how to do a statistical comparison on the outputs of + :class:`GridSearchCV`. - - See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py` - for an example of how to do a statistical comparison on the outputs of - :class:`GridSearchCV`. .. _randomized_parameter_search: @@ -108,7 +114,7 @@ Randomized Parameter Optimization ================================= While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more -favourable properties. +favorable properties. :class:`RandomizedSearchCV` implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search: @@ -128,32 +134,29 @@ discrete choices (which will be sampled uniformly) can be specified:: This example uses the ``scipy.stats`` module, which contains many useful distributions for sampling parameters, such as ``expon``, ``gamma``, -``uniform`` or ``randint``. +``uniform``, ``loguniform`` or ``randint``. In principle, any function can be passed that provides a ``rvs`` (random variate sample) method to sample a value. A call to the ``rvs`` function should provide independent random samples from possible parameter values on consecutive calls. - .. warning:: +.. warning:: - The distributions in ``scipy.stats`` prior to version scipy 0.16 - do not allow specifying a random state. Instead, they use the global - numpy random state, that can be seeded via ``np.random.seed`` or set - using ``np.random.set_state``. However, beginning scikit-learn 0.18, - the :mod:`sklearn.model_selection` module sets the random state provided - by the user if scipy >= 0.16 is also available. + The distributions in ``scipy.stats`` prior to version scipy 0.16 + do not allow specifying a random state. Instead, they use the global + numpy random state, that can be seeded via ``np.random.seed`` or set + using ``np.random.set_state``. However, beginning scikit-learn 0.18, + the :mod:`sklearn.model_selection` module sets the random state provided + by the user if scipy >= 0.16 is also available. For continuous parameters, such as ``C`` above, it is important to specify a continuous distribution to take full advantage of the randomization. This way, increasing ``n_iter`` will always lead to a finer search. -A continuous log-uniform random variable is available through -:class:`~sklearn.utils.fixes.loguniform`. This is a continuous version of -log-spaced parameters. For example to specify ``C`` above, ``loguniform(1, -100)`` can be used instead of ``[1, 10, 100]`` or ``np.logspace(0, 2, -num=1000)``. This is an alias to SciPy's `stats.reciprocal -`_. +A continuous log-uniform random variable is the continuous version of +a log-spaced parameter. For example to specify the equivalent of ``C`` from above, +``loguniform(1, 100)`` can be used instead of ``[1, 10, 100]``. Mirroring the example above in grid search, we can specify a continuous random variable that is log-uniformly distributed between ``1e0`` and ``1e3``:: @@ -164,16 +167,16 @@ variable that is log-uniformly distributed between ``1e0`` and ``1e3``:: 'kernel': ['rbf'], 'class_weight':['balanced', None]} -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_model_selection_plot_randomized_search.py` compares the usage and efficiency - of randomized search and grid search. +* :ref:`sphx_glr_auto_examples_model_selection_plot_randomized_search.py` compares the usage and efficiency + of randomized search and grid search. -.. topic:: References: +.. rubric:: References - * Bergstra, J. and Bengio, Y., - Random search for hyper-parameter optimization, - The Journal of Machine Learning Research (2012) +* Bergstra, J. and Bengio, Y., + Random search for hyper-parameter optimization, + The Journal of Machine Learning Research (2012) .. _successive_halving_user_guide: @@ -191,6 +194,11 @@ iteration, which will be allocated more resources. For parameter tuning, the resource is typically the number of training samples, but it can also be an arbitrary numeric parameter such as `n_estimators` in a random forest. +.. note:: + + The resource increase chosen should be large enough so that a large improvement + in scores is obtained when taking into account statistical significance. + As illustrated in the figure below, only a subset of candidates 'survive' until the last iteration. These are the candidates that have consistently ranked among the top-scoring candidates across all iterations. @@ -202,7 +210,7 @@ here the number of samples. :align: center We here briefly describe the main parameters, but each parameter and their -interactions are described in more details in the sections below. The +interactions are described more in detail in the dropdown section below. The ``factor`` (> 1) parameter controls the rate at which the resources grow, and the rate at which the number of candidates decreases. In each iteration, the number of resources per candidate is multiplied by ``factor`` and the number @@ -219,328 +227,323 @@ These estimators are still **experimental**: their predictions and their API might change without any deprecation cycle. To use them, you need to explicitly import ``enable_halving_search_cv``:: - >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_halving_search_cv # noqa - >>> # now you can import normally from model_selection >>> from sklearn.model_selection import HalvingGridSearchCV >>> from sklearn.model_selection import HalvingRandomSearchCV -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_heatmap.py` - * :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` - -Choosing ``min_resources`` and the number of candidates -------------------------------------------------------- - -Beside ``factor``, the two main parameters that influence the behaviour of a -successive halving search are the ``min_resources`` parameter, and the -number of candidates (or parameter combinations) that are evaluated. -``min_resources`` is the amount of resources allocated at the first -iteration for each candidate. The number of candidates is specified directly -in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` -parameter of :class:`HalvingGridSearchCV`. - -Consider a case where the resource is the number of samples, and where we -have 1000 samples. In theory, with ``min_resources=10`` and ``factor=2``, we -are able to run **at most** 7 iterations with the following number of -samples: ``[10, 20, 40, 80, 160, 320, 640]``. - -But depending on the number of candidates, we might run less than 7 -iterations: if we start with a **small** number of candidates, the last -iteration might use less than 640 samples, which means not using all the -available resources (samples). For example if we start with 5 candidates, we -only need 2 iterations: 5 candidates for the first iteration, then -`5 // 2 = 2` candidates at the second iteration, after which we know which -candidate performs the best (so we don't need a third one). We would only be -using at most 20 samples which is a waste since we have 1000 samples at our -disposal. On the other hand, if we start with a **high** number of -candidates, we might end up with a lot of candidates at the last iteration, -which may not always be ideal: it means that many candidates will run with -the full resources, basically reducing the procedure to standard search. - -In the case of :class:`HalvingRandomSearchCV`, the number of candidates is set -by default such that the last iteration uses as much of the available -resources as possible. For :class:`HalvingGridSearchCV`, the number of -candidates is determined by the `param_grid` parameter. Changing the value of -``min_resources`` will impact the number of possible iterations, and as a -result will also have an effect on the ideal number of candidates. - -Another consideration when choosing ``min_resources`` is whether or not it -is easy to discriminate between good and bad candidates with a small amount -of resources. For example, if you need a lot of samples to distinguish -between good and bad parameters, a high ``min_resources`` is recommended. On -the other hand if the distinction is clear even with a small amount of -samples, then a small ``min_resources`` may be preferable since it would -speed up the computation. - -Notice in the example above that the last iteration does not use the maximum -amount of resources available: 1000 samples are available, yet only 640 are -used, at most. By default, both :class:`HalvingRandomSearchCV` and -:class:`HalvingGridSearchCV` try to use as many resources as possible in the -last iteration, with the constraint that this amount of resources must be a -multiple of both `min_resources` and `factor` (this constraint will be clear -in the next section). :class:`HalvingRandomSearchCV` achieves this by -sampling the right amount of candidates, while :class:`HalvingGridSearchCV` -achieves this by properly setting `min_resources`. Please see -:ref:`exhausting_the_resources` for details. - -.. _amount_of_resource_and_number_of_candidates: - -Amount of resource and number of candidates at each iteration -------------------------------------------------------------- - -At any iteration `i`, each candidate is allocated a given amount of resources -which we denote `n_resources_i`. This quantity is controlled by the -parameters ``factor`` and ``min_resources`` as follows (`factor` is strictly -greater than 1):: - - n_resources_i = factor**i * min_resources, - -or equivalently:: - - n_resources_{i+1} = n_resources_i * factor - -where ``min_resources == n_resources_0`` is the amount of resources used at -the first iteration. ``factor`` also defines the proportions of candidates -that will be selected for the next iteration:: - - n_candidates_i = n_candidates // (factor ** i) - -or equivalently:: - - n_candidates_0 = n_candidates - n_candidates_{i+1} = n_candidates_i // factor - -So in the first iteration, we use ``min_resources`` resources -``n_candidates`` times. In the second iteration, we use ``min_resources * -factor`` resources ``n_candidates // factor`` times. The third again -multiplies the resources per candidate and divides the number of candidates. -This process stops when the maximum amount of resource per candidate is -reached, or when we have identified the best candidate. The best candidate -is identified at the iteration that is evaluating `factor` or less candidates -(see just below for an explanation). - -Here is an example with ``min_resources=3`` and ``factor=2``, starting with -70 candidates: - -+-----------------------+-----------------------+ -| ``n_resources_i`` | ``n_candidates_i`` | -+=======================+=======================+ -| 3 (=min_resources) | 70 (=n_candidates) | -+-----------------------+-----------------------+ -| 3 * 2 = 6 | 70 // 2 = 35 | -+-----------------------+-----------------------+ -| 6 * 2 = 12 | 35 // 2 = 17 | -+-----------------------+-----------------------+ -| 12 * 2 = 24 | 17 // 2 = 8 | -+-----------------------+-----------------------+ -| 24 * 2 = 48 | 8 // 2 = 4 | -+-----------------------+-----------------------+ -| 48 * 2 = 96 | 4 // 2 = 2 | -+-----------------------+-----------------------+ - -We can note that: - -- the process stops at the first iteration which evaluates `factor=2` - candidates: the best candidate is the best out of these 2 candidates. It - is not necessary to run an additional iteration, since it would only - evaluate one candidate (namely the best one, which we have already - identified). For this reason, in general, we want the last iteration to - run at most ``factor`` candidates. If the last iteration evaluates more - than `factor` candidates, then this last iteration reduces to a regular - search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). -- each ``n_resources_i`` is a multiple of both ``factor`` and - ``min_resources`` (which is confirmed by its definition above). - -The amount of resources that is used at each iteration can be found in the -`n_resources_` attribute. - -Choosing a resource -------------------- - -By default, the resource is defined in terms of number of samples. That is, -each iteration will use an increasing amount of samples to train on. You can -however manually specify a parameter to use as the resource with the -``resource`` parameter. Here is an example where the resource is defined in -terms of the number of estimators of a random forest:: - - >>> from sklearn.datasets import make_classification - >>> from sklearn.ensemble import RandomForestClassifier - >>> from sklearn.experimental import enable_halving_search_cv # noqa - >>> from sklearn.model_selection import HalvingGridSearchCV - >>> import pandas as pd - >>> - >>> param_grid = {'max_depth': [3, 5, 10], - ... 'min_samples_split': [2, 5, 10]} - >>> base_estimator = RandomForestClassifier(random_state=0) - >>> X, y = make_classification(n_samples=1000, random_state=0) - >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... factor=2, resource='n_estimators', - ... max_resources=30).fit(X, y) - >>> sh.best_estimator_ - RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) - -Note that it is not possible to budget on a parameter that is part of the -parameter grid. - -.. _exhausting_the_resources: - -Exhausting the available resources ----------------------------------- - -As mentioned above, the number of resources that is used at each iteration -depends on the `min_resources` parameter. -If you have a lot of resources available but start with a low number of -resources, some of them might be wasted (i.e. not used):: - - >>> from sklearn.datasets import make_classification - >>> from sklearn.svm import SVC - >>> from sklearn.experimental import enable_halving_search_cv # noqa - >>> from sklearn.model_selection import HalvingGridSearchCV - >>> import pandas as pd - >>> param_grid= {'kernel': ('linear', 'rbf'), - ... 'C': [1, 10, 100]} - >>> base_estimator = SVC(gamma='scale') - >>> X, y = make_classification(n_samples=1000) - >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... factor=2, min_resources=20).fit(X, y) - >>> sh.n_resources_ - [20, 40, 80] - -The search process will only use 80 resources at most, while our maximum -amount of available resources is ``n_samples=1000``. Here, we have -``min_resources = r_0 = 20``. - -For :class:`HalvingGridSearchCV`, by default, the `min_resources` parameter -is set to 'exhaust'. This means that `min_resources` is automatically set -such that the last iteration can use as many resources as possible, within -the `max_resources` limit:: - - >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... factor=2, min_resources='exhaust').fit(X, y) - >>> sh.n_resources_ - [250, 500, 1000] - -`min_resources` was here automatically set to 250, which results in the last -iteration using all the resources. The exact value that is used depends on -the number of candidate parameter, on `max_resources` and on `factor`. - -For :class:`HalvingRandomSearchCV`, exhausting the resources can be done in 2 -ways: - -- by setting `min_resources='exhaust'`, just like for - :class:`HalvingGridSearchCV`; -- by setting `n_candidates='exhaust'`. - -Both options are mutally exclusive: using `min_resources='exhaust'` requires -knowing the number of candidates, and symmetrically `n_candidates='exhaust'` -requires knowing `min_resources`. - -In general, exhausting the total number of resources leads to a better final -candidate parameter, and is slightly more time-intensive. +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_heatmap.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_successive_halving_iterations.py` + +The sections below dive into technical aspects of successive halving. + +.. dropdown:: Choosing ``min_resources`` and the number of candidates + + Beside ``factor``, the two main parameters that influence the behaviour of a + successive halving search are the ``min_resources`` parameter, and the + number of candidates (or parameter combinations) that are evaluated. + ``min_resources`` is the amount of resources allocated at the first + iteration for each candidate. The number of candidates is specified directly + in :class:`HalvingRandomSearchCV`, and is determined from the ``param_grid`` + parameter of :class:`HalvingGridSearchCV`. + + Consider a case where the resource is the number of samples, and where we + have 1000 samples. In theory, with ``min_resources=10`` and ``factor=2``, we + are able to run **at most** 7 iterations with the following number of + samples: ``[10, 20, 40, 80, 160, 320, 640]``. + + But depending on the number of candidates, we might run less than 7 + iterations: if we start with a **small** number of candidates, the last + iteration might use less than 640 samples, which means not using all the + available resources (samples). For example if we start with 5 candidates, we + only need 2 iterations: 5 candidates for the first iteration, then + `5 // 2 = 2` candidates at the second iteration, after which we know which + candidate performs the best (so we don't need a third one). We would only be + using at most 20 samples which is a waste since we have 1000 samples at our + disposal. On the other hand, if we start with a **high** number of + candidates, we might end up with a lot of candidates at the last iteration, + which may not always be ideal: it means that many candidates will run with + the full resources, basically reducing the procedure to standard search. + + In the case of :class:`HalvingRandomSearchCV`, the number of candidates is set + by default such that the last iteration uses as much of the available + resources as possible. For :class:`HalvingGridSearchCV`, the number of + candidates is determined by the `param_grid` parameter. Changing the value of + ``min_resources`` will impact the number of possible iterations, and as a + result will also have an effect on the ideal number of candidates. + + Another consideration when choosing ``min_resources`` is whether or not it + is easy to discriminate between good and bad candidates with a small amount + of resources. For example, if you need a lot of samples to distinguish + between good and bad parameters, a high ``min_resources`` is recommended. On + the other hand if the distinction is clear even with a small amount of + samples, then a small ``min_resources`` may be preferable since it would + speed up the computation. + + Notice in the example above that the last iteration does not use the maximum + amount of resources available: 1000 samples are available, yet only 640 are + used, at most. By default, both :class:`HalvingRandomSearchCV` and + :class:`HalvingGridSearchCV` try to use as many resources as possible in the + last iteration, with the constraint that this amount of resources must be a + multiple of both `min_resources` and `factor` (this constraint will be clear + in the next section). :class:`HalvingRandomSearchCV` achieves this by + sampling the right amount of candidates, while :class:`HalvingGridSearchCV` + achieves this by properly setting `min_resources`. + + +.. dropdown:: Amount of resource and number of candidates at each iteration + + At any iteration `i`, each candidate is allocated a given amount of resources + which we denote `n_resources_i`. This quantity is controlled by the + parameters ``factor`` and ``min_resources`` as follows (`factor` is strictly + greater than 1):: + + n_resources_i = factor**i * min_resources, + + or equivalently:: + + n_resources_{i+1} = n_resources_i * factor + + where ``min_resources == n_resources_0`` is the amount of resources used at + the first iteration. ``factor`` also defines the proportions of candidates + that will be selected for the next iteration:: + + n_candidates_i = n_candidates // (factor ** i) + + or equivalently:: + + n_candidates_0 = n_candidates + n_candidates_{i+1} = n_candidates_i // factor + + So in the first iteration, we use ``min_resources`` resources + ``n_candidates`` times. In the second iteration, we use ``min_resources * + factor`` resources ``n_candidates // factor`` times. The third again + multiplies the resources per candidate and divides the number of candidates. + This process stops when the maximum amount of resource per candidate is + reached, or when we have identified the best candidate. The best candidate + is identified at the iteration that is evaluating `factor` or less candidates + (see just below for an explanation). + + Here is an example with ``min_resources=3`` and ``factor=2``, starting with + 70 candidates: + + +-----------------------+-----------------------+ + | ``n_resources_i`` | ``n_candidates_i`` | + +=======================+=======================+ + | 3 (=min_resources) | 70 (=n_candidates) | + +-----------------------+-----------------------+ + | 3 * 2 = 6 | 70 // 2 = 35 | + +-----------------------+-----------------------+ + | 6 * 2 = 12 | 35 // 2 = 17 | + +-----------------------+-----------------------+ + | 12 * 2 = 24 | 17 // 2 = 8 | + +-----------------------+-----------------------+ + | 24 * 2 = 48 | 8 // 2 = 4 | + +-----------------------+-----------------------+ + | 48 * 2 = 96 | 4 // 2 = 2 | + +-----------------------+-----------------------+ + + We can note that: + + - the process stops at the first iteration which evaluates `factor=2` + candidates: the best candidate is the best out of these 2 candidates. It + is not necessary to run an additional iteration, since it would only + evaluate one candidate (namely the best one, which we have already + identified). For this reason, in general, we want the last iteration to + run at most ``factor`` candidates. If the last iteration evaluates more + than `factor` candidates, then this last iteration reduces to a regular + search (as in :class:`RandomizedSearchCV` or :class:`GridSearchCV`). + - each ``n_resources_i`` is a multiple of both ``factor`` and + ``min_resources`` (which is confirmed by its definition above). + + The amount of resources that is used at each iteration can be found in the + `n_resources_` attribute. + +.. dropdown:: Choosing a resource + + By default, the resource is defined in terms of number of samples. That is, + each iteration will use an increasing amount of samples to train on. You can + however manually specify a parameter to use as the resource with the + ``resource`` parameter. Here is an example where the resource is defined in + terms of the number of estimators of a random forest:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingGridSearchCV + >>> import pandas as pd + >>> param_grid = {'max_depth': [3, 5, 10], + ... 'min_samples_split': [2, 5, 10]} + >>> base_estimator = RandomForestClassifier(random_state=0) + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, + ... factor=2, resource='n_estimators', + ... max_resources=30).fit(X, y) + >>> sh.best_estimator_ + RandomForestClassifier(max_depth=5, n_estimators=24, random_state=0) + + Note that it is not possible to budget on a parameter that is part of the + parameter grid. + + +.. dropdown:: Exhausting the available resources + + As mentioned above, the number of resources that is used at each iteration + depends on the `min_resources` parameter. + If you have a lot of resources available but start with a low number of + resources, some of them might be wasted (i.e. not used):: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.svm import SVC + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingGridSearchCV + >>> import pandas as pd + >>> param_grid= {'kernel': ('linear', 'rbf'), + ... 'C': [1, 10, 100]} + >>> base_estimator = SVC(gamma='scale') + >>> X, y = make_classification(n_samples=1000) + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, + ... factor=2, min_resources=20).fit(X, y) + >>> sh.n_resources_ + [20, 40, 80] + + The search process will only use 80 resources at most, while our maximum + amount of available resources is ``n_samples=1000``. Here, we have + ``min_resources = r_0 = 20``. + + For :class:`HalvingGridSearchCV`, by default, the `min_resources` parameter + is set to 'exhaust'. This means that `min_resources` is automatically set + such that the last iteration can use as many resources as possible, within + the `max_resources` limit:: + + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, + ... factor=2, min_resources='exhaust').fit(X, y) + >>> sh.n_resources_ + [250, 500, 1000] + + `min_resources` was here automatically set to 250, which results in the last + iteration using all the resources. The exact value that is used depends on + the number of candidate parameters, on `max_resources` and on `factor`. + + For :class:`HalvingRandomSearchCV`, exhausting the resources can be done in 2 + ways: + + - by setting `min_resources='exhaust'`, just like for + :class:`HalvingGridSearchCV`; + - by setting `n_candidates='exhaust'`. + + Both options are mutually exclusive: using `min_resources='exhaust'` requires + knowing the number of candidates, and symmetrically `n_candidates='exhaust'` + requires knowing `min_resources`. + + In general, exhausting the total number of resources leads to a better final + candidate parameter, and is slightly more time-intensive. .. _aggressive_elimination: Aggressive elimination of candidates ------------------------------------ -Ideally, we want the last iteration to evaluate ``factor`` candidates (see -:ref:`amount_of_resource_and_number_of_candidates`). We then just have to -pick the best one. When the number of available resources is small with -respect to the number of candidates, the last iteration may have to evaluate -more than ``factor`` candidates:: - - >>> from sklearn.datasets import make_classification - >>> from sklearn.svm import SVC - >>> from sklearn.experimental import enable_halving_search_cv # noqa - >>> from sklearn.model_selection import HalvingGridSearchCV - >>> import pandas as pd - >>> - >>> - >>> param_grid = {'kernel': ('linear', 'rbf'), - ... 'C': [1, 10, 100]} - >>> base_estimator = SVC(gamma='scale') - >>> X, y = make_classification(n_samples=1000) - >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... factor=2, max_resources=40, - ... aggressive_elimination=False).fit(X, y) - >>> sh.n_resources_ - [20, 40] - >>> sh.n_candidates_ - [6, 3] - -Since we cannot use more than ``max_resources=40`` resources, the process -has to stop at the second iteration which evaluates more than ``factor=2`` -candidates. - Using the ``aggressive_elimination`` parameter, you can force the search process to end up with less than ``factor`` candidates at the last -iteration. To do this, the process will eliminate as many candidates as -necessary using ``min_resources`` resources:: - - >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, - ... factor=2, - ... max_resources=40, - ... aggressive_elimination=True, - ... ).fit(X, y) - >>> sh.n_resources_ - [20, 20, 40] - >>> sh.n_candidates_ - [6, 3, 2] - -Notice that we end with 2 candidates at the last iteration since we have -eliminated enough candidates during the first iterations, using ``n_resources = -min_resources = 20``. +iteration. + +.. dropdown:: Code example of aggressive elimination + + Ideally, we want the last iteration to evaluate ``factor`` candidates. We + then just have to pick the best one. When the number of available resources is + small with respect to the number of candidates, the last iteration may have to + evaluate more than ``factor`` candidates:: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.svm import SVC + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingGridSearchCV + >>> import pandas as pd + >>> param_grid = {'kernel': ('linear', 'rbf'), + ... 'C': [1, 10, 100]} + >>> base_estimator = SVC(gamma='scale') + >>> X, y = make_classification(n_samples=1000) + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, + ... factor=2, max_resources=40, + ... aggressive_elimination=False).fit(X, y) + >>> sh.n_resources_ + [20, 40] + >>> sh.n_candidates_ + [6, 3] + + Since we cannot use more than ``max_resources=40`` resources, the process + has to stop at the second iteration which evaluates more than ``factor=2`` + candidates. + + When using ``aggressive_elimination``, the process will eliminate as many + candidates as necessary using ``min_resources`` resources:: + + >>> sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, + ... factor=2, + ... max_resources=40, + ... aggressive_elimination=True, + ... ).fit(X, y) + >>> sh.n_resources_ + [20, 20, 40] + >>> sh.n_candidates_ + [6, 3, 2] + + Notice that we end with 2 candidates at the last iteration since we have + eliminated enough candidates during the first iterations, using ``n_resources = + min_resources = 20``. .. _successive_halving_cv_results: -Analysing results with the `cv_results_` attribute +Analyzing results with the `cv_results_` attribute -------------------------------------------------- -The ``cv_results_`` attribute contains useful information for analysing the +The ``cv_results_`` attribute contains useful information for analyzing the results of a search. It can be converted to a pandas dataframe with ``df = pd.DataFrame(est.cv_results_)``. The ``cv_results_`` attribute of :class:`HalvingGridSearchCV` and :class:`HalvingRandomSearchCV` is similar to that of :class:`GridSearchCV` and :class:`RandomizedSearchCV`, with additional information related to the successive halving process. -Here is an example with some of the columns of a (truncated) dataframe: - -==== ====== =============== ================= ======================================================================================= - .. iter n_resources mean_test_score params -==== ====== =============== ================= ======================================================================================= - 0 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5} - 1 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_split': 7} - 2 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} - 3 0 125 0.983667 {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_split': 6} - ... ... ... ... ... - 15 2 500 0.951958 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} - 16 2 500 0.947958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} - 17 2 500 0.951958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} - 18 3 1000 0.961009 {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} - 19 3 1000 0.955989 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} -==== ====== =============== ================= ======================================================================================= - -Each row corresponds to a given parameter combination (a candidate) and a given -iteration. The iteration is given by the ``iter`` column. The ``n_resources`` -column tells you how many resources were used. - -In the example above, the best parameter combination is ``{'criterion': -'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10}`` -since it has reached the last iteration (3) with the highest score: -0.96. - -.. topic:: References: - - .. [1] K. Jamieson, A. Talwalkar, - `Non-stochastic Best Arm Identification and Hyperparameter - Optimization `_, in - proc. of Machine Learning Research, 2016. - .. [2] L. Li, K. Jamieson, G. DeSalvo, A. Rostamizadeh, A. Talwalkar, - `Hyperband: A Novel Bandit-Based Approach to Hyperparameter Optimization - `_, in Machine Learning Research - 18, 2018. +.. dropdown:: Example of a (truncated) output dataframe: + + ==== ====== =============== ================= ======================================================================================== + .. iter n_resources mean_test_score params + ==== ====== =============== ================= ======================================================================================== + 0 0 125 0.983667 {'criterion': 'log_loss', 'max_depth': None, 'max_features': 9, 'min_samples_split': 5} + 1 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 8, 'min_samples_split': 7} + 2 0 125 0.983667 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} + 3 0 125 0.983667 {'criterion': 'log_loss', 'max_depth': None, 'max_features': 6, 'min_samples_split': 6} + ... ... ... ... ... + 15 2 500 0.951958 {'criterion': 'log_loss', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} + 16 2 500 0.947958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 10} + 17 2 500 0.951958 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} + 18 3 1000 0.961009 {'criterion': 'log_loss', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10} + 19 3 1000 0.955989 {'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_split': 4} + ==== ====== =============== ================= ======================================================================================== + + Each row corresponds to a given parameter combination (a candidate) and a given + iteration. The iteration is given by the ``iter`` column. The ``n_resources`` + column tells you how many resources were used. + + In the example above, the best parameter combination is ``{'criterion': + 'log_loss', 'max_depth': None, 'max_features': 9, 'min_samples_split': 10}`` + since it has reached the last iteration (3) with the highest score: + 0.96. + + .. rubric:: References + + .. [1] K. Jamieson, A. Talwalkar, + `Non-stochastic Best Arm Identification and Hyperparameter + Optimization `_, in + proc. of Machine Learning Research, 2016. + + .. [2] L. Li, K. Jamieson, G. DeSalvo, A. Rostamizadeh, A. Talwalkar, + :arxiv:`Hyperband: A Novel Bandit-Based Approach to Hyperparameter Optimization + <1603.06560>`, in Machine Learning Research 18, 2018. + + .. _grid_search_tips: @@ -552,14 +555,15 @@ Tips for parameter search Specifying an objective metric ------------------------------ -By default, parameter search uses the ``score`` function of the estimator -to evaluate a parameter setting. These are the +By default, parameter search uses the ``score`` function of the estimator to +evaluate a parameter setting. These are the :func:`sklearn.metrics.accuracy_score` for classification and -:func:`sklearn.metrics.r2_score` for regression. For some applications, -other scoring functions are better suited (for example in unbalanced -classification, the accuracy score is often uninformative). An alternative -scoring function can be specified via the ``scoring`` parameter of most -parameter search tools. See :ref:`scoring_parameter` for more details. +:func:`sklearn.metrics.r2_score` for regression. For some applications, other +scoring functions are better suited (for example in unbalanced classification, +the accuracy score is often uninformative), see :ref:`which_scoring_function` +for some guidance. An alternative scoring function can be specified via the +``scoring`` parameter of most parameter search tools, see +:ref:`scoring_parameter` for more details. .. _multimetric_grid_search: @@ -603,20 +607,20 @@ parameters of composite or nested estimators such as >>> from sklearn.datasets import make_moons >>> X, y = make_moons() >>> calibrated_forest = CalibratedClassifierCV( - ... base_estimator=RandomForestClassifier(n_estimators=10)) + ... estimator=RandomForestClassifier(n_estimators=10)) >>> param_grid = { - ... 'base_estimator__max_depth': [2, 4, 6, 8]} + ... 'estimator__max_depth': [2, 4, 6, 8]} >>> search = GridSearchCV(calibrated_forest, param_grid, cv=5) >>> search.fit(X, y) GridSearchCV(cv=5, - estimator=CalibratedClassifierCV(...), - param_grid={'base_estimator__max_depth': [2, 4, 6, 8]}) + estimator=CalibratedClassifierCV(estimator=RandomForestClassifier(n_estimators=10)), + param_grid={'estimator__max_depth': [2, 4, 6, 8]}) Here, ```` is the parameter name of the nested estimator, -in this case ``base_estimator``. +in this case ``estimator``. If the meta-estimator is constructed as a collection of estimators as in `pipeline.Pipeline`, then ```` refers to the name of the estimator, -see :ref:`pipeline_nested_parameters`. In practice, there can be several +see :ref:`pipeline_nested_parameters`. In practice, there can be several levels of nesting:: >>> from sklearn.pipeline import Pipeline @@ -626,7 +630,7 @@ levels of nesting:: ... ('model', calibrated_forest)]) >>> param_grid = { ... 'select__k': [1, 2], - ... 'model__base_estimator__max_depth': [2, 4, 6, 8]} + ... 'model__estimator__max_depth': [2, 4, 6, 8]} >>> search = GridSearchCV(pipe, param_grid, cv=5).fit(X, y) Please refer to :ref:`pipeline` for performing parameter searches over @@ -658,12 +662,11 @@ entry for :term:`n_jobs`. Robustness to failure --------------------- -Some parameter settings may result in a failure to ``fit`` one or more folds -of the data. By default, this will cause the entire search to fail, even if -some parameter settings could be fully evaluated. Setting ``error_score=0`` -(or `=np.NaN`) will make the procedure robust to such failure, issuing a -warning and setting the score for that fold to 0 (or `NaN`), but completing -the search. +Some parameter settings may result in a failure to ``fit`` one or more folds of +the data. By default, the score for those settings will be `np.nan`. This can +be controlled by setting `error_score="raise"` to raise an exception if one fit +fails, or for example `error_score=0` to set another value for the score of +failing parameter combinations. .. _alternative_cv: @@ -722,7 +725,7 @@ model selection: Out of Bag Estimates -------------------- -When using ensemble methods base upon bagging, i.e. generating new +When using ensemble methods based upon bagging, i.e. generating new training sets using sampling with replacement, part of the training set remains unused. For each classifier in the ensemble, a different part of the training set is left out. diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index c3d2165e29d98..59367b647dd58 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -14,7 +14,7 @@ use incomplete datasets is to discard entire rows and/or columns containing missing values. However, this comes at the price of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values, i.e., to infer them from the known part of the data. See the -:ref:`glossary` entry on imputation. +glossary entry on :term:`imputation`. Univariate vs. Multivariate Imputation @@ -22,9 +22,9 @@ Univariate vs. Multivariate Imputation One type of imputation algorithm is univariate, which imputes values in the i-th feature dimension using only non-missing values in that feature dimension -(e.g. :class:`impute.SimpleImputer`). By contrast, multivariate imputation +(e.g. :class:`SimpleImputer`). By contrast, multivariate imputation algorithms use the entire set of available feature dimensions to estimate the -missing values (e.g. :class:`impute.IterativeImputer`). +missing values (e.g. :class:`IterativeImputer`). .. _single_imputer: @@ -50,7 +50,7 @@ that contain the missing values:: >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] >>> print(imp.transform(X)) [[4. 2. ] - [6. 3.666...] + [6. 3.666] [7. 6. ]] The :class:`SimpleImputer` class also supports sparse matrices:: @@ -87,6 +87,8 @@ string values or pandas categoricals when using the ``'most_frequent'`` or ['a' 'y'] ['b' 'y']] +For another example on usage, see :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + .. _iterative_imputer: @@ -108,9 +110,9 @@ imputation round are returned. This estimator is still **experimental** for now: default parameters or details of behaviour might change without any deprecation cycle. Resolving the following issues would help stabilize :class:`IterativeImputer`: - convergence criteria (:issue:`14338`), default estimators (:issue:`13286`), - and use of random state (:issue:`15611`). To use it, you need to explicitly - import ``enable_iterative_imputer``. + convergence criteria (:issue:`14338`) and default estimators + (:issue:`13286`). To use it, you need to explicitly import + ``enable_iterative_imputer``. :: @@ -173,12 +175,11 @@ Note that a call to the ``transform`` method of :class:`IterativeImputer` is not allowed to change the number of samples. Therefore multiple imputations cannot be achieved by a single call to ``transform``. -References -========== +.. rubric:: References -.. [1] Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate +.. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: Multivariate Imputation by Chained Equations in R". Journal of Statistical Software 45: - 1-67. + 1-67. `_ .. [2] Roderick J A Little and Donald B Rubin (1986). "Statistical Analysis with Missing Data". John Wiley & Sons, Inc., New York, NY, USA. @@ -190,19 +191,20 @@ Nearest neighbors imputation The :class:`KNNImputer` class provides imputation for filling in missing values using the k-Nearest Neighbors approach. By default, a euclidean distance metric -that supports missing values, :func:`~sklearn.metrics.nan_euclidean_distances`, -is used to find the nearest neighbors. Each missing feature is imputed using -values from ``n_neighbors`` nearest neighbors that have a value for the -feature. The feature of the neighbors are averaged uniformly or weighted by -distance to each neighbor. If a sample has more than one feature missing, then -the neighbors for that sample can be different depending on the particular -feature being imputed. When the number of available neighbors is less than -`n_neighbors` and there are no defined distances to the training set, the -training set average for that feature is used during imputation. If there is at -least one neighbor with a defined distance, the weighted or unweighted average -of the remaining neighbors will be used during imputation. If a feature is -always missing in training, it is removed during `transform`. For more -information on the methodology, see ref. [OL2001]_. +that supports missing values, +:func:`~sklearn.metrics.pairwise.nan_euclidean_distances`, is used to find the +nearest neighbors. Each missing feature is imputed using values from +``n_neighbors`` nearest neighbors that have a value for the feature. The +feature of the neighbors are averaged uniformly or weighted by distance to each +neighbor. If a sample has more than one feature missing, then the neighbors for +that sample can be different depending on the particular feature being imputed. +When the number of available neighbors is less than `n_neighbors` and there are +no defined distances to the training set, the training set average for that +feature is used during imputation. If there is at least one neighbor with a +defined distance, the weighted or unweighted average of the remaining neighbors +will be used during imputation. If a feature is always missing in training, it +is removed during `transform`. For more information on the methodology, see +ref. [OL2001]_. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean feature value of the two nearest @@ -219,10 +221,42 @@ neighbors of samples with missing values:: [5.5, 6. , 5. ], [8. , 8. , 7. ]]) -.. [OL2001] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, +For another example on usage, see :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + +.. rubric:: References + +.. [OL2001] `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 Pages 520-525. + `_ + +Keeping the number of features constant +======================================= + +By default, the scikit-learn imputers will drop fully empty features, i.e. +columns containing only missing values. For instance:: + + >>> imputer = SimpleImputer() + >>> X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]]) + >>> imputer.fit_transform(X) + array([[1.], + [2.], + [3.]]) + +The first feature in `X` containing only `np.nan` was dropped after the +imputation. While this feature will not help in predictive setting, dropping +the columns will change the shape of `X` which could be problematic when using +imputers in a more complex machine-learning pipeline. The parameter +`keep_empty_features` offers the option to keep the empty features by imputing +with a constant value. In most of the cases, this constant value is zero:: + + >>> imputer.set_params(keep_empty_features=True) + SimpleImputer(keep_empty_features=True) + >>> imputer.fit_transform(X) + array([[0., 1.], + [0., 2.], + [0., 3.]]) .. _missing_indicator: @@ -274,10 +308,12 @@ whether or not they contain missing values:: >>> indicator.features_ array([0, 1, 2, 3]) -When using the :class:`MissingIndicator` in a :class:`Pipeline`, be sure to use -the :class:`FeatureUnion` or :class:`ColumnTransformer` to add the indicator -features to the regular features. First we obtain the `iris` dataset, and add -some missing values to it. +When using the :class:`MissingIndicator` in a +:class:`~sklearn.pipeline.Pipeline`, be sure to use the +:class:`~sklearn.pipeline.FeatureUnion` or +:class:`~sklearn.compose.ColumnTransformer` to add the indicator features to +the regular features. First we obtain the `iris` dataset, and add some missing +values to it. >>> from sklearn.datasets import load_iris >>> from sklearn.impute import SimpleImputer, MissingIndicator @@ -290,9 +326,9 @@ some missing values to it. >>> X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100, ... random_state=0) -Now we create a :class:`FeatureUnion`. All features will be imputed using -:class:`SimpleImputer`, in order to enable classifiers to work with this data. -Additionally, it adds the indicator variables from +Now we create a :class:`~sklearn.pipeline.FeatureUnion`. All features will be +imputed using :class:`SimpleImputer`, in order to enable classifiers to work +with this data. Additionally, it adds the indicator variables from :class:`MissingIndicator`. >>> transformer = FeatureUnion( @@ -305,11 +341,20 @@ Additionally, it adds the indicator variables from (100, 8) Of course, we cannot use the transformer to make any predictions. We should -wrap this in a :class:`Pipeline` with a classifier (e.g., a -:class:`DecisionTreeClassifier`) to be able to make predictions. +wrap this in a :class:`~sklearn.pipeline.Pipeline` with a classifier (e.g., a +:class:`~sklearn.tree.DecisionTreeClassifier`) to be able to make predictions. >>> clf = make_pipeline(transformer, DecisionTreeClassifier()) >>> clf = clf.fit(X_train, y_train) >>> results = clf.predict(X_test) >>> results.shape (100,) + +Estimators that handle NaN values +================================= + +Some estimators are designed to handle NaN values without preprocessing. +Below is the list of these estimators, classified by type +(cluster, regressor, classifier, transform): + +.. allow_nan_estimators:: diff --git a/doc/modules/isotonic.rst b/doc/modules/isotonic.rst index 8967ef18afcb3..50fbdb24e72c7 100644 --- a/doc/modules/isotonic.rst +++ b/doc/modules/isotonic.rst @@ -9,10 +9,10 @@ Isotonic regression The class :class:`IsotonicRegression` fits a non-decreasing real function to 1-dimensional data. It solves the following problem: - minimize :math:`\sum_i w_i (y_i - \hat{y}_i)^2` - - subject to :math:`\hat{y}_i \le \hat{y}_j` whenever :math:`X_i \le X_j`, +.. math:: + \min \sum_i w_i (y_i - \hat{y}_i)^2 +subject to :math:`\hat{y}_i \le \hat{y}_j` whenever :math:`X_i \le X_j`, where the weights :math:`w_i` are strictly positive, and both `X` and `y` are arbitrary real quantities. @@ -31,3 +31,7 @@ thus form a function that is piecewise linear: .. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_isotonic_regression_001.png :target: ../auto_examples/miscellaneous/plot_isotonic_regression.html :align: center + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_isotonic_regression.py` diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index b7de01e308519..0bbd19d05de33 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -35,13 +35,65 @@ is advisable to compare results against exact kernel methods when possible. Nystroem Method for Kernel Approximation ---------------------------------------- -The Nystroem method, as implemented in :class:`Nystroem` is a general method -for low-rank approximations of kernels. It achieves this by essentially subsampling -the data on which the kernel is evaluated. -By default :class:`Nystroem` uses the ``rbf`` kernel, but it can use any -kernel function or a precomputed kernel matrix. -The number of samples used - which is also the dimensionality of the features computed - -is given by the parameter ``n_components``. +The Nystroem method, as implemented in :class:`Nystroem` is a general method for +reduced rank approximations of kernels. It achieves this by subsampling without +replacement rows/columns of the data on which the kernel is evaluated. While the +computational complexity of the exact method is +:math:`\mathcal{O}(n^3_{\text{samples}})`, the complexity of the approximation +is :math:`\mathcal{O}(n^2_{\text{components}} \cdot n_{\text{samples}})`, where +one can set :math:`n_{\text{components}} \ll n_{\text{samples}}` without a +significant decrease in performance [WS2001]_. + +We can construct the eigendecomposition of the kernel matrix :math:`K`, based +on the features of the data, and then split it into sampled and unsampled data +points. + +.. math:: + + K = U \Lambda U^T + = \begin{bmatrix} U_1 \\ U_2\end{bmatrix} \Lambda \begin{bmatrix} U_1 \\ U_2 \end{bmatrix}^T + = \begin{bmatrix} U_1 \Lambda U_1^T & U_1 \Lambda U_2^T \\ U_2 \Lambda U_1^T & U_2 \Lambda U_2^T \end{bmatrix} + \equiv \begin{bmatrix} K_{11} & K_{12} \\ K_{21} & K_{22} \end{bmatrix} + +where: + +* :math:`U` is orthonormal +* :math:`\Lambda` is diagonal matrix of eigenvalues +* :math:`U_1` is orthonormal matrix of samples that were chosen +* :math:`U_2` is orthonormal matrix of samples that were not chosen + +Given that :math:`U_1 \Lambda U_1^T` can be obtained by orthonormalization of +the matrix :math:`K_{11}`, and :math:`U_2 \Lambda U_1^T` can be evaluated (as +well as its transpose), the only remaining term to elucidate is +:math:`U_2 \Lambda U_2^T`. To do this we can express it in terms of the already +evaluated matrices: + +.. math:: + + \begin{align} U_2 \Lambda U_2^T &= \left(K_{21} U_1 \Lambda^{-1}\right) \Lambda \left(K_{21} U_1 \Lambda^{-1}\right)^T + \\&= K_{21} U_1 (\Lambda^{-1} \Lambda) \Lambda^{-1} U_1^T K_{21}^T + \\&= K_{21} U_1 \Lambda^{-1} U_1^T K_{21}^T + \\&= K_{21} K_{11}^{-1} K_{21}^T + \\&= \left( K_{21} K_{11}^{-\frac12} \right) \left( K_{21} K_{11}^{-\frac12} \right)^T + .\end{align} + +During ``fit``, the class :class:`Nystroem` evaluates the basis :math:`U_1`, and +computes the normalization constant, :math:`K_{11}^{-\frac12}`. Later, during +``transform``, the kernel matrix is determined between the basis (given by the +`components_` attribute) and the new data points, ``X``. This matrix is then +multiplied by the ``normalization_`` matrix for the final result. + +By default :class:`Nystroem` uses the ``rbf`` kernel, but it can use any kernel +function or a precomputed kernel matrix. The number of samples used - which is +also the dimensionality of the features computed - is given by the parameter +``n_components``. + +.. rubric:: Examples + +* See the example entitled + :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`, + that shows an efficient machine learning pipeline that uses a + :class:`Nystroem` kernel. .. _rbf_kernel_approx: @@ -91,9 +143,9 @@ use of larger feature spaces more efficient. Comparing an exact RBF kernel (left) with the approximation (right) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` .. _additive_chi_kernel_approx: @@ -108,7 +160,7 @@ The additive chi squared kernel as used here is given by k(x, y) = \sum_i \frac{2x_iy_i}{x_i+y_i} -This is not exactly the same as :func:`sklearn.metrics.additive_chi2_kernel`. +This is not exactly the same as :func:`sklearn.metrics.pairwise.additive_chi2_kernel`. The authors of [VZ2010]_ prefer the version above as it is always positive definite. Since the kernel is additive, it is possible to treat all components @@ -163,8 +215,8 @@ function given by: where: - * ``x``, ``y`` are the input vectors - * ``d`` is the kernel degree +* ``x``, ``y`` are the input vectors +* ``d`` is the kernel degree Intuitively, the feature space of the polynomial kernel of degree `d` consists of all possible degree-`d` products among input features, which enables @@ -189,9 +241,9 @@ In addition, this method can transform samples in time, where :math:`n_{\text{components}}` is the desired output dimension, determined by ``n_components``. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_kernel_approximation_plot_scalable_poly_kernels.py` +* :ref:`sphx_glr_auto_examples_kernel_approximation_plot_scalable_poly_kernels.py` .. _tensor_sketch_kernel_approx: @@ -231,26 +283,29 @@ The classes in this submodule allow to approximate the embedding or store training examples. -.. topic:: References: - - .. [RR2007] `"Random features for large-scale kernel machines" - `_ - Rahimi, A. and Recht, B. - Advances in neural information processing 2007, - .. [LS2010] `"Random Fourier approximations for skewed multiplicative histogram kernels" - `_ - Random Fourier approximations for skewed multiplicative histogram kernels - - Lecture Notes for Computer Sciencd (DAGM) - .. [VZ2010] `"Efficient additive kernels via explicit feature maps" - `_ - Vedaldi, A. and Zisserman, A. - Computer Vision and Pattern Recognition 2010 - .. [VVZ2010] `"Generalized RBF feature maps for Efficient Detection" - `_ - Vempati, S. and Vedaldi, A. and Zisserman, A. and Jawahar, CV - 2010 - .. [PP2013] `"Fast and scalable polynomial kernels via explicit feature maps" - `_ - Pham, N., & Pagh, R. - 2013 - .. [CCF2002] `"Finding frequent items in data streams" - `_ - Charikar, M., Chen, K., & Farach-Colton - 2002 - .. [WIKICS] `"Wikipedia: Count sketch" - `_ +.. rubric:: References + +.. [WS2001] `"Using the NystrÃļm method to speed up kernel machines" + `_ + Williams, C.K.I.; Seeger, M. - 2001. +.. [RR2007] `"Random features for large-scale kernel machines" + `_ + Rahimi, A. and Recht, B. - Advances in neural information processing 2007, +.. [LS2010] `"Random Fourier approximations for skewed multiplicative histogram kernels" + `_ + Li, F., Ionescu, C., and Sminchisescu, C. + - Pattern Recognition, DAGM 2010, Lecture Notes in Computer Science. +.. [VZ2010] `"Efficient additive kernels via explicit feature maps" + `_ + Vedaldi, A. and Zisserman, A. - Computer Vision and Pattern Recognition 2010 +.. [VVZ2010] `"Generalized RBF feature maps for Efficient Detection" + `_ + Vempati, S. and Vedaldi, A. and Zisserman, A. and Jawahar, CV - 2010 +.. [PP2013] :doi:`"Fast and scalable polynomial kernels via explicit feature maps" + <10.1145/2487575.2487591>` + Pham, N., & Pagh, R. - 2013 +.. [CCF2002] `"Finding frequent items in data streams" + `_ + Charikar, M., Chen, K., & Farach-Colton - 2002 +.. [WIKICS] `"Wikipedia: Count sketch" + `_ diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst index 286e9d4ac5322..64267f4233a53 100644 --- a/doc/modules/kernel_ridge.rst +++ b/doc/modules/kernel_ridge.rst @@ -7,7 +7,7 @@ Kernel ridge regression .. currentmodule:: sklearn.kernel_ridge Kernel ridge regression (KRR) [M2012]_ combines :ref:`ridge_regression` -(linear least squares with l2-norm regularization) with the `kernel trick +(linear least squares with :math:`L_2`-norm regularization) with the `kernel trick `_. It thus learns a linear function in the space induced by the respective kernel and the data. For non-linear kernels, this corresponds to a non-linear function in the original @@ -16,7 +16,7 @@ space. The form of the model learned by :class:`KernelRidge` is identical to support vector regression (:class:`~sklearn.svm.SVR`). However, different loss functions are used: KRR uses squared error loss while support vector -regression uses :math:`\epsilon`-insensitive loss, both combined with l2 +regression uses :math:`\epsilon`-insensitive loss, both combined with :math:`L_2` regularization. In contrast to :class:`~sklearn.svm.SVR`, fitting :class:`KernelRidge` can be done in closed-form and is typically faster for medium-sized datasets. On the other hand, the learned model is non-sparse and @@ -31,7 +31,7 @@ plotted, where both complexity/regularization and bandwidth of the RBF kernel have been optimized using grid-search. The learned functions are very similar; however, fitting :class:`KernelRidge` is approximately seven times faster than fitting :class:`~sklearn.svm.SVR` (both with grid-search). -However, prediction of 100000 target values is more than three times faster +However, prediction of 100,000 target values is more than three times faster with :class:`~sklearn.svm.SVR` since it has learned a sparse model using only approximately 1/3 of the 100 training datapoints as support vectors. @@ -55,8 +55,11 @@ dense model. :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html :align: center +.. rubric:: Examples -.. topic:: References: +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_ridge_regression.py` - .. [M2012] "Machine Learning: A Probabilistic Perspective" - Murphy, K. P. - chapter 14.4.3, pp. 492-493, The MIT Press, 2012 +.. rubric:: References + +.. [M2012] "Machine Learning: A Probabilistic Perspective" + Murphy, K. P. - chapter 14.4.3, pp. 492-493, The MIT Press, 2012 diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst index 962d65705f75a..c18835d514a9f 100644 --- a/doc/modules/lda_qda.rst +++ b/doc/modules/lda_qda.rst @@ -29,10 +29,10 @@ Discriminant Analysis can only learn linear boundaries, while Quadratic Discriminant Analysis can learn quadratic boundaries and is therefore more flexible. -.. topic:: Examples: +.. rubric:: Examples - :ref:`sphx_glr_auto_examples_classification_plot_lda_qda.py`: Comparison of LDA and QDA - on synthetic data. +* :ref:`sphx_glr_auto_examples_classification_plot_lda_qda.py`: Comparison of LDA and + QDA on synthetic data. Dimensionality reduction using Linear Discriminant Analysis =========================================================== @@ -49,10 +49,10 @@ This is implemented in the `transform` method. The desired dimensionality can be set using the ``n_components`` parameter. This parameter has no influence on the `fit` and `predict` methods. -.. topic:: Examples: +.. rubric:: Examples - :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and PCA - for dimensionality reduction of the Iris dataset +* :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and + PCA for dimensionality reduction of the Iris dataset .. _lda_qda_math: @@ -93,10 +93,10 @@ predicted class is the one that maximises this log-posterior. .. note:: **Relation with Gaussian Naive Bayes** - If in the QDA model one assumes that the covariance matrices are diagonal, - then the inputs are assumed to be conditionally independent in each class, - and the resulting classifier is equivalent to the Gaussian Naive Bayes - classifier :class:`naive_bayes.GaussianNB`. + If in the QDA model one assumes that the covariance matrices are diagonal, + then the inputs are assumed to be conditionally independent in each class, + and the resulting classifier is equivalent to the Gaussian Naive Bayes + classifier :class:`naive_bayes.GaussianNB`. LDA --- @@ -137,7 +137,7 @@ Mathematical formulation of LDA dimensionality reduction First note that the K means :math:`\mu_k` are vectors in :math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of dimension at most :math:`K - 1` (2 points lie on a line, 3 points lie on a -plane, etc). +plane, etc.). As mentioned above, we can interpret LDA as assigning :math:`x` to the class whose mean :math:`\mu_k` is the closest in terms of Mahalanobis distance, @@ -173,11 +173,11 @@ In this scenario, the empirical sample covariance is a poor estimator, and shrinkage helps improving the generalization performance of the classifier. Shrinkage LDA can be used by setting the ``shrinkage`` parameter of -the :class:`~discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'. +the :class:`~discriminant_analysis.LinearDiscriminantAnalysis` class to `'auto'`. This automatically determines the optimal shrinkage parameter in an analytic way following the lemma introduced by Ledoit and Wolf [2]_. Note that -currently shrinkage only works when setting the ``solver`` parameter to 'lsqr' -or 'eigen'. +currently shrinkage only works when setting the ``solver`` parameter to `'lsqr'` +or `'eigen'`. The ``shrinkage`` parameter can also be manually set between 0 and 1. In particular, a value of 0 corresponds to no shrinkage (which means the empirical @@ -187,17 +187,17 @@ an estimate for the covariance matrix). Setting this parameter to a value between these two extrema will estimate a shrunk version of the covariance matrix. -The shrinked Ledoit and Wolf estimator of covariance may not always be the +The shrunk Ledoit and Wolf estimator of covariance may not always be the best choice. For example if the distribution of the data is normally distributed, the -Oracle Shrinkage Approximating estimator :class:`sklearn.covariance.OAS` +Oracle Approximating Shrinkage estimator :class:`sklearn.covariance.OAS` yields a smaller Mean Squared Error than the one given by Ledoit and Wolf's -formula used with shrinkage="auto". In LDA, the data are assumed to be gaussian +formula used with `shrinkage="auto"`. In LDA, the data are assumed to be gaussian conditionally to the class. If these assumptions hold, using LDA with -the OAS estimator of covariance will yield a better classification +the OAS estimator of covariance will yield a better classification accuracy than if Ledoit and Wolf or the empirical covariance estimator is used. -The covariance estimator can be chosen using with the ``covariance_estimator`` +The covariance estimator can be chosen using the ``covariance_estimator`` parameter of the :class:`discriminant_analysis.LinearDiscriminantAnalysis` class. A covariance estimator should have a :term:`fit` method and a ``covariance_`` attribute like all covariance estimators in the @@ -210,10 +210,10 @@ class. A covariance estimator should have a :term:`fit` method and a .. centered:: |shrinkage| -.. topic:: Examples: +.. rubric:: Examples - :ref:`sphx_glr_auto_examples_classification_plot_lda.py`: Comparison of LDA classifiers - with Empirical, Ledoit Wolf and OAS covariance estimator. +* :ref:`sphx_glr_auto_examples_classification_plot_lda.py`: Comparison of LDA classifiers + with Empirical, Ledoit Wolf and OAS covariance estimator. Estimation algorithms ===================== @@ -232,14 +232,14 @@ solver may be preferable in situations where the number of features is large. The 'svd' solver cannot be used with shrinkage. For QDA, the use of the SVD solver relies on the fact that the covariance matrix :math:`\Sigma_k` is, by definition, equal to :math:`\frac{1}{n - 1} -X_k^tX_k = V S^2 V^t` where :math:`V` comes from the SVD of the (centered) +X_k^tX_k = \frac{1}{n - 1} V S^2 V^t` where :math:`V` comes from the SVD of the (centered) matrix: :math:`X_k = U S V^t`. It turns out that we can compute the -log-posterior above without having to explictly compute :math:`\Sigma`: +log-posterior above without having to explicitly compute :math:`\Sigma`: computing :math:`S` and :math:`V` via the SVD of :math:`X` is enough. For LDA, two SVDs are computed: the SVD of the centered input matrix :math:`X` and the SVD of the class-wise mean vectors. -The 'lsqr' solver is an efficient algorithm that only works for +The `'lsqr'` solver is an efficient algorithm that only works for classification. It needs to explicitly compute the covariance matrix :math:`\Sigma`, and supports shrinkage and custom covariance estimators. This solver computes the coefficients @@ -247,19 +247,19 @@ This solver computes the coefficients \mu_k`, thus avoiding the explicit computation of the inverse :math:`\Sigma^{-1}`. -The 'eigen' solver is based on the optimization of the between class scatter to +The `'eigen'` solver is based on the optimization of the between class scatter to within class scatter ratio. It can be used for both classification and -transform, and it supports shrinkage. However, the 'eigen' solver needs to +transform, and it supports shrinkage. However, the `'eigen'` solver needs to compute the covariance matrix, so it might not be suitable for situations with a high number of features. -.. topic:: References: +.. rubric:: References - .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., - Friedman J., Section 4.3, p.106-119, 2008. +.. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., + Friedman J., Section 4.3, p.106-119, 2008. - .. [2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. - The Journal of Portfolio Management 30(4), 110-119, 2004. +.. [2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. + The Journal of Portfolio Management 30(4), 110-119, 2004. - .. [3] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification - (Second Edition), section 2.6.2. +.. [3] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification + (Second Edition), section 2.6.2. diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst index 4fb90df937e15..6dca0a29af7cb 100644 --- a/doc/modules/learning_curve.rst +++ b/doc/modules/learning_curve.rst @@ -39,11 +39,11 @@ easy to see whether the estimator suffers from bias or variance. However, in high-dimensional spaces, models can become very difficult to visualize. For this reason, it is often helpful to use the tools described below. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_model_selection_plot_underfitting_overfitting.py` - * :ref:`sphx_glr_auto_examples_model_selection_plot_validation_curve.py` - * :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_underfitting_overfitting.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_train_error_vs_test_error.py` +* :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py` .. _validation_curve: @@ -53,9 +53,9 @@ Validation curve To validate a model we need a scoring function (see :ref:`model_evaluation`), for example accuracy for classifiers. The proper way of choosing multiple -hyperparameters of an estimator are of course grid search or similar methods +hyperparameters of an estimator is of course grid search or similar methods (see :ref:`grid_search`) that select the hyperparameter with the maximum score -on a validation set or multiple validation sets. Note that if we optimized +on a validation set or multiple validation sets. Note that if we optimize the hyperparameters based on a validation score the validation score is biased and not a good estimate of the generalization any longer. To get a proper estimate of the generalization we have to compute the score on another test @@ -71,7 +71,7 @@ The function :func:`validation_curve` can help in this case:: >>> import numpy as np >>> from sklearn.model_selection import validation_curve >>> from sklearn.datasets import load_iris - >>> from sklearn.linear_model import Ridge + >>> from sklearn.svm import SVC >>> np.random.seed(0) >>> X, y = load_iris(return_X_y=True) @@ -79,30 +79,43 @@ The function :func:`validation_curve` can help in this case:: >>> np.random.shuffle(indices) >>> X, y = X[indices], y[indices] - >>> train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha", - ... np.logspace(-7, 3, 3), - ... cv=5) + >>> train_scores, valid_scores = validation_curve( + ... SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3), + ... ) >>> train_scores - array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.51..., 0.52..., 0.49..., 0.47..., 0.49...]]) + array([[0.90, 0.94, 0.91, 0.89, 0.92], + [0.9 , 0.92, 0.93, 0.92, 0.93], + [0.97, 1 , 0.98, 0.97, 0.99]]) >>> valid_scores - array([[0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.46..., 0.25..., 0.50..., 0.49..., 0.52...]]) + array([[0.9, 0.9 , 0.9 , 0.96, 0.9 ], + [0.9, 0.83, 0.96, 0.96, 0.93], + [1. , 0.93, 1 , 1 , 0.9 ]]) + +If you intend to plot the validation curves only, the class +:class:`~sklearn.model_selection.ValidationCurveDisplay` is more direct than +using matplotlib manually on the results of a call to :func:`validation_curve`. +You can use the method +:meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` similarly +to :func:`validation_curve` to generate and plot the validation curve: + +.. plot:: + :context: close-figs + :align: center + + from sklearn.datasets import load_iris + from sklearn.model_selection import ValidationCurveDisplay + from sklearn.svm import SVC + from sklearn.utils import shuffle + X, y = load_iris(return_X_y=True) + X, y = shuffle(X, y, random_state=0) + ValidationCurveDisplay.from_estimator( + SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10) + ) If the training score and the validation score are both low, the estimator will be underfitting. If the training score is high and the validation score is low, the estimator is overfitting and otherwise it is working very well. A low -training score and a high validation score is usually not possible. Underfitting, -overfitting, and a working model are shown in the in the plot below where we vary -the parameter :math:`\gamma` of an SVM on the digits dataset. - -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png - :target: ../auto_examples/model_selection/plot_validation_curve.html - :align: center - :scale: 50% - +training score and a high validation score is usually not possible. .. _learning_curve: @@ -141,11 +154,34 @@ average scores on the validation sets):: >>> train_sizes array([ 50, 80, 110]) >>> train_scores - array([[0.98..., 0.98 , 0.98..., 0.98..., 0.98...], - [0.98..., 1. , 0.98..., 0.98..., 0.98...], - [0.98..., 1. , 0.98..., 0.98..., 0.99...]]) + array([[0.98, 0.98 , 0.98, 0.98, 0.98], + [0.98, 1. , 0.98, 0.98, 0.98], + [0.98, 1. , 0.98, 0.98, 0.99]]) >>> valid_scores - array([[1. , 0.93..., 1. , 1. , 0.96...], - [1. , 0.96..., 1. , 1. , 0.96...], - [1. , 0.96..., 1. , 1. , 0.96...]]) + array([[1. , 0.93, 1. , 1. , 0.96], + [1. , 0.96, 1. , 1. , 0.96], + [1. , 0.96, 1. , 1. , 0.96]]) + +If you intend to plot the learning curves only, the class +:class:`~sklearn.model_selection.LearningCurveDisplay` will be easier to use. +You can use the method +:meth:`~sklearn.model_selection.LearningCurveDisplay.from_estimator` similarly +to :func:`learning_curve` to generate and plot the learning curve: + +.. plot:: + :context: close-figs + :align: center + + from sklearn.datasets import load_iris + from sklearn.model_selection import LearningCurveDisplay + from sklearn.svm import SVC + from sklearn.utils import shuffle + X, y = load_iris(return_X_y=True) + X, y = shuffle(X, y, random_state=0) + LearningCurveDisplay.from_estimator( + SVC(kernel="linear"), X, y, train_sizes=[50, 80, 110], cv=5) + +.. rubric:: Examples +* See :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py` for an + example of using learning curves to check the scalability of a predictive model. diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f1f376dc641c9..9edd90321bd02 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -32,14 +32,14 @@ solves a problem of the form: .. math:: \min_{w} || X w - y||_2^2 -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ols_001.png - :target: ../auto_examples/linear_model/plot_ols.html +.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ols_ridge_001.png + :target: ../auto_examples/linear_model/plot_ols_ridge.html :align: center :scale: 50% -:class:`LinearRegression` will take in its ``fit`` method arrays X, y -and will store the coefficients :math:`w` of the linear model in its -``coef_`` member:: +:class:`LinearRegression` takes in its ``fit`` method arguments ``X``, ``y``, +``sample_weight`` and stores the coefficients :math:`w` of the linear model in its +``coef_`` and ``intercept_`` attributes:: >>> from sklearn import linear_model >>> reg = linear_model.LinearRegression() @@ -47,19 +47,21 @@ and will store the coefficients :math:`w` of the linear model in its LinearRegression() >>> reg.coef_ array([0.5, 0.5]) + >>> reg.intercept_ + 0.0 The coefficient estimates for Ordinary Least Squares rely on the -independence of the features. When features are correlated and the -columns of the design matrix :math:`X` have an approximate linear +independence of the features. When features are correlated and some +columns of the design matrix :math:`X` have an approximately linear dependence, the design matrix becomes close to singular and as a result, the least-squares estimate becomes highly sensitive to random errors in the observed target, producing a large variance. This situation of *multicollinearity* can arise, for example, when data are collected without an experimental design. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_ols.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_ols_ridge.py` Non-Negative Least Squares -------------------------- @@ -68,18 +70,18 @@ It is possible to constrain all the coefficients to be non-negative, which may be useful when they represent some physical or naturally non-negative quantities (e.g., frequency counts or prices of goods). :class:`LinearRegression` accepts a boolean ``positive`` -parameter: when set to `True` `Non Negative Least Squares +parameter: when set to `True` `Non-Negative Least Squares `_ are then applied. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py` Ordinary Least Squares Complexity --------------------------------- The least squares solution is computed using the singular value -decomposition of X. If X is a matrix of shape `(n_samples, n_features)` +decomposition of :math:`X`. If :math:`X` is a matrix of shape `(n_samples, n_features)` this method has a cost of :math:`O(n_{\text{samples}} n_{\text{features}}^2)`, assuming that :math:`n_{\text{samples}} \geq n_{\text{features}}`. @@ -114,7 +116,7 @@ of shrinkage and thus the coefficients become more robust to collinearity. As with other linear models, :class:`Ridge` will take in its ``fit`` method -arrays X, y and will store the coefficients :math:`w` of the linear model in +arrays ``X``, ``y`` and will store the coefficients :math:`w` of the linear model in its ``coef_`` member:: >>> from sklearn import linear_model @@ -124,8 +126,30 @@ its ``coef_`` member:: >>> reg.coef_ array([0.34545455, 0.34545455]) >>> reg.intercept_ - 0.13636... - + np.float64(0.13636) + +Note that the class :class:`Ridge` allows for the user to specify that the +solver be automatically chosen by setting `solver="auto"`. When this option +is specified, :class:`Ridge` will choose between the `"lbfgs"`, `"cholesky"`, +and `"sparse_cg"` solvers. :class:`Ridge` will begin checking the conditions +shown in the following table from top to bottom. If the condition is true, +the corresponding solver is chosen. + ++-------------+----------------------------------------------------+ +| **Solver** | **Condition** | ++-------------+----------------------------------------------------+ +| 'lbfgs' | The ``positive=True`` option is specified. | ++-------------+----------------------------------------------------+ +| 'cholesky' | The input array X is not sparse. | ++-------------+----------------------------------------------------+ +| 'sparse_cg' | None of the above conditions are fulfilled. | ++-------------+----------------------------------------------------+ + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_linear_model_plot_ols_ridge.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_ridge_path.py` +* :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` Classification -------------- @@ -140,26 +164,25 @@ the output with the highest value. It might seem questionable to use a (penalized) Least Squares loss to fit a classification model instead of the more traditional logistic or hinge -losses. However in practice all those models can lead to similar +losses. However, in practice, all those models can lead to similar cross-validation scores in terms of accuracy or precision/recall, while the penalized least squares loss used by the :class:`RidgeClassifier` allows for a very different choice of the numerical solvers with distinct computational performance profiles. The :class:`RidgeClassifier` can be significantly faster than e.g. -:class:`LogisticRegression` with a high number of classes, because it is -able to compute the projection matrix :math:`(X^T X)^{-1} X^T` only once. +:class:`LogisticRegression` with a high number of classes because it can +compute the projection matrix :math:`(X^T X)^{-1} X^T` only once. This classifier is sometimes referred to as a `Least Squares Support Vector -Machines +Machine `_ with a linear kernel. -.. topic:: Examples: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_ridge_path.py` - * :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` Ridge Complexity ---------------- @@ -176,9 +199,14 @@ This method has the same order of complexity as Setting the regularization parameter: leave-one-out Cross-Validation -------------------------------------------------------------------- -:class:`RidgeCV` implements ridge regression with built-in -cross-validation of the alpha parameter. The object works in the same way -as GridSearchCV except that it defaults to Leave-One-Out Cross-Validation:: +:class:`RidgeCV` and :class:`RidgeClassifierCV` implement ridge +regression/classification with built-in cross-validation of the alpha parameter. +They work in the same way as :class:`~sklearn.model_selection.GridSearchCV` except +that it defaults to efficient Leave-One-Out :term:`cross-validation`. +When using the default :term:`cross-validation`, alpha cannot be 0 due to the +formulation used to calculate Leave-One-Out error. See [RL2007]_ for details. + +Usage example:: >>> import numpy as np >>> from sklearn import linear_model @@ -187,20 +215,18 @@ as GridSearchCV except that it defaults to Leave-One-Out Cross-Validation:: RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06])) >>> reg.alpha_ - 0.01 + np.float64(0.01) Specifying the value of the :term:`cv` attribute will trigger the use of cross-validation with :class:`~sklearn.model_selection.GridSearchCV`, for example `cv=10` for 10-fold cross-validation, rather than Leave-One-Out Cross-Validation. -.. topic:: References - - * "Notes on Regularized Least Squares", Rifkin & Lippert (`technical report - `_, - `course slides - `_). +.. dropdown:: References + .. [RL2007] "Notes on Regularized Least Squares", Rifkin & Lippert (`technical report + `_, + `course slides `_). .. _lasso: @@ -210,7 +236,7 @@ Lasso The :class:`Lasso` is a linear model that estimates sparse coefficients. It is useful in some contexts due to its tendency to prefer solutions with fewer non-zero coefficients, effectively reducing the number of -features upon which the given solution is dependent. For this reason +features upon which the given solution is dependent. For this reason, Lasso and its variants are fundamental to the field of compressed sensing. Under certain conditions, it can recover the exact set of non-zero coefficients (see @@ -240,11 +266,11 @@ for another implementation:: The function :func:`lasso_path` is useful for lower-level tasks, as it computes the coefficients along the full path of possible values. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` - * :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py` - * :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` +* :ref:`sphx_glr_auto_examples_applications_plot_tomography_l1_reconstruction.py` +* :ref:`sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py` .. note:: **Feature selection with Lasso** @@ -253,20 +279,19 @@ computes the coefficients along the full path of possible values. thus be used to perform feature selection, as detailed in :ref:`l1_feature_selection`. -The following two references explain the iterations -used in the coordinate descent solver of scikit-learn, as well as -the duality gap computation used for convergence control. +.. dropdown:: References -.. topic:: References - - * "Regularization Path For Generalized linear Models by Coordinate Descent", - Friedman, Hastie & Tibshirani, J Stat Softw, 2010 (`Paper - `__). - * "An Interior-Point Method for Large-Scale L1-Regularized Least Squares," - S. J. Kim, K. Koh, M. Lustig, S. Boyd and D. Gorinevsky, - in IEEE Journal of Selected Topics in Signal Processing, 2007 - (`Paper `__) + The following two references explain the iterations + used in the coordinate descent solver of scikit-learn, as well as + the duality gap computation used for convergence control. + * "Regularization Path For Generalized linear Models by Coordinate Descent", + Friedman, Hastie & Tibshirani, J Stat Softw, 2010 (`Paper + `__). + * "An Interior-Point Method for Large-Scale L1-Regularized Least Squares," + S. J. Kim, K. Koh, M. Lustig, S. Boyd and D. Gorinevsky, + in IEEE Journal of Selected Topics in Signal Processing, 2007 + (`Paper `__) Setting regularization parameter -------------------------------- @@ -298,6 +323,7 @@ features, it is often faster than :class:`LassoCV`. .. centered:: |lasso_cv_1| |lasso_cv_2| +.. _lasso_lars_ic: Information-criteria based model selection ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -306,22 +332,97 @@ Alternatively, the estimator :class:`LassoLarsIC` proposes to use the Akaike information criterion (AIC) and the Bayes Information criterion (BIC). It is a computationally cheaper alternative to find the optimal value of alpha as the regularization path is computed only once instead of k+1 times -when using k-fold cross-validation. However, such criteria needs a -proper estimation of the degrees of freedom of the solution, are -derived for large samples (asymptotic results) and assume the model -is correct, i.e. that the data are actually generated by this model. -They also tend to break when the problem is badly conditioned -(more features than samples). - -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_model_selection_001.png - :target: ../auto_examples/linear_model/plot_lasso_model_selection.html +when using k-fold cross-validation. + +Indeed, these criteria are computed on the in-sample training set. In short, +they penalize the over-optimistic scores of the different Lasso models by +their flexibility (cf. to "Mathematical details" section below). + +However, such criteria need a proper estimation of the degrees of freedom of +the solution, are derived for large samples (asymptotic results) and assume the +correct model is candidates under investigation. They also tend to break when +the problem is badly conditioned (e.g. more features than samples). + +.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_ic_001.png + :target: ../auto_examples/linear_model/plot_lasso_lars_ic.html :align: center :scale: 50% +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_lars_ic.py` + +.. _aic_bic: + +AIC and BIC criteria +^^^^^^^^^^^^^^^^^^^^ + +The definition of AIC (and thus BIC) might differ in the literature. In this +section, we give more information regarding the criterion computed in +scikit-learn. + +.. dropdown:: Mathematical details + + The AIC criterion is defined as: + + .. math:: + AIC = -2 \log(\hat{L}) + 2 d + + where :math:`\hat{L}` is the maximum likelihood of the model and + :math:`d` is the number of parameters (as well referred to as degrees of + freedom in the previous section). + + The definition of BIC replaces the constant :math:`2` by :math:`\log(N)`: + + .. math:: + BIC = -2 \log(\hat{L}) + \log(N) d + + where :math:`N` is the number of samples. + + For a linear Gaussian model, the maximum log-likelihood is defined as: + + .. math:: + \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\sigma^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{2\sigma^2} + + where :math:`\sigma^2` is an estimate of the noise variance, + :math:`y_i` and :math:`\hat{y}_i` are respectively the true and predicted + targets, and :math:`n` is the number of samples. + + Plugging the maximum log-likelihood in the AIC formula yields: + + .. math:: + AIC = n \log(2 \pi \sigma^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sigma^2} + 2 d + + The first term of the above expression is sometimes discarded since it is a + constant when :math:`\sigma^2` is provided. In addition, + it is sometimes stated that the AIC is equivalent to the :math:`C_p` statistic + [12]_. In a strict sense, however, it is equivalent only up to some constant + and a multiplicative factor. + + At last, we mentioned above that :math:`\sigma^2` is an estimate of the + noise variance. In :class:`LassoLarsIC` when the parameter `noise_variance` is + not provided (default), the noise variance is estimated via the unbiased + estimator [13]_ defined as: + + .. math:: + \sigma^2 = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n - p} + + where :math:`p` is the number of features and :math:`\hat{y}_i` is the + predicted target using an ordinary least squares regression. Note, that this + formula is valid only when `n_samples > n_features`. -.. topic:: Examples: + .. rubric:: References - * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py` + .. [12] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani. + "On the degrees of freedom of the lasso." + The Annals of Statistics 35.5 (2007): 2173-2192. + <0712.0881.pdf>` + + .. [13] :doi:`Cherkassky, Vladimir, and Yunqian Ma. + "Comparison of model selection for regression." + Neural computation 15.7 (2003): 1691-1714. + <10.1162/089976603321891864>` Comparison with the regularization parameter of SVM ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -358,28 +459,29 @@ the MultiTaskLasso are full columns. .. centered:: Fitting a time-series model, imposing that any active feature be active at all times. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_multi_task_lasso_support.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_multi_task_lasso_support.py` -Mathematically, it consists of a linear model trained with a mixed -:math:`\ell_1` :math:`\ell_2`-norm for regularization. -The objective function to minimize is: +.. dropdown:: Mathematical details -.. math:: \min_{W} { \frac{1}{2n_{\text{samples}}} ||X W - Y||_{\text{Fro}} ^ 2 + \alpha ||W||_{21}} + Mathematically, it consists of a linear model trained with a mixed + :math:`\ell_1` :math:`\ell_2`-norm for regularization. + The objective function to minimize is: -where :math:`\text{Fro}` indicates the Frobenius norm + .. math:: \min_{W} { \frac{1}{2n_{\text{samples}}} ||X W - Y||_{\text{Fro}} ^ 2 + \alpha ||W||_{21}} -.. math:: ||A||_{\text{Fro}} = \sqrt{\sum_{ij} a_{ij}^2} + where :math:`\text{Fro}` indicates the Frobenius norm -and :math:`\ell_1` :math:`\ell_2` reads + .. math:: ||A||_{\text{Fro}} = \sqrt{\sum_{ij} a_{ij}^2} -.. math:: ||A||_{2 1} = \sum_i \sqrt{\sum_j a_{ij}^2}. + and :math:`\ell_1` :math:`\ell_2` reads -The implementation in the class :class:`MultiTaskLasso` uses -coordinate descent as the algorithm to fit the coefficients. + .. math:: ||A||_{2 1} = \sum_i \sqrt{\sum_j a_{ij}^2}. + The implementation in the class :class:`MultiTaskLasso` uses + coordinate descent as the algorithm to fit the coefficients. .. _elastic_net: @@ -393,7 +495,7 @@ the regularization properties of :class:`Ridge`. We control the convex combination of :math:`\ell_1` and :math:`\ell_2` using the ``l1_ratio`` parameter. -Elastic-net is useful when there are multiple features which are +Elastic-net is useful when there are multiple features that are correlated with one another. Lasso is likely to pick one of these at random, while elastic-net is likely to pick both. @@ -408,32 +510,33 @@ The objective function to minimize is in this case \frac{\alpha(1-\rho)}{2} ||w||_2 ^ 2} -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_coordinate_descent_path_001.png - :target: ../auto_examples/linear_model/plot_lasso_coordinate_descent_path.html +.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lasso_lars_elasticnet_path_002.png + :target: ../auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.html :align: center :scale: 50% The class :class:`ElasticNetCV` can be used to set the parameters ``alpha`` (:math:`\alpha`) and ``l1_ratio`` (:math:`\rho`) by cross-validation. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_coordinate_descent_path.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py` -The following two references explain the iterations -used in the coordinate descent solver of scikit-learn, as well as -the duality gap computation used for convergence control. +.. dropdown:: References -.. topic:: References + The following two references explain the iterations + used in the coordinate descent solver of scikit-learn, as well as + the duality gap computation used for convergence control. - * "Regularization Path For Generalized linear Models by Coordinate Descent", - Friedman, Hastie & Tibshirani, J Stat Softw, 2010 (`Paper - `__). - * "An Interior-Point Method for Large-Scale L1-Regularized Least Squares," - S. J. Kim, K. Koh, M. Lustig, S. Boyd and D. Gorinevsky, - in IEEE Journal of Selected Topics in Signal Processing, 2007 - (`Paper `__) + * "Regularization Path For Generalized linear Models by Coordinate Descent", + Friedman, Hastie & Tibshirani, J Stat Softw, 2010 (`Paper + `__). + * "An Interior-Point Method for Large-Scale L1-Regularized Least Squares," + S. J. Kim, K. Koh, M. Lustig, S. Boyd and D. Gorinevsky, + in IEEE Journal of Selected Topics in Signal Processing, 2007 + (`Paper `__) .. _multi_task_elastic_net: @@ -475,32 +578,32 @@ between the features. The advantages of LARS are: - - It is numerically efficient in contexts where the number of features - is significantly greater than the number of samples. +- It is numerically efficient in contexts where the number of features + is significantly greater than the number of samples. - - It is computationally just as fast as forward selection and has - the same order of complexity as ordinary least squares. +- It is computationally just as fast as forward selection and has + the same order of complexity as ordinary least squares. - - It produces a full piecewise linear solution path, which is - useful in cross-validation or similar attempts to tune the model. +- It produces a full piecewise linear solution path, which is + useful in cross-validation or similar attempts to tune the model. - - If two features are almost equally correlated with the target, - then their coefficients should increase at approximately the same - rate. The algorithm thus behaves as intuition would expect, and - also is more stable. +- If two features are almost equally correlated with the target, + then their coefficients should increase at approximately the same + rate. The algorithm thus behaves as intuition would expect, and + also is more stable. - - It is easily modified to produce solutions for other estimators, - like the Lasso. +- It is easily modified to produce solutions for other estimators, + like the Lasso. The disadvantages of the LARS method include: - - Because LARS is based upon an iterative refitting of the - residuals, it would appear to be especially sensitive to the - effects of noise. This problem is discussed in detail by Weisberg - in the discussion section of the Efron et al. (2004) Annals of - Statistics article. +- Because LARS is based upon an iterative refitting of the + residuals, it would appear to be especially sensitive to the + effects of noise. This problem is discussed in detail by Weisberg + in the discussion section of the Efron et al. (2004) Annals of + Statistics article. -The LARS model can be used using estimator :class:`Lars`, or its +The LARS model can be used via the estimator :class:`Lars`, or its low-level implementation :func:`lars_path` or :func:`lars_path_gram`. @@ -512,8 +615,8 @@ algorithm, and unlike the implementation based on coordinate descent, this yields the exact solution, which is piecewise linear as a function of the norm of its coefficients. -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_001.png - :target: ../auto_examples/linear_model/plot_lasso_lars.html +.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lasso_lars_elasticnet_path_001.png + :target: ../auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.html :align: center :scale: 50% @@ -524,52 +627,50 @@ function of the norm of its coefficients. >>> reg.fit([[0, 0], [1, 1]], [0, 1]) LassoLars(alpha=0.1) >>> reg.coef_ - array([0.717157..., 0. ]) + array([0.6, 0. ]) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_lars.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py` -The Lars algorithm provides the full path of the coefficients along +The LARS algorithm provides the full path of the coefficients along the regularization parameter almost for free, thus a common operation is to retrieve the path with one of the functions :func:`lars_path` or :func:`lars_path_gram`. -Mathematical formulation ------------------------- - -The algorithm is similar to forward stepwise regression, but instead -of including features at each step, the estimated coefficients are -increased in a direction equiangular to each one's correlations with -the residual. +.. dropdown:: Mathematical formulation -Instead of giving a vector result, the LARS solution consists of a -curve denoting the solution for each value of the :math:`\ell_1` norm of the -parameter vector. The full coefficients path is stored in the array -``coef_path_``, which has size (n_features, max_features+1). The first -column is always zero. + The algorithm is similar to forward stepwise regression, but instead + of including features at each step, the estimated coefficients are + increased in a direction equiangular to each one's correlations with + the residual. -.. topic:: References: + Instead of giving a vector result, the LARS solution consists of a + curve denoting the solution for each value of the :math:`\ell_1` norm of the + parameter vector. The full coefficients path is stored in the array + ``coef_path_`` of shape `(n_features, max_features + 1)`. The first + column is always zero. - * Original Algorithm is detailed in the paper `Least Angle Regression - `_ - by Hastie et al. + .. rubric:: References + * Original Algorithm is detailed in the paper `Least Angle Regression + `_ + by Hastie et al. .. _omp: Orthogonal Matching Pursuit (OMP) ================================= -:class:`OrthogonalMatchingPursuit` and :func:`orthogonal_mp` implements the OMP +:class:`OrthogonalMatchingPursuit` and :func:`orthogonal_mp` implement the OMP algorithm for approximating the fit of a linear model with constraints imposed -on the number of non-zero coefficients (ie. the :math:`\ell_0` pseudo-norm). +on the number of non-zero coefficients (i.e. the :math:`\ell_0` pseudo-norm). Being a forward feature selection method like :ref:`least_angle_regression`, orthogonal matching pursuit can approximate the optimum solution vector with a fixed number of non-zero elements: .. math:: - \underset{w}{\operatorname{arg\,min\,}} ||y - Xw||_2^2 \text{ subject to } ||w||_0 \leq n_{\text{nonzero\_coefs}} + \underset{w}{\operatorname{arg\,min\,}} ||y - Xw||_2^2 \text{ subject to } ||w||_0 \leq n_{\text{nonzero_coefs}} Alternatively, orthogonal matching pursuit can target a specific error instead of a specific number of non-zero coefficients. This can be expressed as: @@ -585,18 +686,17 @@ residual is recomputed using an orthogonal projection on the space of the previously chosen dictionary elements. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_omp.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_omp.py` -.. topic:: References: +.. dropdown:: References - * https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf - - * `Matching pursuits with time-frequency dictionaries - `_, - S. G. Mallat, Z. Zhang, + * https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf + * `Matching pursuits with time-frequency dictionaries + `_, + S. G. Mallat, Z. Zhang, .. _bayesian_regression: @@ -619,29 +719,29 @@ variable to be estimated from the data. To obtain a fully probabilistic model, the output :math:`y` is assumed to be Gaussian distributed around :math:`X w`: -.. math:: p(y|X,w,\alpha) = \mathcal{N}(y|X w,\alpha) +.. math:: p(y|X,w,\alpha) = \mathcal{N}(y|X w,\alpha^{-1}) where :math:`\alpha` is again treated as a random variable that is to be estimated from the data. The advantages of Bayesian Regression are: - - It adapts to the data at hand. +- It adapts to the data at hand. - - It can be used to include regularization parameters in the - estimation procedure. +- It can be used to include regularization parameters in the + estimation procedure. The disadvantages of Bayesian regression include: - - Inference of the model can be time consuming. +- Inference of the model can be time consuming. -.. topic:: References +.. dropdown:: References - * A good introduction to Bayesian methods is given in C. Bishop: Pattern - Recognition and Machine learning + * A good introduction to Bayesian methods is given in C. Bishop: Pattern + Recognition and Machine learning - * Original Algorithm is detailed in the book `Bayesian learning for neural - networks` by Radford M. Neal + * Original Algorithm is detailed in the book `Bayesian learning for neural + networks` by Radford M. Neal .. _bayesian_ridge_regression: @@ -675,13 +775,6 @@ There are four more hyperparameters, :math:`\alpha_1`, :math:`\alpha_2`, :math:`\alpha` and :math:`\lambda`. These are usually chosen to be *non-informative*. By default :math:`\alpha_1 = \alpha_2 = \lambda_1 = \lambda_2 = 10^{-6}`. - -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_bayesian_ridge_001.png - :target: ../auto_examples/linear_model/plot_bayesian_ridge.html - :align: center - :scale: 50% - - Bayesian Ridge Regression is used for regression:: >>> from sklearn import linear_model @@ -701,212 +794,344 @@ The coefficients :math:`w` of the model can be accessed:: >>> reg.coef_ array([0.49999993, 0.49999993]) -Due to the Bayesian framework, the weights found are slightly different to the +Due to the Bayesian framework, the weights found are slightly different from the ones found by :ref:`ordinary_least_squares`. However, Bayesian Ridge Regression is more robust to ill-posed problems. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py` -.. topic:: References: +.. dropdown:: References - * Section 3.3 in Christopher M. Bishop: Pattern Recognition and Machine Learning, 2006 + * Section 3.3 in Christopher M. Bishop: Pattern Recognition and Machine Learning, 2006 - * David J. C. MacKay, `Bayesian Interpolation `_, 1992. + * David J. C. MacKay, `Bayesian Interpolation `_, 1992. - * Michael E. Tipping, `Sparse Bayesian Learning and the Relevance Vector Machine `_, 2001. + * Michael E. Tipping, `Sparse Bayesian Learning and the Relevance Vector Machine `_, 2001. +.. _automatic_relevance_determination: Automatic Relevance Determination - ARD --------------------------------------- -:class:`ARDRegression` is very similar to `Bayesian Ridge Regression`_, -but can lead to sparser coefficients :math:`w` [1]_ [2]_. -:class:`ARDRegression` poses a different prior over :math:`w`, by dropping the -assumption of the Gaussian being spherical. +The Automatic Relevance Determination (as being implemented in +:class:`ARDRegression`) is a kind of linear model which is very similar to the +`Bayesian Ridge Regression`_, but that leads to sparser coefficients :math:`w` +[1]_ [2]_. -Instead, the distribution over :math:`w` is assumed to be an axis-parallel, -elliptical Gaussian distribution. - -This means each coefficient :math:`w_{i}` is drawn from a Gaussian distribution, -centered on zero and with a precision :math:`\lambda_{i}`: +:class:`ARDRegression` poses a different prior over :math:`w`: it drops +the spherical Gaussian distribution for a centered elliptic Gaussian +distribution. This means each coefficient :math:`w_{i}` can itself be drawn from +a Gaussian distribution, centered on zero and with a precision +:math:`\lambda_{i}`: .. math:: p(w|\lambda) = \mathcal{N}(w|0,A^{-1}) -with :math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`. +with :math:`A` being a positive definite diagonal matrix and +:math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`. -In contrast to `Bayesian Ridge Regression`_, each coordinate of :math:`w_{i}` -has its own standard deviation :math:`\lambda_i`. The prior over all -:math:`\lambda_i` is chosen to be the same gamma distribution given by -hyperparameters :math:`\lambda_1` and :math:`\lambda_2`. +In contrast to the `Bayesian Ridge Regression`_, each coordinate of +:math:`w_{i}` has its own standard deviation :math:`\frac{1}{\lambda_i}`. The +prior over all :math:`\lambda_i` is chosen to be the same gamma distribution +given by the hyperparameters :math:`\lambda_1` and :math:`\lambda_2`. -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ard_001.png - :target: ../auto_examples/linear_model/plot_ard.html - :align: center - :scale: 50% - -ARD is also known in the literature as *Sparse Bayesian Learning* and -*Relevance Vector Machine* [3]_ [4]_. +ARD is also known in the literature as *Sparse Bayesian Learning* and *Relevance +Vector Machine* [3]_ [4]_. For a worked-out comparison between ARD and `Bayesian +Ridge Regression`_, see the example below. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_ard.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_ard.py` -.. topic:: References: - .. [1] Christopher M. Bishop: Pattern Recognition and Machine Learning, Chapter 7.2.1 +.. rubric:: References - .. [2] David Wipf and Srikantan Nagarajan: `A new view of automatic relevance determination `_ +.. [1] Christopher M. Bishop: Pattern Recognition and Machine Learning, Chapter 7.2.1 - .. [3] Michael E. Tipping: `Sparse Bayesian Learning and the Relevance Vector Machine `_ +.. [2] David Wipf and Srikantan Nagarajan: `A New View of Automatic Relevance Determination `_ - .. [4] Tristan Fletcher: `Relevance Vector Machines explained `_ +.. [3] Michael E. Tipping: `Sparse Bayesian Learning and the Relevance Vector Machine `_ +.. [4] Tristan Fletcher: `Relevance Vector Machines Explained `_ .. _Logistic_regression: Logistic regression =================== -Logistic regression, despite its name, is a linear model for classification -rather than regression. Logistic regression is also known in the literature as -logit regression, maximum-entropy classification (MaxEnt) or the log-linear -classifier. In this model, the probabilities describing the possible outcomes -of a single trial are modeled using a -`logistic function `_. +The logistic regression is implemented in :class:`LogisticRegression`. Despite +its name, it is implemented as a linear model for classification rather than +regression in terms of the scikit-learn/ML nomenclature. The logistic +regression is also known in the literature as logit regression, +maximum-entropy classification (MaxEnt) or the log-linear classifier. In this +model, the probabilities describing the possible outcomes of a single trial +are modeled using a `logistic function +`_. -Logistic regression is implemented in :class:`LogisticRegression`. This implementation can fit binary, One-vs-Rest, or multinomial logistic regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net regularization. -.. note:: +.. note:: **Regularization** Regularization is applied by default, which is common in machine learning but not in statistics. Another advantage of regularization is that it improves numerical stability. No regularization amounts to setting C to a very high value. -As an optimization problem, binary class :math:`\ell_2` penalized logistic -regression minimizes the following cost function: +.. note:: **Logistic Regression as a special case of the Generalized Linear Models (GLM)** -.. math:: \min_{w, c} \frac{1}{2}w^T w + C \sum_{i=1}^n \log(\exp(- y_i (X_i^T w + c)) + 1) . + Logistic regression is a special case of + :ref:`generalized_linear_models` with a Binomial / Bernoulli conditional + distribution and a Logit link. The numerical output of the logistic + regression, which is the predicted probability, can be used as a classifier + by applying a threshold (by default 0.5) to it. This is how it is + implemented in scikit-learn, so it expects a categorical target, making + the Logistic Regression a classifier. -Similarly, :math:`\ell_1` regularized logistic regression solves the following -optimization problem: +.. rubric:: Examples -.. math:: \min_{w, c} \|w\|_1 + C \sum_{i=1}^n \log(\exp(- y_i (X_i^T w + c)) + 1). +* :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_l1_l2_sparsity.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_20newsgroups.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_mnist.py` +* :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py` -Elastic-Net regularization is a combination of :math:`\ell_1` and -:math:`\ell_2`, and minimizes the following cost function: +Binary Case +----------- -.. math:: \min_{w, c} \frac{1 - \rho}{2}w^T w + \rho \|w\|_1 + C \sum_{i=1}^n \log(\exp(- y_i (X_i^T w + c)) + 1), +For notational ease, we assume that the target :math:`y_i` takes values in the +set :math:`\{0, 1\}` for data point :math:`i`. +Once fitted, the :meth:`~sklearn.linear_model.LogisticRegression.predict_proba` +method of :class:`~sklearn.linear_model.LogisticRegression` predicts +the probability of the positive class :math:`P(y_i=1|X_i)` as -where :math:`\rho` controls the strength of :math:`\ell_1` regularization vs. -:math:`\ell_2` regularization (it corresponds to the `l1_ratio` parameter). +.. math:: \hat{p}(X_i) = \operatorname{expit}(X_i w + w_0) = \frac{1}{1 + \exp(-X_i w - w_0)}. -Note that, in this notation, it's assumed that the target :math:`y_i` takes -values in the set :math:`{-1, 1}` at trial :math:`i`. We can also see that -Elastic-Net is equivalent to :math:`\ell_1` when :math:`\rho = 1` and equivalent -to :math:`\ell_2` when :math:`\rho=0`. -The solvers implemented in the class :class:`LogisticRegression` -are "liblinear", "newton-cg", "lbfgs", "sag" and "saga": - -The solver "liblinear" uses a coordinate descent (CD) algorithm, and relies -on the excellent C++ `LIBLINEAR library -`_, which is shipped with -scikit-learn. However, the CD algorithm implemented in liblinear cannot learn -a true multinomial (multiclass) model; instead, the optimization problem is -decomposed in a "one-vs-rest" fashion so separate binary classifiers are -trained for all classes. This happens under the hood, so -:class:`LogisticRegression` instances using this solver behave as multiclass -classifiers. For :math:`\ell_1` regularization :func:`sklearn.svm.l1_min_c` allows to -calculate the lower bound for C in order to get a non "null" (all feature -weights to zero) model. - -The "lbfgs", "sag" and "newton-cg" solvers only support :math:`\ell_2` -regularization or no regularization, and are found to converge faster for some -high-dimensional data. Setting `multi_class` to "multinomial" with these solvers -learns a true multinomial logistic regression model [5]_, which means that its -probability estimates should be better calibrated than the default "one-vs-rest" -setting. - -The "sag" solver uses Stochastic Average Gradient descent [6]_. It is faster -than other solvers for large datasets, when both the number of samples and the -number of features are large. - -The "saga" solver [7]_ is a variant of "sag" that also supports the -non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse -multinomial logistic regression. It is also the only solver that supports -`penalty="elasticnet"`. - -The "lbfgs" is an optimization algorithm that approximates the -Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to -quasi-Newton methods. The "lbfgs" solver is recommended for use for -small data-sets but for larger datasets its performance suffers. [9]_ - -The following table summarizes the penalties supported by each solver: - -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| | **Solvers** | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| **Penalties** | **'liblinear'** | **'lbfgs'** | **'newton-cg'** | **'sag'** | **'saga'** | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Multinomial + L2 penalty | no | yes | yes | yes | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| OVR + L2 penalty | yes | yes | yes | yes | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Multinomial + L1 penalty | no | no | no | no | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| OVR + L1 penalty | yes | no | no | no | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Elastic-Net | no | no | no | no | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| No penalty ('none') | no | yes | yes | yes | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| **Behaviors** | | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Penalize the intercept (bad) | yes | no | no | no | no | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Faster for large datasets | no | no | no | yes | yes | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ -| Robust to unscaled datasets | yes | yes | yes | no | no | -+------------------------------+-----------------+-------------+-----------------+-----------+------------+ - -The "lbfgs" solver is used by default for its robustness. For large datasets -the "saga" solver is usually faster. -For large dataset, you may also consider using :class:`SGDClassifier` -with 'log' loss, which might be even faster but requires more tuning. - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_l1_l2_sparsity.py` +As an optimization problem, binary +class logistic regression with regularization term :math:`r(w)` minimizes the +following cost function: - * :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py` +.. math:: + :name: regularized-logistic-loss + + \min_{w} \frac{1}{S}\sum_{i=1}^n s_i + \left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right) + + \frac{r(w)}{S C}\,, + +where :math:`{s_i}` corresponds to the weights assigned by the user to a +specific training sample (the vector :math:`s` is formed by element-wise +multiplication of the class weights and sample weights), +and the sum :math:`S = \sum_{i=1}^n s_i`. + +We currently provide four choices for the regularization term :math:`r(w)` via +the `penalty` argument: + ++----------------+-------------------------------------------------+ +| penalty | :math:`r(w)` | ++================+=================================================+ +| `None` | :math:`0` | ++----------------+-------------------------------------------------+ +| :math:`\ell_1` | :math:`\|w\|_1` | ++----------------+-------------------------------------------------+ +| :math:`\ell_2` | :math:`\frac{1}{2}\|w\|_2^2 = \frac{1}{2}w^T w` | ++----------------+-------------------------------------------------+ +| `ElasticNet` | :math:`\frac{1 - \rho}{2}w^T w + \rho \|w\|_1` | ++----------------+-------------------------------------------------+ + +For ElasticNet, :math:`\rho` (which corresponds to the `l1_ratio` parameter) +controls the strength of :math:`\ell_1` regularization vs. :math:`\ell_2` +regularization. Elastic-Net is equivalent to :math:`\ell_1` when +:math:`\rho = 1` and equivalent to :math:`\ell_2` when :math:`\rho=0`. + +Note that the scale of the class weights and the sample weights will influence +the optimization problem. For instance, multiplying the sample weights by a +constant :math:`b>0` is equivalent to multiplying the (inverse) regularization +strength `C` by :math:`b`. + +Multinomial Case +---------------- - * :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py` +The binary case can be extended to :math:`K` classes leading to the multinomial +logistic regression, see also `log-linear model +`_. - * :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_20newsgroups.py` +.. note:: + It is possible to parameterize a :math:`K`-class classification model + using only :math:`K-1` weight vectors, leaving one class probability fully + determined by the other class probabilities by leveraging the fact that all + class probabilities must sum to one. We deliberately choose to overparameterize the model + using :math:`K` weight vectors for ease of implementation and to preserve the + symmetrical inductive bias regarding ordering of classes, see [16]_. This effect becomes + especially important when using regularization. The choice of overparameterization can be + detrimental for unpenalized models since then the solution may not be unique, as shown in [16]_. + +.. dropdown:: Mathematical details + + Let :math:`y_i \in \{1, \ldots, K\}` be the label (ordinal) encoded target variable for observation :math:`i`. + Instead of a single coefficient vector, we now have + a matrix of coefficients :math:`W` where each row vector :math:`W_k` corresponds to class + :math:`k`. We aim at predicting the class probabilities :math:`P(y_i=k|X_i)` via + :meth:`~sklearn.linear_model.LogisticRegression.predict_proba` as: + + .. math:: \hat{p}_k(X_i) = \frac{\exp(X_i W_k + W_{0, k})}{\sum_{l=0}^{K-1} \exp(X_i W_l + W_{0, l})}. + + The objective for the optimization becomes + + .. math:: + \min_W -\frac{1}{S}\sum_{i=1}^n \sum_{k=0}^{K-1} s_{ik} [y_i = k] \log(\hat{p}_k(X_i)) + + \frac{r(W)}{S C}\,, + + where :math:`[P]` represents the Iverson bracket which evaluates to :math:`0` + if :math:`P` is false, otherwise it evaluates to :math:`1`. + + Again, :math:`s_{ik}` are the weights assigned by the user (multiplication of sample + weights and class weights) with their sum :math:`S = \sum_{i=1}^n \sum_{k=0}^{K-1} s_{ik}`. + + We currently provide four choices + for the regularization term :math:`r(W)` via the `penalty` argument, where :math:`m` + is the number of features: + + +----------------+----------------------------------------------------------------------------------+ + | penalty | :math:`r(W)` | + +================+==================================================================================+ + | `None` | :math:`0` | + +----------------+----------------------------------------------------------------------------------+ + | :math:`\ell_1` | :math:`\|W\|_{1,1} = \sum_{i=1}^m\sum_{j=1}^{K}|W_{i,j}|` | + +----------------+----------------------------------------------------------------------------------+ + | :math:`\ell_2` | :math:`\frac{1}{2}\|W\|_F^2 = \frac{1}{2}\sum_{i=1}^m\sum_{j=1}^{K} W_{i,j}^2` | + +----------------+----------------------------------------------------------------------------------+ + | `ElasticNet` | :math:`\frac{1 - \rho}{2}\|W\|_F^2 + \rho \|W\|_{1,1}` | + +----------------+----------------------------------------------------------------------------------+ + +.. _logistic_regression_solvers: + +Solvers +------- - * :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_mnist.py` +The solvers implemented in the class :class:`LogisticRegression` +are "lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag" and "saga": + +The following table summarizes the penalties and multinomial multiclass supported by each solver: + ++------------------------------+-----------------+-------------+-----------------+-----------------------+-----------+------------+ +| | **Solvers** | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| **Penalties** | **'lbfgs'** | **'liblinear'** | **'newton-cg'** | **'newton-cholesky'** | **'sag'** | **'saga'** | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| L2 penalty | yes | yes | yes | yes | yes | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| L1 penalty | no | yes | no | no | no | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| Elastic-Net (L1 + L2) | no | no | no | no | no | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| No penalty ('none') | yes | no | yes | yes | yes | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| **Multiclass support** | | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| multinomial multiclass | yes | no | yes | yes | yes | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| **Behaviors** | | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| Penalize the intercept (bad) | no | yes | no | no | no | no | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| Faster for large datasets | no | no | no | no | yes | yes | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ +| Robust to unscaled datasets | yes | yes | yes | yes | no | no | ++------------------------------+-------------+-----------------+-----------------+-----------------------+-----------+------------+ + +The "lbfgs" solver is used by default for its robustness. For +`n_samples >> n_features`, "newton-cholesky" is a good choice and can reach high +precision (tiny `tol` values). For large datasets +the "saga" solver is usually faster (than "lbfgs"), in particular for low precision +(high `tol`). +For large dataset, you may also consider using :class:`SGDClassifier` +with `loss="log_loss"`, which might be even faster but requires more tuning. .. _liblinear_differences: -.. topic:: Differences from liblinear: +Differences between solvers +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There might be a difference in the scores obtained between +:class:`LogisticRegression` with ``solver=liblinear`` or +:class:`~sklearn.svm.LinearSVC` and the external liblinear library directly, +when ``fit_intercept=False`` and the fit ``coef_`` (or) the data to be predicted +are zeroes. This is because for the sample(s) with ``decision_function`` zero, +:class:`LogisticRegression` and :class:`~sklearn.svm.LinearSVC` predict the +negative class, while liblinear predicts the positive class. Note that a model +with ``fit_intercept=False`` and having many samples with ``decision_function`` +zero, is likely to be an underfit, bad model and you are advised to set +``fit_intercept=True`` and increase the ``intercept_scaling``. + +.. dropdown:: Solvers' details + + * The solver "liblinear" uses a coordinate descent (CD) algorithm, and relies + on the excellent C++ `LIBLINEAR library + `_, which is shipped with + scikit-learn. However, the CD algorithm implemented in liblinear cannot learn + a true multinomial (multiclass) model; instead, the optimization problem is + decomposed in a "one-vs-rest" fashion so separate binary classifiers are + trained for all classes. This happens under the hood, so + :class:`LogisticRegression` instances using this solver behave as multiclass + classifiers. For :math:`\ell_1` regularization :func:`sklearn.svm.l1_min_c` allows to + calculate the lower bound for C in order to get a non "null" (all feature + weights to zero) model. + + * The "lbfgs", "newton-cg" and "sag" solvers only support :math:`\ell_2` + regularization or no regularization, and are found to converge faster for some + high-dimensional data. Setting `multi_class` to "multinomial" with these solvers + learns a true multinomial logistic regression model [5]_, which means that its + probability estimates should be better calibrated than the default "one-vs-rest" + setting. + + * The "sag" solver uses Stochastic Average Gradient descent [6]_. It is faster + than other solvers for large datasets, when both the number of samples and the + number of features are large. + + * The "saga" solver [7]_ is a variant of "sag" that also supports the + non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse + multinomial logistic regression. It is also the only solver that supports + `penalty="elasticnet"`. + + * The "lbfgs" is an optimization algorithm that approximates the + Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to + quasi-Newton methods. As such, it can deal with a wide range of different training + data and is therefore the default solver. Its performance, however, suffers on poorly + scaled datasets and on datasets with one-hot encoded categorical features with rare + categories. + + * The "newton-cholesky" solver is an exact Newton solver that calculates the Hessian + matrix and solves the resulting linear system. It is a very good choice for + `n_samples` >> `n_features` and can reach high precision (tiny values of `tol`), + but has a few shortcomings: Only :math:`\ell_2` regularization is supported. + Furthermore, because the Hessian matrix is explicitly computed, the memory usage + has a quadratic dependency on `n_features` as well as on `n_classes`. + + For a comparison of some of these solvers, see [9]_. + + .. rubric:: References + + .. [5] Christopher M. Bishop: Pattern Recognition and Machine Learning, Chapter 4.3.4 + + .. [6] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. `_ + + .. [7] Aaron Defazio, Francis Bach, Simon Lacoste-Julien: + :arxiv:`SAGA: A Fast Incremental Gradient Method With Support for + Non-Strongly Convex Composite Objectives. <1407.0202>` + + .. [8] https://en.wikipedia.org/wiki/Broyden%E2%80%93Fletcher%E2%80%93Goldfarb%E2%80%93Shanno_algorithm + + .. [9] Thomas P. Minka `"A comparison of numerical optimizers for logistic regression" + `_ + + .. [16] :arxiv:`Simon, Noah, J. Friedman and T. Hastie. + "A Blockwise Descent Algorithm for Group-penalized Multiresponse and + Multinomial Regression." <1311.6529>` - There might be a difference in the scores obtained between - :class:`LogisticRegression` with ``solver=liblinear`` - or :class:`LinearSVC` and the external liblinear library directly, - when ``fit_intercept=False`` and the fit ``coef_`` (or) the data to - be predicted are zeroes. This is because for the sample(s) with - ``decision_function`` zero, :class:`LogisticRegression` and :class:`LinearSVC` - predict the negative class, while liblinear predicts the positive class. - Note that a model with ``fit_intercept=False`` and having many samples with - ``decision_function`` zero, is likely to be a underfit, bad model and you are - advised to set ``fit_intercept=True`` and increase the intercept_scaling. .. note:: **Feature selection with sparse logistic regression** @@ -918,7 +1143,7 @@ with 'log' loss, which might be even faster but requires more tuning. It is possible to obtain the p-values and confidence intervals for coefficients in cases of regression without penalization. The `statsmodels - package ` natively supports this. + package `_ natively supports this. Within sklearn, one could use bootstrapping instead as well. @@ -928,23 +1153,12 @@ according to the ``scoring`` attribute. The "newton-cg", "sag", "saga" and "lbfgs" solvers are found to be faster for high-dimensional dense data, due to warm-starting (see :term:`Glossary `). -.. topic:: References: - - .. [5] Christopher M. Bishop: Pattern Recognition and Machine Learning, Chapter 4.3.4 - - .. [6] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. `_ - - .. [7] Aaron Defazio, Francis Bach, Simon Lacoste-Julien: `SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives. `_ - - .. [8] https://en.wikipedia.org/wiki/Broyden%E2%80%93Fletcher%E2%80%93Goldfarb%E2%80%93Shanno_algorithm - - .. [9] `"Performance Evaluation of Lbfgs vs other solvers" - `_ - .. _Generalized_linear_regression: -Generalized Linear Regression -============================= +.. _Generalized_linear_models: + +Generalized Linear Models +========================= Generalized Linear Models (GLM) extend linear models in two ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear @@ -959,22 +1173,23 @@ reproductive exponential dispersion model (EDM) [11]_). The minimization problem becomes: -.. math:: \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2, +.. math:: \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2^2, where :math:`\alpha` is the L2 regularization penalty. When sample weights are provided, the average becomes a weighted average. -The following table lists some specific EDMs and their unit deviance (all of -these are instances of the Tweedie family): +The following table lists some specific EDMs and their unit deviance : -================= =============================== ============================================ +================= ================================ ============================================ Distribution Target Domain Unit Deviance :math:`d(y, \hat{y})` -================= =============================== ============================================ -Normal :math:`y \in (-\infty, \infty)` :math:`(y-\hat{y})^2` -Poisson :math:`y \in [0, \infty)` :math:`2(y\log\frac{y}{\hat{y}}-y+\hat{y})` -Gamma :math:`y \in (0, \infty)` :math:`2(\log\frac{\hat{y}}{y}+\frac{y}{\hat{y}}-1)` -Inverse Gaussian :math:`y \in (0, \infty)` :math:`\frac{(y-\hat{y})^2}{y\hat{y}^2}` -================= =============================== ============================================ +================= ================================ ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`(y-\hat{y})^2` +Bernoulli :math:`y \in \{0, 1\}` :math:`2({y}\log\frac{y}{\hat{y}}+({1}-{y})\log\frac{{1}-{y}}{{1}-\hat{y}})` +Categorical :math:`y \in \{0, 1, ..., k\}` :math:`2\sum_{i \in \{0, 1, ..., k\}} I(y = i) y_\text{i}\log\frac{I(y = i)}{\hat{I(y = i)}}` +Poisson :math:`y \in [0, \infty)` :math:`2(y\log\frac{y}{\hat{y}}-y+\hat{y})` +Gamma :math:`y \in (0, \infty)` :math:`2(\log\frac{\hat{y}}{y}+\frac{y}{\hat{y}}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\frac{(y-\hat{y})^2}{y\hat{y}^2}` +================= ================================ ============================================ The Probability Density Functions (PDF) of these distributions are illustrated in the following figure, @@ -989,40 +1204,58 @@ in the following figure, distribution, but not for the Gamma distribution which has a strictly positive target domain. +The Bernoulli distribution is a discrete probability distribution modelling a +Bernoulli trial - an event that has only two mutually exclusive outcomes. +The Categorical distribution is a generalization of the Bernoulli distribution +for a categorical random variable. While a random variable in a Bernoulli +distribution has two possible outcomes, a Categorical random variable can take +on one of K possible categories, with the probability of each category +specified separately. + The choice of the distribution depends on the problem at hand: * If the target values :math:`y` are counts (non-negative integer valued) or - relative frequencies (non-negative), you might use a Poisson deviance - with log-link. -* If the target values are positive valued and skewed, you might try a - Gamma deviance with log-link. -* If the target values seem to be heavier tailed than a Gamma distribution, - you might try an Inverse Gaussian deviance (or even higher variance powers - of the Tweedie family). - - -Examples of use cases include: - -* Agriculture / weather modeling: number of rain events per year (Poisson), - amount of rainfall per event (Gamma), total rainfall per year (Tweedie / - Compound Poisson Gamma). -* Risk modeling / insurance policy pricing: number of claim events / - policyholder per year (Poisson), cost per event (Gamma), total cost per - policyholder per year (Tweedie / Compound Poisson Gamma). -* Predictive maintenance: number of production interruption events per year - (Poisson), duration of interruption (Gamma), total interruption time per year - (Tweedie / Compound Poisson Gamma). - - -.. topic:: References: - - .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemÃĄtica, no. 51. See also - `Exponential dispersion model. - `_ + relative frequencies (non-negative), you might use a Poisson distribution + with a log-link. +* If the target values are positive valued and skewed, you might try a Gamma + distribution with a log-link. +* If the target values seem to be heavier tailed than a Gamma distribution, you + might try an Inverse Gaussian distribution (or even higher variance powers of + the Tweedie family). +* If the target values :math:`y` are probabilities, you can use the Bernoulli + distribution. The Bernoulli distribution with a logit link can be used for + binary classification. The Categorical distribution with a softmax link can be + used for multiclass classification. + + +.. dropdown:: Examples of use cases + + * Agriculture / weather modeling: number of rain events per year (Poisson), + amount of rainfall per event (Gamma), total rainfall per year (Tweedie / + Compound Poisson Gamma). + * Risk modeling / insurance policy pricing: number of claim events / + policyholder per year (Poisson), cost per event (Gamma), total cost per + policyholder per year (Tweedie / Compound Poisson Gamma). + * Credit Default: probability that a loan can't be paid back (Bernoulli). + * Fraud Detection: probability that a financial transaction like a cash transfer + is a fraudulent transaction (Bernoulli). + * Predictive maintenance: number of production interruption events per year + (Poisson), duration of interruption (Gamma), total interruption time per year + (Tweedie / Compound Poisson Gamma). + * Medical Drug Testing: probability of curing a patient in a set of trials or + probability that a patient will experience side effects (Bernoulli). + * News Classification: classification of news articles into three categories + namely Business News, Politics and Entertainment news (Categorical). + +.. rubric:: References + +.. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + +.. [11] Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemÃĄtica, no. 51. See also + `Exponential dispersion model. + `_ Usage ----- @@ -1051,39 +1284,38 @@ Usage example:: >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) TweedieRegressor(alpha=0.5, link='log', power=1) >>> reg.coef_ - array([0.2463..., 0.4337...]) + array([0.2463, 0.4337]) >>> reg.intercept_ - -0.7638... + np.float64(-0.7638) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` -Practical considerations ------------------------- +.. dropdown:: Practical considerations -The feature matrix `X` should be standardized before fitting. This ensures -that the penalty treats features equally. + The feature matrix `X` should be standardized before fitting. This ensures + that the penalty treats features equally. -Since the linear predictor :math:`Xw` can be negative and Poisson, -Gamma and Inverse Gaussian distributions don't support negative values, it -is necessary to apply an inverse link function that guarantees the -non-negativeness. For example with `link='log'`, the inverse link function -becomes :math:`h(Xw)=\exp(Xw)`. + Since the linear predictor :math:`Xw` can be negative and Poisson, + Gamma and Inverse Gaussian distributions don't support negative values, it + is necessary to apply an inverse link function that guarantees the + non-negativeness. For example with `link='log'`, the inverse link function + becomes :math:`h(Xw)=\exp(Xw)`. -If you want to model a relative frequency, i.e. counts per exposure (time, -volume, ...) you can do so by using a Poisson distribution and passing -:math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values -together with :math:`\mathrm{exposure}` as sample weights. For a concrete -example see e.g. -:ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. + If you want to model a relative frequency, i.e. counts per exposure (time, + volume, ...) you can do so by using a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values + together with :math:`\mathrm{exposure}` as sample weights. For a concrete + example see e.g. + :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. -When performing cross-validation for the `power` parameter of -`TweedieRegressor`, it is advisable to specify an explicit `scoring` function, -because the default scorer :meth:`TweedieRegressor.score` is a function of -`power` itself. + When performing cross-validation for the `power` parameter of + `TweedieRegressor`, it is advisable to specify an explicit `scoring` function, + because the default scorer :meth:`TweedieRegressor.score` is a function of + `power` itself. Stochastic Gradient Descent - SGD ================================= @@ -1100,9 +1332,7 @@ E.g., with ``loss="log"``, :class:`SGDClassifier` fits a logistic regression model, while with ``loss="hinge"`` it fits a linear support vector machine (SVM). -.. topic:: References - - * :ref:`sgd` +You can refer to the dedicated :ref:`sgd` documentation section for more details. .. _perceptron: @@ -1112,16 +1342,21 @@ Perceptron The :class:`Perceptron` is another simple classification algorithm suitable for large scale learning. By default: - - It does not require a learning rate. +- It does not require a learning rate. - - It is not regularized (penalized). +- It is not regularized (penalized). - - It updates its model only on mistakes. +- It updates its model only on mistakes. The last characteristic implies that the Perceptron is slightly faster to train than SGD with the hinge loss and that the resulting models are sparser. +In fact, the :class:`Perceptron` is a wrapper around the :class:`SGDClassifier` +class using a perceptron loss and a constant learning rate. Refer to +:ref:`mathematical section ` of the SGD procedure +for more details. + .. _passive_aggressive: Passive Aggressive Algorithms @@ -1138,13 +1373,11 @@ For classification, :class:`PassiveAggressiveClassifier` can be used with ``loss='epsilon_insensitive'`` (PA-I) or ``loss='squared_epsilon_insensitive'`` (PA-II). -.. topic:: References: - - - * `"Online Passive-Aggressive Algorithms" - `_ - K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR 7 (2006) +.. dropdown:: References + * `"Online Passive-Aggressive Algorithms" + `_ + K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR 7 (2006) Robustness regression: outliers and modeling errors ===================================================== @@ -1203,7 +1436,7 @@ Note that in general, robust fitting in high-dimensional setting (large in these settings. -.. topic:: **Trade-offs: which estimator?** +.. topic:: Trade-offs: which estimator ? Scikit-learn provides 3 robust regression estimators: :ref:`RANSAC `, @@ -1212,7 +1445,7 @@ in these settings. * :ref:`HuberRegressor ` should be faster than :ref:`RANSAC ` and :ref:`Theil Sen ` - unless the number of samples are very large, i.e ``n_samples`` >> ``n_features``. + unless the number of samples is very large, i.e. ``n_samples`` >> ``n_features``. This is because :ref:`RANSAC ` and :ref:`Theil Sen ` fit on smaller subsets of the data. However, both :ref:`Theil Sen ` and :ref:`RANSAC ` are unlikely to be as robust as @@ -1228,7 +1461,7 @@ in these settings. medium-size outliers in the X direction, but this property will disappear in high-dimensional settings. - When in doubt, use :ref:`RANSAC `. + When in doubt, use :ref:`RANSAC `. .. _ransac_regression: @@ -1254,50 +1487,48 @@ estimated only from the determined inliers. :align: center :scale: 50% -Details of the algorithm -^^^^^^^^^^^^^^^^^^^^^^^^ - -Each iteration performs the following steps: - -1. Select ``min_samples`` random samples from the original data and check - whether the set of data is valid (see ``is_data_valid``). -2. Fit a model to the random subset (``base_estimator.fit``) and check - whether the estimated model is valid (see ``is_model_valid``). -3. Classify all data as inliers or outliers by calculating the residuals - to the estimated model (``base_estimator.predict(X) - y``) - all data - samples with absolute residuals smaller than the ``residual_threshold`` - are considered as inliers. -4. Save fitted model as best model if number of inlier samples is - maximal. In case the current estimated model has the same number of - inliers, it is only considered as the best model if it has better score. - -These steps are performed either a maximum number of times (``max_trials``) or -until one of the special stop criteria are met (see ``stop_n_inliers`` and -``stop_score``). The final model is estimated using all inlier samples (consensus -set) of the previously determined best model. - -The ``is_data_valid`` and ``is_model_valid`` functions allow to identify and reject -degenerate combinations of random sub-samples. If the estimated model is not -needed for identifying degenerate cases, ``is_data_valid`` should be used as it -is called prior to fitting the model and thus leading to better computational -performance. - - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_linear_model_plot_ransac.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_robust_fit.py` - -.. topic:: References: - - * https://en.wikipedia.org/wiki/RANSAC - * `"Random Sample Consensus: A Paradigm for Model Fitting with Applications to - Image Analysis and Automated Cartography" - `_ - Martin A. Fischler and Robert C. Bolles - SRI International (1981) - * `"Performance Evaluation of RANSAC Family" - `_ - Sunglok Choi, Taemin Kim and Wonpil Yu - BMVC (2009) +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_linear_model_plot_ransac.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_robust_fit.py` + +.. dropdown:: Details of the algorithm + + Each iteration performs the following steps: + + 1. Select ``min_samples`` random samples from the original data and check + whether the set of data is valid (see ``is_data_valid``). + 2. Fit a model to the random subset (``estimator.fit``) and check + whether the estimated model is valid (see ``is_model_valid``). + 3. Classify all data as inliers or outliers by calculating the residuals + to the estimated model (``estimator.predict(X) - y``) - all data + samples with absolute residuals smaller than or equal to the + ``residual_threshold`` are considered as inliers. + 4. Save fitted model as best model if number of inlier samples is + maximal. In case the current estimated model has the same number of + inliers, it is only considered as the best model if it has better score. + + These steps are performed either a maximum number of times (``max_trials``) or + until one of the special stop criteria are met (see ``stop_n_inliers`` and + ``stop_score``). The final model is estimated using all inlier samples (consensus + set) of the previously determined best model. + + The ``is_data_valid`` and ``is_model_valid`` functions allow to identify and reject + degenerate combinations of random sub-samples. If the estimated model is not + needed for identifying degenerate cases, ``is_data_valid`` should be used as it + is called prior to fitting the model and thus leading to better computational + performance. + +.. dropdown:: References + + * https://en.wikipedia.org/wiki/RANSAC + * `"Random Sample Consensus: A Paradigm for Model Fitting with Applications to + Image Analysis and Automated Cartography" + `_ + Martin A. Fischler and Robert C. Bolles - SRI International (1981) + * `"Performance Evaluation of RANSAC Family" + `_ + Sunglok Choi, Taemin Kim and Wonpil Yu - BMVC (2009) .. _theil_sen_regression: @@ -1310,67 +1541,62 @@ that the robustness of the estimator decreases quickly with the dimensionality of the problem. It loses its robustness properties and becomes no better than an ordinary least squares in high dimension. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_linear_model_plot_theilsen.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_robust_fit.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_theilsen.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_robust_fit.py` -.. topic:: References: - * https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator +.. dropdown:: Theoretical considerations -Theoretical considerations -^^^^^^^^^^^^^^^^^^^^^^^^^^ + :class:`TheilSenRegressor` is comparable to the :ref:`Ordinary Least Squares + (OLS) ` in terms of asymptotic efficiency and as an + unbiased estimator. In contrast to OLS, Theil-Sen is a non-parametric + method which means it makes no assumption about the underlying + distribution of the data. Since Theil-Sen is a median-based estimator, it + is more robust against corrupted data aka outliers. In univariate + setting, Theil-Sen has a breakdown point of about 29.3% in case of a + simple linear regression which means that it can tolerate arbitrary + corrupted data of up to 29.3%. -:class:`TheilSenRegressor` is comparable to the :ref:`Ordinary Least Squares -(OLS) ` in terms of asymptotic efficiency and as an -unbiased estimator. In contrast to OLS, Theil-Sen is a non-parametric -method which means it makes no assumption about the underlying -distribution of the data. Since Theil-Sen is a median-based estimator, it -is more robust against corrupted data aka outliers. In univariate -setting, Theil-Sen has a breakdown point of about 29.3% in case of a -simple linear regression which means that it can tolerate arbitrary -corrupted data of up to 29.3%. - -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_theilsen_001.png - :target: ../auto_examples/linear_model/plot_theilsen.html - :align: center - :scale: 50% + .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_theilsen_001.png + :target: ../auto_examples/linear_model/plot_theilsen.html + :align: center + :scale: 50% -The implementation of :class:`TheilSenRegressor` in scikit-learn follows a -generalization to a multivariate linear regression model [#f1]_ using the -spatial median which is a generalization of the median to multiple -dimensions [#f2]_. + The implementation of :class:`TheilSenRegressor` in scikit-learn follows a + generalization to a multivariate linear regression model [#f1]_ using the + spatial median which is a generalization of the median to multiple + dimensions [#f2]_. -In terms of time and space complexity, Theil-Sen scales according to + In terms of time and space complexity, Theil-Sen scales according to -.. math:: - \binom{n_{\text{samples}}}{n_{\text{subsamples}}} + .. math:: + \binom{n_{\text{samples}}}{n_{\text{subsamples}}} -which makes it infeasible to be applied exhaustively to problems with a -large number of samples and features. Therefore, the magnitude of a -subpopulation can be chosen to limit the time and space complexity by -considering only a random subset of all possible combinations. + which makes it infeasible to be applied exhaustively to problems with a + large number of samples and features. Therefore, the magnitude of a + subpopulation can be chosen to limit the time and space complexity by + considering only a random subset of all possible combinations. -.. topic:: Examples: + .. rubric:: References - * :ref:`sphx_glr_auto_examples_linear_model_plot_theilsen.py` + .. [#f1] Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang: `Theil-Sen Estimators in a Multiple Linear Regression Model. `_ -.. topic:: References: + .. [#f2] T. Kärkkäinen and S. ÄyrämÃļ: `On Computation of Spatial Median for Robust Data Mining. `_ - .. [#f1] Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang: `Theil-Sen Estimators in a Multiple Linear Regression Model. `_ + Also see the `Wikipedia page `_ - .. [#f2] T. Kärkkäinen and S. ÄyrämÃļ: `On Computation of Spatial Median for Robust Data Mining. `_ .. _huber_regression: Huber Regression ---------------- -The :class:`HuberRegressor` is different to :class:`Ridge` because it applies a -linear loss to samples that are classified as outliers. +The :class:`HuberRegressor` is different from :class:`Ridge` because it applies a +linear loss to samples that are defined as outliers by the `epsilon` parameter. A sample is classified as an inlier if the absolute error of that sample is -lesser than a certain threshold. It differs from :class:`TheilSenRegressor` +less than the threshold `epsilon`. It differs from :class:`TheilSenRegressor` and :class:`RANSACRegressor` because it does not ignore the effect of the outliers but gives a lesser weight to them. @@ -1379,25 +1605,35 @@ but gives a lesser weight to them. :align: center :scale: 50% -The loss function that :class:`HuberRegressor` minimizes is given by +.. rubric:: Examples -.. math:: +* :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` - \min_{w, \sigma} {\sum_{i=1}^n\left(\sigma + H_{\epsilon}\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \alpha {||w||_2}^2} +.. dropdown:: Mathematical details -where + :class:`HuberRegressor` minimizes -.. math:: + .. math:: - H_{\epsilon}(z) = \begin{cases} - z^2, & \text {if } |z| < \epsilon, \\ - 2\epsilon|z| - \epsilon^2, & \text{otherwise} - \end{cases} + \min_{w, \sigma} {\sum_{i=1}^n\left(\sigma + H_{\epsilon}\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \alpha {||w||_2}^2} -It is advised to set the parameter ``epsilon`` to 1.35 to achieve 95% statistical efficiency. + where the loss function is given by + + .. math:: + + H_{\epsilon}(z) = \begin{cases} + z^2, & \text {if } |z| < \epsilon, \\ + 2\epsilon|z| - \epsilon^2, & \text{otherwise} + \end{cases} + + It is advised to set the parameter ``epsilon`` to 1.35 to achieve 95% + statistical efficiency. + + .. rubric:: References + + * Peter J. Huber, Elvezio M. Ronchetti: Robust Statistics, Concomitant scale + estimates, p. 172. -Notes ------ The :class:`HuberRegressor` differs from using :class:`SGDRegressor` with loss set to `huber` in the following ways. @@ -1410,18 +1646,89 @@ in the following ways. samples while :class:`SGDRegressor` needs a number of passes on the training data to produce the same robustness. -.. topic:: Examples: +Note that this estimator is different from the `R implementation of Robust +Regression `_ because the R +implementation does a weighted least squares implementation with weights given to each +sample on the basis of how much the residual is greater than a certain threshold. + +.. _quantile_regression: + +Quantile Regression +=================== + +Quantile regression estimates the median or other quantiles of :math:`y` +conditional on :math:`X`, while ordinary least squares (OLS) estimates the +conditional mean. + +Quantile regression may be useful if one is interested in predicting an +interval instead of point prediction. Sometimes, prediction intervals are +calculated based on the assumption that prediction error is distributed +normally with zero mean and constant variance. Quantile regression provides +sensible prediction intervals even for errors with non-constant (but +predictable) variance or non-normal distribution. + +.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_002.png + :target: ../auto_examples/linear_model/plot_quantile_regression.html + :align: center + :scale: 50% + +Based on minimizing the pinball loss, conditional quantiles can also be +estimated by models other than linear models. For example, +:class:`~sklearn.ensemble.GradientBoostingRegressor` can predict conditional +quantiles if its parameter ``loss`` is set to ``"quantile"`` and parameter +``alpha`` is set to the quantile that should be predicted. See the example in +:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`. + +Most implementations of quantile regression are based on linear programming +problem. The current implementation is based on +:func:`scipy.optimize.linprog`. + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_linear_model_plot_quantile_regression.py` + +.. dropdown:: Mathematical details + + As a linear model, the :class:`QuantileRegressor` gives linear predictions + :math:`\hat{y}(w, X) = Xw` for the :math:`q`-th quantile, :math:`q \in (0, 1)`. + The weights or coefficients :math:`w` are then found by the following + minimization problem: - * :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` + .. math:: + \min_{w} {\frac{1}{n_{\text{samples}}} + \sum_i PB_q(y_i - X_i w) + \alpha ||w||_1}. -.. topic:: References: + This consists of the pinball loss (also known as linear loss), + see also :class:`~sklearn.metrics.mean_pinball_loss`, - * Peter J. Huber, Elvezio M. Ronchetti: Robust Statistics, Concomitant scale estimates, pg 172 + .. math:: + PB_q(t) = q \max(t, 0) + (1 - q) \max(-t, 0) = + \begin{cases} + q t, & t > 0, \\ + 0, & t = 0, \\ + (q-1) t, & t < 0 + \end{cases} + + and the L1 penalty controlled by parameter ``alpha``, similar to + :class:`Lasso`. + + As the pinball loss is only linear in the residuals, quantile regression is + much more robust to outliers than squared error based estimation of the mean. + Somewhat in between is the :class:`HuberRegressor`. + +.. dropdown:: References + + * Koenker, R., & Bassett Jr, G. (1978). `Regression quantiles. + `_ + Econometrica: journal of the Econometric Society, 33-50. + + * Portnoy, S., & Koenker, R. (1997). :doi:`The Gaussian hare and the Laplacian + tortoise: computability of squared-error versus absolute-error estimators. + Statistical Science, 12, 279-300 <10.1214/ss/1030037960>`. + + * Koenker, R. (2005). :doi:`Quantile Regression <10.1017/CBO9780511754098>`. + Cambridge University Press. -Note that this estimator is different from the R implementation of Robust Regression -(http://www.ats.ucla.edu/stat/r/dae/rreg.htm) because the R implementation does a weighted least -squares implementation with weights given to each sample on the basis of how much the residual is -greater than a certain threshold. .. _polynomial_regression: @@ -1435,32 +1742,34 @@ on nonlinear functions of the data. This approach maintains the generally fast performance of linear methods, while allowing them to fit a much wider range of data. -For example, a simple linear regression can be extended by constructing -**polynomial features** from the coefficients. In the standard linear -regression case, you might have a model that looks like this for -two-dimensional data: +.. dropdown:: Mathematical details + + For example, a simple linear regression can be extended by constructing + **polynomial features** from the coefficients. In the standard linear + regression case, you might have a model that looks like this for + two-dimensional data: -.. math:: \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 + .. math:: \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 -If we want to fit a paraboloid to the data instead of a plane, we can combine -the features in second-order polynomials, so that the model looks like this: + If we want to fit a paraboloid to the data instead of a plane, we can combine + the features in second-order polynomials, so that the model looks like this: -.. math:: \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 + w_3 x_1 x_2 + w_4 x_1^2 + w_5 x_2^2 + .. math:: \hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 + w_3 x_1 x_2 + w_4 x_1^2 + w_5 x_2^2 -The (sometimes surprising) observation is that this is *still a linear model*: -to see this, imagine creating a new set of features + The (sometimes surprising) observation is that this is *still a linear model*: + to see this, imagine creating a new set of features -.. math:: z = [x_1, x_2, x_1 x_2, x_1^2, x_2^2] + .. math:: z = [x_1, x_2, x_1 x_2, x_1^2, x_2^2] -With this re-labeling of the data, our problem can be written + With this re-labeling of the data, our problem can be written -.. math:: \hat{y}(w, z) = w_0 + w_1 z_1 + w_2 z_2 + w_3 z_3 + w_4 z_4 + w_5 z_5 + .. math:: \hat{y}(w, z) = w_0 + w_1 z_1 + w_2 z_2 + w_3 z_3 + w_4 z_4 + w_5 z_5 -We see that the resulting *polynomial regression* is in the same class of -linear models we considered above (i.e. the model is linear in :math:`w`) -and can be solved by the same techniques. By considering linear fits within -a higher-dimensional space built with these basis functions, the model has the -flexibility to fit a much broader range of data. + We see that the resulting *polynomial regression* is in the same class of + linear models we considered above (i.e. the model is linear in :math:`w`) + and can be solved by the same techniques. By considering linear fits within + a higher-dimensional space built with these basis functions, the model has the + flexibility to fit a much broader range of data. Here is an example of applying this idea to one-dimensional data, using polynomial features of varying degrees: diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst index 8de2a73477c87..aec992a8f9dc1 100644 --- a/doc/modules/manifold.rst +++ b/doc/modules/manifold.rst @@ -7,23 +7,40 @@ Manifold learning ================= -.. rst-class:: quote - - | Look for the bare necessities - | The simple bare necessities - | Forget about your worries and your strife - | I mean the bare necessities - | Old Mother Nature's recipes - | That bring the bare necessities of life - | - | -- Baloo's song [The Jungle Book] +| Look for the bare necessities +| The simple bare necessities +| Forget about your worries and your strife +| I mean the bare necessities +| Old Mother Nature's recipes +| That bring the bare necessities of life +| +| -- Baloo's song [The Jungle Book] .. figure:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_001.png :target: ../auto_examples/manifold/plot_compare_methods.html :align: center - :scale: 60 + :scale: 70% + +.. |manifold_img3| image:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_003.png + :target: ../auto_examples/manifold/plot_compare_methods.html + :scale: 60% + +.. |manifold_img4| image:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_004.png + :target: ../auto_examples/manifold/plot_compare_methods.html + :scale: 60% + +.. |manifold_img5| image:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_005.png + :target: ../auto_examples/manifold/plot_compare_methods.html + :scale: 60% + +.. |manifold_img6| image:: ../auto_examples/manifold/images/sphx_glr_plot_compare_methods_006.png + :target: ../auto_examples/manifold/plot_compare_methods.html + :scale: 60% + +.. centered:: |manifold_img3| |manifold_img4| |manifold_img5| |manifold_img6| + Manifold learning is an approach to non-linear dimensionality reduction. Algorithms for this task are based on the idea that the dimensionality of @@ -83,13 +100,23 @@ unsupervised: it learns the high-dimensional structure of the data from the data itself, without the use of predetermined classifications. -.. topic:: Examples: +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` for an example of + dimensionality reduction on handwritten digits. + +* See :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py` for an example of + dimensionality reduction on a toy "S-curve" dataset. - * See :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` for an example of - dimensionality reduction on handwritten digits. +* See :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py` for an example of + using manifold learning to map the stock market structure based on historical stock + prices. - * See :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py` for an example of - dimensionality reduction on a toy "S-curve" dataset. +* See :ref:`sphx_glr_auto_examples_manifold_plot_manifold_sphere.py` for an example of + manifold learning techniques applied to a spherical data-set. + +* See :ref:`sphx_glr_auto_examples_manifold_plot_swissroll.py` for an example of using + manifold learning techniques on a Swiss Roll dataset. The manifold learning implementations available in scikit-learn are summarized below @@ -111,43 +138,43 @@ distances between all points. Isomap can be performed with the object :align: center :scale: 50 -Complexity ----------- -The Isomap algorithm comprises three stages: +.. dropdown:: Complexity + + The Isomap algorithm comprises three stages: -1. **Nearest neighbor search.** Isomap uses - :class:`~sklearn.neighbors.BallTree` for efficient neighbor search. - The cost is approximately :math:`O[D \log(k) N \log(N)]`, for :math:`k` - nearest neighbors of :math:`N` points in :math:`D` dimensions. + 1. **Nearest neighbor search.** Isomap uses + :class:`~sklearn.neighbors.BallTree` for efficient neighbor search. + The cost is approximately :math:`O[D \log(k) N \log(N)]`, for :math:`k` + nearest neighbors of :math:`N` points in :math:`D` dimensions. -2. **Shortest-path graph search.** The most efficient known algorithms - for this are *Dijkstra's Algorithm*, which is approximately - :math:`O[N^2(k + \log(N))]`, or the *Floyd-Warshall algorithm*, which - is :math:`O[N^3]`. The algorithm can be selected by the user with - the ``path_method`` keyword of ``Isomap``. If unspecified, the code - attempts to choose the best algorithm for the input data. + 2. **Shortest-path graph search.** The most efficient known algorithms + for this are *Dijkstra's Algorithm*, which is approximately + :math:`O[N^2(k + \log(N))]`, or the *Floyd-Warshall algorithm*, which + is :math:`O[N^3]`. The algorithm can be selected by the user with + the ``path_method`` keyword of ``Isomap``. If unspecified, the code + attempts to choose the best algorithm for the input data. -3. **Partial eigenvalue decomposition.** The embedding is encoded in the - eigenvectors corresponding to the :math:`d` largest eigenvalues of the - :math:`N \times N` isomap kernel. For a dense solver, the cost is - approximately :math:`O[d N^2]`. This cost can often be improved using - the ``ARPACK`` solver. The eigensolver can be specified by the user - with the ``eigen_solver`` keyword of ``Isomap``. If unspecified, the - code attempts to choose the best algorithm for the input data. + 3. **Partial eigenvalue decomposition.** The embedding is encoded in the + eigenvectors corresponding to the :math:`d` largest eigenvalues of the + :math:`N \times N` isomap kernel. For a dense solver, the cost is + approximately :math:`O[d N^2]`. This cost can often be improved using + the ``ARPACK`` solver. The eigensolver can be specified by the user + with the ``eigen_solver`` keyword of ``Isomap``. If unspecified, the + code attempts to choose the best algorithm for the input data. -The overall complexity of Isomap is -:math:`O[D \log(k) N \log(N)] + O[N^2(k + \log(N))] + O[d N^2]`. + The overall complexity of Isomap is + :math:`O[D \log(k) N \log(N)] + O[N^2(k + \log(N))] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"A global geometric framework for nonlinear dimensionality reduction" - `_ - Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. Science 290 (5500) +* `"A global geometric framework for nonlinear dimensionality reduction" + `_ + Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. Science 290 (5500) .. _locally_linear_embedding: @@ -168,33 +195,32 @@ Locally linear embedding can be performed with function :align: center :scale: 50 -Complexity ----------- +.. dropdown:: Complexity -The standard LLE algorithm comprises three stages: + The standard LLE algorithm comprises three stages: -1. **Nearest Neighbors Search**. See discussion under Isomap above. + 1. **Nearest Neighbors Search**. See discussion under Isomap above. -2. **Weight Matrix Construction**. :math:`O[D N k^3]`. - The construction of the LLE weight matrix involves the solution of a - :math:`k \times k` linear equation for each of the :math:`N` local - neighborhoods + 2. **Weight Matrix Construction**. :math:`O[D N k^3]`. + The construction of the LLE weight matrix involves the solution of a + :math:`k \times k` linear equation for each of the :math:`N` local + neighborhoods. -3. **Partial Eigenvalue Decomposition**. See discussion under Isomap above. + 3. **Partial Eigenvalue Decomposition**. See discussion under Isomap above. -The overall complexity of standard LLE is -:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[d N^2]`. + The overall complexity of standard LLE is + :math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"Nonlinear dimensionality reduction by locally linear embedding" - `_ - Roweis, S. & Saul, L. Science 290:2323 (2000) +* `"Nonlinear dimensionality reduction by locally linear embedding" + `_ + Roweis, S. & Saul, L. Science 290:2323 (2000) Modified Locally Linear Embedding @@ -222,35 +248,34 @@ It requires ``n_neighbors > n_components``. :align: center :scale: 50 -Complexity ----------- +.. dropdown:: Complexity -The MLLE algorithm comprises three stages: + The MLLE algorithm comprises three stages: -1. **Nearest Neighbors Search**. Same as standard LLE + 1. **Nearest Neighbors Search**. Same as standard LLE -2. **Weight Matrix Construction**. Approximately - :math:`O[D N k^3] + O[N (k-D) k^2]`. The first term is exactly equivalent - to that of standard LLE. The second term has to do with constructing the - weight matrix from multiple weights. In practice, the added cost of - constructing the MLLE weight matrix is relatively small compared to the - cost of stages 1 and 3. + 2. **Weight Matrix Construction**. Approximately + :math:`O[D N k^3] + O[N (k-D) k^2]`. The first term is exactly equivalent + to that of standard LLE. The second term has to do with constructing the + weight matrix from multiple weights. In practice, the added cost of + constructing the MLLE weight matrix is relatively small compared to the + cost of stages 1 and 3. -3. **Partial Eigenvalue Decomposition**. Same as standard LLE + 3. **Partial Eigenvalue Decomposition**. Same as standard LLE -The overall complexity of MLLE is -:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N (k-D) k^2] + O[d N^2]`. + The overall complexity of MLLE is + :math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N (k-D) k^2] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"MLLE: Modified Locally Linear Embedding Using Multiple Weights" - `_ - Zhang, Z. & Wang, J. +* `"MLLE: Modified Locally Linear Embedding Using Multiple Weights" + `_ + Zhang, Z. & Wang, J. Hessian Eigenmapping @@ -272,33 +297,32 @@ It requires ``n_neighbors > n_components * (n_components + 3) / 2``. :align: center :scale: 50 -Complexity ----------- +.. dropdown:: Complexity -The HLLE algorithm comprises three stages: + The HLLE algorithm comprises three stages: -1. **Nearest Neighbors Search**. Same as standard LLE + 1. **Nearest Neighbors Search**. Same as standard LLE -2. **Weight Matrix Construction**. Approximately - :math:`O[D N k^3] + O[N d^6]`. The first term reflects a similar - cost to that of standard LLE. The second term comes from a QR - decomposition of the local hessian estimator. + 2. **Weight Matrix Construction**. Approximately + :math:`O[D N k^3] + O[N d^6]`. The first term reflects a similar + cost to that of standard LLE. The second term comes from a QR + decomposition of the local hessian estimator. -3. **Partial Eigenvalue Decomposition**. Same as standard LLE + 3. **Partial Eigenvalue Decomposition**. Same as standard LLE. -The overall complexity of standard HLLE is -:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N d^6] + O[d N^2]`. + The overall complexity of standard HLLE is + :math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N d^6] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"Hessian Eigenmaps: Locally linear embedding techniques for - high-dimensional data" `_ - Donoho, D. & Grimes, C. Proc Natl Acad Sci USA. 100:5591 (2003) +* `"Hessian Eigenmaps: Locally linear embedding techniques for + high-dimensional data" `_ + Donoho, D. & Grimes, C. Proc Natl Acad Sci USA. 100:5591 (2003) .. _spectral_embedding: @@ -316,35 +340,34 @@ preserving local distances. Spectral embedding can be performed with the function :func:`spectral_embedding` or its object-oriented counterpart :class:`SpectralEmbedding`. -Complexity ----------- +.. dropdown:: Complexity -The Spectral Embedding (Laplacian Eigenmaps) algorithm comprises three stages: + The Spectral Embedding (Laplacian Eigenmaps) algorithm comprises three stages: -1. **Weighted Graph Construction**. Transform the raw input data into - graph representation using affinity (adjacency) matrix representation. + 1. **Weighted Graph Construction**. Transform the raw input data into + graph representation using affinity (adjacency) matrix representation. -2. **Graph Laplacian Construction**. unnormalized Graph Laplacian - is constructed as :math:`L = D - A` for and normalized one as - :math:`L = D^{-\frac{1}{2}} (D - A) D^{-\frac{1}{2}}`. + 2. **Graph Laplacian Construction**. unnormalized Graph Laplacian + is constructed as :math:`L = D - A` for and normalized one as + :math:`L = D^{-\frac{1}{2}} (D - A) D^{-\frac{1}{2}}`. -3. **Partial Eigenvalue Decomposition**. Eigenvalue decomposition is - done on graph Laplacian + 3. **Partial Eigenvalue Decomposition**. Eigenvalue decomposition is + done on graph Laplacian. -The overall complexity of spectral embedding is -:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[d N^2]`. + The overall complexity of spectral embedding is + :math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"Laplacian Eigenmaps for Dimensionality Reduction - and Data Representation" - `_ - M. Belkin, P. Niyogi, Neural Computation, June 2003; 15 (6):1373-1396 +* `"Laplacian Eigenmaps for Dimensionality Reduction + and Data Representation" + `_ + M. Belkin, P. Niyogi, Neural Computation, June 2003; 15 (6):1373-1396 Local Tangent Space Alignment @@ -364,33 +387,32 @@ tangent spaces to learn the embedding. LTSA can be performed with function :align: center :scale: 50 -Complexity ----------- +.. dropdown:: Complexity -The LTSA algorithm comprises three stages: + The LTSA algorithm comprises three stages: -1. **Nearest Neighbors Search**. Same as standard LLE + 1. **Nearest Neighbors Search**. Same as standard LLE -2. **Weight Matrix Construction**. Approximately - :math:`O[D N k^3] + O[k^2 d]`. The first term reflects a similar - cost to that of standard LLE. + 2. **Weight Matrix Construction**. Approximately + :math:`O[D N k^3] + O[k^2 d]`. The first term reflects a similar + cost to that of standard LLE. -3. **Partial Eigenvalue Decomposition**. Same as standard LLE + 3. **Partial Eigenvalue Decomposition**. Same as standard LLE -The overall complexity of standard LTSA is -:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[k^2 d] + O[d N^2]`. + The overall complexity of standard LTSA is + :math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[k^2 d] + O[d N^2]`. -* :math:`N` : number of training data points -* :math:`D` : input dimension -* :math:`k` : number of nearest neighbors -* :math:`d` : output dimension + * :math:`N` : number of training data points + * :math:`D` : input dimension + * :math:`k` : number of nearest neighbors + * :math:`d` : output dimension -.. topic:: References: +.. rubric:: References - * `"Principal manifolds and nonlinear dimensionality reduction via - tangent space alignment" - `_ - Zhang, Z. & Zha, H. Journal of Shanghai Univ. 8:406 (2004) +* :arxiv:`"Principal manifolds and nonlinear dimensionality reduction via + tangent space alignment" + ` + Zhang, Z. & Zha, H. Journal of Shanghai Univ. 8:406 (2004) .. _multidimensional_scaling: @@ -402,20 +424,19 @@ Multi-dimensional Scaling (MDS) representation of the data in which the distances respect well the distances in the original high-dimensional space. -In general, :class:`MDS` is a technique used for analyzing similarity or -dissimilarity data. It attempts to model similarity or dissimilarity data as -distances in a geometric spaces. The data can be ratings of similarity between +In general, :class:`MDS` is a technique used for analyzing +dissimilarity data. It attempts to model dissimilarities as +distances in a Euclidean space. The data can be ratings of dissimilarity between objects, interaction frequencies of molecules, or trade indices between countries. -There exists two types of MDS algorithm: metric and non metric. In the -scikit-learn, the class :class:`MDS` implements both. In Metric MDS, the input -similarity matrix arises from a metric (and thus respects the triangular -inequality), the distances between output two points are then set to be as -close as possible to the similarity or dissimilarity data. In the non-metric -version, the algorithms will try to preserve the order of the distances, and +There exist two types of MDS algorithm: metric and non-metric. In +scikit-learn, the class :class:`MDS` implements both. In metric MDS, +the distances in the embedding space are set as +close as possible to the dissimilarity data. In the non-metric +version, the algorithm will try to preserve the order of the distances, and hence seek for a monotonic relationship between the distances in the embedded -space and the similarities/dissimilarities. +space and the input dissimilarities. .. figure:: ../auto_examples/manifold/images/sphx_glr_plot_lle_digits_010.png :target: ../auto_examples/manifold/plot_lle_digits.html @@ -423,54 +444,68 @@ space and the similarities/dissimilarities. :scale: 50 -Let :math:`S` be the similarity matrix, and :math:`X` the coordinates of the -:math:`n` input points. Disparities :math:`\hat{d}_{ij}` are transformation of -the similarities chosen in some optimal ways. The objective, called the -stress, is then defined by :math:`\sum_{i < j} d_{ij}(X) - \hat{d}_{ij}(X)` +Let :math:`\delta_{ij}` be the dissimilarity matrix between the +:math:`n` input points (possibly arising as some pairwise distances +:math:`d_{ij}(X)` between the coordinates :math:`X` of the input points). +Disparities :math:`\hat{d}_{ij} = f(\delta_{ij})` are some transformation of +the dissimilarities. The MDS objective, called the raw stress, is then +defined by :math:`\sum_{i < j} (\hat{d}_{ij} - d_{ij}(Z))^2`, +where :math:`d_{ij}(Z)` are the pairwise distances between the +coordinates :math:`Z` of the embedded points. -Metric MDS ----------- +.. dropdown:: Metric MDS -The simplest metric :class:`MDS` model, called *absolute MDS*, disparities are defined by -:math:`\hat{d}_{ij} = S_{ij}`. With absolute MDS, the value :math:`S_{ij}` -should then correspond exactly to the distance between point :math:`i` and -:math:`j` in the embedding point. + In the metric :class:`MDS` model (sometimes also called *absolute MDS*), + disparities are simply equal to the input dissimilarities + :math:`\hat{d}_{ij} = \delta_{ij}`. -Most commonly, disparities are set to :math:`\hat{d}_{ij} = b S_{ij}`. +.. dropdown:: Nonmetric MDS -Nonmetric MDS -------------- + Non metric :class:`MDS` focuses on the ordination of the data. If + :math:`\delta_{ij} > \delta_{kl}`, then the embedding + seeks to enforce :math:`d_{ij}(Z) > d_{kl}(Z)`. A simple algorithm + to enforce proper ordination is to use an + isotonic regression of :math:`d_{ij}(Z)` on :math:`\delta_{ij}`, yielding + disparities :math:`\hat{d}_{ij}` that are a monotonic transformation + of dissimilarities :math:`\delta_{ij}` and hence having the same ordering. + This is done repeatedly after every step of the optimization algorithm. + In order to avoid the trivial solution where all embedding points are + overlapping, the disparities :math:`\hat{d}_{ij}` are normalized. -Non metric :class:`MDS` focuses on the ordination of the data. If -:math:`S_{ij} < S_{jk}`, then the embedding should enforce :math:`d_{ij} < -d_{jk}`. A simple algorithm to enforce that is to use a monotonic regression -of :math:`d_{ij}` on :math:`S_{ij}`, yielding disparities :math:`\hat{d}_{ij}` -in the same order as :math:`S_{ij}`. + Note that since we only care about relative ordering, our objective should be + invariant to simple translation and scaling, however the stress used in metric + MDS is sensitive to scaling. To address this, non-metric MDS returns + normalized stress, also known as Stress-1, defined as -A trivial solution to this problem is to set all the points on the origin. In -order to avoid that, the disparities :math:`\hat{d}_{ij}` are normalized. + .. math:: + \sqrt{\frac{\sum_{i < j} (\hat{d}_{ij} - d_{ij}(Z))^2}{\sum_{i < j} + d_{ij}(Z)^2}}. + Normalized Stress-1 is returned if `normalized_stress=True`. -.. figure:: ../auto_examples/manifold/images/sphx_glr_plot_mds_001.png - :target: ../auto_examples/manifold/plot_mds.html - :align: center - :scale: 60 + .. figure:: ../auto_examples/manifold/images/sphx_glr_plot_mds_001.png + :target: ../auto_examples/manifold/plot_mds.html + :align: center + :scale: 60 +.. rubric:: References -.. topic:: References: +* `"More on Multidimensional Scaling and Unfolding in R: smacof Version 2" + `_ + Mair P, Groenen P., de Leeuw J. Journal of Statistical Software (2022) - * `"Modern Multidimensional Scaling - Theory and Applications" - `_ - Borg, I.; Groenen P. Springer Series in Statistics (1997) +* `"Modern Multidimensional Scaling - Theory and Applications" + `_ + Borg, I.; Groenen P. Springer Series in Statistics (1997) - * `"Nonmetric multidimensional scaling: a numerical method" - `_ - Kruskal, J. Psychometrika, 29 (1964) +* `"Nonmetric multidimensional scaling: a numerical method" + `_ + Kruskal, J. Psychometrika, 29 (1964) - * `"Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis" - `_ - Kruskal, J. Psychometrika, 29, (1964) +* `"Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis" + `_ + Kruskal, J. Psychometrika, 29, (1964) .. _t_sne: @@ -518,105 +553,110 @@ The disadvantages to using t-SNE are roughly: :align: center :scale: 50 -Optimizing t-SNE ----------------- -The main purpose of t-SNE is visualization of high-dimensional data. Hence, -it works best when the data will be embedded on two or three dimensions. - -Optimizing the KL divergence can be a little bit tricky sometimes. There are -five parameters that control the optimization of t-SNE and therefore possibly -the quality of the resulting embedding: - -* perplexity -* early exaggeration factor -* learning rate -* maximum number of iterations -* angle (not used in the exact method) - -The perplexity is defined as :math:`k=2^{(S)}` where :math:`S` is the Shannon -entropy of the conditional probability distribution. The perplexity of a -:math:`k`-sided die is :math:`k`, so that :math:`k` is effectively the number of -nearest neighbors t-SNE considers when generating the conditional probabilities. -Larger perplexities lead to more nearest neighbors and less sensitive to small -structure. Conversely a lower perplexity considers a smaller number of -neighbors, and thus ignores more global information in favour of the -local neighborhood. As dataset sizes get larger more points will be -required to get a reasonable sample of the local neighborhood, and hence -larger perplexities may be required. Similarly noisier datasets will require -larger perplexity values to encompass enough local neighbors to see beyond -the background noise. - -The maximum number of iterations is usually high enough and does not need -any tuning. The optimization consists of two phases: the early exaggeration -phase and the final optimization. During early exaggeration the joint -probabilities in the original space will be artificially increased by -multiplication with a given factor. Larger factors result in larger gaps -between natural clusters in the data. If the factor is too high, the KL -divergence could increase during this phase. Usually it does not have to be -tuned. A critical parameter is the learning rate. If it is too low gradient -descent will get stuck in a bad local minimum. If it is too high the KL -divergence will increase during optimization. More tips can be found in -Laurens van der Maaten's FAQ (see references). The last parameter, angle, -is a tradeoff between performance and accuracy. Larger angles imply that we -can approximate larger regions by a single point, leading to better speed -but less accurate results. - -`"How to Use t-SNE Effectively" `_ -provides a good discussion of the effects of the various parameters, as well -as interactive plots to explore the effects of different parameters. - -Barnes-Hut t-SNE ----------------- - -The Barnes-Hut t-SNE that has been implemented here is usually much slower than -other manifold learning algorithms. The optimization is quite difficult -and the computation of the gradient is :math:`O[d N log(N)]`, where :math:`d` -is the number of output dimensions and :math:`N` is the number of samples. The -Barnes-Hut method improves on the exact method where t-SNE complexity is -:math:`O[d N^2]`, but has several other notable differences: - -* The Barnes-Hut implementation only works when the target dimensionality is 3 - or less. The 2D case is typical when building visualizations. -* Barnes-Hut only works with dense input data. Sparse data matrices can only be - embedded with the exact method or can be approximated by a dense low rank - projection for instance using :class:`~sklearn.decomposition.TruncatedSVD` -* Barnes-Hut is an approximation of the exact method. The approximation is - parameterized with the angle parameter, therefore the angle parameter is - unused when method="exact" -* Barnes-Hut is significantly more scalable. Barnes-Hut can be used to embed - hundred of thousands of data points while the exact method can handle - thousands of samples before becoming computationally intractable - -For visualization purpose (which is the main use case of t-SNE), using the -Barnes-Hut method is strongly recommended. The exact t-SNE method is useful -for checking the theoretically properties of the embedding possibly in higher -dimensional space but limit to small datasets due to computational constraints. - -Also note that the digits labels roughly match the natural grouping found by -t-SNE while the linear 2D projection of the PCA model yields a representation -where label regions largely overlap. This is a strong clue that this data can -be well separated by non linear methods that focus on the local structure (e.g. -an SVM with a Gaussian RBF kernel). However, failing to visualize well -separated homogeneously labeled groups with t-SNE in 2D does not necessarily -imply that the data cannot be correctly classified by a supervised model. It -might be the case that 2 dimensions are not low enough to accurately represents -the internal structure of the data. - - -.. topic:: References: - - * `"Visualizing High-Dimensional Data Using t-SNE" - `_ - van der Maaten, L.J.P.; Hinton, G. Journal of Machine Learning Research - (2008) - - * `"t-Distributed Stochastic Neighbor Embedding" - `_ - van der Maaten, L.J.P. - - * `"Accelerating t-SNE using Tree-Based Algorithms." - `_ - L.J.P. van der Maaten. Journal of Machine Learning Research 15(Oct):3221-3245, 2014. +.. dropdown:: Optimizing t-SNE + + The main purpose of t-SNE is visualization of high-dimensional data. Hence, + it works best when the data will be embedded on two or three dimensions. + + Optimizing the KL divergence can be a little bit tricky sometimes. There are + five parameters that control the optimization of t-SNE and therefore possibly + the quality of the resulting embedding: + + * perplexity + * early exaggeration factor + * learning rate + * maximum number of iterations + * angle (not used in the exact method) + + The perplexity is defined as :math:`k=2^{(S)}` where :math:`S` is the Shannon + entropy of the conditional probability distribution. The perplexity of a + :math:`k`-sided die is :math:`k`, so that :math:`k` is effectively the number of + nearest neighbors t-SNE considers when generating the conditional probabilities. + Larger perplexities lead to more nearest neighbors and less sensitive to small + structure. Conversely a lower perplexity considers a smaller number of + neighbors, and thus ignores more global information in favour of the + local neighborhood. As dataset sizes get larger more points will be + required to get a reasonable sample of the local neighborhood, and hence + larger perplexities may be required. Similarly noisier datasets will require + larger perplexity values to encompass enough local neighbors to see beyond + the background noise. + + The maximum number of iterations is usually high enough and does not need + any tuning. The optimization consists of two phases: the early exaggeration + phase and the final optimization. During early exaggeration the joint + probabilities in the original space will be artificially increased by + multiplication with a given factor. Larger factors result in larger gaps + between natural clusters in the data. If the factor is too high, the KL + divergence could increase during this phase. Usually it does not have to be + tuned. A critical parameter is the learning rate. If it is too low gradient + descent will get stuck in a bad local minimum. If it is too high the KL + divergence will increase during optimization. A heuristic suggested in + Belkina et al. (2019) is to set the learning rate to the sample size + divided by the early exaggeration factor. We implement this heuristic + as `learning_rate='auto'` argument. More tips can be found in + Laurens van der Maaten's FAQ (see references). The last parameter, angle, + is a tradeoff between performance and accuracy. Larger angles imply that we + can approximate larger regions by a single point, leading to better speed + but less accurate results. + + `"How to Use t-SNE Effectively" `_ + provides a good discussion of the effects of the various parameters, as well + as interactive plots to explore the effects of different parameters. + +.. dropdown:: Barnes-Hut t-SNE + + The Barnes-Hut t-SNE that has been implemented here is usually much slower than + other manifold learning algorithms. The optimization is quite difficult + and the computation of the gradient is :math:`O[d N log(N)]`, where :math:`d` + is the number of output dimensions and :math:`N` is the number of samples. The + Barnes-Hut method improves on the exact method where t-SNE complexity is + :math:`O[d N^2]`, but has several other notable differences: + + * The Barnes-Hut implementation only works when the target dimensionality is 3 + or less. The 2D case is typical when building visualizations. + * Barnes-Hut only works with dense input data. Sparse data matrices can only be + embedded with the exact method or can be approximated by a dense low rank + projection for instance using :class:`~sklearn.decomposition.PCA` + * Barnes-Hut is an approximation of the exact method. The approximation is + parameterized with the angle parameter, therefore the angle parameter is + unused when method="exact" + * Barnes-Hut is significantly more scalable. Barnes-Hut can be used to embed + hundreds of thousands of data points while the exact method can handle + thousands of samples before becoming computationally intractable + + For visualization purpose (which is the main use case of t-SNE), using the + Barnes-Hut method is strongly recommended. The exact t-SNE method is useful + for checking the theoretical properties of the embedding possibly in higher + dimensional space but limited to small datasets due to computational constraints. + + Also note that the digits labels roughly match the natural grouping found by + t-SNE while the linear 2D projection of the PCA model yields a representation + where label regions largely overlap. This is a strong clue that this data can + be well separated by non linear methods that focus on the local structure (e.g. + an SVM with a Gaussian RBF kernel). However, failing to visualize well + separated homogeneously labeled groups with t-SNE in 2D does not necessarily + imply that the data cannot be correctly classified by a supervised model. It + might be the case that 2 dimensions are not high enough to accurately represent + the internal structure of the data. + +.. rubric:: References + +* `"Visualizing High-Dimensional Data Using t-SNE" + `_ + van der Maaten, L.J.P.; Hinton, G. Journal of Machine Learning Research (2008) + +* `"t-Distributed Stochastic Neighbor Embedding" + `_ van der Maaten, L.J.P. + +* `"Accelerating t-SNE using Tree-Based Algorithms" + `_ + van der Maaten, L.J.P.; Journal of Machine Learning Research 15(Oct):3221-3245, 2014. + +* `"Automated optimized parameters for T-distributed stochastic neighbor + embedding improve visualization and analysis of large datasets" + `_ + Belkina, A.C., Ciccolella, C.O., Anno, R., Halpert, R., Spidlen, J., + Snyder-Cappione, J.E., Nature Communications 10, 5415 (2019). Tips on practical use ===================== @@ -649,5 +689,5 @@ Tips on practical use .. seealso:: :ref:`random_trees_embedding` can also be useful to derive non-linear - representations of feature space, also it does not perform + representations of feature space, but it does not perform dimensionality reduction. diff --git a/doc/modules/metrics.rst b/doc/modules/metrics.rst index 0926980aaaf8a..f65d86a758b03 100644 --- a/doc/modules/metrics.rst +++ b/doc/modules/metrics.rst @@ -28,9 +28,9 @@ There are a number of ways to convert between a distance metric and a similarity measure, such as a kernel. Let ``D`` be the distance, and ``S`` be the kernel: - 1. ``S = np.exp(-D * gamma)``, where one heuristic for choosing - ``gamma`` is ``1 / num_features`` - 2. ``S = 1. / (D / np.max(D))`` +1. ``S = np.exp(-D * gamma)``, where one heuristic for choosing + ``gamma`` is ``1 / num_features`` +2. ``S = 1. / (D / np.max(D))`` .. currentmodule:: sklearn.metrics @@ -87,11 +87,11 @@ represented as tf-idf vectors. can produce normalized vectors, in which case :func:`cosine_similarity` is equivalent to :func:`linear_kernel`, only slower.) -.. topic:: References: +.. rubric:: References - * C.D. Manning, P. Raghavan and H. SchÃŧtze (2008). Introduction to - Information Retrieval. Cambridge University Press. - https://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html +* C.D. Manning, P. Raghavan and H. SchÃŧtze (2008). Introduction to + Information Retrieval. Cambridge University Press. + https://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html .. _linear_kernel: @@ -111,7 +111,7 @@ Polynomial kernel ----------------- The function :func:`polynomial_kernel` computes the degree-d polynomial kernel between two vectors. The polynomial kernel represents the similarity between two -vectors. Conceptually, the polynomial kernels considers not only the similarity +vectors. Conceptually, the polynomial kernel considers not only the similarity between vectors under the same dimension, but also across dimensions. When used in machine learning algorithms, this allows to account for feature interaction. @@ -123,8 +123,8 @@ The polynomial kernel is defined as: where: - * ``x``, ``y`` are the input vectors - * ``d`` is the kernel degree +* ``x``, ``y`` are the input vectors +* ``d`` is the kernel degree If :math:`c_0 = 0` the kernel is said to be homogeneous. @@ -143,9 +143,9 @@ activation function). It is defined as: where: - * ``x``, ``y`` are the input vectors - * :math:`\gamma` is known as slope - * :math:`c_0` is known as intercept +* ``x``, ``y`` are the input vectors +* :math:`\gamma` is known as slope +* :math:`c_0` is known as intercept .. _rbf_kernel: @@ -165,14 +165,14 @@ the kernel is known as the Gaussian kernel of variance :math:`\sigma^2`. Laplacian kernel ---------------- -The function :func:`laplacian_kernel` is a variant on the radial basis +The function :func:`laplacian_kernel` is a variant on the radial basis function kernel defined as: .. math:: k(x, y) = \exp( -\gamma \| x-y \|_1) -where ``x`` and ``y`` are the input vectors and :math:`\|x-y\|_1` is the +where ``x`` and ``y`` are the input vectors and :math:`\|x-y\|_1` is the Manhattan distance between the input vectors. It has proven useful in ML applied to noiseless data. @@ -222,11 +222,10 @@ which is a distance between discrete probability distributions. The chi squared kernel is most commonly used on histograms (bags) of visual words. -.. topic:: References: - - * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. - Local features and kernels for classification of texture and object - categories: A comprehensive study - International Journal of Computer Vision 2007 - https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf +.. rubric:: References +* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. + Local features and kernels for classification of texture and object + categories: A comprehensive study + International Journal of Computer Vision 2007 + https://hal.archives-ouvertes.fr/hal-00171412/document diff --git a/doc/modules/mixture.rst b/doc/modules/mixture.rst index fb8e897270f0b..694bde784d61e 100644 --- a/doc/modules/mixture.rst +++ b/doc/modules/mixture.rst @@ -14,13 +14,13 @@ matrices supported), sample them, and estimate them from data. Facilities to help determine the appropriate number of components are also provided. - .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_pdf_001.png - :target: ../auto_examples/mixture/plot_gmm_pdf.html - :align: center - :scale: 50% +.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_pdf_001.png + :target: ../auto_examples/mixture/plot_gmm_pdf.html + :align: center + :scale: 50% - **Two-component Gaussian mixture model:** *data points, and equi-probability - surfaces of the model.* + **Two-component Gaussian mixture model:** *data points, and equi-probability + surfaces of the model.* A Gaussian mixture model is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of @@ -42,8 +42,8 @@ algorithm for fitting mixture-of-Gaussian models. It can also draw confidence ellipsoids for multivariate models, and compute the Bayesian Information Criterion to assess the number of clusters in the data. A :meth:`GaussianMixture.fit` method is provided that learns a Gaussian -Mixture Model from train data. Given test data, it can assign to each -sample the Gaussian it mostly probably belong to using +Mixture Model from training data. Given test data, it can assign to each +sample the Gaussian it most probably belongs to using the :meth:`GaussianMixture.predict` method. .. @@ -60,80 +60,111 @@ full covariance. :align: center :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_covariances.py` for an example of - using the Gaussian mixture as clustering on the iris dataset. +* See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_covariances.py` for an example of + using the Gaussian mixture as clustering on the iris dataset. - * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_pdf.py` for an example on plotting the - density estimation. +* See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_pdf.py` for an example on plotting the + density estimation. -Pros and cons of class :class:`GaussianMixture` ------------------------------------------------ +.. dropdown:: Pros and cons of class GaussianMixture -Pros -.... + .. rubric:: Pros -:Speed: It is the fastest algorithm for learning mixture models + :Speed: It is the fastest algorithm for learning mixture models -:Agnostic: As this algorithm maximizes only the likelihood, it - will not bias the means towards zero, or bias the cluster sizes to - have specific structures that might or might not apply. + :Agnostic: As this algorithm maximizes only the likelihood, it + will not bias the means towards zero, or bias the cluster sizes to + have specific structures that might or might not apply. -Cons -.... + .. rubric:: Cons -:Singularities: When one has insufficiently many points per - mixture, estimating the covariance matrices becomes difficult, - and the algorithm is known to diverge and find solutions with - infinite likelihood unless one regularizes the covariances artificially. + :Singularities: When one has insufficiently many points per + mixture, estimating the covariance matrices becomes difficult, + and the algorithm is known to diverge and find solutions with + infinite likelihood unless one regularizes the covariances artificially. -:Number of components: This algorithm will always use all the - components it has access to, needing held-out data - or information theoretical criteria to decide how many components to use - in the absence of external cues. + :Number of components: This algorithm will always use all the + components it has access to, needing held-out data + or information theoretical criteria to decide how many components to use + in the absence of external cues. -Selecting the number of components in a classical Gaussian Mixture Model ------------------------------------------------------------------------- +.. dropdown:: Selecting the number of components in a classical Gaussian Mixture model -The BIC criterion can be used to select the number of components in a Gaussian -Mixture in an efficient way. In theory, it recovers the true number of -components only in the asymptotic regime (i.e. if much data is available and -assuming that the data was actually generated i.i.d. from a mixture of Gaussian -distribution). Note that using a :ref:`Variational Bayesian Gaussian mixture ` -avoids the specification of the number of components for a Gaussian mixture -model. + The BIC criterion can be used to select the number of components in a Gaussian + Mixture in an efficient way. In theory, it recovers the true number of + components only in the asymptotic regime (i.e. if much data is available and + assuming that the data was actually generated i.i.d. from a mixture of Gaussian + distributions). Note that using a :ref:`Variational Bayesian Gaussian mixture ` + avoids the specification of the number of components for a Gaussian mixture + model. -.. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_selection_001.png - :target: ../auto_examples/mixture/plot_gmm_selection.html - :align: center - :scale: 50% + .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_selection_002.png + :target: ../auto_examples/mixture/plot_gmm_selection.html + :align: center + :scale: 50% -.. topic:: Examples: + .. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py` for an example - of model selection performed with classical Gaussian mixture. + * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py` for an example + of model selection performed with classical Gaussian mixture. .. _expectation_maximization: -Estimation algorithm Expectation-maximization ------------------------------------------------ - -The main difficulty in learning Gaussian mixture models from unlabeled -data is that it is one usually doesn't know which points came from -which latent component (if one has access to this information it gets -very easy to fit a separate Gaussian distribution to each set of -points). `Expectation-maximization -`_ -is a well-founded statistical -algorithm to get around this problem by an iterative process. First -one assumes random components (randomly centered on data points, -learned from k-means, or even just normally distributed around the -origin) and computes for each point a probability of being generated by -each component of the model. Then, one tweaks the -parameters to maximize the likelihood of the data given those -assignments. Repeating this process is guaranteed to always converge -to a local optimum. +.. dropdown:: Estimation algorithm expectation-maximization + + The main difficulty in learning Gaussian mixture models from unlabeled + data is that one usually doesn't know which points came from + which latent component (if one has access to this information it gets + very easy to fit a separate Gaussian distribution to each set of + points). `Expectation-maximization + `_ + is a well-founded statistical + algorithm to get around this problem by an iterative process. First + one assumes random components (randomly centered on data points, + learned from k-means, or even just normally distributed around the + origin) and computes for each point a probability of being generated by + each component of the model. Then, one tweaks the + parameters to maximize the likelihood of the data given those + assignments. Repeating this process is guaranteed to always converge + to a local optimum. + +.. dropdown:: Choice of the Initialization method + + There is a choice of four initialization methods (as well as inputting user defined + initial means) to generate the initial centers for the model components: + + k-means (default) + This applies a traditional k-means clustering algorithm. + This can be computationally expensive compared to other initialization methods. + + k-means++ + This uses the initialization method of k-means clustering: k-means++. + This will pick the first center at random from the data. Subsequent centers will be + chosen from a weighted distribution of the data favouring points further away from + existing centers. k-means++ is the default initialization for k-means so will be + quicker than running a full k-means but can still take a significant amount of + time for large data sets with many components. + + random_from_data + This will pick random data points from the input data as the initial + centers. This is a very fast method of initialization but can produce non-convergent + results if the chosen points are too close to each other. + + random + Centers are chosen as a small perturbation away from the mean of all data. + This method is simple but can lead to the model taking longer to converge. + + .. figure:: ../auto_examples/mixture/images/sphx_glr_plot_gmm_init_001.png + :target: ../auto_examples/mixture/plot_gmm_init.html + :align: center + :scale: 50% + + .. rubric:: Examples + + * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm_init.py` for an example of + using different initializations in Gaussian Mixture. .. _bgmm: @@ -142,12 +173,11 @@ Variational Bayesian Gaussian Mixture The :class:`BayesianGaussianMixture` object implements a variant of the Gaussian mixture model with variational inference algorithms. The API is -similar as the one defined by :class:`GaussianMixture`. +similar to the one defined by :class:`GaussianMixture`. .. _variational_inference: -Estimation algorithm: variational inference ---------------------------------------------- +**Estimation algorithm: variational inference** Variational inference is an extension of expectation-maximization that maximizes a lower bound on model evidence (including @@ -162,13 +192,13 @@ expectation-maximization solutions but introduces some subtle biases to the model. Inference is often notably slower, but not usually as much so as to render usage unpractical. -Due to its Bayesian nature, the variational algorithm needs more hyper- -parameters than expectation-maximization, the most important of these being the +Due to its Bayesian nature, the variational algorithm needs more hyperparameters +than expectation-maximization, the most important of these being the concentration parameter ``weight_concentration_prior``. Specifying a low value -for the concentration prior will make the model put most of the weight on few -components set the remaining components weights very close to zero. High values -of the concentration prior will allow a larger number of components to be active -in the mixture. +for the concentration prior will make the model put most of the weight on a few +components and set the remaining components' weights very close to zero. High +values of the concentration prior will allow a larger number of components to +be active in the mixture. The parameters implementation of the :class:`BayesianGaussianMixture` class proposes two types of prior for the weights distribution: a finite mixture model @@ -178,7 +208,7 @@ uses a truncated distribution with a fixed maximum number of components (called the Stick-breaking representation). The number of components actually used almost always depends on the data. -The next figure compares the results obtained for the different type of the +The next figure compares the results obtained for the different types of the weight concentration prior (parameter ``weight_concentration_prior_type``) for different values of ``weight_concentration_prior``. Here, we can see the value of the ``weight_concentration_prior`` parameter @@ -229,64 +259,58 @@ from the two resulting mixtures. -.. topic:: Examples: - - * See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on - plotting the confidence ellipsoids for both :class:`GaussianMixture` - and :class:`BayesianGaussianMixture`. - - * :ref:`sphx_glr_auto_examples_mixture_plot_gmm_sin.py` shows using - :class:`GaussianMixture` and :class:`BayesianGaussianMixture` to fit a - sine wave. - - * See :ref:`sphx_glr_auto_examples_mixture_plot_concentration_prior.py` - for an example plotting the confidence ellipsoids for the - :class:`BayesianGaussianMixture` with different - ``weight_concentration_prior_type`` for different values of the parameter - ``weight_concentration_prior``. +.. rubric:: Examples +* See :ref:`sphx_glr_auto_examples_mixture_plot_gmm.py` for an example on + plotting the confidence ellipsoids for both :class:`GaussianMixture` + and :class:`BayesianGaussianMixture`. -Pros and cons of variational inference with :class:`BayesianGaussianMixture` ----------------------------------------------------------------------------- +* :ref:`sphx_glr_auto_examples_mixture_plot_gmm_sin.py` shows using + :class:`GaussianMixture` and :class:`BayesianGaussianMixture` to fit a + sine wave. -Pros -..... +* See :ref:`sphx_glr_auto_examples_mixture_plot_concentration_prior.py` + for an example plotting the confidence ellipsoids for the + :class:`BayesianGaussianMixture` with different + ``weight_concentration_prior_type`` for different values of the parameter + ``weight_concentration_prior``. -:Automatic selection: when ``weight_concentration_prior`` is small enough and - ``n_components`` is larger than what is found necessary by the model, the - Variational Bayesian mixture model has a natural tendency to set some mixture - weights values close to zero. This makes it possible to let the model choose - a suitable number of effective components automatically. Only an upper bound - of this number needs to be provided. Note however that the "ideal" number of - active components is very application specific and is typically ill-defined - in a data exploration setting. +.. dropdown:: Pros and cons of variational inference with BayesianGaussianMixture -:Less sensitivity to the number of parameters: unlike finite models, which will - almost always use all components as much as they can, and hence will produce - wildly different solutions for different numbers of components, the - variational inference with a Dirichlet process prior - (``weight_concentration_prior_type='dirichlet_process'``) won't change much - with changes to the parameters, leading to more stability and less tuning. + .. rubric:: Pros -:Regularization: due to the incorporation of prior information, - variational solutions have less pathological special cases than - expectation-maximization solutions. + :Automatic selection: When ``weight_concentration_prior`` is small enough and + ``n_components`` is larger than what is found necessary by the model, the + Variational Bayesian mixture model has a natural tendency to set some mixture + weights values close to zero. This makes it possible to let the model choose + a suitable number of effective components automatically. Only an upper bound + of this number needs to be provided. Note however that the "ideal" number of + active components is very application specific and is typically ill-defined + in a data exploration setting. + :Less sensitivity to the number of parameters: Unlike finite models, which will + almost always use all components as much as they can, and hence will produce + wildly different solutions for different numbers of components, the + variational inference with a Dirichlet process prior + (``weight_concentration_prior_type='dirichlet_process'``) won't change much + with changes to the parameters, leading to more stability and less tuning. -Cons -..... + :Regularization: Due to the incorporation of prior information, + variational solutions have less pathological special cases than + expectation-maximization solutions. -:Speed: the extra parametrization necessary for variational inference make - inference slower, although not by much. + .. rubric:: Cons -:Hyperparameters: this algorithm needs an extra hyperparameter - that might need experimental tuning via cross-validation. + :Speed: The extra parametrization necessary for variational inference makes + inference slower, although not by much. -:Bias: there are many implicit biases in the inference algorithms (and also in - the Dirichlet process if used), and whenever there is a mismatch between - these biases and the data it might be possible to fit better models using a - finite mixture. + :Hyperparameters: This algorithm needs an extra hyperparameter + that might need experimental tuning via cross-validation. + :Bias: There are many implicit biases in the inference algorithms (and also in + the Dirichlet process if used), and whenever there is a mismatch between + these biases and the data it might be possible to fit better models using a + finite mixture. .. _dirichlet_process: @@ -312,7 +336,7 @@ group of the mixture. At the end, to represent the infinite mixture, we associate the last remaining piece of the stick to the proportion of points that don't fall into all the other groups. The length of each piece is a random variable with probability proportional to the concentration parameter. Smaller -value of the concentration will divide the unit-length into larger pieces of +values of the concentration will divide the unit-length into larger pieces of the stick (defining more concentrated distribution). Larger concentration values will create smaller pieces of the stick (increasing the number of components with non zero weights). diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c807af982e277..c304966fccdb2 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -6,18 +6,158 @@ Metrics and scoring: quantifying the quality of predictions =========================================================== +.. _which_scoring_function: + +Which scoring function should I use? +==================================== + +Before we take a closer look into the details of the many scores and +:term:`evaluation metrics`, we want to give some guidance, inspired by statistical +decision theory, on the choice of **scoring functions** for **supervised learning**, +see [Gneiting2009]_: + +- *Which scoring function should I use?* +- *Which scoring function is a good one for my task?* + +In a nutshell, if the scoring function is given, e.g. in a kaggle competition +or in a business context, use that one. +If you are free to choose, it starts by considering the ultimate goal and application +of the prediction. It is useful to distinguish two steps: + +* Predicting +* Decision making + +**Predicting:** +Usually, the response variable :math:`Y` is a random variable, in the sense that there +is *no deterministic* function :math:`Y = g(X)` of the features :math:`X`. +Instead, there is a probability distribution :math:`F` of :math:`Y`. +One can aim to predict the whole distribution, known as *probabilistic prediction*, +or---more the focus of scikit-learn---issue a *point prediction* (or point forecast) +by choosing a property or functional of that distribution :math:`F`. +Typical examples are the mean (expected value), the median or a quantile of the +response variable :math:`Y` (conditionally on :math:`X`). + +Once that is settled, use a **strictly consistent** scoring function for that +(target) functional, see [Gneiting2009]_. +This means using a scoring function that is aligned with *measuring the distance +between predictions* `y_pred` *and the true target functional using observations of* +:math:`Y`, i.e. `y_true`. +For classification **strictly proper scoring rules**, see +`Wikipedia entry for Scoring rule `_ +and [Gneiting2007]_, coincide with strictly consistent scoring functions. +The table further below provides examples. +One could say that consistent scoring functions act as *truth serum* in that +they guarantee *"that truth telling [. . .] is an optimal strategy in +expectation"* [Gneiting2014]_. + +Once a strictly consistent scoring function is chosen, it is best used for both: as +loss function for model training and as metric/score in model evaluation and model +comparison. + +Note that for regressors, the prediction is done with :term:`predict` while for +classifiers it is usually :term:`predict_proba`. + +**Decision Making:** +The most common decisions are done on binary classification tasks, where the result of +:term:`predict_proba` is turned into a single outcome, e.g., from the predicted +probability of rain a decision is made on how to act (whether to take mitigating +measures like an umbrella or not). +For classifiers, this is what :term:`predict` returns. +See also :ref:`TunedThresholdClassifierCV`. +There are many scoring functions which measure different aspects of such a +decision, most of them are covered with or derived from the +:func:`metrics.confusion_matrix`. + +**List of strictly consistent scoring functions:** +Here, we list some of the most relevant statistical functionals and corresponding +strictly consistent scoring functions for tasks in practice. Note that the list is not +complete and that there are more of them. +For further criteria on how to select a specific one, see [Fissler2022]_. + +================== =================================================== ==================== ================================= +functional scoring or loss function response `y` prediction +================== =================================================== ==================== ================================= +**Classification** +mean :ref:`Brier score ` :sup:`1` multi-class ``predict_proba`` +mean :ref:`log loss ` multi-class ``predict_proba`` +mode :ref:`zero-one loss ` :sup:`2` multi-class ``predict``, categorical +**Regression** +mean :ref:`squared error ` :sup:`3` all reals ``predict``, all reals +mean :ref:`Poisson deviance ` non-negative ``predict``, strictly positive +mean :ref:`Gamma deviance ` strictly positive ``predict``, strictly positive +mean :ref:`Tweedie deviance ` depends on ``power`` ``predict``, depends on ``power`` +median :ref:`absolute error ` all reals ``predict``, all reals +quantile :ref:`pinball loss ` all reals ``predict``, all reals +mode no consistent one exists reals +================== =================================================== ==================== ================================= + +:sup:`1` The Brier score is just a different name for the squared error in case of +classification. + +:sup:`2` The zero-one loss is only consistent but not strictly consistent for the mode. +The zero-one loss is equivalent to one minus the accuracy score, meaning it gives +different score values but the same ranking. + +:sup:`3` R² gives the same ranking as squared error. + +**Fictitious Example:** +Let's make the above arguments more tangible. Consider a setting in network reliability +engineering, such as maintaining stable internet or Wi-Fi connections. +As provider of the network, you have access to the dataset of log entries of network +connections containing network load over time and many interesting features. +Your goal is to improve the reliability of the connections. +In fact, you promise your customers that on at least 99% of all days there are no +connection discontinuities larger than 1 minute. +Therefore, you are interested in a prediction of the 99% quantile (of longest +connection interruption duration per day) in order to know in advance when to add +more bandwidth and thereby satisfy your customers. So the *target functional* is the +99% quantile. From the table above, you choose the pinball loss as scoring function +(fair enough, not much choice given), for model training (e.g. +`HistGradientBoostingRegressor(loss="quantile", quantile=0.99)`) as well as model +evaluation (`mean_pinball_loss(..., alpha=0.99)` - we apologize for the different +argument names, `quantile` and `alpha`) be it in grid search for finding +hyperparameters or in comparing to other models like +`QuantileRegressor(quantile=0.99)`. + +.. rubric:: References + +.. [Gneiting2007] T. Gneiting and A. E. Raftery. :doi:`Strictly Proper + Scoring Rules, Prediction, and Estimation <10.1198/016214506000001437>` + In: Journal of the American Statistical Association 102 (2007), + pp. 359– 378. + `link to pdf `_ + +.. [Gneiting2009] T. Gneiting. :arxiv:`Making and Evaluating Point Forecasts + <0912.0902>` + Journal of the American Statistical Association 106 (2009): 746 - 762. + +.. [Gneiting2014] T. Gneiting and M. Katzfuss. :doi:`Probabilistic Forecasting + <10.1146/annurev-statistics-062713-085831>`. In: Annual Review of Statistics and Its Application 1.1 (2014), pp. 125–151. + +.. [Fissler2022] T. Fissler, C. Lorentzen and M. Mayer. :arxiv:`Model + Comparison and Calibration Assessment: User Guide for Consistent Scoring + Functions in Machine Learning and Actuarial Practice. <2202.12780>` + +.. _scoring_api_overview: + +Scoring API overview +==================== + There are 3 different APIs for evaluating the quality of a model's predictions: * **Estimator score method**: Estimators have a ``score`` method providing a default evaluation criterion for the problem they are designed to solve. - This is not discussed on this page, but in each estimator's documentation. + Most commonly this is :ref:`accuracy ` for classifiers and the + :ref:`coefficient of determination ` (:math:`R^2`) for regressors. + Details for each estimator can be found in its documentation. -* **Scoring parameter**: Model-evaluation tools using +* **Scoring parameter**: Model-evaluation tools that use :ref:`cross-validation ` (such as - :func:`model_selection.cross_val_score` and - :class:`model_selection.GridSearchCV`) rely on an internal *scoring* strategy. - This is discussed in the section :ref:`scoring_parameter`. + :class:`model_selection.GridSearchCV`, :func:`model_selection.validation_curve` and + :class:`linear_model.LogisticRegressionCV`) rely on an internal *scoring* strategy. + This can be specified using the `scoring` parameter of that tool and is discussed + in the section :ref:`scoring_parameter`. * **Metric functions**: The :mod:`sklearn.metrics` module implements functions assessing prediction error for specific purposes. These metrics are detailed @@ -38,24 +178,39 @@ value of those metrics for random predictions. The ``scoring`` parameter: defining model evaluation rules ========================================================== -Model selection and evaluation using tools, such as -:class:`model_selection.GridSearchCV` and -:func:`model_selection.cross_val_score`, take a ``scoring`` parameter that +Model selection and evaluation tools that internally use +:ref:`cross-validation ` (such as +:class:`model_selection.GridSearchCV`, :func:`model_selection.validation_curve` and +:class:`linear_model.LogisticRegressionCV`) take a ``scoring`` parameter that controls what metric they apply to the estimators evaluated. -Common cases: predefined values -------------------------------- +They can be specified in several ways: + +* `None`: the estimator's default evaluation criterion (i.e., the metric used in the + estimator's `score` method) is used. +* :ref:`String name `: common metrics can be passed via a string + name. +* :ref:`Callable `: more complex metrics can be passed via a custom + metric callable (e.g., function). + +Some tools do also accept multiple metric evaluation. See :ref:`multimetric_scoring` +for details. + +.. _scoring_string_names: + +String name scorers +------------------- For the most common use cases, you can designate a scorer object with the -``scoring`` parameter; the table below shows all possible values. +``scoring`` parameter via a string name; the table below shows all possible values. All scorer objects follow the convention that **higher return values are better -than lower return values**. Thus metrics which measure the distance between +than lower return values**. Thus metrics which measure the distance between the model and the data, like :func:`metrics.mean_squared_error`, are -available as neg_mean_squared_error which return the negated value +available as 'neg_mean_squared_error' which return the negated value of the metric. ==================================== ============================================== ================================== -Scoring Function Comment +Scoring string name Function Comment ==================================== ============================================== ================================== **Classification** 'accuracy' :func:`metrics.accuracy_score` @@ -77,6 +232,7 @@ Scoring Function 'roc_auc_ovo' :func:`metrics.roc_auc_score` 'roc_auc_ovr_weighted' :func:`metrics.roc_auc_score` 'roc_auc_ovo_weighted' :func:`metrics.roc_auc_score` +'d2_log_loss_score' :func:`metrics.d2_log_loss_score` **Clustering** 'adjusted_mutual_info_score' :func:`metrics.adjusted_mutual_info_score` @@ -91,19 +247,20 @@ Scoring Function **Regression** 'explained_variance' :func:`metrics.explained_variance_score` -'max_error' :func:`metrics.max_error` +'neg_max_error' :func:`metrics.max_error` 'neg_mean_absolute_error' :func:`metrics.mean_absolute_error` 'neg_mean_squared_error' :func:`metrics.mean_squared_error` -'neg_root_mean_squared_error' :func:`metrics.mean_squared_error` +'neg_root_mean_squared_error' :func:`metrics.root_mean_squared_error` 'neg_mean_squared_log_error' :func:`metrics.mean_squared_log_error` +'neg_root_mean_squared_log_error' :func:`metrics.root_mean_squared_log_error` 'neg_median_absolute_error' :func:`metrics.median_absolute_error` 'r2' :func:`metrics.r2_score` 'neg_mean_poisson_deviance' :func:`metrics.mean_poisson_deviance` 'neg_mean_gamma_deviance' :func:`metrics.mean_gamma_deviance` 'neg_mean_absolute_percentage_error' :func:`metrics.mean_absolute_percentage_error` +'d2_absolute_error_score' :func:`metrics.d2_absolute_error_score` ==================================== ============================================== ================================== - Usage examples: >>> from sklearn import svm, datasets @@ -111,47 +268,51 @@ Usage examples: >>> X, y = datasets.load_iris(return_X_y=True) >>> clf = svm.SVC(random_state=0) >>> cross_val_score(clf, X, y, cv=5, scoring='recall_macro') - array([0.96..., 0.96..., 0.96..., 0.93..., 1. ]) - >>> model = svm.SVC() - >>> cross_val_score(model, X, y, cv=5, scoring='wrong_choice') - Traceback (most recent call last): - ValueError: 'wrong_choice' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options. + array([0.96, 0.96, 0.96, 0.93, 1. ]) .. note:: - The values listed by the ``ValueError`` exception correspond to the functions measuring - prediction accuracy described in the following sections. - The scorer objects for those functions are stored in the dictionary - ``sklearn.metrics.SCORERS``. + If a wrong scoring name is passed, an ``InvalidParameterError`` is raised. + You can retrieve the names of all available scorers by calling + :func:`~sklearn.metrics.get_scorer_names`. .. currentmodule:: sklearn.metrics -.. _scoring: +.. _scoring_callable: -Defining your scoring strategy from metric functions ------------------------------------------------------ +Callable scorers +---------------- -The module :mod:`sklearn.metrics` also exposes a set of simple functions -measuring a prediction error given ground truth and prediction: +For more complex use cases and more flexibility, you can pass a callable to +the `scoring` parameter. This can be done by: -- functions ending with ``_score`` return a value to - maximize, the higher the better. +* :ref:`scoring_adapt_metric` +* :ref:`scoring_custom` (most flexible) -- functions ending with ``_error`` or ``_loss`` return a - value to minimize, the lower the better. When converting - into a scorer object using :func:`make_scorer`, set - the ``greater_is_better`` parameter to ``False`` (``True`` by default; see the - parameter description below). +.. _scoring_adapt_metric: -Metrics available for various machine learning tasks are detailed in sections -below. +Adapting predefined metrics via `make_scorer` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Many metrics are not given names to be used as ``scoring`` values, +The following metric functions are not implemented as named scorers, sometimes because they require additional parameters, such as -:func:`fbeta_score`. In such cases, you need to generate an appropriate -scoring object. The simplest way to generate a callable object for scoring -is by using :func:`make_scorer`. That function converts metrics -into callables that can be used for model evaluation. +:func:`fbeta_score`. They cannot be passed to the ``scoring`` +parameters; instead their callable needs to be passed to +:func:`make_scorer` together with the value of the user-settable +parameters. + +===================================== ========= ============================================== +Function Parameter Example usage +===================================== ========= ============================================== +**Classification** +:func:`metrics.fbeta_score` ``beta`` ``make_scorer(fbeta_score, beta=2)`` + +**Regression** +:func:`metrics.mean_tweedie_deviance` ``power`` ``make_scorer(mean_tweedie_deviance, power=1.5)`` +:func:`metrics.mean_pinball_loss` ``alpha`` ``make_scorer(mean_pinball_loss, alpha=0.95)`` +:func:`metrics.d2_tweedie_score` ``power`` ``make_scorer(d2_tweedie_score, power=1.5)`` +:func:`metrics.d2_pinball_score` ``alpha`` ``make_scorer(d2_pinball_score, alpha=0.95)`` +===================================== ========= ============================================== One typical use case is to wrap an existing metric function from the library with non-default values for its parameters, such as the ``beta`` parameter for @@ -164,67 +325,101 @@ the :func:`fbeta_score` function:: >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, ... scoring=ftwo_scorer, cv=5) -The second use case is to build a completely custom scorer object -from a simple python function using :func:`make_scorer`, which can -take several parameters: - -* the python function you want to use (``my_custom_loss_func`` - in the example below) - -* whether the python function returns a score (``greater_is_better=True``, - the default) or a loss (``greater_is_better=False``). If a loss, the output - of the python function is negated by the scorer object, conforming to - the cross validation convention that scorers return higher values for better models. - -* for classification metrics only: whether the python function you provided requires continuous decision - certainties (``needs_threshold=True``). The default value is - False. +The module :mod:`sklearn.metrics` also exposes a set of simple functions +measuring a prediction error given ground truth and prediction: -* any additional parameters, such as ``beta`` or ``labels`` in :func:`f1_score`. +- functions ending with ``_score`` return a value to + maximize, the higher the better. -Here is an example of building custom scorers, and of using the -``greater_is_better`` parameter:: +- functions ending with ``_error``, ``_loss``, or ``_deviance`` return a + value to minimize, the lower the better. When converting + into a scorer object using :func:`make_scorer`, set + the ``greater_is_better`` parameter to ``False`` (``True`` by default; see the + parameter description below). - >>> import numpy as np - >>> def my_custom_loss_func(y_true, y_pred): - ... diff = np.abs(y_true - y_pred).max() - ... return np.log1p(diff) - ... - >>> # score will negate the return value of my_custom_loss_func, - >>> # which will be np.log(2), 0.693, given the values for X - >>> # and y defined below. - >>> score = make_scorer(my_custom_loss_func, greater_is_better=False) - >>> X = [[1], [1]] - >>> y = [0, 1] - >>> from sklearn.dummy import DummyClassifier - >>> clf = DummyClassifier(strategy='most_frequent', random_state=0) - >>> clf = clf.fit(X, y) - >>> my_custom_loss_func(y, clf.predict(X)) - 0.69... - >>> score(clf, X, y) - -0.69... - - -.. _diy_scoring: - -Implementing your own scoring object ------------------------------------- -You can generate even more flexible model scorers by constructing your own -scoring object from scratch, without using the :func:`make_scorer` factory. -For a callable to be a scorer, it needs to meet the protocol specified by -the following two rules: - -- It can be called with parameters ``(estimator, X, y)``, where ``estimator`` - is the model that should be evaluated, ``X`` is validation data, and ``y`` is - the ground truth target for ``X`` (in the supervised case) or ``None`` (in the - unsupervised case). - -- It returns a floating point number that quantifies the - ``estimator`` prediction quality on ``X``, with reference to ``y``. - Again, by convention higher numbers are better, so if your scorer - returns loss, that value should be negated. - -.. note:: **Using custom scorers in functions where n_jobs > 1** +.. _scoring_custom: + +Creating a custom scorer object +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can create your own custom scorer object using +:func:`make_scorer` or for the most flexibility, from scratch. See below for details. + +.. dropdown:: Custom scorer objects using `make_scorer` + + You can build a completely custom scorer object + from a simple python function using :func:`make_scorer`, which can + take several parameters: + + * the python function you want to use (``my_custom_loss_func`` + in the example below) + + * whether the python function returns a score (``greater_is_better=True``, + the default) or a loss (``greater_is_better=False``). If a loss, the output + of the python function is negated by the scorer object, conforming to + the cross validation convention that scorers return higher values for better models. + + * for classification metrics only: whether the python function you provided requires + continuous decision certainties. If the scoring function only accepts probability + estimates (e.g. :func:`metrics.log_loss`), then one needs to set the parameter + `response_method="predict_proba"`. Some scoring + functions do not necessarily require probability estimates but rather non-thresholded + decision values (e.g. :func:`metrics.roc_auc_score`). In this case, one can provide a + list (e.g., `response_method=["decision_function", "predict_proba"]`), + and scorer will use the first available method, in the order given in the list, + to compute the scores. + + * any additional parameters of the scoring function, such as ``beta`` or ``labels``. + + Here is an example of building custom scorers, and of using the + ``greater_is_better`` parameter:: + + >>> import numpy as np + >>> def my_custom_loss_func(y_true, y_pred): + ... diff = np.abs(y_true - y_pred).max() + ... return float(np.log1p(diff)) + ... + >>> # score will negate the return value of my_custom_loss_func, + >>> # which will be np.log(2), 0.693, given the values for X + >>> # and y defined below. + >>> score = make_scorer(my_custom_loss_func, greater_is_better=False) + >>> X = [[1], [1]] + >>> y = [0, 1] + >>> from sklearn.dummy import DummyClassifier + >>> clf = DummyClassifier(strategy='most_frequent', random_state=0) + >>> clf = clf.fit(X, y) + >>> my_custom_loss_func(y, clf.predict(X)) + 0.69 + >>> score(clf, X, y) + -0.69 + +.. dropdown:: Custom scorer objects from scratch + + You can generate even more flexible model scorers by constructing your own + scoring object from scratch, without using the :func:`make_scorer` factory. + + For a callable to be a scorer, it needs to meet the protocol specified by + the following two rules: + + - It can be called with parameters ``(estimator, X, y)``, where ``estimator`` + is the model that should be evaluated, ``X`` is validation data, and ``y`` is + the ground truth target for ``X`` (in the supervised case) or ``None`` (in the + unsupervised case). + + - It returns a floating point number that quantifies the + ``estimator`` prediction quality on ``X``, with reference to ``y``. + Again, by convention higher numbers are better, so if your scorer + returns loss, that value should be negated. + + - Advanced: If it requires extra metadata to be passed to it, it should expose + a ``get_metadata_routing`` method returning the requested metadata. The user + should be able to set the requested metadata via a ``set_score_request`` + method. Please see :ref:`User Guide ` and :ref:`Developer + Guide ` for + more details. + + +.. dropdown:: Using custom scorers in functions where n_jobs > 1 While defining the custom scoring function alongside the calling function should work out of the box with the default joblib backend (loky), @@ -255,13 +450,15 @@ There are three ways to specify multiple scoring metrics for the ``scoring`` parameter: - As an iterable of string metrics:: - >>> scoring = ['accuracy', 'precision'] + + >>> scoring = ['accuracy', 'precision'] - As a ``dict`` mapping the scorer name to the scoring function:: - >>> from sklearn.metrics import accuracy_score - >>> from sklearn.metrics import make_scorer - >>> scoring = {'accuracy': make_scorer(accuracy_score), - ... 'prec': 'precision'} + + >>> from sklearn.metrics import accuracy_score + >>> from sklearn.metrics import make_scorer + >>> scoring = {'accuracy': make_scorer(accuracy_score), + ... 'prec': 'precision'} Note that the dict values can either be scorer functions or one of the predefined metric strings. @@ -307,6 +504,7 @@ Some of these are restricted to the binary classification case: precision_recall_curve roc_curve + class_likelihood_ratios det_curve @@ -340,6 +538,7 @@ Some also work in the multilabel case: recall_score roc_auc_score zero_one_loss + d2_log_loss_score And some work with binary and multilabel (but not multiclass) problems: @@ -428,18 +627,18 @@ where :math:`1(x)` is the `indicator function >>> accuracy_score(y_true, y_pred) 0.5 >>> accuracy_score(y_true, y_pred, normalize=False) - 2 + 2.0 In the multilabel case with binary label indicators:: >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 -.. topic:: Example: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_feature_selection_plot_permutation_test_for_classification.py` - for an example of accuracy score usage using permutations of - the dataset. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_permutation_tests_for_classification.py` + for an example of accuracy score usage using permutations of + the dataset. .. _top_k_accuracy_score: @@ -477,7 +676,7 @@ where :math:`k` is the number of guesses allowed and :math:`1(x)` is the 0.75 >>> # Not normalizing gives the number of "correctly" classified samples >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False) - 3 + 3.0 .. _balanced_accuracy_score: @@ -510,7 +709,7 @@ In contrast, if the conventional accuracy is above chance only because the classifier takes advantage of an imbalanced test set, then the balanced accuracy, as appropriate, will drop to :math:`\frac{1}{n\_classes}`. -The score ranges from 0 to 1, or when ``adjusted=True`` is used, it rescaled to +The score ranges from 0 to 1, or when ``adjusted=True`` is used, it is rescaled to the range :math:`\frac{1}{1 - n\_classes}` to 1, inclusive, with performance at random scoring 0. @@ -550,21 +749,20 @@ or *informedness*. * Balanced Accuracy as described in [Urbanowicz2015]_: the average of sensitivity and specificity is computed for each class and then averaged over total number of classes. -.. topic:: References: - - .. [Guyon2015] I. Guyon, K. Bennett, G. Cawley, H.J. Escalante, S. Escalera, T.K. Ho, N. Macià, - B. Ray, M. Saeed, A.R. Statnikov, E. Viegas, `Design of the 2015 ChaLearn AutoML Challenge - `_, - IJCNN 2015. - .. [Mosley2013] L. Mosley, `A balanced approach to the multi-class imbalance problem - `_, - IJCV 2010. - .. [Kelleher2015] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, `Fundamentals of - Machine Learning for Predictive Data Analytics: Algorithms, Worked Examples, - and Case Studies `_, - 2015. - .. [Urbanowicz2015] Urbanowicz R.J., Moore, J.H. `ExSTraCS 2.0: description and evaluation of a scalable learning - classifier system `_, Evol. Intel. (2015) 8: 89. +.. rubric:: References + +.. [Guyon2015] I. Guyon, K. Bennett, G. Cawley, H.J. Escalante, S. Escalera, T.K. Ho, N. Macià, + B. Ray, M. Saeed, A.R. Statnikov, E. Viegas, `Design of the 2015 ChaLearn AutoML Challenge + `_, IJCNN 2015. +.. [Mosley2013] L. Mosley, `A balanced approach to the multi-class imbalance problem + `_, IJCV 2010. +.. [Kelleher2015] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, `Fundamentals of + Machine Learning for Predictive Data Analytics: Algorithms, Worked Examples, + and Case Studies `_, + 2015. +.. [Urbanowicz2015] Urbanowicz R.J., Moore, J.H. :doi:`ExSTraCS 2.0: description + and evaluation of a scalable learning classifier + system <10.1007/s12065-015-0128-8>`, Evol. Intel. (2015) 8: 89. .. _cohen_kappa: @@ -576,7 +774,7 @@ The function :func:`cohen_kappa_score` computes `Cohen's kappa This measure is intended to compare labelings by different human annotators, not a classifier versus a ground truth. -The kappa score (see docstring) is a number between -1 and 1. +The kappa score is a number between -1 and 1. Scores above .8 are generally considered good agreement; zero or lower means no agreement (practically random labels). @@ -585,9 +783,9 @@ but not for multilabel problems (except by manually computing a per-label score) and not for more than two annotators. >>> from sklearn.metrics import cohen_kappa_score - >>> y_true = [2, 0, 2, 2, 0, 1] - >>> y_pred = [0, 0, 2, 2, 0, 2] - >>> cohen_kappa_score(y_true, y_pred) + >>> labeling1 = [2, 0, 2, 2, 0, 1] + >>> labeling2 = [0, 0, 2, 2, 0, 2] + >>> cohen_kappa_score(labeling1, labeling2) 0.4285714285714286 .. _confusion_matrix: @@ -639,23 +837,23 @@ false negatives and true positives as follows:: >>> y_true = [0, 0, 0, 1, 1, 1, 1, 1] >>> y_pred = [0, 1, 0, 1, 0, 1, 0, 1] - >>> tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + >>> tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel().tolist() >>> tn, fp, fn, tp (2, 1, 2, 3) -.. topic:: Example: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py` - for an example of using a confusion matrix to evaluate classifier output - quality. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py` + for an example of using a confusion matrix to evaluate classifier output + quality. - * See :ref:`sphx_glr_auto_examples_classification_plot_digits_classification.py` - for an example of using a confusion matrix to classify - hand-written digits. +* See :ref:`sphx_glr_auto_examples_classification_plot_digits_classification.py` + for an example of using a confusion matrix to classify + hand-written digits. - * See :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - for an example of using a confusion matrix to classify text - documents. +* See :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` + for an example of using a confusion matrix to classify text + documents. .. _classification_report: @@ -682,19 +880,15 @@ and inferred labels:: weighted avg 0.67 0.60 0.59 5 -.. topic:: Example: - - * See :ref:`sphx_glr_auto_examples_classification_plot_digits_classification.py` - for an example of classification report usage for - hand-written digits. +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - for an example of classification report usage for text - documents. +* See :ref:`sphx_glr_auto_examples_classification_plot_digits_classification.py` + for an example of classification report usage for + hand-written digits. - * See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` - for an example of classification report usage for - grid search with nested cross-validation. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` + for an example of classification report usage for + grid search with nested cross-validation. .. _hamming_loss: @@ -705,17 +899,21 @@ The :func:`hamming_loss` computes the average Hamming loss or `Hamming distance `_ between two sets of samples. -If :math:`\hat{y}_j` is the predicted value for the :math:`j`-th label of -a given sample, :math:`y_j` is the corresponding true value, and -:math:`n_\text{labels}` is the number of classes or labels, then the -Hamming loss :math:`L_{Hamming}` between two samples is defined as: +If :math:`\hat{y}_{i,j}` is the predicted value for the :math:`j`-th label of a +given sample :math:`i`, :math:`y_{i,j}` is the corresponding true value, +:math:`n_\text{samples}` is the number of samples and :math:`n_\text{labels}` +is the number of labels, then the Hamming loss :math:`L_{Hamming}` is defined +as: .. math:: - L_{Hamming}(y, \hat{y}) = \frac{1}{n_\text{labels}} \sum_{j=0}^{n_\text{labels} - 1} 1(\hat{y}_j \not= y_j) + L_{Hamming}(y, \hat{y}) = \frac{1}{n_\text{samples} * n_\text{labels}} \sum_{i=0}^{n_\text{samples}-1} \sum_{j=0}^{n_\text{labels} - 1} 1(\hat{y}_{i,j} \not= y_{i,j}) where :math:`1(x)` is the `indicator function -`_. :: +`_. + +The equation above does not hold true in the case of multiclass classification. +Please refer to the note below for more information. :: >>> from sklearn.metrics import hamming_loss >>> y_pred = [1, 2, 3, 4] @@ -795,47 +993,44 @@ score: recall_score Note that the :func:`precision_recall_curve` function is restricted to the -binary case. The :func:`average_precision_score` function works only in -binary classification and multilabel indicator format. The -:func:`plot_precision_recall_curve` function plots the precision recall as -follows. +binary case. The :func:`average_precision_score` function supports multiclass +and multilabel formats by computing each class score in a One-vs-the-rest (OvR) +fashion and averaging them or not depending of its ``average`` argument value. + +The :func:`PrecisionRecallDisplay.from_estimator` and +:func:`PrecisionRecallDisplay.from_predictions` functions will plot the +precision-recall curve as follows. .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_precision_recall_001.png :target: ../auto_examples/model_selection/plot_precision_recall.html#plot-the-precision-recall-curve :scale: 75 :align: center -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` - for an example of :func:`f1_score` usage to classify text - documents. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` + for an example of :func:`precision_score` and :func:`recall_score` usage + to estimate parameters using grid search with nested cross-validation. - * See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` - for an example of :func:`precision_score` and :func:`recall_score` usage - to estimate parameters using grid search with nested cross-validation. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_precision_recall.py` + for an example of :func:`precision_recall_curve` usage to evaluate + classifier output quality. - * See :ref:`sphx_glr_auto_examples_model_selection_plot_precision_recall.py` - for an example of :func:`precision_recall_curve` usage to evaluate - classifier output quality. - - -.. topic:: References: - - .. [Manning2008] C.D. Manning, P. Raghavan, H. SchÃŧtze, `Introduction to Information Retrieval - `_, - 2008. - .. [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, - `The Pascal Visual Object Classes (VOC) Challenge - `_, - IJCV 2010. - .. [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves - `_, - ICML 2006. - .. [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right - `_, - NIPS 2015. +.. rubric:: References +.. [Manning2008] C.D. Manning, P. Raghavan, H. SchÃŧtze, `Introduction to Information Retrieval + `_, + 2008. +.. [Everingham2010] M. Everingham, L. Van Gool, C.K.I. Williams, J. Winn, A. Zisserman, + `The Pascal Visual Object Classes (VOC) Challenge + `_, + IJCV 2010. +.. [Davis2006] J. Davis, M. Goadrich, `The Relationship Between Precision-Recall and ROC Curves + `_, + ICML 2006. +.. [Flach2015] P.A. Flach, M. Kull, `Precision-Recall-Gain Curves: PR Analysis Done Right + `_, + NIPS 2015. Binary classification ^^^^^^^^^^^^^^^^^^^^^ @@ -856,20 +1051,36 @@ following table: | | Missing result | Correct absence of result| +-------------------+---------------------+--------------------------+ -In this context, we can define the notions of precision, recall and F-measure: +In this context, we can define the notions of precision and recall: .. math:: - \text{precision} = \frac{tp}{tp + fp}, + \text{precision} = \frac{\text{tp}}{\text{tp} + \text{fp}}, .. math:: - \text{recall} = \frac{tp}{tp + fn}, + \text{recall} = \frac{\text{tp}}{\text{tp} + \text{fn}}, + +(Sometimes recall is also called ''sensitivity'') + +F-measure is the weighted harmonic mean of precision and recall, with precision's +contribution to the mean weighted by some parameter :math:`\beta`: .. math:: - F_\beta = (1 + \beta^2) \frac{\text{precision} \times \text{recall}}{\beta^2 \text{precision} + \text{recall}}. + F_\beta = (1 + \beta^2) \frac{\text{precision} \times \text{recall}}{\beta^2 \text{precision} + \text{recall}} +To avoid division by zero when precision and recall are zero, Scikit-Learn calculates F-measure with this +otherwise-equivalent formula: + +.. math:: + + F_\beta = \frac{(1 + \beta^2) \text{tp}}{(1 + \beta^2) \text{tp} + \text{fp} + \beta^2 \text{fn}} + +Note that this formula is still undefined when there are no true positives, false +positives, or false negatives. By default, F-1 for a set of exclusively true negatives +is calculated as 0, however this behavior can be changed using the `zero_division` +parameter. Here are some small examples in binary classification:: >>> from sklearn import metrics @@ -880,15 +1091,15 @@ Here are some small examples in binary classification:: >>> metrics.recall_score(y_true, y_pred) 0.5 >>> metrics.f1_score(y_true, y_pred) - 0.66... + 0.66 >>> metrics.fbeta_score(y_true, y_pred, beta=0.5) - 0.83... + 0.83 >>> metrics.fbeta_score(y_true, y_pred, beta=1) - 0.66... + 0.66 >>> metrics.fbeta_score(y_true, y_pred, beta=2) - 0.55... + 0.55 >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5) - (array([0.66..., 1. ]), array([1. , 0.5]), array([0.71..., 0.83...]), array([2, 2])) + (array([0.66, 1. ]), array([1. , 0.5]), array([0.71, 0.83]), array([2, 2])) >>> import numpy as np @@ -898,34 +1109,41 @@ Here are some small examples in binary classification:: >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> precision, recall, threshold = precision_recall_curve(y_true, y_scores) >>> precision - array([0.66..., 0.5 , 1. , 1. ]) + array([0.5 , 0.66, 0.5 , 1. , 1. ]) >>> recall - array([1. , 0.5, 0.5, 0. ]) + array([1. , 1. , 0.5, 0.5, 0. ]) >>> threshold - array([0.35, 0.4 , 0.8 ]) + array([0.1 , 0.35, 0.4 , 0.8 ]) >>> average_precision_score(y_true, y_scores) - 0.83... + 0.83 Multiclass and multilabel classification ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In multiclass and multilabel classification task, the notions of precision, +In a multiclass and multilabel classification task, the notions of precision, recall, and F-measures can be applied to each label independently. There are a few ways to combine results across labels, specified by the ``average`` argument to the -:func:`average_precision_score` (multilabel only), :func:`f1_score`, +:func:`average_precision_score`, :func:`f1_score`, :func:`fbeta_score`, :func:`precision_recall_fscore_support`, :func:`precision_score` and :func:`recall_score` functions, as described -:ref:`above `. Note that if all labels are included, "micro"-averaging -in a multiclass setting will produce precision, recall and :math:`F` -that are all identical to accuracy. Also note that "weighted" averaging may -produce an F-score that is not between precision and recall. +:ref:`above `. + +Note the following behaviors when averaging: + +* If all labels are included, "micro"-averaging in a multiclass setting will produce + precision, recall and :math:`F` that are all identical to accuracy. +* "weighted" averaging may produce a F-score that is not between precision and recall. +* "macro" averaging for F-measures is calculated as the arithmetic mean over + per-label/class F-measures, not the harmonic mean over the arithmetic precision and + recall means. Both calculations can be seen in the literature but are not equivalent, + see [OB2019]_ for details. To make this more explicit, consider the following notation: -* :math:`y` the set of *predicted* :math:`(sample, label)` pairs -* :math:`\hat{y}` the set of *true* :math:`(sample, label)` pairs +* :math:`y` the set of *true* :math:`(sample, label)` pairs +* :math:`\hat{y}` the set of *predicted* :math:`(sample, label)` pairs * :math:`L` the set of labels * :math:`S` the set of samples * :math:`y_s` the subset of :math:`y` with sample :math:`s`, @@ -933,10 +1151,10 @@ To make this more explicit, consider the following notation: * :math:`y_l` the subset of :math:`y` with label :math:`l` * similarly, :math:`\hat{y}_s` and :math:`\hat{y}_l` are subsets of :math:`\hat{y}` -* :math:`P(A, B) := \frac{\left| A \cap B \right|}{\left|A\right|}` for some +* :math:`P(A, B) := \frac{\left| A \cap B \right|}{\left|B\right|}` for some sets :math:`A` and :math:`B` -* :math:`R(A, B) := \frac{\left| A \cap B \right|}{\left|B\right|}` - (Conventions vary on handling :math:`B = \emptyset`; this implementation uses +* :math:`R(A, B) := \frac{\left| A \cap B \right|}{\left|A\right|}` + (Conventions vary on handling :math:`A = \emptyset`; this implementation uses :math:`R(A, B):=0`, and similar for :math:`P`.) * :math:`F_\beta(A, B) := \left(1 + \beta^2\right) \frac{P(A, B) \times R(A, B)}{\beta^2 P(A, B) + R(A, B)}` @@ -951,7 +1169,7 @@ Then the metrics are defined as: +---------------+------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ |``"macro"`` | :math:`\frac{1}{\left|L\right|} \sum_{l \in L} P(y_l, \hat{y}_l)` | :math:`\frac{1}{\left|L\right|} \sum_{l \in L} R(y_l, \hat{y}_l)` | :math:`\frac{1}{\left|L\right|} \sum_{l \in L} F_\beta(y_l, \hat{y}_l)` | +---------------+------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ -|``"weighted"`` | :math:`\frac{1}{\sum_{l \in L} \left|\hat{y}_l\right|} \sum_{l \in L} \left|\hat{y}_l\right| P(y_l, \hat{y}_l)` | :math:`\frac{1}{\sum_{l \in L} \left|\hat{y}_l\right|} \sum_{l \in L} \left|\hat{y}_l\right| R(y_l, \hat{y}_l)` | :math:`\frac{1}{\sum_{l \in L} \left|\hat{y}_l\right|} \sum_{l \in L} \left|\hat{y}_l\right| F_\beta(y_l, \hat{y}_l)`| +|``"weighted"`` | :math:`\frac{1}{\sum_{l \in L} \left|y_l\right|} \sum_{l \in L} \left|y_l\right| P(y_l, \hat{y}_l)` | :math:`\frac{1}{\sum_{l \in L} \left|y_l\right|} \sum_{l \in L} \left|y_l\right| R(y_l, \hat{y}_l)` | :math:`\frac{1}{\sum_{l \in L} \left|y_l\right|} \sum_{l \in L} \left|y_l\right| F_\beta(y_l, \hat{y}_l)` | +---------------+------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ |``None`` | :math:`\langle P(y_l, \hat{y}_l) | l \in L \rangle` | :math:`\langle R(y_l, \hat{y}_l) | l \in L \rangle` | :math:`\langle F_\beta(y_l, \hat{y}_l) | l \in L \rangle` | +---------------+------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+ @@ -960,15 +1178,15 @@ Then the metrics are defined as: >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> metrics.precision_score(y_true, y_pred, average='macro') - 0.22... + 0.22 >>> metrics.recall_score(y_true, y_pred, average='micro') - 0.33... + 0.33 >>> metrics.f1_score(y_true, y_pred, average='weighted') - 0.26... + 0.267 >>> metrics.fbeta_score(y_true, y_pred, average='macro', beta=0.5) - 0.23... + 0.238 >>> metrics.precision_recall_fscore_support(y_true, y_pred, beta=0.5, average=None) - (array([0.66..., 0. , 0. ]), array([1., 0., 0.]), array([0.71..., 0. , 0. ]), array([2, 2, 2]...)) + (array([0.667, 0., 0.]), array([1., 0., 0.]), array([0.714, 0., 0.]), array([2, 2, 2])) For multiclass classification with a "negative class", it is possible to exclude some labels: @@ -979,7 +1197,12 @@ For multiclass classification with a "negative class", it is possible to exclude Similarly, labels not present in the data sample may be accounted for in macro-averaging. >>> metrics.precision_score(y_true, y_pred, labels=[0, 1, 2, 3], average='macro') - 0.166... + 0.166 + +.. rubric:: References + +.. [OB2019] :arxiv:`Opitz, J., & Burst, S. (2019). "Macro f1 and macro f1." + <1911.03347>` .. _jaccard_similarity_score: @@ -990,17 +1213,16 @@ The :func:`jaccard_score` function computes the average of `Jaccard similarity coefficients `_, also called the Jaccard index, between pairs of label sets. -The Jaccard similarity coefficient of the :math:`i`-th samples, -with a ground truth label set :math:`y_i` and predicted label set -:math:`\hat{y}_i`, is defined as +The Jaccard similarity coefficient with a ground truth label set :math:`y` and +predicted label set :math:`\hat{y}`, is defined as .. math:: - J(y_i, \hat{y}_i) = \frac{|y_i \cap \hat{y}_i|}{|y_i \cup \hat{y}_i|}. + J(y, \hat{y}) = \frac{|y \cap \hat{y}|}{|y \cup \hat{y}|}. -:func:`jaccard_score` works like :func:`precision_recall_fscore_support` as a -naively set-wise measure applying natively to binary targets, and extended to -apply to multilabel and multiclass through the use of `average` (see +The :func:`jaccard_score` (like :func:`precision_recall_fscore_support`) applies +natively to binary targets. By computing it set-wise it can be extended to apply +to multilabel and multiclass through the use of `average` (see :ref:`above `). In the binary case:: @@ -1012,14 +1234,19 @@ In the binary case:: >>> y_pred = np.array([[1, 1, 1], ... [1, 0, 0]]) >>> jaccard_score(y_true[0], y_pred[0]) - 0.6666... + 0.6666 + +In the 2D comparison case (e.g. image similarity): + + >>> jaccard_score(y_true, y_pred, average="micro") + 0.6 In the multilabel case with binary label indicators:: >>> jaccard_score(y_true, y_pred, average='samples') - 0.5833... + 0.5833 >>> jaccard_score(y_true, y_pred, average='macro') - 0.6666... + 0.6666 >>> jaccard_score(y_true, y_pred, average=None) array([0.5, 0.5, 1. ]) @@ -1029,11 +1256,11 @@ multilabel problem:: >>> y_pred = [0, 2, 1, 2] >>> y_true = [0, 1, 2, 2] >>> jaccard_score(y_true, y_pred, average=None) - array([1. , 0. , 0.33...]) + array([1. , 0. , 0.33]) >>> jaccard_score(y_true, y_pred, average='macro') - 0.44... + 0.44 >>> jaccard_score(y_true, y_pred, average='micro') - 0.33... + 0.33 .. _hinge_loss: @@ -1046,29 +1273,35 @@ the model and the data using that considers only prediction errors. (Hinge loss is used in maximal margin classifiers such as support vector machines.) -If the labels are encoded with +1 and -1, :math:`y`: is the true -value, and :math:`w` is the predicted decisions as output by -``decision_function``, then the hinge loss is defined as: +If the true label :math:`y_i` of a binary classification task is encoded as +:math:`y_i=\left\{-1, +1\right\}` for every sample :math:`i`; and :math:`w_i` +is the corresponding predicted decision (an array of shape (`n_samples`,) as +output by the `decision_function` method), then the hinge loss is defined as: .. math:: - L_\text{Hinge}(y, w) = \max\left\{1 - wy, 0\right\} = \left|1 - wy\right|_+ + L_\text{Hinge}(y, w) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} \max\left\{1 - w_i y_i, 0\right\} If there are more than two labels, :func:`hinge_loss` uses a multiclass variant due to Crammer & Singer. -`Here `_ is +`Here `_ is the paper describing it. -If :math:`y_w` is the predicted decision for true label and :math:`y_t` is the -maximum of the predicted decisions for all other labels, where predicted -decisions are output by decision function, then multiclass hinge loss is defined -by: +In this case the predicted decision is an array of shape (`n_samples`, +`n_labels`). If :math:`w_{i, y_i}` is the predicted decision for the true label +:math:`y_i` of the :math:`i`-th sample; and +:math:`\hat{w}_{i, y_i} = \max\left\{w_{i, y_j}~|~y_j \ne y_i \right\}` +is the maximum of the +predicted decisions for all the other labels, then the multi-class hinge loss +is defined by: .. math:: - L_\text{Hinge}(y_w, y_t) = \max\left\{1 + y_t - y_w, 0\right\} + L_\text{Hinge}(y, w) = \frac{1}{n_\text{samples}} + \sum_{i=0}^{n_\text{samples}-1} \max\left\{1 + \hat{w}_{i, y_i} + - w_{i, y_i}, 0\right\} -Here a small example demonstrating the use of the :func:`hinge_loss` function +Here is a small example demonstrating the use of the :func:`hinge_loss` function with a svm classifier in a binary class problem:: >>> from sklearn import svm @@ -1080,9 +1313,9 @@ with a svm classifier in a binary class problem:: LinearSVC(random_state=0) >>> pred_decision = est.decision_function([[-2], [3], [0.5]]) >>> pred_decision - array([-2.18..., 2.36..., 0.09...]) + array([-2.18, 2.36, 0.09]) >>> hinge_loss([-1, 1, 1], pred_decision) - 0.3... + 0.3 Here is an example demonstrating the use of the :func:`hinge_loss` function with a svm classifier in a multiclass problem:: @@ -1095,8 +1328,8 @@ with a svm classifier in a multiclass problem:: LinearSVC() >>> pred_decision = est.decision_function([[-1], [2], [3]]) >>> y_true = [0, 2, 3] - >>> hinge_loss(y_true, pred_decision, labels) - 0.56... + >>> hinge_loss(y_true, pred_decision, labels=labels) + 0.56 .. _log_loss: @@ -1111,30 +1344,30 @@ probability outputs (``predict_proba``) of a classifier instead of its discrete predictions. For binary classification with a true label :math:`y \in \{0,1\}` -and a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, +and a probability estimate :math:`\hat{p} \approx \operatorname{Pr}(y = 1)`, the log loss per sample is the negative log-likelihood of the classifier given the true label: .. math:: - L_{\log}(y, p) = -\log \operatorname{Pr}(y|p) = -(y \log (p) + (1 - y) \log (1 - p)) + L_{\log}(y, \hat{p}) = -\log \operatorname{Pr}(y|\hat{p}) = -(y \log (\hat{p}) + (1 - y) \log (1 - \hat{p})) This extends to the multiclass case as follows. Let the true labels for a set of samples be encoded as a 1-of-K binary indicator matrix :math:`Y`, i.e., :math:`y_{i,k} = 1` if sample :math:`i` has label :math:`k` taken from a set of :math:`K` labels. -Let :math:`P` be a matrix of probability estimates, -with :math:`p_{i,k} = \operatorname{Pr}(y_{i,k} = 1)`. +Let :math:`\hat{P}` be a matrix of probability estimates, +with elements :math:`\hat{p}_{i,k} \approx \operatorname{Pr}(y_{i,k} = 1)`. Then the log loss of the whole set is .. math:: - L_{\log}(Y, P) = -\log \operatorname{Pr}(Y|P) = - \frac{1}{N} \sum_{i=0}^{N-1} \sum_{k=0}^{K-1} y_{i,k} \log p_{i,k} + L_{\log}(Y, \hat{P}) = -\log \operatorname{Pr}(Y|\hat{P}) = - \frac{1}{N} \sum_{i=0}^{N-1} \sum_{k=0}^{K-1} y_{i,k} \log \hat{p}_{i,k} To see how this generalizes the binary log loss given above, note that in the binary case, -:math:`p_{i,0} = 1 - p_{i,1}` and :math:`y_{i,0} = 1 - y_{i,1}`, +:math:`\hat{p}_{i,0} = 1 - \hat{p}_{i,1}` and :math:`y_{i,0} = 1 - y_{i,1}`, so expanding the inner sum over :math:`y_{i,k} \in \{0,1\}` gives the binary log loss. @@ -1146,7 +1379,7 @@ method. >>> y_true = [0, 0, 1, 1] >>> y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]] >>> log_loss(y_true, y_pred) - 0.1738... + 0.1738 The first ``[.9, .1]`` in ``y_pred`` denotes 90% probability that the first sample has label 0. The log loss is non-negative. @@ -1201,8 +1434,9 @@ Then the multiclass MCC is defined as: When there are more than two labels, the value of the MCC will no longer range between -1 and +1. Instead the minimum value will be somewhere between -1 and 0 -depending on the number and distribution of ground true labels. The maximum +depending on the number and distribution of ground truth labels. The maximum value is always +1. +For additional information, see [WikipediaMCC2021]_. Here is a small example illustrating the usage of the :func:`matthews_corrcoef` function: @@ -1211,7 +1445,14 @@ function: >>> y_true = [+1, +1, +1, -1] >>> y_pred = [+1, -1, +1, +1] >>> matthews_corrcoef(y_true, y_pred) - -0.33... + -0.33 + +.. rubric:: References + +.. [WikipediaMCC2021] Wikipedia contributors. Phi coefficient. + Wikipedia, The Free Encyclopedia. April 21, 2021, 12:21 CEST. + Available at: https://en.wikipedia.org/wiki/Phi_coefficient + Accessed April 21, 2021. .. _multilabel_confusion_matrix: @@ -1336,10 +1577,10 @@ Quoting Wikipedia : positive rate), at various threshold settings. TPR is also known as sensitivity, and FPR is one minus the specificity or true negative rate." -This function requires the true binary -value and the target scores, which can either be probability estimates of the -positive class, confidence values, or binary decisions. -Here is a small example of how to use the :func:`roc_curve` function:: +This function requires the true binary value and the target scores, which can +either be probability estimates of the positive class, confidence values, or +binary decisions. Here is a small example of how to use the :func:`roc_curve` +function:: >>> import numpy as np >>> from sklearn.metrics import roc_curve @@ -1351,25 +1592,29 @@ Here is a small example of how to use the :func:`roc_curve` function:: >>> tpr array([0. , 0.5, 0.5, 1. , 1. ]) >>> thresholds - array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) + array([ inf, 0.8 , 0.4 , 0.35, 0.1 ]) + +Compared to metrics such as the subset accuracy, the Hamming loss, or the +F1 score, ROC doesn't require optimizing a threshold for each label. + +The :func:`roc_auc_score` function, denoted by ROC-AUC or AUROC, computes the +area under the ROC curve. By doing so, the curve information is summarized in +one number. -This figure shows an example of such an ROC curve: +The following figure shows the ROC curve and ROC-AUC score for a classifier +aimed to distinguish the virginica flower from the rest of the species in the +:ref:`iris_dataset`: .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_001.png :target: ../auto_examples/model_selection/plot_roc.html :scale: 75 :align: center -The :func:`roc_auc_score` function computes the area under the receiver -operating characteristic (ROC) curve, which is also denoted by -AUC or AUROC. By computing the -area under the roc curve, the curve information is summarized in one number. + + For more information see the `Wikipedia article on AUC `_. -Compared to metrics such as the subset accuracy, the Hamming loss, or the -F1 score, ROC doesn't require optimizing a threshold for each label. - .. _roc_auc_binary: Binary case @@ -1387,7 +1632,7 @@ Therefore, the `y_score` parameter is of size (n_samples,). >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.metrics import roc_auc_score >>> X, y = load_breast_cancer(return_X_y=True) - >>> clf = LogisticRegression(solver="liblinear").fit(X, y) + >>> clf = LogisticRegression().fit(X, y) >>> clf.classes_ array([0, 1]) @@ -1395,12 +1640,12 @@ We can use the probability estimates corresponding to `clf.classes_[1]`. >>> y_score = clf.predict_proba(X)[:, 1] >>> roc_auc_score(y, y_score) - 0.99... + 0.99 Otherwise, we can use the non-thresholded decision values >>> roc_auc_score(y, clf.decision_function(X)) - 0.99... + 0.99 .. _roc_auc_multiclass: @@ -1417,50 +1662,57 @@ correspond to the probability estimates that a sample belongs to a particular class. The OvO and OvR algorithms support weighting uniformly (``average='macro'``) and by prevalence (``average='weighted'``). -**One-vs-one Algorithm**: Computes the average AUC of all possible pairwise -combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted -uniformly: +.. dropdown:: One-vs-one Algorithm -.. math:: + Computes the average AUC of all possible pairwise + combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted + uniformly: - \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) + - \text{AUC}(k | j)) + .. math:: -where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the -AUC with class :math:`j` as the positive class and class :math:`k` as the -negative class. In general, -:math:`\text{AUC}(j | k) \neq \text{AUC}(k | j))` in the multiclass -case. This algorithm is used by setting the keyword argument ``multiclass`` -to ``'ovo'`` and ``average`` to ``'macro'``. + \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) + + \text{AUC}(k | j)) -The [HT2001]_ multiclass AUC metric can be extended to be weighted by the -prevalence: + where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the + AUC with class :math:`j` as the positive class and class :math:`k` as the + negative class. In general, + :math:`\text{AUC}(j | k) \neq \text{AUC}(k | j))` in the multiclass + case. This algorithm is used by setting the keyword argument ``multiclass`` + to ``'ovo'`` and ``average`` to ``'macro'``. -.. math:: + The [HT2001]_ multiclass AUC metric can be extended to be weighted by the + prevalence: - \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)( - \text{AUC}(j | k) + \text{AUC}(k | j)) + .. math:: -where :math:`c` is the number of classes. This algorithm is used by setting -the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to -``'weighted'``. The ``'weighted'`` option returns a prevalence-weighted average -as described in [FC2009]_. + \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)( + \text{AUC}(j | k) + \text{AUC}(k | j)) -**One-vs-rest Algorithm**: Computes the AUC of each class against the rest -[PD2000]_. The algorithm is functionally the same as the multilabel case. To -enable this algorithm set the keyword argument ``multiclass`` to ``'ovr'``. -Like OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and -``'weighted'`` [F2001]_. + where :math:`c` is the number of classes. This algorithm is used by setting + the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to + ``'weighted'``. The ``'weighted'`` option returns a prevalence-weighted average + as described in [FC2009]_. -In applications where a high false positive rate is not tolerable the parameter -``max_fpr`` of :func:`roc_auc_score` can be used to summarize the ROC curve up -to the given limit. +.. dropdown:: One-vs-rest Algorithm + Computes the AUC of each class against the rest + [PD2000]_. The algorithm is functionally the same as the multilabel case. To + enable this algorithm set the keyword argument ``multiclass`` to ``'ovr'``. + Additionally to ``'macro'`` [F2006]_ and ``'weighted'`` [F2001]_ averaging, OvR + supports ``'micro'`` averaging. -.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png - :target: ../auto_examples/model_selection/plot_roc.html - :scale: 75 - :align: center + In applications where a high false positive rate is not tolerable the parameter + ``max_fpr`` of :func:`roc_auc_score` can be used to summarize the ROC curve up + to the given limit. + + The following figure shows the micro-averaged ROC curve and its corresponding + ROC-AUC score for a classifier aimed to distinguish the different species in + the :ref:`iris_dataset`: + + .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png + :target: ../auto_examples/model_selection/plot_roc.html + :scale: 75 + :align: center .. _roc_auc_multilabel: @@ -1476,11 +1728,11 @@ class with the greater label for each output. >>> from sklearn.datasets import make_multilabel_classification >>> from sklearn.multioutput import MultiOutputClassifier >>> X, y = make_multilabel_classification(random_state=0) - >>> inner_clf = LogisticRegression(solver="liblinear", random_state=0) + >>> inner_clf = LogisticRegression(random_state=0) >>> clf = MultiOutputClassifier(inner_clf).fit(X, y) >>> y_score = np.transpose([y_pred[:, 1] for y_pred in clf.predict_proba(X)]) >>> roc_auc_score(y, y_score, average=None) - array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) + array([0.828, 0.851, 0.94, 0.87, 0.95]) And the decision values do not require such processing. @@ -1488,46 +1740,45 @@ And the decision values do not require such processing. >>> clf = RidgeClassifierCV().fit(X, y) >>> y_score = clf.decision_function(X) >>> roc_auc_score(y, y_score, average=None) - array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) + array([0.82, 0.85, 0.93, 0.87, 0.94]) -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` - for an example of using ROC to - evaluate the quality of the output of a classifier. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for an example of + using ROC to evaluate the quality of the output of a classifier. - * See :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py` - for an example of using ROC to - evaluate classifier output quality, using cross-validation. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py` for an + example of using ROC to evaluate classifier output quality, using cross-validation. - * See :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` - for an example of using ROC to - model species distribution. +* See :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` + for an example of using ROC to model species distribution. -.. topic:: References: +.. rubric:: References - .. [HT2001] Hand, D.J. and Till, R.J., (2001). `A simple generalisation - of the area under the ROC curve for multiple class classification problems. - `_ - Machine learning, 45(2), pp.171-186. +.. [HT2001] Hand, D.J. and Till, R.J., (2001). `A simple generalisation + of the area under the ROC curve for multiple class classification problems. + `_ + Machine learning, 45(2), pp. 171-186. - .. [FC2009] Ferri, Cèsar & Hernandez-Orallo, Jose & Modroiu, R. (2009). - `An Experimental Comparison of Performance Measures for Classification. - `_ - Pattern Recognition Letters. 30. 27-38. +.. [FC2009] Ferri, Cèsar & Hernandez-Orallo, Jose & Modroiu, R. (2009). + `An Experimental Comparison of Performance Measures for Classification. + `_ + Pattern Recognition Letters. 30. 27-38. - .. [PD2000] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving - probability estimation trees (Section 6.2), CeDER Working Paper #IS-00-04, - Stern School of Business, New York University. +.. [PD2000] Provost, F., Domingos, P. (2000). `Well-trained PETs: Improving + probability estimation trees + `_ + (Section 6.2), CeDER Working Paper #IS-00-04, Stern School of Business, + New York University. - .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis. - `_ - Pattern Recognition Letters, 27(8), pp. 861-874. +.. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis. + `_ + Pattern Recognition Letters, 27(8), pp. 861-874. - .. [F2001] Fawcett, T., 2001. `Using rule sets to maximize - ROC performance `_ - In Data Mining, 2001. - Proceedings IEEE International Conference, pp. 131-138. +.. [F2001] Fawcett, T., 2001. `Using rule sets to maximize + ROC performance `_ + In Data Mining, 2001. + Proceedings IEEE International Conference, pp. 131-138. .. _det_curve: @@ -1563,59 +1814,57 @@ same classification task: :scale: 75 :align: center -**Properties:** +.. dropdown:: Properties -* DET curves form a linear curve in normal deviate scale if the detection - scores are normally (or close-to normally) distributed. - It was shown by [Navratil2007]_ that the reverse it not necessarily true and - even more general distributions are able produce linear DET curves. + * DET curves form a linear curve in normal deviate scale if the detection + scores are normally (or close-to normally) distributed. + It was shown by [Navratil2007]_ that the reverse is not necessarily true and + even more general distributions are able to produce linear DET curves. -* The normal deviate scale transformation spreads out the points such that a - comparatively larger space of plot is occupied. - Therefore curves with similar classification performance might be easier to - distinguish on a DET plot. + * The normal deviate scale transformation spreads out the points such that a + comparatively larger space of plot is occupied. + Therefore curves with similar classification performance might be easier to + distinguish on a DET plot. -* With False Negative Rate being "inverse" to True Positive Rate the point - of perfection for DET curves is the origin (in contrast to the top left - corner for ROC curves). + * With False Negative Rate being "inverse" to True Positive Rate the point + of perfection for DET curves is the origin (in contrast to the top left + corner for ROC curves). -**Applications and limitations:** +.. dropdown:: Applications and limitations -DET curves are intuitive to read and hence allow quick visual assessment of a -classifier's performance. -Additionally DET curves can be consulted for threshold analysis and operating -point selection. -This is particularly helpful if a comparison of error types is required. + DET curves are intuitive to read and hence allow quick visual assessment of a + classifier's performance. + Additionally DET curves can be consulted for threshold analysis and operating + point selection. + This is particularly helpful if a comparison of error types is required. -One the other hand DET curves do not provide their metric as a single number. -Therefore for either automated evaluation or comparison to other -classification tasks metrics like the derived area under ROC curve might be -better suited. + On the other hand DET curves do not provide their metric as a single number. + Therefore for either automated evaluation or comparison to other + classification tasks metrics like the derived area under ROC curve might be + better suited. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_model_selection_plot_det.py` - for an example comparison between receiver operating characteristic (ROC) - curves and Detection error tradeoff (DET) curves. +* See :ref:`sphx_glr_auto_examples_model_selection_plot_det.py` + for an example comparison between receiver operating characteristic (ROC) + curves and Detection error tradeoff (DET) curves. -.. topic:: References: +.. rubric:: References - .. [WikipediaDET2017] Wikipedia contributors. Detection error tradeoff. - Wikipedia, The Free Encyclopedia. September 4, 2017, 23:33 UTC. - Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. - Accessed February 19, 2018. +.. [WikipediaDET2017] Wikipedia contributors. Detection error tradeoff. + Wikipedia, The Free Encyclopedia. September 4, 2017, 23:33 UTC. + Available at: https://en.wikipedia.org/w/index.php?title=Detection_error_tradeoff&oldid=798982054. + Accessed February 19, 2018. - .. [Martin1997] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, - `The DET Curve in Assessment of Detection Task Performance - `_, - NIST 1997. +.. [Martin1997] A. Martin, G. Doddington, T. Kamm, M. Ordowski, and M. Przybocki, + `The DET Curve in Assessment of Detection Task Performance + `_, NIST 1997. - .. [Navratil2007] J. Navractil and D. Klusacek, - "`On Linear DETs, - `_" - 2007 IEEE International Conference on Acoustics, - Speech and Signal Processing - ICASSP '07, Honolulu, - HI, 2007, pp. IV-229-IV-232. +.. [Navratil2007] J. Navratil and D. Klusacek, + `"On Linear DETs" `_, + 2007 IEEE International Conference on Acoustics, + Speech and Signal Processing - ICASSP '07, Honolulu, + HI, 2007, pp. IV-229-IV-232. .. _zero_one_loss: @@ -1631,7 +1880,7 @@ In multilabel classification, the :func:`zero_one_loss` scores a subset as one if its labels strictly match the predictions, and as a zero if there are any errors. By default, the function returns the percentage of imperfectly predicted subsets. To get the count of such subsets instead, set -``normalize`` to ``False`` +``normalize`` to ``False``. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample and :math:`y_i` is the corresponding true value, @@ -1639,10 +1888,11 @@ then the 0-1 loss :math:`L_{0-1}` is defined as: .. math:: - L_{0-1}(y_i, \hat{y}_i) = 1(\hat{y}_i \not= y_i) + L_{0-1}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i \not= y_i) where :math:`1(x)` is the `indicator function -`_. +`_. The zero-one +loss can also be computed as :math:`\text{zero-one loss} = 1 - \text{accuracy}`. >>> from sklearn.metrics import zero_one_loss @@ -1651,7 +1901,7 @@ where :math:`1(x)` is the `indicator function >>> zero_one_loss(y_true, y_pred) 0.25 >>> zero_one_loss(y_true, y_pred, normalize=False) - 1 + 1.0 In the multilabel case with binary label indicators, where the first label set [0,1] has an error:: @@ -1660,54 +1910,77 @@ set [0,1] has an error:: 0.5 >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)), normalize=False) - 1 + 1.0 -.. topic:: Example: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py` - for an example of zero one loss usage to perform recursive feature - elimination with cross-validation. +* See :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py` + for an example of zero one loss usage to perform recursive feature + elimination with cross-validation. .. _brier_score_loss: Brier score loss ---------------- -The :func:`brier_score_loss` function computes the -`Brier score `_ -for binary classes [Brier1950]_. Quoting Wikipedia: +The :func:`brier_score_loss` function computes the `Brier score +`_ for binary and multiclass +probabilistic predictions and is equivalent to the mean squared error. +Quoting Wikipedia: + + "The Brier score is a strictly proper scoring rule that measures the accuracy of + probabilistic predictions. [...] [It] is applicable to tasks in which predictions + must assign probabilities to a set of mutually exclusive discrete outcomes or + classes." + +Let the true labels for a set of :math:`N` data points be encoded as a 1-of-K binary +indicator matrix :math:`Y`, i.e., :math:`y_{i,k} = 1` if sample :math:`i` has +label :math:`k` taken from a set of :math:`K` labels. Let :math:`\hat{P}` be a matrix +of probability estimates with elements :math:`\hat{p}_{i,k} \approx \operatorname{Pr}(y_{i,k} = 1)`. +Following the original definition by [Brier1950]_, the Brier score is given by: + +.. math:: + + BS(Y, \hat{P}) = \frac{1}{N}\sum_{i=0}^{N-1}\sum_{k=0}^{K-1}(y_{i,k} - \hat{p}_{i,k})^{2} - "The Brier score is a proper score function that measures the accuracy of - probabilistic predictions. It is applicable to tasks in which predictions - must assign probabilities to a set of mutually exclusive discrete outcomes." +The Brier score lies in the interval :math:`[0, 2]` and the lower the value the +better the probability estimates are (the mean squared difference is smaller). +Actually, the Brier score is a strictly proper scoring rule, meaning that it +achieves the best score only when the estimated probabilities equal the +true ones. -This function returns the mean squared error of the actual outcome -:math:`y \in \{0,1\}` and the predicted probability estimate -:math:`p = \operatorname{Pr}(y = 1)` (:term:`predict_proba`) as outputted by: +Note that in the binary case, the Brier score is usually divided by two and +ranges between :math:`[0,1]`. For binary targets :math:`y_i \in {0, 1}` and +probability estimates :math:`\hat{p}_i \approx \operatorname{Pr}(y_i = 1)` +for the positive class, the Brier score is then equal to: .. math:: - BS = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}} - 1}(y_i - p_i)^2 + BS(y, \hat{p}) = \frac{1}{N} \sum_{i=0}^{N - 1}(y_i - \hat{p}_i)^2 -The Brier score loss is also between 0 to 1 and the lower the value (the mean -square difference is smaller), the more accurate the prediction is. +The :func:`brier_score_loss` function computes the Brier score given the +ground-truth labels and predicted probabilities, as returned by an estimator's +``predict_proba`` method. The `scale_by_half` parameter controls which of the +two above definitions to follow. -Here is a small example of usage of this function:: >>> import numpy as np >>> from sklearn.metrics import brier_score_loss >>> y_true = np.array([0, 1, 1, 0]) >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) >>> y_prob = np.array([0.1, 0.9, 0.8, 0.4]) - >>> y_pred = np.array([0, 1, 1, 0]) >>> brier_score_loss(y_true, y_prob) 0.055 >>> brier_score_loss(y_true, 1 - y_prob, pos_label=0) 0.055 >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham") 0.055 - >>> brier_score_loss(y_true, y_prob > 0.5) - 0.0 + >>> brier_score_loss( + ... ["eggs", "ham", "spam"], + ... [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]], + ... labels=["eggs", "ham", "spam"], + ... ) + 0.146 The Brier score can be used to assess how well a classifier is calibrated. However, a lower Brier score loss does not always mean a better calibration. @@ -1722,28 +1995,220 @@ necessarily mean a better calibrated model. "Only when refinement loss remains the same does a lower Brier score loss always mean better calibration" [Bella2012]_, [Flach2008]_. -.. topic:: Example: +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py` + for an example of Brier score loss usage to perform probability + calibration of classifiers. + +.. rubric:: References - * See :ref:`sphx_glr_auto_examples_calibration_plot_calibration.py` - for an example of Brier score loss usage to perform probability - calibration of classifiers. +.. [Brier1950] G. Brier, `Verification of forecasts expressed in terms of probability + `_, + Monthly weather review 78.1 (1950) -.. topic:: References: +.. [Bella2012] Bella, Ferri, HernÃĄndez-Orallo, and Ramírez-Quintana + `"Calibration of Machine Learning Models" + `_ + in Khosrow-Pour, M. "Machine learning: concepts, methodologies, tools + and applications." Hershey, PA: Information Science Reference (2012). - .. [Brier1950] G. Brier, `Verification of forecasts expressed in terms of - probability - `_, - Monthly weather review 78.1 (1950) +.. [Flach2008] Flach, Peter, and Edson Matsubara. `"On classification, ranking, + and probability estimation." `_ + Dagstuhl Seminar Proceedings. Schloss Dagstuhl-Leibniz-Zentrum fÃŧr Informatik (2008). - .. [Bella2012] Bella, Ferri, HernÃĄndez-Orallo, and Ramírez-Quintana - `"Calibration of Machine Learning Models" - `_ - in Khosrow-Pour, M. "Machine learning: concepts, methodologies, tools - and applications." Hershey, PA: Information Science Reference (2012). +.. _class_likelihood_ratios: + +Class likelihood ratios +----------------------- + +The :func:`class_likelihood_ratios` function computes the `positive and negative +likelihood ratios +`_ +:math:`LR_\pm` for binary classes, which can be interpreted as the ratio of +post-test to pre-test odds as explained below. As a consequence, this metric is +invariant w.r.t. the class prevalence (the number of samples in the positive +class divided by the total number of samples) and **can be extrapolated between +populations regardless of any possible class imbalance.** + +The :math:`LR_\pm` metrics are therefore very useful in settings where the data +available to learn and evaluate a classifier is a study population with nearly +balanced classes, such as a case-control study, while the target application, +i.e. the general population, has very low prevalence. + +The positive likelihood ratio :math:`LR_+` is the probability of a classifier to +correctly predict that a sample belongs to the positive class divided by the +probability of predicting the positive class for a sample belonging to the +negative class: + +.. math:: + + LR_+ = \frac{\text{PR}(P+|T+)}{\text{PR}(P+|T-)}. + +The notation here refers to predicted (:math:`P`) or true (:math:`T`) label and +the sign :math:`+` and :math:`-` refer to the positive and negative class, +respectively, e.g. :math:`P+` stands for "predicted positive". + +Analogously, the negative likelihood ratio :math:`LR_-` is the probability of a +sample of the positive class being classified as belonging to the negative class +divided by the probability of a sample of the negative class being correctly +classified: + +.. math:: + + LR_- = \frac{\text{PR}(P-|T+)}{\text{PR}(P-|T-)}. + +For classifiers above chance :math:`LR_+` above 1 **higher is better**, while +:math:`LR_-` ranges from 0 to 1 and **lower is better**. +Values of :math:`LR_\pm\approx 1` correspond to chance level. + +Notice that probabilities differ from counts, for instance +:math:`\operatorname{PR}(P+|T+)` is not equal to the number of true positive +counts ``tp`` (see `the wikipedia page +`_ for +the actual formulas). + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_model_selection_plot_likelihood_ratios.py` + +.. dropdown:: Interpretation across varying prevalence + + Both class likelihood ratios are interpretable in terms of an odds ratio + (pre-test and post-tests): + + .. math:: + + \text{post-test odds} = \text{Likelihood ratio} \times \text{pre-test odds}. + + Odds are in general related to probabilities via + + .. math:: + + \text{odds} = \frac{\text{probability}}{1 - \text{probability}}, + + or equivalently + + .. math:: + + \text{probability} = \frac{\text{odds}}{1 + \text{odds}}. + + On a given population, the pre-test probability is given by the prevalence. By + converting odds to probabilities, the likelihood ratios can be translated into a + probability of truly belonging to either class before and after a classifier + prediction: + + .. math:: + + \text{post-test odds} = \text{Likelihood ratio} \times + \frac{\text{pre-test probability}}{1 - \text{pre-test probability}}, + + .. math:: + + \text{post-test probability} = \frac{\text{post-test odds}}{1 + \text{post-test odds}}. + +.. dropdown:: Mathematical divergences + + The positive likelihood ratio (`LR+`) is undefined when :math:`fp=0`, meaning the + classifier does not misclassify any negative labels as positives. This condition can + either indicate a perfect identification of all the negative cases or, if there are + also no true positive predictions (:math:`tp=0`), that the classifier does not predict + the positive class at all. In the first case, `LR+` can be interpreted as `np.inf`, in + the second case (for instance, with highly imbalanced data) it can be interpreted as + `np.nan`. + + The negative likelihood ratio (`LR-`) is undefined when :math:`tn=0`. Such + divergence is invalid, as :math:`LR_- > 1.0` would indicate an increase in the odds of + a sample belonging to the positive class after being classified as negative, as if the + act of classifying caused the positive condition. This includes the case of a + :class:`~sklearn.dummy.DummyClassifier` that always predicts the positive class + (i.e. when :math:`tn=fn=0`). + + Both class likelihood ratios (`LR+ and LR-`) are undefined when :math:`tp=fn=0`, which + means that no samples of the positive class were present in the test set. This can + happen when cross-validating on highly imbalanced data and also leads to a division by + zero. + + If a division by zero occurs and `raise_warning` is set to `True` (default), + :func:`class_likelihood_ratios` raises an `UndefinedMetricWarning` and returns + `np.nan` by default to avoid pollution when averaging over cross-validation folds. + Users can set return values in case of a division by zero with the + `replace_undefined_by` param. + + For a worked-out demonstration of the :func:`class_likelihood_ratios` function, + see the example below. + +.. dropdown:: References + + * `Wikipedia entry for Likelihood ratios in diagnostic testing + `_ + + * Brenner, H., & Gefeller, O. (1997). + Variation of sensitivity, specificity, likelihood ratios and predictive + values with disease prevalence. Statistics in medicine, 16(9), 981-991. + + +.. _d2_score_classification: + +D² score for classification +--------------------------- + +The D² score computes the fraction of deviance explained. +It is a generalization of R², where the squared error is generalized and replaced +by a classification deviance of choice :math:`\text{dev}(y, \hat{y})` +(e.g., Log loss). D² is a form of a *skill score*. +It is calculated as + +.. math:: + + D^2(y, \hat{y}) = 1 - \frac{\text{dev}(y, \hat{y})}{\text{dev}(y, y_{\text{null}})} \,. + +Where :math:`y_{\text{null}}` is the optimal prediction of an intercept-only model +(e.g., the per-class proportion of `y_true` in the case of the Log loss). + +Like R², the best possible score is 1.0 and it can be negative (because the +model can be arbitrarily worse). A constant model that always predicts +:math:`y_{\text{null}}`, disregarding the input features, would get a D² score +of 0.0. + +.. dropdown:: D2 log loss score + + The :func:`d2_log_loss_score` function implements the special case + of D² with the log loss, see :ref:`log_loss`, i.e.: + + .. math:: + + \text{dev}(y, \hat{y}) = \text{log_loss}(y, \hat{y}). + + Here are some usage examples of the :func:`d2_log_loss_score` function:: + + >>> from sklearn.metrics import d2_log_loss_score + >>> y_true = [1, 1, 2, 3] + >>> y_pred = [ + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... [0.5, 0.25, 0.25], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.98, 0.01, 0.01], + ... [0.01, 0.98, 0.01], + ... [0.01, 0.01, 0.98], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + 0.981 + >>> y_true = [1, 2, 3] + >>> y_pred = [ + ... [0.1, 0.6, 0.3], + ... [0.1, 0.6, 0.3], + ... [0.4, 0.5, 0.1], + ... ] + >>> d2_log_loss_score(y_true, y_pred) + -0.552 - .. [Flach2008] Flach, Peter, and Edson Matsubara. `"On classification, ranking, - and probability estimation." `_ - Dagstuhl Seminar Proceedings. Schloss Dagstuhl-Leibniz-Zentrum fr Informatik (2008). .. _multilabel_ranking_metrics: @@ -1765,7 +2230,7 @@ The :func:`coverage_error` function computes the average number of labels that have to be included in the final prediction such that all true labels are predicted. This is useful if you want to know how many top-scored-labels you have to predict in average without missing any true one. The best value -of this metrics is thus the average number of true labels. +of this metric is thus the average number of true labels. .. note:: @@ -1841,7 +2306,7 @@ Here is a small example of usage of this function:: >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) >>> label_ranking_average_precision_score(y_true, y_score) - 0.416... + 0.416 .. _label_ranking_loss: @@ -1876,18 +2341,19 @@ Here is a small example of usage of this function:: >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) >>> label_ranking_loss(y_true, y_score) - 0.75... + 0.75 >>> # With the following prediction, we have perfect and minimal loss >>> y_score = np.array([[1.0, 0.1, 0.2], [0.1, 0.2, 0.9]]) >>> label_ranking_loss(y_true, y_score) 0.0 -.. topic:: References: +.. dropdown:: References * Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). Mining multi-label data. In Data mining and knowledge discovery handbook (pp. 667-685). Springer US. + .. _ndcg: Normalized Discounted Cumulative Gain @@ -1906,7 +2372,7 @@ engine algorithms or related applications. Using a graded relevance scale of documents in a search-engine result set, DCG measures the usefulness, or gain, of a document based on its position in the result list. The gain is accumulated from the top of the result list to the bottom, with the gain of each result -discounted at lower ranks" +discounted at lower ranks." DCG orders the true targets (e.g. relevance of query answers) in the predicted order, then multiplies them by a logarithmic decay and sums the result. The sum @@ -1932,7 +2398,7 @@ DCG score is and the NDCG score is the DCG score divided by the DCG score obtained for :math:`y`. -.. topic:: References: +.. dropdown:: References * `Wikipedia entry for Discounted Cumulative Gain `_ @@ -1950,6 +2416,7 @@ and the NDCG score is the DCG score divided by the DCG score obtained for European conference on information retrieval (pp. 414-421). Springer, Berlin, Heidelberg. + .. _regression_metrics: Regression metrics @@ -1960,17 +2427,18 @@ Regression metrics The :mod:`sklearn.metrics` module implements several loss, score, and utility functions to measure regression performance. Some of those have been enhanced to handle the multioutput case: :func:`mean_squared_error`, -:func:`mean_absolute_error`, :func:`explained_variance_score`, -:func:`r2_score` and :func:`mean_pinball_loss`. +:func:`mean_absolute_error`, :func:`r2_score`, +:func:`explained_variance_score`, :func:`mean_pinball_loss`, :func:`d2_pinball_score` +and :func:`d2_absolute_error_score`. -These functions have an ``multioutput`` keyword argument which specifies the +These functions have a ``multioutput`` keyword argument which specifies the way the scores or losses for each individual target should be averaged. The default is ``'uniform_average'``, which specifies a uniformly weighted mean over outputs. If an ``ndarray`` of shape ``(n_outputs,)`` is passed, then its entries are interpreted as weights and an according weighted average is -returned. If ``multioutput`` is ``'raw_values'`` is specified, then all -unaltered individual scores or losses will be returned in an array of shape +returned. If ``multioutput`` is ``'raw_values'``, then all unaltered +individual scores or losses will be returned in an array of shape ``(n_outputs,)``. @@ -1979,76 +2447,89 @@ value ``'variance_weighted'`` for the ``multioutput`` parameter. This option leads to a weighting of each individual score by the variance of the corresponding target variable. This setting quantifies the globally captured unscaled variance. If the target variables are of different scale, then this -score puts more importance on well explaining the higher variance variables. -``multioutput='variance_weighted'`` is the default value for :func:`r2_score` -for backward compatibility. This will be changed to ``uniform_average`` in the -future. +score puts more importance on explaining the higher variance variables. -.. _explained_variance_score: - -Explained variance score -------------------------- - -The :func:`explained_variance_score` computes the `explained variance -regression score `_. +.. _r2_score: -If :math:`\hat{y}` is the estimated target output, :math:`y` the corresponding -(correct) target output, and :math:`Var` is `Variance -`_, the square of the standard deviation, -then the explained variance is estimated as follow: +R² score, the coefficient of determination +------------------------------------------- -.. math:: +The :func:`r2_score` function computes the `coefficient of +determination `_, +usually denoted as :math:`R^2`. - explained\_{}variance(y, \hat{y}) = 1 - \frac{Var\{ y - \hat{y}\}}{Var\{y\}} +It represents the proportion of variance (of y) that has been explained by the +independent variables in the model. It provides an indication of goodness of +fit and therefore a measure of how well unseen samples are likely to be +predicted by the model, through the proportion of explained variance. -The best possible score is 1.0, lower values are worse. +As such variance is dataset dependent, :math:`R^2` may not be meaningfully comparable +across different datasets. Best possible score is 1.0 and it can be negative +(because the model can be arbitrarily worse). A constant model that always +predicts the expected (average) value of y, disregarding the input features, +would get an :math:`R^2` score of 0.0. -Here is a small example of usage of the :func:`explained_variance_score` -function:: +Note: when the prediction residuals have zero mean, the :math:`R^2` score and +the :ref:`explained_variance_score` are identical. - >>> from sklearn.metrics import explained_variance_score - >>> y_true = [3, -0.5, 2, 7] - >>> y_pred = [2.5, 0.0, 2, 8] - >>> explained_variance_score(y_true, y_pred) - 0.957... - >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] - >>> y_pred = [[0, 2], [-1, 2], [8, -5]] - >>> explained_variance_score(y_true, y_pred, multioutput='raw_values') - array([0.967..., 1. ]) - >>> explained_variance_score(y_true, y_pred, multioutput=[0.3, 0.7]) - 0.990... - -.. _max_error: +If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample +and :math:`y_i` is the corresponding true value for total :math:`n` samples, +the estimated :math:`R^2` is defined as: -Max error -------------------- +.. math:: -The :func:`max_error` function computes the maximum `residual error -`_ , a metric -that captures the worst case error between the predicted value and -the true value. In a perfectly fitted single output regression -model, ``max_error`` would be ``0`` on the training set and though this -would be highly unlikely in the real world, this metric shows the -extent of error that the model had when it was fitted. + R^2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} +where :math:`\bar{y} = \frac{1}{n} \sum_{i=1}^{n} y_i` and :math:`\sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \sum_{i=1}^{n} \epsilon_i^2`. -If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample, -and :math:`y_i` is the corresponding true value, then the max error is -defined as +Note that :func:`r2_score` calculates unadjusted :math:`R^2` without correcting for +bias in sample variance of y. -.. math:: +In the particular case where the true target is constant, the :math:`R^2` score is +not finite: it is either ``NaN`` (perfect predictions) or ``-Inf`` (imperfect +predictions). Such non-finite scores may prevent correct model optimization +such as grid-search cross-validation to be performed correctly. For this reason +the default behaviour of :func:`r2_score` is to replace them with 1.0 (perfect +predictions) or 0.0 (imperfect predictions). If ``force_finite`` +is set to ``False``, this score falls back on the original :math:`R^2` definition. - \text{Max Error}(y, \hat{y}) = max(| y_i - \hat{y}_i |) +Here is a small example of usage of the :func:`r2_score` function:: -Here is a small example of usage of the :func:`max_error` function:: + >>> from sklearn.metrics import r2_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> r2_score(y_true, y_pred) + 0.948 + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> r2_score(y_true, y_pred, multioutput='variance_weighted') + 0.938 + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> r2_score(y_true, y_pred, multioutput='uniform_average') + 0.936 + >>> r2_score(y_true, y_pred, multioutput='raw_values') + array([0.965, 0.908]) + >>> r2_score(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.925 + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2] + >>> r2_score(y_true, y_pred) + 1.0 + >>> r2_score(y_true, y_pred, force_finite=False) + nan + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2 + 1e-8] + >>> r2_score(y_true, y_pred) + 0.0 + >>> r2_score(y_true, y_pred, force_finite=False) + -inf - >>> from sklearn.metrics import max_error - >>> y_true = [3, 2, 7, 1] - >>> y_pred = [9, 2, 7, 1] - >>> max_error(y_true, y_pred) - 6 +.. rubric:: Examples -The :func:`max_error` does not support multioutput. +* See :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` + for an example of R² score usage to + evaluate Lasso and Elastic Net on sparse signals. .. _mean_absolute_error: @@ -2082,14 +2563,14 @@ Here is a small example of usage of the :func:`mean_absolute_error` function:: >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values') array([0.5, 1. ]) >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7]) - 0.85... + 0.85 .. _mean_squared_error: Mean squared error ------------------- -The :func:`mean_squared_error` function computes `mean square +The :func:`mean_squared_error` function computes `mean squared error `_, a risk metric corresponding to the expected value of the squared (quadratic) error or loss. @@ -2113,13 +2594,16 @@ function:: >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] >>> y_pred = [[0, 2], [-1, 2], [8, -5]] >>> mean_squared_error(y_true, y_pred) - 0.7083... + 0.7083 + +.. rubric:: Examples -.. topic:: Examples: +* See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` + for an example of mean squared error usage to evaluate gradient boosting regression. - * See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py` - for an example of mean squared error usage to - evaluate gradient boosting regression. +Taking the square root of the MSE, called the root mean squared error (RMSE), is another +common metric that provides a measure in the same units as the target variable. RMSE is +available through the :func:`root_mean_squared_error` function. .. _mean_squared_log_error: @@ -2152,11 +2636,14 @@ function:: >>> y_true = [3, 5, 2.5, 7] >>> y_pred = [2.5, 5, 4, 8] >>> mean_squared_log_error(y_true, y_pred) - 0.039... + 0.0397 >>> y_true = [[0.5, 1], [1, 2], [7, 6]] >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]] >>> mean_squared_log_error(y_true, y_pred) - 0.044... + 0.044 + +The root mean squared logarithmic error (RMSLE) is available through the +:func:`root_mean_squared_log_error` function. .. _mean_absolute_percentage_error: @@ -2173,7 +2660,7 @@ error (MAPE) estimated over :math:`n_{\text{samples}}` is defined as .. math:: - \text{MAPE}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1} \frac{{}\left| y_i - \hat{y}_i \right|}{max(\epsilon, \left| y_i \right|)} + \text{MAPE}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1} \frac{{}\left| y_i - \hat{y}_i \right|}{\max(\epsilon, \left| y_i \right|)} where :math:`\epsilon` is an arbitrary small yet strictly positive number to avoid undefined results when y is zero. @@ -2187,13 +2674,29 @@ function:: >>> y_true = [1, 10, 1e6] >>> y_pred = [0.9, 15, 1.2e6] >>> mean_absolute_percentage_error(y_true, y_pred) - 0.2666... + 0.2666 In above example, if we had used `mean_absolute_error`, it would have ignored the small magnitude values and only reflected the error in prediction of highest magnitude value. But that problem is resolved in case of MAPE because it calculates relative percentage error with respect to actual output. +.. note:: + + The MAPE formula here does not represent the common "percentage" definition: the + percentage in the range [0, 100] is converted to a relative value in the range [0, + 1] by dividing by 100. Thus, an error of 200% corresponds to a relative error of 2. + The motivation here is to have a range of values that is more consistent with other + error metrics in scikit-learn, such as `accuracy_score`. + + To obtain the mean absolute percentage error as per the Wikipedia formula, + multiply the `mean_absolute_percentage_error` computed here by 100. + +.. dropdown:: References + + * `Wikipedia entry for Mean Absolute Percentage Error + `_ + .. _median_absolute_error: Median absolute error @@ -2222,65 +2725,102 @@ function:: >>> median_absolute_error(y_true, y_pred) 0.5 -.. _r2_score: -R² score, the coefficient of determination -------------------------------------------- -The :func:`r2_score` function computes the `coefficient of -determination `_, -usually denoted as R². +.. _max_error: -It represents the proportion of variance (of y) that has been explained by the -independent variables in the model. It provides an indication of goodness of -fit and therefore a measure of how well unseen samples are likely to be -predicted by the model, through the proportion of explained variance. +Max error +------------------- -As such variance is dataset dependent, R² may not be meaningfully comparable -across different datasets. Best possible score is 1.0 and it can be negative -(because the model can be arbitrarily worse). A constant model that always -predicts the expected value of y, disregarding the input features, would get a -R² score of 0.0. +The :func:`max_error` function computes the maximum `residual error +`_ , a metric +that captures the worst case error between the predicted value and +the true value. In a perfectly fitted single output regression +model, ``max_error`` would be ``0`` on the training set and though this +would be highly unlikely in the real world, this metric shows the +extent of error that the model had when it was fitted. -If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample -and :math:`y_i` is the corresponding true value for total :math:`n` samples, -the estimated R² is defined as: + +If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample, +and :math:`y_i` is the corresponding true value, then the max error is +defined as .. math:: - R^2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} + \text{Max Error}(y, \hat{y}) = \max(| y_i - \hat{y}_i |) -where :math:`\bar{y} = \frac{1}{n} \sum_{i=1}^{n} y_i` and :math:`\sum_{i=1}^{n} (y_i - \hat{y}_i)^2 = \sum_{i=1}^{n} \epsilon_i^2`. +Here is a small example of usage of the :func:`max_error` function:: -Note that :func:`r2_score` calculates unadjusted R² without correcting for -bias in sample variance of y. + >>> from sklearn.metrics import max_error + >>> y_true = [3, 2, 7, 1] + >>> y_pred = [9, 2, 7, 1] + >>> max_error(y_true, y_pred) + 6.0 -Here is a small example of usage of the :func:`r2_score` function:: +The :func:`max_error` does not support multioutput. - >>> from sklearn.metrics import r2_score - >>> y_true = [3, -0.5, 2, 7] - >>> y_pred = [2.5, 0.0, 2, 8] - >>> r2_score(y_true, y_pred) - 0.948... - >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] - >>> y_pred = [[0, 2], [-1, 2], [8, -5]] - >>> r2_score(y_true, y_pred, multioutput='variance_weighted') - 0.938... - >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] - >>> y_pred = [[0, 2], [-1, 2], [8, -5]] - >>> r2_score(y_true, y_pred, multioutput='uniform_average') - 0.936... - >>> r2_score(y_true, y_pred, multioutput='raw_values') - array([0.965..., 0.908...]) - >>> r2_score(y_true, y_pred, multioutput=[0.3, 0.7]) - 0.925... +.. _explained_variance_score: + +Explained variance score +------------------------- + +The :func:`explained_variance_score` computes the `explained variance +regression score `_. +If :math:`\hat{y}` is the estimated target output, :math:`y` the corresponding +(correct) target output, and :math:`Var` is `Variance +`_, the square of the standard deviation, +then the explained variance is estimated as follow: + +.. math:: + + explained\_{}variance(y, \hat{y}) = 1 - \frac{Var\{ y - \hat{y}\}}{Var\{y\}} + +The best possible score is 1.0, lower values are worse. + +.. topic:: Link to :ref:`r2_score` -.. topic:: Example: + The difference between the explained variance score and the :ref:`r2_score` + is that the explained variance score does not account for + systematic offset in the prediction. For this reason, the + :ref:`r2_score` should be preferred in general. - * See :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` - for an example of R² score usage to - evaluate Lasso and Elastic Net on sparse signals. +In the particular case where the true target is constant, the Explained +Variance score is not finite: it is either ``NaN`` (perfect predictions) or +``-Inf`` (imperfect predictions). Such non-finite scores may prevent correct +model optimization such as grid-search cross-validation to be performed +correctly. For this reason the default behaviour of +:func:`explained_variance_score` is to replace them with 1.0 (perfect +predictions) or 0.0 (imperfect predictions). You can set the ``force_finite`` +parameter to ``False`` to prevent this fix from happening and fallback on the +original Explained Variance score. + +Here is a small example of usage of the :func:`explained_variance_score` +function:: + + >>> from sklearn.metrics import explained_variance_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> explained_variance_score(y_true, y_pred) + 0.957 + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> explained_variance_score(y_true, y_pred, multioutput='raw_values') + array([0.967, 1. ]) + >>> explained_variance_score(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.990 + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2] + >>> explained_variance_score(y_true, y_pred) + 1.0 + >>> explained_variance_score(y_true, y_pred, force_finite=False) + nan + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2 + 1e-8] + >>> explained_variance_score(y_true, y_pred) + 0.0 + >>> explained_variance_score(y_true, y_pred, force_finite=False) + -inf .. _mean_tweedie_deviance: @@ -2310,10 +2850,10 @@ is defined as \sum_{i=0}^{n_\text{samples} - 1} \begin{cases} (y_i-\hat{y}_i)^2, & \text{for }p=0\text{ (Normal)}\\ - 2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i), & \text{for}p=1\text{ (Poisson)}\\ - 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for}p=2\text{ (Gamma)}\\ + 2(y_i \log(y_i/\hat{y}_i) + \hat{y}_i - y_i), & \text{for }p=1\text{ (Poisson)}\\ + 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for }p=2\text{ (Gamma)}\\ 2\left(\frac{\max(y_i,0)^{2-p}}{(1-p)(2-p)}- - \frac{y\,\hat{y}^{1-p}_i}{1-p}+\frac{\hat{y}^{2-p}_i}{2-p}\right), + \frac{y_i\,\hat{y}_i^{1-p}}{1-p}+\frac{\hat{y}_i^{2-p}}{2-p}\right), & \text{otherwise} \end{cases} @@ -2325,8 +2865,8 @@ distribution (``power=0``), quadratically. In general, the higher ``power`` the less weight is given to extreme deviations between true and predicted targets. -For instance, let's compare the two predictions 1.0 and 100 that are both -50% of their corresponding true value. +For instance, let's compare the two predictions 1.5 and 150 that are both +50% larger than their corresponding true value. The mean squared error (``power=0``) is very sensitive to the prediction difference of the second point,:: @@ -2340,16 +2880,16 @@ prediction difference of the second point,:: If we increase ``power`` to 1,:: >>> mean_tweedie_deviance([1.0], [1.5], power=1) - 0.18... + 0.189 >>> mean_tweedie_deviance([100.], [150.], power=1) - 18.9... + 18.9 the difference in errors decreases. Finally, by setting, ``power=2``:: >>> mean_tweedie_deviance([1.0], [1.5], power=2) - 0.14... + 0.144 >>> mean_tweedie_deviance([100.], [150.], power=2) - 0.14... + 0.144 we would get identical errors. The deviance when ``power=2`` is thus only sensitive to relative errors. @@ -2360,33 +2900,35 @@ Pinball loss ------------ The :func:`mean_pinball_loss` function is used to evaluate the predictive -performance of quantile regression models. The `pinball loss -`_ is equivalent -to :func:`mean_absolute_error` when the quantile parameter ``alpha`` is set to -0.5. +performance of `quantile regression +`_ models. .. math:: \text{pinball}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1} \alpha \max(y_i - \hat{y}_i, 0) + (1 - \alpha) \max(\hat{y}_i - y_i, 0) +The value of pinball loss is equivalent to half of :func:`mean_absolute_error` when the quantile +parameter ``alpha`` is set to 0.5. + + Here is a small example of usage of the :func:`mean_pinball_loss` function:: >>> from sklearn.metrics import mean_pinball_loss >>> y_true = [1, 2, 3] >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1) - 0.03... + 0.033 >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1) - 0.3... + 0.3 >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9) - 0.3... + 0.3 >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9) - 0.03... + 0.033 >>> mean_pinball_loss(y_true, y_true, alpha=0.1) 0.0 >>> mean_pinball_loss(y_true, y_true, alpha=0.9) 0.0 -It is possible to build a scorer object with a specific choice of alpha:: +It is possible to build a scorer object with a specific choice of ``alpha``:: >>> from sklearn.metrics import make_scorer >>> mean_pinball_loss_95p = make_scorer(mean_pinball_loss, alpha=0.95) @@ -2405,32 +2947,190 @@ quantile regressor via cross-validation: ... random_state=0, ... ) >>> cross_val_score(estimator, X, y, cv=5, scoring=mean_pinball_loss_95p) - array([11.1..., 10.4... , 24.4..., 9.2..., 12.9...]) + array([13.6, 9.7, 23.3, 9.5, 10.4]) It is also possible to build scorer objects for hyper-parameter tuning. The sign of the loss must be switched to ensure that greater means better as explained in the example linked below. -.. topic:: Example: +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py` + for an example of using the pinball loss to evaluate and tune the + hyper-parameters of quantile regression models on data with non-symmetric + noise and outliers. + +.. _d2_score: + +D² score +-------- + +The D² score computes the fraction of deviance explained. +It is a generalization of R², where the squared error is generalized and replaced +by a deviance of choice :math:`\text{dev}(y, \hat{y})` +(e.g., Tweedie, pinball or mean absolute error). D² is a form of a *skill score*. +It is calculated as + +.. math:: + + D^2(y, \hat{y}) = 1 - \frac{\text{dev}(y, \hat{y})}{\text{dev}(y, y_{\text{null}})} \,. + +Where :math:`y_{\text{null}}` is the optimal prediction of an intercept-only model +(e.g., the mean of `y_true` for the Tweedie case, the median for absolute +error and the alpha-quantile for pinball loss). + +Like R², the best possible score is 1.0 and it can be negative (because the +model can be arbitrarily worse). A constant model that always predicts +:math:`y_{\text{null}}`, disregarding the input features, would get a D² score +of 0.0. + +.. dropdown:: D² Tweedie score + + The :func:`d2_tweedie_score` function implements the special case of D² + where :math:`\text{dev}(y, \hat{y})` is the Tweedie deviance, see :ref:`mean_tweedie_deviance`. + It is also known as D² Tweedie and is related to McFadden's likelihood ratio index. + + The argument ``power`` defines the Tweedie power as for + :func:`mean_tweedie_deviance`. Note that for `power=0`, + :func:`d2_tweedie_score` equals :func:`r2_score` (for single targets). + + A scorer object with a specific choice of ``power`` can be built by:: + + >>> from sklearn.metrics import d2_tweedie_score, make_scorer + >>> d2_tweedie_score_15 = make_scorer(d2_tweedie_score, power=1.5) + +.. dropdown:: D² pinball score - * See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py` - for an example of using a the pinball loss to evaluate and tune the - hyper-parameters of quantile regression models on data with non-symmetric - noise and outliers. + The :func:`d2_pinball_score` function implements the special case + of D² with the pinball loss, see :ref:`pinball_loss`, i.e.: + .. math:: + + \text{dev}(y, \hat{y}) = \text{pinball}(y, \hat{y}). + + The argument ``alpha`` defines the slope of the pinball loss as for + :func:`mean_pinball_loss` (:ref:`pinball_loss`). It determines the + quantile level ``alpha`` for which the pinball loss and also D² + are optimal. Note that for `alpha=0.5` (the default) :func:`d2_pinball_score` + equals :func:`d2_absolute_error_score`. + + A scorer object with a specific choice of ``alpha`` can be built by:: + + >>> from sklearn.metrics import d2_pinball_score, make_scorer + >>> d2_pinball_score_08 = make_scorer(d2_pinball_score, alpha=0.8) + +.. dropdown:: D² absolute error score + + The :func:`d2_absolute_error_score` function implements the special case of + the :ref:`mean_absolute_error`: + + .. math:: + + \text{dev}(y, \hat{y}) = \text{MAE}(y, \hat{y}). + + Here are some usage examples of the :func:`d2_absolute_error_score` function:: + + >>> from sklearn.metrics import d2_absolute_error_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> d2_absolute_error_score(y_true, y_pred) + 0.764 + >>> y_true = [1, 2, 3] + >>> y_pred = [1, 2, 3] + >>> d2_absolute_error_score(y_true, y_pred) + 1.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [2, 2, 2] + >>> d2_absolute_error_score(y_true, y_pred) + 0.0 + + +.. _visualization_regression_evaluation: + +Visual evaluation of regression models +-------------------------------------- + +Among methods to assess the quality of regression models, scikit-learn provides +the :class:`~sklearn.metrics.PredictionErrorDisplay` class. It allows to +visually inspect the prediction errors of a model in two different manners. + +.. image:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_predict_001.png + :target: ../auto_examples/model_selection/plot_cv_predict.html + :scale: 75 + :align: center + +The plot on the left shows the actual values vs predicted values. For a +noise-free regression task aiming to predict the (conditional) expectation of +`y`, a perfect regression model would display data points on the diagonal +defined by predicted equal to actual values. The further away from this optimal +line, the larger the error of the model. In a more realistic setting with +irreducible noise, that is, when not all the variations of `y` can be explained +by features in `X`, then the best model would lead to a cloud of points densely +arranged around the diagonal. + +Note that the above only holds when the predicted values is the expected value +of `y` given `X`. This is typically the case for regression models that +minimize the mean squared error objective function or more generally the +:ref:`mean Tweedie deviance ` for any value of its +"power" parameter. + +When plotting the predictions of an estimator that predicts a quantile +of `y` given `X`, e.g. :class:`~sklearn.linear_model.QuantileRegressor` +or any other model minimizing the :ref:`pinball loss `, a +fraction of the points are either expected to lie above or below the diagonal +depending on the estimated quantile level. + +All in all, while intuitive to read, this plot does not really inform us on +what to do to obtain a better model. + +The right-hand side plot shows the residuals (i.e. the difference between the +actual and the predicted values) vs. the predicted values. + +This plot makes it easier to visualize if the residuals follow and +`homoscedastic or heteroschedastic +`_ +distribution. + +In particular, if the true distribution of `y|X` is Poisson or Gamma +distributed, it is expected that the variance of the residuals of the optimal +model would grow with the predicted value of `E[y|X]` (either linearly for +Poisson or quadratically for Gamma). + +When fitting a linear least squares regression model (see +:class:`~sklearn.linear_model.LinearRegression` and +:class:`~sklearn.linear_model.Ridge`), we can use this plot to check +if some of the `model assumptions +`_ +are met, in particular that the residuals should be uncorrelated, their +expected value should be null and that their variance should be constant +(homoschedasticity). + +If this is not the case, and in particular if the residuals plot show some +banana-shaped structure, this is a hint that the model is likely mis-specified +and that non-linear feature engineering or switching to a non-linear regression +model might be useful. + +Refer to the example below to see a model evaluation that makes use of this +display. + +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py` for + an example on how to use :class:`~sklearn.metrics.PredictionErrorDisplay` + to visualize the prediction quality improvement of a regression model + obtained by transforming the target before learning. .. _clustering_metrics: Clustering metrics -====================== +================== .. currentmodule:: sklearn.metrics The :mod:`sklearn.metrics` module implements several loss, score, and utility -functions. For more information see the :ref:`clustering_evaluation` -section for instance clustering, and :ref:`biclustering_evaluation` for -biclustering. - +functions to measure clustering performance. For more information see the +:ref:`clustering_evaluation` section for instance clustering, and +:ref:`biclustering_evaluation` for biclustering. .. _dummy_estimators: @@ -2472,19 +3172,19 @@ Next, let's compare the accuracy of ``SVC`` and ``most_frequent``:: >>> from sklearn.svm import SVC >>> clf = SVC(kernel='linear', C=1).fit(X_train, y_train) >>> clf.score(X_test, y_test) - 0.63... + 0.63 >>> clf = DummyClassifier(strategy='most_frequent', random_state=0) >>> clf.fit(X_train, y_train) DummyClassifier(random_state=0, strategy='most_frequent') >>> clf.score(X_test, y_test) - 0.57... + 0.579 We see that ``SVC`` doesn't do much better than a dummy classifier. Now, let's change the kernel:: >>> clf = SVC(kernel='rbf', C=1).fit(X_train, y_train) >>> clf.score(X_test, y_test) - 0.94... + 0.94 We see that the accuracy was boosted to almost 100%. A cross validation strategy is recommended for a better estimate of the accuracy, if it diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst deleted file mode 100644 index e00212d80fd10..0000000000000 --- a/doc/modules/model_persistence.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. _model_persistence: - -================= -Model persistence -================= - -After training a scikit-learn model, it is desirable to have a way to persist -the model for future use without having to retrain. The following sections give -you some hints on how to persist a scikit-learn model. - -Python specific serialization ------------------------------ - -It is possible to save a model in scikit-learn by using Python's built-in -persistence model, namely `pickle -`_:: - - >>> from sklearn import svm - >>> from sklearn import datasets - >>> clf = svm.SVC() - >>> X, y= datasets.load_iris(return_X_y=True) - >>> clf.fit(X, y) - SVC() - - >>> import pickle - >>> s = pickle.dumps(clf) - >>> clf2 = pickle.loads(s) - >>> clf2.predict(X[0:1]) - array([0]) - >>> y[0] - 0 - -In the specific case of scikit-learn, it may be better to use joblib's -replacement of pickle (``dump`` & ``load``), which is more efficient on -objects that carry large numpy arrays internally as is often the case for -fitted scikit-learn estimators, but can only pickle to the disk and not to a -string:: - - >>> from joblib import dump, load - >>> dump(clf, 'filename.joblib') # doctest: +SKIP - -Later you can load back the pickled model (possibly in another Python process) -with:: - - >>> clf = load('filename.joblib') # doctest:+SKIP - -.. note:: - - ``dump`` and ``load`` functions also accept file-like object - instead of filenames. More information on data persistence with Joblib is - available `here - `_. - -.. _persistence_limitations: - -Security & maintainability limitations -...................................... - -pickle (and joblib by extension), has some issues regarding maintainability -and security. Because of this, - -* Never unpickle untrusted data as it could lead to malicious code being - executed upon loading. -* While models saved using one version of scikit-learn might load in - other versions, this is entirely unsupported and inadvisable. It should - also be kept in mind that operations performed on such data could give - different and unexpected results. - -In order to rebuild a similar model with future versions of scikit-learn, -additional metadata should be saved along the pickled model: - -* The training data, e.g. a reference to an immutable snapshot -* The python source code used to generate the model -* The versions of scikit-learn and its dependencies -* The cross validation score obtained on the training data - -This should make it possible to check that the cross-validation score is in the -same range as before. - -Aside for a few exceptions, pickled models should be portable across -architectures assuming the same versions of dependencies and Python are used. -If you encounter an estimator that is not portable please open an issue on -GitHub. Pickled models are often deployed in production using containers, like -Docker, in order to freeze the environment and dependencies. - -If you want to know more about these issues and explore other possible -serialization methods, please refer to this -`talk by Alex Gaynor -`_. - -Interoperable formats ---------------------- - -For reproducibility and quality control needs, when different architectures -and environments should be taken into account, exporting the model in -`Open Neural Network -Exchange `_ format or `Predictive Model Markup Language -(PMML) `_ format -might be a better approach than using `pickle` alone. -These are helpful where you may want to use your model for prediction in a -different environment from where the model was trained. - -ONNX is a binary serialization of the model. It has been developed to improve -the usability of the interoperable representation of data models. -It aims to facilitate the conversion of the data -models between different machine learning frameworks, and to improve their -portability on different computing architectures. More details are available -from the `ONNX tutorial `_. -To convert scikit-learn model to ONNX a specific tool `sklearn-onnx -`_ has been developed. - -PMML is an implementation of the `XML -`_ document standard -defined to represent data models together with the data used to generate them. -Being human and machine readable, -PMML is a good option for model validation on different platforms and -long term archiving. On the other hand, as XML in general, its verbosity does -not help in production when performance is critical. -To convert scikit-learn model to PMML you can use for example `sklearn2pmml -`_ distributed under the Affero GPLv3 -license. diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index b3ea8d838574e..ef7d6ab3000e1 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -63,8 +63,8 @@ can provide additional strategies beyond what is built-in: - :class:`semi_supervised.LabelSpreading` - :class:`discriminant_analysis.LinearDiscriminantAnalysis` - :class:`svm.LinearSVC` (setting multi_class="crammer_singer") - - :class:`linear_model.LogisticRegression` (setting multi_class="multinomial") - - :class:`linear_model.LogisticRegressionCV` (setting multi_class="multinomial") + - :class:`linear_model.LogisticRegression` (with most solvers) + - :class:`linear_model.LogisticRegressionCV` (with most solvers) - :class:`neural_network.MLPClassifier` - :class:`neighbors.NearestCentroid` - :class:`discriminant_analysis.QuadraticDiscriminantAnalysis` @@ -86,8 +86,8 @@ can provide additional strategies beyond what is built-in: - :class:`ensemble.GradientBoostingClassifier` - :class:`gaussian_process.GaussianProcessClassifier` (setting multi_class = "one_vs_rest") - :class:`svm.LinearSVC` (setting multi_class="ovr") - - :class:`linear_model.LogisticRegression` (setting multi_class="ovr") - - :class:`linear_model.LogisticRegressionCV` (setting multi_class="ovr") + - :class:`linear_model.LogisticRegression` (most solvers) + - :class:`linear_model.LogisticRegressionCV` (most solvers) - :class:`linear_model.SGDClassifier` - :class:`linear_model.Perceptron` - :class:`linear_model.PassiveAggressiveClassifier` @@ -102,6 +102,7 @@ can provide additional strategies beyond what is built-in: - :class:`neural_network.MLPClassifier` - :class:`neighbors.RadiusNeighborsClassifier` - :class:`ensemble.RandomForestClassifier` + - :class:`linear_model.RidgeClassifier` - :class:`linear_model.RidgeClassifierCV` @@ -146,35 +147,38 @@ Target format Valid :term:`multiclass` representations for :func:`~sklearn.utils.multiclass.type_of_target` (`y`) are: - - 1d or column vector containing more than two discrete values. An - example of a vector ``y`` for 4 samples: - - >>> import numpy as np - >>> y = np.array(['apple', 'pear', 'apple', 'orange']) - >>> print(y) - ['apple' 'pear' 'apple' 'orange'] - - - Dense or sparse :term:`binary` matrix of shape ``(n_samples, n_classes)`` - with a single sample per row, where each column represents one class. An - example of both a dense and sparse :term:`binary` matrix ``y`` for 4 - samples, where the columns, in order, are apple, orange, and pear: - - >>> import numpy as np - >>> from sklearn.preprocessing import LabelBinarizer - >>> y = np.array(['apple', 'pear', 'apple', 'orange']) - >>> y_dense = LabelBinarizer().fit_transform(y) - >>> print(y_dense) - [[1 0 0] - [0 0 1] - [1 0 0] - [0 1 0]] - >>> from scipy import sparse - >>> y_sparse = sparse.csr_matrix(y_dense) - >>> print(y_sparse) - (0, 0) 1 - (1, 2) 1 - (2, 0) 1 - (3, 1) 1 +- 1d or column vector containing more than two discrete values. An + example of a vector ``y`` for 4 samples: + + >>> import numpy as np + >>> y = np.array(['apple', 'pear', 'apple', 'orange']) + >>> print(y) + ['apple' 'pear' 'apple' 'orange'] + +- Dense or sparse :term:`binary` matrix of shape ``(n_samples, n_classes)`` + with a single sample per row, where each column represents one class. An + example of both a dense and sparse :term:`binary` matrix ``y`` for 4 + samples, where the columns, in order, are apple, orange, and pear: + + >>> import numpy as np + >>> from sklearn.preprocessing import LabelBinarizer + >>> y = np.array(['apple', 'pear', 'apple', 'orange']) + >>> y_dense = LabelBinarizer().fit_transform(y) + >>> print(y_dense) + [[1 0 0] + [0 0 1] + [1 0 0] + [0 1 0]] + >>> from scipy import sparse + >>> y_sparse = sparse.csr_matrix(y_dense) + >>> print(y_sparse) + + Coords Values + (0, 0) 1 + (1, 2) 1 + (2, 0) 1 + (3, 1) 1 For more information about :class:`~sklearn.preprocessing.LabelBinarizer`, refer to :ref:`preprocessing_targets`. @@ -221,9 +225,11 @@ in which cell [i, j] indicates the presence of label j in sample i. :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multilabel.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multilabel.py` +* :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py` +* :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py` .. _ovo_classification: @@ -262,10 +268,10 @@ Below is an example of multiclass learning using OvO:: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) -.. topic:: References: +.. rubric:: References - * "Pattern Recognition and Machine Learning. Springer", - Christopher M. Bishop, page 183, (First Edition) +* "Pattern Recognition and Machine Learning. Springer", + Christopher M. Bishop, page 183, (First Edition) .. _ecoc: @@ -295,7 +301,7 @@ used. It is a percentage of the total number of classes. A number between 0 and 1 will require fewer classifiers than one-vs-the-rest. In theory, ``log2(n_classes) / n_classes`` is sufficient to represent each class unambiguously. However, in practice, it may not lead to -good accuracy since ``log2(n_classes)`` is much smaller than n_classes. +good accuracy since ``log2(n_classes)`` is much smaller than `n_classes`. A number greater than 1 will require more classifiers than one-vs-the-rest. In this case, some classifiers will in theory correct for @@ -310,8 +316,7 @@ Below is an example of multiclass learning using Output-Codes:: >>> from sklearn.multiclass import OutputCodeClassifier >>> from sklearn.svm import LinearSVC >>> X, y = datasets.load_iris(return_X_y=True) - >>> clf = OutputCodeClassifier(LinearSVC(random_state=0), - ... code_size=2, random_state=0) + >>> clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) >>> clf.fit(X, y).predict(X) array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -321,21 +326,16 @@ Below is an example of multiclass learning using Output-Codes:: 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) -.. topic:: References: +.. rubric:: References - * "Solving multiclass learning problems via error-correcting output codes", - Dietterich T., Bakiri G., - Journal of Artificial Intelligence Research 2, - 1995. +* "Solving multiclass learning problems via error-correcting output codes", + Dietterich T., Bakiri G., Journal of Artificial Intelligence Research 2, 1995. - .. [3] "The error coding method and PICTs", - James G., Hastie T., - Journal of Computational and Graphical statistics 7, - 1998. +.. [3] "The error coding method and PICTs", James G., Hastie T., + Journal of Computational and Graphical statistics 7, 1998. - * "The Elements of Statistical Learning", - Hastie T., Tibshirani R., Friedman J., page 606 (second-edition) - 2008. +* "The Elements of Statistical Learning", + Hastie T., Tibshirani R., Friedman J., page 606 (second-edition), 2008. .. _multilabel_classification: @@ -382,10 +382,13 @@ An example of the same ``y`` in sparse matrix form: >>> y_sparse = sparse.csr_matrix(y) >>> print(y_sparse) - (0, 0) 1 - (0, 3) 1 - (1, 2) 1 - (1, 3) 1 + + Coords Values + (0, 0) 1 + (0, 3) 1 + (1, 2) 1 + (1, 3) 1 .. _multioutputclassfier: @@ -400,33 +403,11 @@ to be able to estimate a series of target functions (f1,f2,f3...,fn) that are trained on a single X predictor matrix to predict a series of responses (y1,y2,y3...,yn). -Below is an example of multilabel classification: - - >>> from sklearn.datasets import make_classification - >>> from sklearn.multioutput import MultiOutputClassifier - >>> from sklearn.ensemble import RandomForestClassifier - >>> from sklearn.utils import shuffle - >>> import numpy as np - >>> X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) - >>> y2 = shuffle(y1, random_state=1) - >>> y3 = shuffle(y1, random_state=2) - >>> Y = np.vstack((y1, y2, y3)).T - >>> n_samples, n_features = X.shape # 10,100 - >>> n_outputs = Y.shape[1] # 3 - >>> n_classes = 3 - >>> forest = RandomForestClassifier(random_state=1) - >>> multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1) - >>> multi_target_forest.fit(X, Y).predict(X) - array([[2, 2, 0], - [1, 2, 1], - [2, 1, 0], - [0, 0, 2], - [0, 2, 1], - [0, 0, 2], - [1, 1, 0], - [1, 1, 1], - [0, 0, 2], - [2, 0, 0]]) +You can find a usage example for +:class:`~sklearn.multioutput.MultiOutputClassifier` +as part of the section on :ref:`multiclass_multioutput_classification` +since it is a generalization of multilabel classification to +multiclass outputs instead of binary outputs. .. _classifierchain: @@ -454,10 +435,10 @@ one does not know the optimal ordering of the models in the chain so typically many randomly ordered chains are fit and their predictions are averaged together. -.. topic:: References: +.. rubric:: References - Jesse Read, Bernhard Pfahringer, Geoff Holmes, Eibe Frank, - "Classifier Chains for Multi-label Classification", 2009. +* Jesse Read, Bernhard Pfahringer, Geoff Holmes, Eibe Frank, + "Classifier Chains for Multi-label Classification", 2009. .. _multiclass_multioutput_classification: @@ -487,6 +468,36 @@ as a special case. Multitask classification is similar to the multioutput classification task with different model formulations. For more information, see the relevant estimator documentation. +Below is an example of multiclass-multioutput classification: + + >>> from sklearn.datasets import make_classification + >>> from sklearn.multioutput import MultiOutputClassifier + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.utils import shuffle + >>> import numpy as np + >>> X, y1 = make_classification(n_samples=10, n_features=100, + ... n_informative=30, n_classes=3, + ... random_state=1) + >>> y2 = shuffle(y1, random_state=1) + >>> y3 = shuffle(y1, random_state=2) + >>> Y = np.vstack((y1, y2, y3)).T + >>> n_samples, n_features = X.shape # 10,100 + >>> n_outputs = Y.shape[1] # 3 + >>> n_classes = 3 + >>> forest = RandomForestClassifier(random_state=1) + >>> multi_target_forest = MultiOutputClassifier(forest, n_jobs=2) + >>> multi_target_forest.fit(X, Y).predict(X) + array([[2, 2, 0], + [1, 2, 1], + [2, 1, 0], + [0, 0, 2], + [0, 2, 1], + [0, 0, 2], + [1, 1, 0], + [1, 1, 1], + [0, 0, 2], + [2, 0, 0]]) + .. warning:: At present, no metric in :mod:`sklearn.metrics` supports the multiclass-multioutput classification task. @@ -520,11 +531,42 @@ using data obtained at a certain location. Each sample would be data obtained at one location and both wind speed and direction would be output for each sample. +The following regressors natively support multioutput regression: + +- :class:`cross_decomposition.CCA` +- :class:`tree.DecisionTreeRegressor` +- :class:`dummy.DummyRegressor` +- :class:`linear_model.ElasticNet` +- :class:`tree.ExtraTreeRegressor` +- :class:`ensemble.ExtraTreesRegressor` +- :class:`gaussian_process.GaussianProcessRegressor` +- :class:`neighbors.KNeighborsRegressor` +- :class:`kernel_ridge.KernelRidge` +- :class:`linear_model.Lars` +- :class:`linear_model.Lasso` +- :class:`linear_model.LassoLars` +- :class:`linear_model.LinearRegression` +- :class:`multioutput.MultiOutputRegressor` +- :class:`linear_model.MultiTaskElasticNet` +- :class:`linear_model.MultiTaskElasticNetCV` +- :class:`linear_model.MultiTaskLasso` +- :class:`linear_model.MultiTaskLassoCV` +- :class:`linear_model.OrthogonalMatchingPursuit` +- :class:`cross_decomposition.PLSCanonical` +- :class:`cross_decomposition.PLSRegression` +- :class:`linear_model.RANSACRegressor` +- :class:`neighbors.RadiusNeighborsRegressor` +- :class:`ensemble.RandomForestRegressor` +- :class:`multioutput.RegressorChain` +- :class:`linear_model.Ridge` +- :class:`linear_model.RidgeCV` +- :class:`compose.TransformedTargetRegressor` + Target format ------------- A valid representation of :term:`multioutput` `y` is a dense matrix of shape -``(n_samples, n_classes)`` of floats. A column wise concatenation of +``(n_samples, n_output)`` of floats. A column wise concatenation of :term:`continuous` variables. An example of ``y`` for 3 samples: >>> y = np.array([[31.4, 94], [40.5, 109], [25.0, 30]]) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index b2dd4cf5a7cd3..b25334a902050 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -69,11 +69,11 @@ On the flip side, although naive Bayes is known as a decent classifier, it is known to be a bad estimator, so the probability outputs from ``predict_proba`` are not to be taken too seriously. -.. topic:: References: +.. dropdown:: References - * H. Zhang (2004). `The optimality of Naive Bayes. - `_ - Proc. FLAIRS. + * H. Zhang (2004). `The optimality of Naive Bayes. + `_ + Proc. FLAIRS. .. _gaussian_naive_bayes: @@ -117,7 +117,7 @@ for each class :math:`y`, where :math:`n` is the number of features and :math:`\theta_{yi}` is the probability :math:`P(x_i \mid y)` of feature :math:`i` appearing in a sample belonging to class :math:`y`. -The parameters :math:`\theta_y` is estimated by a smoothed +The parameters :math:`\theta_y` are estimated by a smoothed version of maximum likelihood, i.e. relative frequency counting: .. math:: @@ -125,13 +125,13 @@ version of maximum likelihood, i.e. relative frequency counting: \hat{\theta}_{yi} = \frac{ N_{yi} + \alpha}{N_y + \alpha n} where :math:`N_{yi} = \sum_{x \in T} x_i` is -the number of times feature :math:`i` appears in a sample of class :math:`y` +the number of times feature :math:`i` appears in all samples of class :math:`y` in the training set :math:`T`, and :math:`N_{y} = \sum_{i=1}^{n} N_{yi}` is the total count of all features for class :math:`y`. -The smoothing priors :math:`\alpha \ge 0` accounts for -features not present in the learning samples and prevents zero probabilities +The smoothing priors :math:`\alpha \ge 0` account for +features not present in the learning samples and prevent zero probabilities in further computations. Setting :math:`\alpha = 1` is called Laplace smoothing, while :math:`\alpha < 1` is called Lidstone smoothing. @@ -147,38 +147,42 @@ that is particularly suited for imbalanced data sets. Specifically, CNB uses statistics from the *complement* of each class to compute the model's weights. The inventors of CNB show empirically that the parameter estimates for CNB are more stable than those for MNB. Further, CNB regularly outperforms MNB (often -by a considerable margin) on text classification tasks. The procedure for -calculating the weights is as follows: +by a considerable margin) on text classification tasks. -.. math:: +.. dropdown:: Weights calculation - \hat{\theta}_{ci} = \frac{\alpha_i + \sum_{j:y_j \neq c} d_{ij}} - {\alpha + \sum_{j:y_j \neq c} \sum_{k} d_{kj}} + The procedure for calculating the weights is as follows: - w_{ci} = \log \hat{\theta}_{ci} + .. math:: - w_{ci} = \frac{w_{ci}}{\sum_{j} |w_{cj}|} + \hat{\theta}_{ci} = \frac{\alpha_i + \sum_{j:y_j \neq c} d_{ij}} + {\alpha + \sum_{j:y_j \neq c} \sum_{k} d_{kj}} -where the summations are over all documents :math:`j` not in class :math:`c`, -:math:`d_{ij}` is either the count or tf-idf value of term :math:`i` in document -:math:`j`, :math:`\alpha_i` is a smoothing hyperparameter like that found in -MNB, and :math:`\alpha = \sum_{i} \alpha_i`. The second normalization addresses -the tendency for longer documents to dominate parameter estimates in MNB. The -classification rule is: + w_{ci} = \log \hat{\theta}_{ci} -.. math:: + w_{ci} = \frac{w_{ci}}{\sum_{j} |w_{cj}|} + + where the summations are over all documents :math:`j` not in class :math:`c`, + :math:`d_{ij}` is either the count or tf-idf value of term :math:`i` in document + :math:`j`, :math:`\alpha_i` is a smoothing hyperparameter like that found in + MNB, and :math:`\alpha = \sum_{i} \alpha_i`. The second normalization addresses + the tendency for longer documents to dominate parameter estimates in MNB. The + classification rule is: + + .. math:: - \hat{c} = \arg\min_c \sum_{i} t_i w_{ci} + \hat{c} = \arg\min_c \sum_{i} t_i w_{ci} -i.e., a document is assigned to the class that is the *poorest* complement -match. + i.e., a document is assigned to the class that is the *poorest* complement + match. -.. topic:: References: +.. dropdown:: References + + * Rennie, J. D., Shih, L., Teevan, J., & Karger, D. R. (2003). + `Tackling the poor assumptions of naive bayes text classifiers. + `_ + In ICML (Vol. 3, pp. 616-623). - * Rennie, J. D., Shih, L., Teevan, J., & Karger, D. R. (2003). - `Tackling the poor assumptions of naive bayes text classifiers. - `_ - In ICML (Vol. 3, pp. 616-623). .. _bernoulli_naive_bayes: @@ -190,14 +194,14 @@ algorithms for data that is distributed according to multivariate Bernoulli distributions; i.e., there may be multiple features but each one is assumed to be a binary-valued (Bernoulli, boolean) variable. Therefore, this class requires samples to be represented as binary-valued -feature vectors; if handed any other kind of data, a ``BernoulliNB`` instance +feature vectors; if handed any other kind of data, a :class:`BernoulliNB` instance may binarize its input (depending on the ``binarize`` parameter). The decision rule for Bernoulli naive Bayes is based on .. math:: - P(x_i \mid y) = P(i \mid y) x_i + (1 - P(i \mid y)) (1 - x_i) + P(x_i \mid y) = P(x_i = 1 \mid y) x_i + (1 - P(x_i = 1 \mid y)) (1 - x_i) which differs from multinomial NB's rule in that it explicitly penalizes the non-occurrence of a feature :math:`i` @@ -205,57 +209,60 @@ that is an indicator for class :math:`y`, where the multinomial variant would simply ignore a non-occurring feature. In the case of text classification, word occurrence vectors (rather than word -count vectors) may be used to train and use this classifier. ``BernoulliNB`` +count vectors) may be used to train and use this classifier. :class:`BernoulliNB` might perform better on some datasets, especially those with shorter documents. It is advisable to evaluate both models, if time permits. -.. topic:: References: +.. dropdown:: References + + * C.D. Manning, P. Raghavan and H. SchÃŧtze (2008). Introduction to + Information Retrieval. Cambridge University Press, pp. 234-265. - * C.D. Manning, P. Raghavan and H. SchÃŧtze (2008). Introduction to - Information Retrieval. Cambridge University Press, pp. 234-265. + * A. McCallum and K. Nigam (1998). + `A comparison of event models for Naive Bayes text classification. + `_ + Proc. AAAI/ICML-98 Workshop on Learning for Text Categorization, pp. 41-48. - * A. McCallum and K. Nigam (1998). - `A comparison of event models for Naive Bayes text classification. - `_ - Proc. AAAI/ICML-98 Workshop on Learning for Text Categorization, pp. 41-48. + * V. Metsis, I. Androutsopoulos and G. Paliouras (2006). + `Spam filtering with Naive Bayes -- Which Naive Bayes? + `_ + 3rd Conf. on Email and Anti-Spam (CEAS). - * V. Metsis, I. Androutsopoulos and G. Paliouras (2006). - `Spam filtering with Naive Bayes -- Which Naive Bayes? - `_ - 3rd Conf. on Email and Anti-Spam (CEAS). .. _categorical_naive_bayes: Categorical Naive Bayes ----------------------- -:class:`CategoricalNB` implements the categorical naive Bayes -algorithm for categorically distributed data. It assumes that each feature, -which is described by the index :math:`i`, has its own categorical -distribution. +:class:`CategoricalNB` implements the categorical naive Bayes +algorithm for categorically distributed data. It assumes that each feature, +which is described by the index :math:`i`, has its own categorical +distribution. For each feature :math:`i` in the training set :math:`X`, :class:`CategoricalNB` estimates a categorical distribution for each feature i of X conditioned on the class y. The index set of the samples is defined as :math:`J = \{ 1, \dots, m \}`, with :math:`m` as the number of samples. -The probability of category :math:`t` in feature :math:`i` given class -:math:`c` is estimated as: +.. dropdown:: Probability calculation -.. math:: + The probability of category :math:`t` in feature :math:`i` given class + :math:`c` is estimated as: + + .. math:: - P(x_i = t \mid y = c \: ;\, \alpha) = \frac{ N_{tic} + \alpha}{N_{c} + - \alpha n_i}, + P(x_i = t \mid y = c \: ;\, \alpha) = \frac{ N_{tic} + \alpha}{N_{c} + + \alpha n_i}, -where :math:`N_{tic} = |\{j \in J \mid x_{ij} = t, y_j = c\}|` is the number -of times category :math:`t` appears in the samples :math:`x_{i}`, which belong -to class :math:`c`, :math:`N_{c} = |\{ j \in J\mid y_j = c\}|` is the number -of samples with class c, :math:`\alpha` is a smoothing parameter and -:math:`n_i` is the number of available categories of feature :math:`i`. + where :math:`N_{tic} = |\{j \in J \mid x_{ij} = t, y_j = c\}|` is the number + of times category :math:`t` appears in the samples :math:`x_{i}`, which belong + to class :math:`c`, :math:`N_{c} = |\{ j \in J\mid y_j = c\}|` is the number + of samples with class c, :math:`\alpha` is a smoothing parameter and + :math:`n_i` is the number of available categories of feature :math:`i`. -:class:`CategoricalNB` assumes that the sample matrix :math:`X` is encoded -(for instance with the help of :class:`OrdinalEncoder`) such that all -categories for each feature :math:`i` are represented with numbers +:class:`CategoricalNB` assumes that the sample matrix :math:`X` is encoded (for +instance with the help of :class:`~sklearn.preprocessing.OrdinalEncoder`) such +that all categories for each feature :math:`i` are represented with numbers :math:`0, ..., n_i - 1` where :math:`n_i` is the number of available categories of feature :math:`i`. diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index bb84b79e8570a..82caa397b60d2 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -59,12 +59,12 @@ The choice of neighbors search algorithm is controlled through the keyword from the training data. For a discussion of the strengths and weaknesses of each option, see `Nearest Neighbor Algorithms`_. - .. warning:: +.. warning:: - Regarding the Nearest Neighbors algorithms, if two - neighbors :math:`k+1` and :math:`k` have identical distances - but different labels, the result will depend on the ordering of the - training data. + Regarding the Nearest Neighbors algorithms, if two + neighbors :math:`k+1` and :math:`k` have identical distances + but different labels, the result will depend on the ordering of the + training data. Finding the Nearest Neighbors ----------------------------- @@ -136,8 +136,13 @@ have the same interface; we'll show an example of using the KD Tree here: Refer to the :class:`KDTree` and :class:`BallTree` class documentation for more information on the options available for nearest neighbors searches, including specification of query strategies, distance metrics, etc. For a list -of available metrics, see the documentation of the :class:`DistanceMetric` -class. +of valid metrics use `KDTree.valid_metrics` and `BallTree.valid_metrics`: + + >>> from sklearn.neighbors import KDTree, BallTree + >>> KDTree.valid_metrics + ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity'] + >>> BallTree.valid_metrics + ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity', 'seuclidean', 'mahalanobis', 'hamming', 'canberra', 'braycurtis', 'jaccard', 'dice', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'haversine', 'pyfunc'] .. _classification: @@ -183,18 +188,14 @@ distance can be supplied to compute the weights. .. |classification_1| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_001.png :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 - -.. |classification_2| image:: ../auto_examples/neighbors/images/sphx_glr_plot_classification_002.png - :target: ../auto_examples/neighbors/plot_classification.html - :scale: 50 + :scale: 75 -.. centered:: |classification_1| |classification_2| +.. centered:: |classification_1| -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`: an example of - classification using nearest neighbors. +* :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`: an example of + classification using nearest neighbors. .. _regression: @@ -240,13 +241,13 @@ the lower half of those faces. :align: center -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`: an example of regression - using nearest neighbors. +* :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`: an example of regression + using nearest neighbors. - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`: an example of - multi-output regression using nearest neighbors. +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`: + an example of multi-output regression using nearest neighbors. Nearest Neighbor Algorithms @@ -304,11 +305,11 @@ keyword ``algorithm = 'kd_tree'``, and are computed using the class :class:`KDTree`. -.. topic:: References: +.. dropdown:: References - * `"Multidimensional binary search trees used for associative searching" - `_, - Bentley, J.L., Communications of the ACM (1975) + * `"Multidimensional binary search trees used for associative searching" + `_, + Bentley, J.L., Communications of the ACM (1975) .. _ball_tree: @@ -342,127 +343,142 @@ neighbors searches are specified using the keyword ``algorithm = 'ball_tree'``, and are computed using the class :class:`BallTree`. Alternatively, the user can work with the :class:`BallTree` class directly. -.. topic:: References: - - * `"Five balltree construction algorithms" - `_, - Omohundro, S.M., International Computer Science Institute - Technical Report (1989) - -Choice of Nearest Neighbors Algorithm -------------------------------------- -The optimal algorithm for a given dataset is a complicated choice, and -depends on a number of factors: - -* number of samples :math:`N` (i.e. ``n_samples``) and dimensionality - :math:`D` (i.e. ``n_features``). - - * *Brute force* query time grows as :math:`O[D N]` - * *Ball tree* query time grows as approximately :math:`O[D \log(N)]` - * *KD tree* query time changes with :math:`D` in a way that is difficult - to precisely characterise. For small :math:`D` (less than 20 or so) - the cost is approximately :math:`O[D\log(N)]`, and the KD tree - query can be very efficient. - For larger :math:`D`, the cost increases to nearly :math:`O[DN]`, and - the overhead due to the tree - structure can lead to queries which are slower than brute force. - - For small data sets (:math:`N` less than 30 or so), :math:`\log(N)` is - comparable to :math:`N`, and brute force algorithms can be more efficient - than a tree-based approach. Both :class:`KDTree` and :class:`BallTree` - address this through providing a *leaf size* parameter: this controls the - number of samples at which a query switches to brute-force. This allows both - algorithms to approach the efficiency of a brute-force computation for small - :math:`N`. - -* data structure: *intrinsic dimensionality* of the data and/or *sparsity* - of the data. Intrinsic dimensionality refers to the dimension - :math:`d \le D` of a manifold on which the data lies, which can be linearly - or non-linearly embedded in the parameter space. Sparsity refers to the - degree to which the data fills the parameter space (this is to be - distinguished from the concept as used in "sparse" matrices. The data - matrix may have no zero entries, but the **structure** can still be - "sparse" in this sense). - - * *Brute force* query time is unchanged by data structure. - * *Ball tree* and *KD tree* query times can be greatly influenced - by data structure. In general, sparser data with a smaller intrinsic - dimensionality leads to faster query times. Because the KD tree - internal representation is aligned with the parameter axes, it will not - generally show as much improvement as ball tree for arbitrarily - structured data. - - Datasets used in machine learning tend to be very structured, and are - very well-suited for tree-based queries. - -* number of neighbors :math:`k` requested for a query point. - - * *Brute force* query time is largely unaffected by the value of :math:`k` - * *Ball tree* and *KD tree* query time will become slower as :math:`k` - increases. This is due to two effects: first, a larger :math:`k` leads - to the necessity to search a larger portion of the parameter space. - Second, using :math:`k > 1` requires internal queueing of results - as the tree is traversed. - - As :math:`k` becomes large compared to :math:`N`, the ability to prune - branches in a tree-based query is reduced. In this situation, Brute force - queries can be more efficient. - -* number of query points. Both the ball tree and the KD Tree - require a construction phase. The cost of this construction becomes - negligible when amortized over many queries. If only a small number of - queries will be performed, however, the construction can make up - a significant fraction of the total cost. If very few query points - will be required, brute force is better than a tree-based method. - -Currently, ``algorithm = 'auto'`` selects ``'brute'`` if any of the following -conditions are verified: - -* input data is sparse -* ``metric = 'precomputed'`` -* :math:`D > 15` -* :math:`k >= N/2` -* ``effective_metric_`` isn't in the ``VALID_METRICS`` list for either - ``'kd_tree'`` or ``'ball_tree'`` - -Otherwise, it selects the first out of ``'kd_tree'`` and ``'ball_tree'`` that -has ``effective_metric_`` in its ``VALID_METRICS`` list. This heuristic is -based on the following assumptions: - -* the number of query points is at least the same order as the number of - training points -* ``leaf_size`` is close to its default value of ``30`` -* when :math:`D > 15`, the intrinsic dimensionality of the data is generally - too high for tree-based methods - -Effect of ``leaf_size`` ------------------------ -As noted above, for small sample sizes a brute force search can be more -efficient than a tree-based query. This fact is accounted for in the ball -tree and KD tree by internally switching to brute force searches within -leaf nodes. The level of this switch can be specified with the parameter -``leaf_size``. This parameter choice has many effects: - -**construction time** - A larger ``leaf_size`` leads to a faster tree construction time, because - fewer nodes need to be created - -**query time** - Both a large or small ``leaf_size`` can lead to suboptimal query cost. - For ``leaf_size`` approaching 1, the overhead involved in traversing - nodes can significantly slow query times. For ``leaf_size`` approaching - the size of the training set, queries become essentially brute force. - A good compromise between these is ``leaf_size = 30``, the default value - of the parameter. - -**memory** - As ``leaf_size`` increases, the memory required to store a tree structure - decreases. This is especially important in the case of ball tree, which - stores a :math:`D`-dimensional centroid for each node. The required - storage space for :class:`BallTree` is approximately ``1 / leaf_size`` times - the size of the training set. - -``leaf_size`` is not referenced for brute force queries. + +.. dropdown:: References + + * `"Five Balltree Construction Algorithms" + `_, + Omohundro, S.M., International Computer Science Institute + Technical Report (1989) + +.. dropdown:: Choice of Nearest Neighbors Algorithm + + The optimal algorithm for a given dataset is a complicated choice, and + depends on a number of factors: + + * number of samples :math:`N` (i.e. ``n_samples``) and dimensionality + :math:`D` (i.e. ``n_features``). + + * *Brute force* query time grows as :math:`O[D N]` + * *Ball tree* query time grows as approximately :math:`O[D \log(N)]` + * *KD tree* query time changes with :math:`D` in a way that is difficult + to precisely characterise. For small :math:`D` (less than 20 or so) + the cost is approximately :math:`O[D\log(N)]`, and the KD tree + query can be very efficient. + For larger :math:`D`, the cost increases to nearly :math:`O[DN]`, and + the overhead due to the tree + structure can lead to queries which are slower than brute force. + + For small data sets (:math:`N` less than 30 or so), :math:`\log(N)` is + comparable to :math:`N`, and brute force algorithms can be more efficient + than a tree-based approach. Both :class:`KDTree` and :class:`BallTree` + address this through providing a *leaf size* parameter: this controls the + number of samples at which a query switches to brute-force. This allows both + algorithms to approach the efficiency of a brute-force computation for small + :math:`N`. + + * data structure: *intrinsic dimensionality* of the data and/or *sparsity* + of the data. Intrinsic dimensionality refers to the dimension + :math:`d \le D` of a manifold on which the data lies, which can be linearly + or non-linearly embedded in the parameter space. Sparsity refers to the + degree to which the data fills the parameter space (this is to be + distinguished from the concept as used in "sparse" matrices. The data + matrix may have no zero entries, but the **structure** can still be + "sparse" in this sense). + + * *Brute force* query time is unchanged by data structure. + * *Ball tree* and *KD tree* query times can be greatly influenced + by data structure. In general, sparser data with a smaller intrinsic + dimensionality leads to faster query times. Because the KD tree + internal representation is aligned with the parameter axes, it will not + generally show as much improvement as ball tree for arbitrarily + structured data. + + Datasets used in machine learning tend to be very structured, and are + very well-suited for tree-based queries. + + * number of neighbors :math:`k` requested for a query point. + + * *Brute force* query time is largely unaffected by the value of :math:`k` + * *Ball tree* and *KD tree* query time will become slower as :math:`k` + increases. This is due to two effects: first, a larger :math:`k` leads + to the necessity to search a larger portion of the parameter space. + Second, using :math:`k > 1` requires internal queueing of results + as the tree is traversed. + + As :math:`k` becomes large compared to :math:`N`, the ability to prune + branches in a tree-based query is reduced. In this situation, Brute force + queries can be more efficient. + + * number of query points. Both the ball tree and the KD Tree + require a construction phase. The cost of this construction becomes + negligible when amortized over many queries. If only a small number of + queries will be performed, however, the construction can make up + a significant fraction of the total cost. If very few query points + will be required, brute force is better than a tree-based method. + + Currently, ``algorithm = 'auto'`` selects ``'brute'`` if any of the following + conditions are verified: + + * input data is sparse + * ``metric = 'precomputed'`` + * :math:`D > 15` + * :math:`k >= N/2` + * ``effective_metric_`` isn't in the ``VALID_METRICS`` list for either + ``'kd_tree'`` or ``'ball_tree'`` + + Otherwise, it selects the first out of ``'kd_tree'`` and ``'ball_tree'`` that + has ``effective_metric_`` in its ``VALID_METRICS`` list. This heuristic is + based on the following assumptions: + + * the number of query points is at least the same order as the number of + training points + * ``leaf_size`` is close to its default value of ``30`` + * when :math:`D > 15`, the intrinsic dimensionality of the data is generally + too high for tree-based methods + +.. dropdown:: Effect of ``leaf_size`` + + As noted above, for small sample sizes a brute force search can be more + efficient than a tree-based query. This fact is accounted for in the ball + tree and KD tree by internally switching to brute force searches within + leaf nodes. The level of this switch can be specified with the parameter + ``leaf_size``. This parameter choice has many effects: + + **construction time** + A larger ``leaf_size`` leads to a faster tree construction time, because + fewer nodes need to be created + + **query time** + Both a large or small ``leaf_size`` can lead to suboptimal query cost. + For ``leaf_size`` approaching 1, the overhead involved in traversing + nodes can significantly slow query times. For ``leaf_size`` approaching + the size of the training set, queries become essentially brute force. + A good compromise between these is ``leaf_size = 30``, the default value + of the parameter. + + **memory** + As ``leaf_size`` increases, the memory required to store a tree structure + decreases. This is especially important in the case of ball tree, which + stores a :math:`D`-dimensional centroid for each node. The required + storage space for :class:`BallTree` is approximately ``1 / leaf_size`` times + the size of the training set. + + ``leaf_size`` is not referenced for brute force queries. + +.. dropdown:: Valid Metrics for Nearest Neighbor Algorithms + + For a list of available metrics, see the documentation of the + :class:`~sklearn.metrics.DistanceMetric` class and the metrics listed in + `sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the "cosine" + metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`. + + A list of valid metrics for any of the above algorithms can be obtained by using their + ``valid_metric`` attribute. For example, valid metrics for ``KDTree`` can be generated by: + + >>> from sklearn.neighbors import KDTree + >>> print(sorted(KDTree.valid_metrics)) + ['chebyshev', 'cityblock', 'euclidean', 'infinity', 'l1', 'l2', 'manhattan', 'minkowski', 'p'] .. _nearest_centroid_classifier: @@ -515,10 +531,10 @@ the model from 0.81 to 0.82. .. centered:: |nearest_centroid_1| |nearest_centroid_2| -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_plot_nearest_centroid.py`: an example of - classification using nearest centroid with different shrink thresholds. +* :ref:`sphx_glr_auto_examples_neighbors_plot_nearest_centroid.py`: an example of + classification using nearest centroid with different shrink thresholds. .. _neighbors_transformer: @@ -544,17 +560,24 @@ a scikit-learn pipeline, one can also use the corresponding classes :class:`KNeighborsTransformer` and :class:`RadiusNeighborsTransformer`. The benefits of this sparse graph API are multiple. -First, the precomputed graph can be re-used multiple times, for instance while +First, the precomputed graph can be reused multiple times, for instance while varying a parameter of the estimator. This can be done manually by the user, or using the caching properties of the scikit-learn pipeline: + >>> import tempfile >>> from sklearn.manifold import Isomap >>> from sklearn.neighbors import KNeighborsTransformer >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.datasets import make_regression + >>> cache_path = tempfile.gettempdir() # we use a temporary folder here + >>> X, _ = make_regression(n_samples=50, n_features=25, random_state=0) >>> estimator = make_pipeline( - ... KNeighborsTransformer(n_neighbors=5, mode='distance'), - ... Isomap(neighbors_algorithm='precomputed'), - ... memory='/path/to/cache') + ... KNeighborsTransformer(mode='distance'), + ... Isomap(n_components=3, metric='precomputed'), + ... memory=cache_path) + >>> X_embedded = estimator.fit_transform(X) + >>> X_embedded.shape + (50, 3) Second, precomputing the graph can give finer control on the nearest neighbors estimation, for instance enabling multiprocessing though the parameter @@ -596,17 +619,17 @@ implementation with special data types. The precomputed neighbors include one extra neighbor in a custom nearest neighbors estimator, since unnecessary neighbors will be filtered by following estimators. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`: - an example of pipelining :class:`KNeighborsTransformer` and - :class:`~sklearn.manifold.TSNE`. Also proposes two custom nearest neighbors - estimators based on external packages. +* :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`: + an example of pipelining :class:`KNeighborsTransformer` and + :class:`~sklearn.manifold.TSNE`. Also proposes two custom nearest neighbors + estimators based on external packages. - * :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py`: - an example of pipelining :class:`KNeighborsTransformer` and - :class:`KNeighborsClassifier` to enable caching of the neighbors graph - during a hyper-parameter grid-search. +* :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py`: + an example of pipelining :class:`KNeighborsTransformer` and + :class:`KNeighborsClassifier` to enable caching of the neighbors graph + during a hyper-parameter grid-search. .. _nca: @@ -730,11 +753,11 @@ by each method. Each data sample belongs to one of 10 classes. .. centered:: |nca_dim_reduction_1| |nca_dim_reduction_2| |nca_dim_reduction_3| -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_classification.py` - * :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py` - * :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` +* :ref:`sphx_glr_auto_examples_neighbors_plot_nca_classification.py` +* :ref:`sphx_glr_auto_examples_neighbors_plot_nca_dim_reduction.py` +* :ref:`sphx_glr_auto_examples_manifold_plot_lle_digits.py` .. _nca_mathematical_formulation: @@ -767,18 +790,16 @@ space: p_{i j} = \frac{\exp(-||L x_i - L x_j||^2)}{\sum\limits_{k \ne i} {\exp{-(||L x_i - L x_k||^2)}}} , \quad p_{i i} = 0 +.. dropdown:: Mahalanobis distance -Mahalanobis distance -^^^^^^^^^^^^^^^^^^^^ + NCA can be seen as learning a (squared) Mahalanobis distance metric: -NCA can be seen as learning a (squared) Mahalanobis distance metric: - -.. math:: + .. math:: - || L(x_i - x_j)||^2 = (x_i - x_j)^TM(x_i - x_j), + || L(x_i - x_j)||^2 = (x_i - x_j)^TM(x_i - x_j), -where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size -``(n_features, n_features)``. + where :math:`M = L^T L` is a symmetric positive semi-definite matrix of size + ``(n_features, n_features)``. Implementation @@ -811,12 +832,12 @@ complexity equals ``n_components * n_features * n_samples_test``. There is no added space complexity in the operation. -.. topic:: References: +.. rubric:: References - .. [1] `"Neighbourhood Components Analysis" - `_, - J. Goldberger, S. Roweis, G. Hinton, R. Salakhutdinov, Advances in - Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520. +.. [1] `"Neighbourhood Components Analysis" + `_, + J. Goldberger, S. Roweis, G. Hinton, R. Salakhutdinov, Advances in + Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520. - `Wikipedia entry on Neighborhood Components Analysis - `_ +* `Wikipedia entry on Neighborhood Components Analysis + `_ diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst index b9f4e17c6a575..13611b7f52775 100644 --- a/doc/modules/neural_networks_supervised.rst +++ b/doc/modules/neural_networks_supervised.rst @@ -20,7 +20,7 @@ Multi-layer Perceptron ====================== **Multi-layer Perceptron (MLP)** is a supervised learning algorithm that learns -a function :math:`f(\cdot): R^m \rightarrow R^o` by training on a dataset, +a function :math:`f: R^m \rightarrow R^o` by training on a dataset, where :math:`m` is the number of dimensions for input and :math:`o` is the number of dimensions for output. Given a set of features :math:`X = {x_1, x_2, ..., x_m}` and a target :math:`y`, it can learn a non-linear function approximator for either @@ -49,27 +49,29 @@ The module contains the public attributes ``coefs_`` and ``intercepts_``. :math:`i+1`. ``intercepts_`` is a list of bias vectors, where the vector at index :math:`i` represents the bias values added to layer :math:`i+1`. -The advantages of Multi-layer Perceptron are: +.. dropdown:: Advantages and disadvantages of Multi-layer Perceptron - + Capability to learn non-linear models. + The advantages of Multi-layer Perceptron are: - + Capability to learn models in real-time (on-line learning) - using ``partial_fit``. + + Capability to learn non-linear models. + + Capability to learn models in real-time (on-line learning) + using ``partial_fit``. -The disadvantages of Multi-layer Perceptron (MLP) include: - + MLP with hidden layers have a non-convex loss function where there exists - more than one local minimum. Therefore different random weight - initializations can lead to different validation accuracy. + The disadvantages of Multi-layer Perceptron (MLP) include: - + MLP requires tuning a number of hyperparameters such as the number of - hidden neurons, layers, and iterations. + + MLP with hidden layers has a non-convex loss function where there exists + more than one local minimum. Therefore, different random weight + initializations can lead to different validation accuracy. - + MLP is sensitive to feature scaling. + + MLP requires tuning a number of hyperparameters such as the number of + hidden neurons, layers, and iterations. -Please see :ref:`Tips on Practical Use ` section that addresses -some of these disadvantages. + + MLP is sensitive to feature scaling. + + Please see :ref:`Tips on Practical Use ` section that addresses + some of these disadvantages. Classification @@ -114,8 +116,8 @@ classification, it minimizes the Cross-Entropy loss function, giving a vector of probability estimates :math:`P(y|x)` per sample :math:`x`:: >>> clf.predict_proba([[2., 2.], [1., 2.]]) - array([[1.967...e-04, 9.998...-01], - [1.967...e-04, 9.998...-01]]) + array([[1.967e-04, 9.998e-01], + [1.967e-04, 9.998e-01]]) :class:`MLPClassifier` supports multi-class classification by applying `Softmax `_ @@ -125,7 +127,7 @@ Further, the model supports :ref:`multi-label classification ` in which a sample can belong to more than one class. For each class, the raw output passes through the logistic function. Values larger or equal to `0.5` are rounded to `1`, otherwise to `0`. For a predicted output of a sample, the -indices where the value is `1` represents the assigned classes of that sample:: +indices where the value is `1` represent the assigned classes of that sample:: >>> X = [[0., 0.], [1., 1.]] >>> y = [[0, 1], [1, 1]] @@ -143,10 +145,11 @@ indices where the value is `1` represents the assigned classes of that sample:: See the examples below and the docstring of :meth:`MLPClassifier.fit` for further information. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py` - * :ref:`sphx_glr_auto_examples_neural_networks_plot_mnist_filters.py` +* :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py` +* See :ref:`sphx_glr_auto_examples_neural_networks_plot_mnist_filters.py` for + visualized representation of trained weights. Regression ========== @@ -175,16 +178,16 @@ decision function with value of alpha. See the examples below for further information. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py` +* :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py` Algorithms ========== MLP trains using `Stochastic Gradient Descent `_, -`Adam `_, or +:arxiv:`Adam <1412.6980>`, or `L-BFGS `__. Stochastic Gradient Descent (SGD) updates parameters using the gradient of the loss function with respect to a parameter that needs adaptation, i.e. @@ -199,7 +202,7 @@ the parameter space search. :math:`Loss` is the loss function used for the network. More details can be found in the documentation of -`SGD `_ +`SGD `_ Adam is similar to SGD in a sense that it is a stochastic optimizer, but it can automatically adjust the amount to update parameters based on adaptive estimates @@ -223,86 +226,82 @@ Complexity Suppose there are :math:`n` training samples, :math:`m` features, :math:`k` hidden layers, each containing :math:`h` neurons - for simplicity, and :math:`o` output neurons. The time complexity of backpropagation is -:math:`O(n\cdot m \cdot h^k \cdot o \cdot i)`, where :math:`i` is the number +:math:`O(i \cdot n \cdot (m \cdot h + (k - 1) \cdot h \cdot h + h \cdot o))`, where :math:`i` is the number of iterations. Since backpropagation has a high time complexity, it is advisable to start with smaller number of hidden neurons and few hidden layers for training. +.. dropdown:: Mathematical formulation -Mathematical formulation -======================== - -Given a set of training examples :math:`(x_1, y_1), (x_2, y_2), \ldots, (x_n, y_n)` -where :math:`x_i \in \mathbf{R}^n` and :math:`y_i \in \{0, 1\}`, a one hidden -layer one hidden neuron MLP learns the function :math:`f(x) = W_2 g(W_1^T x + b_1) + b_2` -where :math:`W_1 \in \mathbf{R}^m` and :math:`W_2, b_1, b_2 \in \mathbf{R}` are -model parameters. :math:`W_1, W_2` represent the weights of the input layer and -hidden layer, respectively; and :math:`b_1, b_2` represent the bias added to -the hidden layer and the output layer, respectively. -:math:`g(\cdot) : R \rightarrow R` is the activation function, set by default as -the hyperbolic tan. It is given as, - -.. math:: - g(z)= \frac{e^z-e^{-z}}{e^z+e^{-z}} - -For binary classification, :math:`f(x)` passes through the logistic function -:math:`g(z)=1/(1+e^{-z})` to obtain output values between zero and one. A -threshold, set to 0.5, would assign samples of outputs larger or equal 0.5 -to the positive class, and the rest to the negative class. - -If there are more than two classes, :math:`f(x)` itself would be a vector of -size (n_classes,). Instead of passing through logistic function, it passes -through the softmax function, which is written as, + Given a set of training examples :math:`(x_1, y_1), (x_2, y_2), \ldots, (x_n, y_n)` + where :math:`x_i \in \mathbf{R}^n` and :math:`y_i \in \{0, 1\}`, a one hidden + layer one hidden neuron MLP learns the function :math:`f(x) = W_2 g(W_1^T x + b_1) + b_2` + where :math:`W_1 \in \mathbf{R}^m` and :math:`W_2, b_1, b_2 \in \mathbf{R}` are + model parameters. :math:`W_1, W_2` represent the weights of the input layer and + hidden layer, respectively; and :math:`b_1, b_2` represent the bias added to + the hidden layer and the output layer, respectively. + :math:`g(\cdot) : R \rightarrow R` is the activation function, set by default as + the hyperbolic tan. It is given as, -.. math:: - \text{softmax}(z)_i = \frac{\exp(z_i)}{\sum_{l=1}^k\exp(z_l)} + .. math:: + g(z)= \frac{e^z-e^{-z}}{e^z+e^{-z}} -where :math:`z_i` represents the :math:`i` th element of the input to softmax, -which corresponds to class :math:`i`, and :math:`K` is the number of classes. -The result is a vector containing the probabilities that sample :math:`x` -belong to each class. The output is the class with the highest probability. + For binary classification, :math:`f(x)` passes through the logistic function + :math:`g(z)=1/(1+e^{-z})` to obtain output values between zero and one. A + threshold, set to 0.5, would assign samples of outputs larger or equal 0.5 + to the positive class, and the rest to the negative class. -In regression, the output remains as :math:`f(x)`; therefore, output activation -function is just the identity function. + If there are more than two classes, :math:`f(x)` itself would be a vector of + size (n_classes,). Instead of passing through logistic function, it passes + through the softmax function, which is written as, -MLP uses different loss functions depending on the problem type. The loss -function for classification is Cross-Entropy, which in binary case is given as, + .. math:: + \text{softmax}(z)_i = \frac{\exp(z_i)}{\sum_{l=1}^k\exp(z_l)} -.. math:: + where :math:`z_i` represents the :math:`i` th element of the input to softmax, + which corresponds to class :math:`i`, and :math:`K` is the number of classes. + The result is a vector containing the probabilities that sample :math:`x` + belongs to each class. The output is the class with the highest probability. - Loss(\hat{y},y,W) = -y \ln {\hat{y}} - (1-y) \ln{(1-\hat{y})} + \alpha ||W||_2^2 + In regression, the output remains as :math:`f(x)`; therefore, output activation + function is just the identity function. -where :math:`\alpha ||W||_2^2` is an L2-regularization term (aka penalty) -that penalizes complex models; and :math:`\alpha > 0` is a non-negative -hyperparameter that controls the magnitude of the penalty. + MLP uses different loss functions depending on the problem type. The loss + function for classification is Average Cross-Entropy, which in binary case is + given as, -For regression, MLP uses the Square Error loss function; written as, + .. math:: -.. math:: + Loss(\hat{y},y,W) = -\dfrac{1}{n}\sum_{i=0}^n(y_i \ln {\hat{y_i}} + (1-y_i) \ln{(1-\hat{y_i})}) + \dfrac{\alpha}{2n} ||W||_2^2 - Loss(\hat{y},y,W) = \frac{1}{2}||\hat{y} - y ||_2^2 + \frac{\alpha}{2} ||W||_2^2 + where :math:`\alpha ||W||_2^2` is an L2-regularization term (aka penalty) + that penalizes complex models; and :math:`\alpha > 0` is a non-negative + hyperparameter that controls the magnitude of the penalty. + For regression, MLP uses the Mean Square Error loss function; written as, -Starting from initial random weights, multi-layer perceptron (MLP) minimizes -the loss function by repeatedly updating these weights. After computing the -loss, a backward pass propagates it from the output layer to the previous -layers, providing each weight parameter with an update value meant to decrease -the loss. + .. math:: -In gradient descent, the gradient :math:`\nabla Loss_{W}` of the loss with respect -to the weights is computed and deducted from :math:`W`. -More formally, this is expressed as, + Loss(\hat{y},y,W) = \frac{1}{2n}\sum_{i=0}^n||\hat{y}_i - y_i ||_2^2 + \frac{\alpha}{2n} ||W||_2^2 -.. math:: - W^{i+1} = W^i - \epsilon \nabla {Loss}_{W}^{i} + Starting from initial random weights, multi-layer perceptron (MLP) minimizes + the loss function by repeatedly updating these weights. After computing the + loss, a backward pass propagates it from the output layer to the previous + layers, providing each weight parameter with an update value meant to decrease + the loss. + In gradient descent, the gradient :math:`\nabla Loss_{W}` of the loss with respect + to the weights is computed and deducted from :math:`W`. + More formally, this is expressed as, -where :math:`i` is the iteration step, and :math:`\epsilon` is the learning rate -with a value larger than 0. + .. math:: + W^{i+1} = W^i - \epsilon \nabla {Loss}_{W}^{i} -The algorithm stops when it reaches a preset maximum number of iterations; or -when the improvement in loss is below a certain, small number. + where :math:`i` is the iteration step, and :math:`\epsilon` is the learning rate + with a value larger than 0. + The algorithm stops when it reaches a preset maximum number of iterations; or + when the improvement in loss is below a certain, small number. .. _mlp_tips: @@ -310,34 +309,35 @@ when the improvement in loss is below a certain, small number. Tips on Practical Use ===================== - * Multi-layer Perceptron is sensitive to feature scaling, so it - is highly recommended to scale your data. For example, scale each - attribute on the input vector X to [0, 1] or [-1, +1], or standardize - it to have mean 0 and variance 1. Note that you must apply the *same* - scaling to the test set for meaningful results. - You can use :class:`StandardScaler` for standardization. - - >>> from sklearn.preprocessing import StandardScaler # doctest: +SKIP - >>> scaler = StandardScaler() # doctest: +SKIP - >>> # Don't cheat - fit only on training data - >>> scaler.fit(X_train) # doctest: +SKIP - >>> X_train = scaler.transform(X_train) # doctest: +SKIP - >>> # apply same transformation to test data - >>> X_test = scaler.transform(X_test) # doctest: +SKIP - - An alternative and recommended approach is to use :class:`StandardScaler` - in a :class:`Pipeline` - - * Finding a reasonable regularization parameter :math:`\alpha` is - best done using :class:`GridSearchCV`, usually in the - range ``10.0 ** -np.arange(1, 7)``. - - * Empirically, we observed that `L-BFGS` converges faster and - with better solutions on small datasets. For relatively large - datasets, however, `Adam` is very robust. It usually converges - quickly and gives pretty good performance. `SGD` with momentum or - nesterov's momentum, on the other hand, can perform better than - those two algorithms if learning rate is correctly tuned. +* Multi-layer Perceptron is sensitive to feature scaling, so it + is highly recommended to scale your data. For example, scale each + attribute on the input vector X to [0, 1] or [-1, +1], or standardize + it to have mean 0 and variance 1. Note that you must apply the *same* + scaling to the test set for meaningful results. + You can use :class:`~sklearn.preprocessing.StandardScaler` for standardization. + + >>> from sklearn.preprocessing import StandardScaler # doctest: +SKIP + >>> scaler = StandardScaler() # doctest: +SKIP + >>> # Don't cheat - fit only on training data + >>> scaler.fit(X_train) # doctest: +SKIP + >>> X_train = scaler.transform(X_train) # doctest: +SKIP + >>> # apply same transformation to test data + >>> X_test = scaler.transform(X_test) # doctest: +SKIP + + An alternative and recommended approach is to use + :class:`~sklearn.preprocessing.StandardScaler` in a + :class:`~sklearn.pipeline.Pipeline` + +* Finding a reasonable regularization parameter :math:`\alpha` is best done + using :class:`~sklearn.model_selection.GridSearchCV`, usually in the range + ``10.0 ** -np.arange(1, 7)``. + +* Empirically, we observed that `L-BFGS` converges faster and + with better solutions on small datasets. For relatively large + datasets, however, `Adam` is very robust. It usually converges + quickly and gives pretty good performance. `SGD` with momentum or + nesterov's momentum, on the other hand, can perform better than + those two algorithms if learning rate is correctly tuned. More control with warm_start ============================ @@ -353,21 +353,19 @@ or want to do additional monitoring, using ``warm_start=True`` and ... # additional monitoring / inspection MLPClassifier(... -.. topic:: References: +.. dropdown:: References - * `"Learning representations by back-propagating errors." - `_ - Rumelhart, David E., Geoffrey E. Hinton, and Ronald J. Williams. + * `"Learning representations by back-propagating errors." + `_ + Rumelhart, David E., Geoffrey E. Hinton, and Ronald J. Williams. - * `"Stochastic Gradient Descent" `_ L. Bottou - Website, 2010. + * `"Stochastic Gradient Descent" `_ L. Bottou - Website, 2010. - * `"Backpropagation" `_ - Andrew Ng, Jiquan Ngiam, Chuan Yu Foo, Yifan Mai, Caroline Suen - Website, 2011. + * `"Backpropagation" `_ + Andrew Ng, Jiquan Ngiam, Chuan Yu Foo, Yifan Mai, Caroline Suen - Website, 2011. - * `"Efficient BackProp" `_ - Y. LeCun, L. Bottou, G. Orr, K. MÃŧller - In Neural Networks: Tricks - of the Trade 1998. + * `"Efficient BackProp" `_ + Y. LeCun, L. Bottou, G. Orr, K. MÃŧller - In Neural Networks: Tricks of the Trade 1998. - * `"Adam: A method for stochastic optimization." - `_ - Kingma, Diederik, and Jimmy Ba. arXiv preprint arXiv:1412.6980 (2014). + * :arxiv:`"Adam: A method for stochastic optimization." <1412.6980>` + Kingma, Diederik, and Jimmy Ba (2014) diff --git a/doc/modules/neural_networks_unsupervised.rst b/doc/modules/neural_networks_unsupervised.rst index aca56ae8aaf2e..7f6c0016d183b 100644 --- a/doc/modules/neural_networks_unsupervised.rst +++ b/doc/modules/neural_networks_unsupervised.rst @@ -37,9 +37,9 @@ weights of independent RBMs. This method is known as unsupervised pre-training. :align: center :scale: 100% -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py` +* :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py` Graphical model and parametrization @@ -57,7 +57,7 @@ visible and hidden unit, omitted from the image for simplicity. The energy function measures the quality of a joint assignment: -.. math:: +.. math:: E(\mathbf{v}, \mathbf{h}) = -\sum_i \sum_j w_{ij}v_ih_j - \sum_i b_iv_i - \sum_j c_jh_j @@ -149,13 +149,13 @@ step, in PCD we keep a number of chains (fantasy particles) that are updated :math:`k` Gibbs steps after each weight update. This allows the particles to explore the space more thoroughly. -.. topic:: References: +.. rubric:: References - * `"A fast learning algorithm for deep belief nets" - `_ - G. Hinton, S. Osindero, Y.-W. Teh, 2006 +* `"A fast learning algorithm for deep belief nets" + `_, + G. Hinton, S. Osindero, Y.-W. Teh, 2006 - * `"Training Restricted Boltzmann Machines using Approximations to - the Likelihood Gradient" - `_ - T. Tieleman, 2008 +* `"Training Restricted Boltzmann Machines using Approximations to + the Likelihood Gradient" + `_, + T. Tieleman, 2008 diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 5d2008f3c3f58..7de2da4f1818e 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -74,20 +74,22 @@ not available. When ``novelty`` is set to ``True`` be aware that you must only use ``predict``, ``decision_function`` and ``score_samples`` on new unseen data and not on the training samples as this would lead to wrong results. + I.e., the result of ``predict`` will not be the same as ``fit_predict``. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. The behavior of :class:`neighbors.LocalOutlierFactor` is summarized in the following table. -===================== ================================ ===================== -Method Outlier detection Novelty detection -===================== ================================ ===================== -``fit_predict`` OK Not available -``predict`` Not available Use only on new data -``decision_function`` Not available Use only on new data -``score_samples`` Use ``negative_outlier_factor_`` Use only on new data -===================== ================================ ===================== +============================ ================================ ===================== +Method Outlier detection Novelty detection +============================ ================================ ===================== +``fit_predict`` OK Not available +``predict`` Not available Use only on new data +``decision_function`` Not available Use only on new data +``score_samples`` Use ``negative_outlier_factor_`` Use only on new data +``negative_outlier_factor_`` OK OK +============================ ================================ ===================== Overview of outlier detection methods @@ -110,19 +112,30 @@ does not perform very well for outlier detection. That being said, outlier detection in high-dimension, or without any assumptions on the distribution of the inlying data is very challenging. :class:`svm.OneClassSVM` may still be used with outlier detection but requires fine-tuning of its hyperparameter -`nu` to handle outliers and prevent overfitting. Finally, -:class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns -an ellipse. For more details on the different estimators refer to the example +`nu` to handle outliers and prevent overfitting. +:class:`linear_model.SGDOneClassSVM` provides an implementation of a +linear One-Class SVM with a linear complexity in the number of samples. This +implementation is here used with a kernel approximation technique to obtain +results similar to :class:`svm.OneClassSVM` which uses a Gaussian kernel +by default. Finally, :class:`covariance.EllipticEnvelope` assumes the data is +Gaussian and learns an ellipse. For more details on the different estimators +refer to the example :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the sections hereunder. -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` - for a comparison of the :class:`svm.OneClassSVM`, the - :class:`ensemble.IsolationForest`, the - :class:`neighbors.LocalOutlierFactor` and - :class:`covariance.EllipticEnvelope`. +* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` + for a comparison of the :class:`svm.OneClassSVM`, the + :class:`ensemble.IsolationForest`, the + :class:`neighbors.LocalOutlierFactor` and + :class:`covariance.EllipticEnvelope`. + +* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_outlier_detection_bench.py` + for an example showing how to evaluate outlier detection estimators, + the :class:`neighbors.LocalOutlierFactor` and the + :class:`ensemble.IsolationForest`, using ROC curves from + :class:`metrics.RocCurveDisplay`. Novelty Detection ================= @@ -140,7 +153,7 @@ In general, it is about to learn a rough, close frontier delimiting the contour of the initial observations distribution, plotted in embedding :math:`p`-dimensional space. Then, if further observations lay within the frontier-delimited subspace, they are considered as -coming from the same population than the initial +coming from the same population as the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment. @@ -154,18 +167,18 @@ implementation. The `nu` parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier. -.. topic:: References: +.. rubric:: References + +* `Estimating the support of a high-dimensional distribution + `_ + SchÃļlkopf, Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. - * `Estimating the support of a high-dimensional distribution - `_ - SchÃļlkopf, Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. +.. rubric:: Examples -.. topic:: Examples: +* See :ref:`sphx_glr_auto_examples_svm_plot_oneclass.py` for visualizing the + frontier learned around some data by a :class:`svm.OneClassSVM` object. - * See :ref:`sphx_glr_auto_examples_svm_plot_oneclass.py` for visualizing the - frontier learned around some data by a - :class:`svm.OneClassSVM` object. - * :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` +* :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` .. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_001.png :target: ../auto_examples/svm/plot_oneclass.html @@ -173,6 +186,23 @@ but regular, observation outside the frontier. :scale: 75% +Scaling up the One-Class SVM +---------------------------- + +An online linear version of the One-Class SVM is implemented in +:class:`linear_model.SGDOneClassSVM`. This implementation scales linearly with +the number of samples and can be used with a kernel approximation to +approximate the solution of a kernelized :class:`svm.OneClassSVM` whose +complexity is at best quadratic in the number of samples. See section +:ref:`sgd_online_one_class_svm` for more details. + +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_linear_model_plot_sgdocsvm_vs_ocsvm.py` + for an illustration of the approximation of a kernelized One-Class SVM + with the `linear_model.SGDOneClassSVM` combined with kernel approximation. + + Outlier Detection ================= @@ -200,7 +230,7 @@ points, ignoring points outside the central mode. For instance, assuming that the inlier data are Gaussian distributed, it will estimate the inlier location and covariance in a robust way (i.e. without being influenced by outliers). The Mahalanobis distances -obtained from this estimate is used to derive a measure of outlyingness. +obtained from this estimate are used to derive a measure of outlyingness. This strategy is illustrated below. .. figure:: ../auto_examples/covariance/images/sphx_glr_plot_mahalanobis_distances_001.png @@ -208,18 +238,22 @@ This strategy is illustrated below. :align: center :scale: 75% -.. topic:: Examples: +.. rubric:: Examples + +* See :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py` for + an illustration of the difference between using a standard + (:class:`covariance.EmpiricalCovariance`) or a robust estimate + (:class:`covariance.MinCovDet`) of location and covariance to + assess the degree of outlyingness of an observation. - * See :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py` for - an illustration of the difference between using a standard - (:class:`covariance.EmpiricalCovariance`) or a robust estimate - (:class:`covariance.MinCovDet`) of location and covariance to - assess the degree of outlyingness of an observation. +* See :ref:`sphx_glr_auto_examples_applications_plot_outlier_detection_wine.py` + for an example of robust covariance estimation on a real data set. -.. topic:: References: - * Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the minimum - covariance determinant estimator" Technometrics 41(3), 212 (1999) +.. rubric:: References + +* Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the minimum + covariance determinant estimator" Technometrics 41(3), 212 (1999) .. _isolation_forest: @@ -251,7 +285,7 @@ the maximum depth of each tree is set to :math:`\lceil \log_2(n) \rceil` where This algorithm is illustrated below. -.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_isolation_forest_001.png +.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_isolation_forest_003.png :target: ../auto_examples/ensemble/plot_isolation_forest.html :align: center :scale: 75% @@ -269,23 +303,24 @@ allows you to add more trees to an already fitted model:: >>> clf.set_params(n_estimators=20) # add 10 more trees # doctest: +SKIP >>> clf.fit(X) # fit the added trees # doctest: +SKIP -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py` for - an illustration of the use of IsolationForest. +* See :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py` for + an illustration of the use of IsolationForest. - * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` - for a comparison of :class:`ensemble.IsolationForest` with - :class:`neighbors.LocalOutlierFactor`, - :class:`svm.OneClassSVM` (tuned to perform like an outlier detection - method) and a covariance-based outlier detection with - :class:`covariance.EllipticEnvelope`. +* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` + for a comparison of :class:`ensemble.IsolationForest` with + :class:`neighbors.LocalOutlierFactor`, + :class:`svm.OneClassSVM` (tuned to perform like an outlier detection + method), :class:`linear_model.SGDOneClassSVM`, and a covariance-based + outlier detection with :class:`covariance.EllipticEnvelope`. -.. topic:: References: +.. rubric:: References - * Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." - Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. +* Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." + Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. +.. _local_outlier_factor: Local Outlier Factor -------------------- @@ -301,20 +336,18 @@ lower density than their neighbors. In practice the local density is obtained from the k-nearest neighbors. The LOF score of an observation is equal to the ratio of the -average local density of his k-nearest neighbors, and its own local density: +average local density of its k-nearest neighbors, and its own local density: a normal instance is expected to have a local density similar to that of its neighbors, while abnormal data are expected to have much smaller local density. -The number k of neighbors considered, (alias parameter n_neighbors) is typically -chosen 1) greater than the minimum number of objects a cluster has to contain, -so that other objects can be local outliers relative to this cluster, and 2) -smaller than the maximum number of close by objects that can potentially be -local outliers. -In practice, such informations are generally not available, and taking -n_neighbors=20 appears to work well in general. -When the proportion of outliers is high (i.e. greater than 10 \%, as in the -example below), n_neighbors should be greater (n_neighbors=35 in the example -below). +The number k of neighbors considered, (alias parameter `n_neighbors`) is +typically chosen 1) greater than the minimum number of objects a cluster has to +contain, so that other objects can be local outliers relative to this cluster, +and 2) smaller than the maximum number of close by objects that can potentially +be local outliers. In practice, such information is generally not available, and +taking `n_neighbors=20` appears to work well in general. When the proportion of +outliers is high (i.e. greater than 10 \%, as in the example below), +`n_neighbors` should be greater (`n_neighbors=35` in the example below). The strength of the LOF algorithm is that it takes both local and global properties of datasets into consideration: it can perform well even in datasets @@ -328,30 +361,31 @@ method. The scores of abnormality of the training samples are accessible through the ``negative_outlier_factor_`` attribute. Note that ``predict``, ``decision_function`` and ``score_samples`` can be used on new unseen data when LOF is applied for novelty detection, i.e. when the -``novelty`` parameter is set to ``True``. See :ref:`novelty_with_lof`. +``novelty`` parameter is set to ``True``, but the result of ``predict`` may +differ from that of ``fit_predict``. See :ref:`novelty_with_lof`. This strategy is illustrated below. .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_outlier_detection_001.png - :target: ../auto_examples/neighbors/sphx_glr_plot_lof_outlier_detection.html + :target: ../auto_examples/neighbors/plot_lof_outlier_detection.html :align: center :scale: 75% -.. topic:: Examples: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py` - for an illustration of the use of :class:`neighbors.LocalOutlierFactor`. +* See :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py` + for an illustration of the use of :class:`neighbors.LocalOutlierFactor`. - * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` - for a comparison with other anomaly detection methods. +* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` + for a comparison with other anomaly detection methods. -.. topic:: References: +.. rubric:: References - * Breunig, Kriegel, Ng, and Sander (2000) - `LOF: identifying density-based local outliers. - `_ - Proc. ACM SIGMOD +* Breunig, Kriegel, Ng, and Sander (2000) + `LOF: identifying density-based local outliers. + `_ + Proc. ACM SIGMOD .. _novelty_with_lof: @@ -366,19 +400,20 @@ set to ``True`` before fitting the estimator:: lof = LocalOutlierFactor(novelty=True) lof.fit(X_train) -Note that ``fit_predict`` is not available in this case. +Note that ``fit_predict`` is not available in this case to avoid inconsistencies. -.. warning:: **Novelty detection with Local Outlier Factor`** +.. warning:: **Novelty detection with Local Outlier Factor** When ``novelty`` is set to ``True`` be aware that you must only use ``predict``, ``decision_function`` and ``score_samples`` on new unseen data and not on the training samples as this would lead to wrong results. + I.e., the result of ``predict`` will not be the same as ``fit_predict``. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. Novelty detection with Local Outlier Factor is illustrated below. - .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png - :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html - :align: center - :scale: 75% +.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png + :target: ../auto_examples/neighbors/plot_lof_novelty_detection.html + :align: center + :scale: 75% diff --git a/doc/modules/partial_dependence.rst b/doc/modules/partial_dependence.rst index 89fd0f17e38bb..7f30a3a7e6731 100644 --- a/doc/modules/partial_dependence.rst +++ b/doc/modules/partial_dependence.rst @@ -11,10 +11,10 @@ Partial dependence plots (PDP) and individual conditional expectation (ICE) plots can be used to visualize and analyze interaction between the target response [1]_ and a set of input features of interest. -Both PDPs and ICEs assume that the input features of interest are independent -from the complement features, and this assumption is often violated in practice. -Thus, in the case of correlated features, we will create absurd data points to -compute the PDP/ICE. +Both PDPs [H2009]_ and ICEs [G2015]_ assume that the input features of interest +are independent from the complement features, and this assumption is often +violated in practice. Thus, in the case of correlated features, we will +create absurd data points to compute the PDP/ICE [M2019]_. Partial dependence plots ======================== @@ -25,66 +25,75 @@ of all other input features (the 'complement' features). Intuitively, we can interpret the partial dependence as the expected target response as a function of the input features of interest. -Due to the limits of human perception the size of the set of input feature of +Due to the limits of human perception, the size of the set of input features of interest must be small (usually, one or two) thus the input features of interest are usually chosen among the most important features. The figure below shows two one-way and one two-way partial dependence plots for -the California housing dataset, with a :class:`HistGradientBoostingRegressor -`: +the bike sharing dataset, with a +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`: -.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_partial_dependence_003.png +.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_partial_dependence_006.png :target: ../auto_examples/inspection/plot_partial_dependence.html :align: center :scale: 70 -One-way PDPs tell us about the interaction between the target response and an -input feature of interest feature (e.g. linear, non-linear). The left plot -in the above figure shows the effect of the average occupancy on the median -house price; we can clearly see a linear relationship among them when the -average occupancy is inferior to 3 persons. Similarly, we could analyze the -effect of the house age on the median house price (middle plot). Thus, these -interpretations are marginal, considering a feature at a time. - -PDPs with two input features of interest show the interactions among the two -features. For example, the two-variable PDP in the above figure shows the -dependence of median house price on joint values of house age and average -occupants per household. We can clearly see an interaction between the two -features: for an average occupancy greater than two, the house price is nearly -independent of the house age, whereas for values less than 2 there is a strong -dependence on age. +One-way PDPs tell us about the interaction between the target response and an input +feature of interest (e.g. linear, non-linear). The left plot in the above figure +shows the effect of the temperature on the number of bike rentals; we can clearly see +that a higher temperature is related with a higher number of bike rentals. Similarly, we +could analyze the effect of the humidity on the number of bike rentals (middle plot). +Thus, these interpretations are marginal, considering a feature at a time. + +PDPs with two input features of interest show the interactions among the two features. +For example, the two-variable PDP in the above figure shows the dependence of the number +of bike rentals on joint values of temperature and humidity. We can clearly see an +interaction between the two features: with a temperature higher than 20 degrees Celsius, +mainly the humidity has a strong impact on the number of bike rentals. For lower +temperatures, both the temperature and the humidity have an impact on the number of bike +rentals. The :mod:`sklearn.inspection` module provides a convenience function -:func:`plot_partial_dependence` to create one-way and two-way partial +:func:`~PartialDependenceDisplay.from_estimator` to create one-way and two-way partial dependence plots. In the below example we show how to create a grid of partial dependence plots: two one-way PDPs for the features ``0`` and ``1`` and a two-way PDP between the two features:: >>> from sklearn.datasets import make_hastie_10_2 >>> from sklearn.ensemble import GradientBoostingClassifier - >>> from sklearn.inspection import plot_partial_dependence + >>> from sklearn.inspection import PartialDependenceDisplay >>> X, y = make_hastie_10_2(random_state=0) >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, ... max_depth=1, random_state=0).fit(X, y) >>> features = [0, 1, (0, 1)] - >>> plot_partial_dependence(clf, X, features) #doctest: +SKIP + >>> PartialDependenceDisplay.from_estimator(clf, X, features) + <...> You can access the newly created figure and Axes objects using ``plt.gcf()`` and ``plt.gca()``. -For multi-class classification, you need to set the class label for which -the PDPs should be created via the ``target`` argument:: +To make a partial dependence plot with categorical features, you need to specify +which features are categorical using the parameter `categorical_features`. This +parameter takes a list of indices, names of the categorical features or a boolean +mask. The graphical representation of partial dependence for categorical features is +a bar plot or a 2D heatmap. - >>> from sklearn.datasets import load_iris - >>> iris = load_iris() - >>> mc_clf = GradientBoostingClassifier(n_estimators=10, - ... max_depth=1).fit(iris.data, iris.target) - >>> features = [3, 2, (3, 2)] - >>> plot_partial_dependence(mc_clf, X, features, target=0) #doctest: +SKIP +.. dropdown:: PDPs for multi-class classification -The same parameter ``target`` is used to specify the target in multi-output -regression settings. + For multi-class classification, you need to set the class label for which + the PDPs should be created via the ``target`` argument:: + + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> mc_clf = GradientBoostingClassifier(n_estimators=10, + ... max_depth=1).fit(iris.data, iris.target) + >>> features = [3, 2, (3, 2)] + >>> PartialDependenceDisplay.from_estimator(mc_clf, X, features, target=0) + <...> + + The same parameter ``target`` is used to specify the target in multi-output + regression settings. If you need the raw values of the partial dependence function rather than the plots, you can use the @@ -92,10 +101,10 @@ the plots, you can use the >>> from sklearn.inspection import partial_dependence - >>> pdp, axes = partial_dependence(clf, X, [0]) - >>> pdp + >>> results = partial_dependence(clf, X, [0]) + >>> results["average"] array([[ 2.466..., 2.466..., ... - >>> axes + >>> results["grid_values"] [array([-1.624..., -1.592..., ... The values at which the partial dependence should be evaluated are directly @@ -118,12 +127,11 @@ feature for each sample separately with one line per sample. Due to the limits of human perception, only one input feature of interest is supported for ICE plots. -The figures below show four ICE plots for the California housing dataset, -with a :class:`HistGradientBoostingRegressor -`. The second figure plots +The figures below show two ICE plots for the bike sharing dataset, +with a :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. The figures plot the corresponding PD line overlaid on ICE lines. -.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_partial_dependence_002.png +.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_partial_dependence_004.png :target: ../auto_examples/inspection/plot_partial_dependence.html :align: center :scale: 70 @@ -131,34 +139,49 @@ the corresponding PD line overlaid on ICE lines. While the PDPs are good at showing the average effect of the target features, they can obscure a heterogeneous relationship created by interactions. When interactions are present the ICE plot will provide many more insights. -For example, we could observe a linear relationship between the median income -and the house price in the PD line. However, the ICE lines show that there -are some exceptions, where the house price remains constant in some ranges of -the median income. +For example, we see that the ICE for the temperature feature gives us some +additional information: some of the ICE lines are flat while some others +show a decrease of the dependence for temperature above 35 degrees Celsius. +We observe a similar pattern for the humidity feature: some of the ICE +lines show a sharp decrease when the humidity is above 80%. -The :mod:`sklearn.inspection` module's :func:`plot_partial_dependence` +The :mod:`sklearn.inspection` module's :meth:`PartialDependenceDisplay.from_estimator` convenience function can be used to create ICE plots by setting ``kind='individual'``. In the example below, we show how to create a grid of ICE plots: >>> from sklearn.datasets import make_hastie_10_2 >>> from sklearn.ensemble import GradientBoostingClassifier - >>> from sklearn.inspection import plot_partial_dependence + >>> from sklearn.inspection import PartialDependenceDisplay >>> X, y = make_hastie_10_2(random_state=0) >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, ... max_depth=1, random_state=0).fit(X, y) >>> features = [0, 1] - >>> plot_partial_dependence(clf, X, features, - ... kind='individual') # doctest: +SKIP + >>> PartialDependenceDisplay.from_estimator(clf, X, features, + ... kind='individual') + <...> In ICE plots it might not be easy to see the average effect of the input feature of interest. Hence, it is recommended to use ICE plots alongside PDPs. They can be plotted together with ``kind='both'``. - >>> plot_partial_dependence(clf, X, features, - ... kind='both') # doctest: +SKIP + >>> PartialDependenceDisplay.from_estimator(clf, X, features, + ... kind='both') + <...> + +If there are too many lines in an ICE plot, it can be difficult to see +differences between individual samples and interpret the model. Centering the +ICE at the first value on the x-axis, produces centered Individual Conditional +Expectation (cICE) plots [G2015]_. This puts emphasis on the divergence of +individual conditional expectations from the mean line, thus making it easier +to explore heterogeneous relationships. cICE plots can be plotted by setting +`centered=True`: + + >>> PartialDependenceDisplay.from_estimator(clf, X, features, + ... kind='both', centered=True) + <...> Mathematical Definition ======================= @@ -188,11 +211,11 @@ Computation methods =================== There are two main methods to approximate the integral above, namely the -'brute' and 'recursion' methods. The `method` parameter controls which method +`'brute'` and `'recursion'` methods. The `method` parameter controls which method to use. -The 'brute' method is a generic method that works with any estimator. Note that -computing ICE plots is only supported with the 'brute' method. It +The `'brute'` method is a generic method that works with any estimator. Note that +computing ICE plots is only supported with the `'brute'` method. It approximates the above integral by computing an average over the data `X`: .. math:: @@ -206,23 +229,23 @@ over the dataset `X` which is computationally intensive. Each of the :math:`f(x_{S}, x_{C}^{(i)})` corresponds to one ICE line evaluated at :math:`x_{S}`. Computing this for multiple values of :math:`x_{S}`, one obtains a full ICE line. As one can see, the average of the ICE lines -correspond to the partial dependence line. +corresponds to the partial dependence line. -The 'recursion' method is faster than the 'brute' method, but it is only +The `'recursion'` method is faster than the `'brute'` method, but it is only supported for PDP plots by some tree-based estimators. It is computed as follows. For a given point :math:`x_S`, a weighted tree traversal is performed: if a split node involves an input feature of interest, the corresponding left or right branch is followed; otherwise both branches are followed, each branch being weighted by the fraction of training samples that entered that branch. Finally, the partial dependence is given by a weighted average of all the -visited leaves values. +visited leaves' values. -With the 'brute' method, the parameter `X` is used both for generating the +With the `'brute'` method, the parameter `X` is used both for generating the grid of values :math:`x_S` and the complement feature values :math:`x_C`. However with the 'recursion' method, `X` is only used for the grid values: implicitly, the :math:`x_C` values are those of the training data. -By default, the 'recursion' method is used for plotting PDPs on tree-based +By default, the `'recursion'` method is used for plotting PDPs on tree-based estimators that support it, and 'brute' is used for the rest. .. _pdp_method_differences: @@ -230,18 +253,18 @@ estimators that support it, and 'brute' is used for the rest. .. note:: While both methods should be close in general, they might differ in some - specific settings. The 'brute' method assumes the existence of the + specific settings. The `'brute'` method assumes the existence of the data points :math:`(x_S, x_C^{(i)})`. When the features are correlated, - such artificial samples may have a very low probability mass. The 'brute' - and 'recursion' methods will likely disagree regarding the value of the + such artificial samples may have a very low probability mass. The `'brute'` + and `'recursion'` methods will likely disagree regarding the value of the partial dependence, because they will treat these unlikely samples differently. Remember, however, that the primary assumption for interpreting PDPs is that the features should be independent. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` +* :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` .. rubric:: Footnotes @@ -249,17 +272,20 @@ estimators that support it, and 'brute' is used for the rest. class (the positive class for binary classification), or the decision function. -.. topic:: References +.. rubric:: References - T. Hastie, R. Tibshirani and J. Friedman, `The Elements of - Statistical Learning `_, +.. [H2009] T. Hastie, R. Tibshirani and J. Friedman, + `The Elements of Statistical Learning + `_, Second Edition, Section 10.13.2, Springer, 2009. - C. Molnar, `Interpretable Machine Learning - `_, Section 5.1, 2019. +.. [M2019] C. Molnar, + `Interpretable Machine Learning + `_, + Section 5.1, 2019. - A. Goldstein, A. Kapelner, J. Bleich, and E. Pitkin, `Peeking Inside the - Black Box: Visualizing Statistical Learning With Plots of Individual - Conditional Expectation `_, - Journal of Computational and Graphical Statistics, 24(1): 44-65, Springer, - 2015. +.. [G2015] :arxiv:`A. Goldstein, A. Kapelner, J. Bleich, and E. Pitkin, + "Peeking Inside the Black Box: Visualizing Statistical + Learning With Plots of Individual Conditional Expectation" + Journal of Computational and Graphical Statistics, + 24(1): 44-65, Springer, 2015. <1309.6392>` diff --git a/doc/modules/permutation_importance.rst b/doc/modules/permutation_importance.rst index 833c9fb9a696e..80bb5ef0eb650 100644 --- a/doc/modules/permutation_importance.rst +++ b/doc/modules/permutation_importance.rst @@ -6,15 +6,46 @@ Permutation feature importance .. currentmodule:: sklearn.inspection -Permutation feature importance is a model inspection technique that can be used -for any :term:`fitted` :term:`estimator` when the data is tabular. This is -especially useful for non-linear or opaque :term:`estimators`. The permutation -feature importance is defined to be the decrease in a model score when a single -feature value is randomly shuffled [1]_. This procedure breaks the relationship -between the feature and the target, thus the drop in the model score is -indicative of how much the model depends on the feature. This technique -benefits from being model agnostic and can be calculated many times with -different permutations of the feature. +Permutation feature importance is a model inspection technique that measures the +contribution of each feature to a :term:`fitted` model's statistical performance +on a given tabular dataset. This technique is particularly useful for non-linear +or opaque :term:`estimators`, and involves randomly shuffling the values of a +single feature and observing the resulting degradation of the model's score +[1]_. By breaking the relationship between the feature and the target, we +determine how much the model relies on such particular feature. + +In the following figures, we observe the effect of permuting features on the correlation +between the feature and the target and consequently on the model's statistical +performance. + +.. image:: ../images/permuted_predictive_feature.png + :align: center + +.. image:: ../images/permuted_non_predictive_feature.png + :align: center + +On the top figure, we observe that permuting a predictive feature breaks the +correlation between the feature and the target, and consequently the model's +statistical performance decreases. On the bottom figure, we observe that permuting +a non-predictive feature does not significantly degrade the model's statistical +performance. + +One key advantage of permutation feature importance is that it is +model-agnostic, i.e. it can be applied to any fitted estimator. Moreover, it can +be calculated multiple times with different permutations of the feature, further +providing a measure of the variance in the estimated feature importances for the +specific trained model. + +The figure below shows the permutation feature importance of a +:class:`~sklearn.ensemble.RandomForestClassifier` trained on an augmented +version of the titanic dataset that contains a `random_cat` and a `random_num` +features, i.e. a categorical and a numerical feature that are not correlated in +any way with the target variable: + +.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_permutation_importance_002.png + :target: ../auto_examples/inspection/plot_permutation_importance.html + :align: center + :scale: 70 .. warning:: @@ -22,7 +53,7 @@ different permutations of the feature. cross-validation score) could be **very important for a good model**. Therefore it is always important to evaluate the predictive power of a model using a held-out set (or better with cross-validation) prior to computing - importances. Permutation importance does not reflect to the intrinsic + importances. Permutation importance does not reflect the intrinsic predictive value of a feature by itself but **how important this feature is for a particular model**. @@ -74,49 +105,50 @@ highlight which features contribute the most to the generalization power of the inspected model. Features that are important on the training set but not on the held-out set might cause the model to overfit. -The permutation feature importance is the decrease in a model score when a single -feature value is randomly shuffled. The score function to be used for the -computation of importances can be specified with the `scoring` argument, -which also accepts multiple scorers. Using multiple scorers is more computationally -efficient than sequentially calling :func:`permutation_importance` several times -with a different scorer, as it reuses model predictions. - -An example of using multiple scorers is shown below, employing a list of metrics, -but more input formats are possible, as documented in :ref:`multimetric_scoring`. - - >>> scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error'] - >>> r_multi = permutation_importance( - ... model, X_val, y_val, n_repeats=30, random_state=0, scoring=scoring) - ... - >>> for metric in r_multi: - ... print(f"{metric}") - ... r = r_multi[metric] - ... for i in r.importances_mean.argsort()[::-1]: - ... if r.importances_mean[i] - 2 * r.importances_std[i] > 0: - ... print(f" {diabetes.feature_names[i]:<8}" - ... f"{r.importances_mean[i]:.3f}" - ... f" +/- {r.importances_std[i]:.3f}") - ... - r2 - s5 0.204 +/- 0.050 - bmi 0.176 +/- 0.048 - bp 0.088 +/- 0.033 - sex 0.056 +/- 0.023 - neg_mean_absolute_percentage_error - s5 0.081 +/- 0.020 - bmi 0.064 +/- 0.015 - bp 0.029 +/- 0.010 - neg_mean_squared_error - s5 1013.903 +/- 246.460 - bmi 872.694 +/- 240.296 - bp 438.681 +/- 163.025 - sex 277.382 +/- 115.126 - -The ranking of the features is approximately the same for different metrics even -if the scales of the importance values are very different. However, this is not -guaranteed and different metrics might lead to significantly different feature -importances, in particular for models trained for imbalanced classification problems, -for which the choice of the classification metric can be critical. +The permutation feature importance depends on the score function that is +specified with the `scoring` argument. This argument accepts multiple scorers, +which is more computationally efficient than sequentially calling +:func:`permutation_importance` several times with a different scorer, as it +reuses model predictions. + +.. dropdown:: Example of permutation feature importance using multiple scorers + + In the example below we use a list of metrics, but more input formats are + possible, as documented in :ref:`multimetric_scoring`. + + >>> scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error'] + >>> r_multi = permutation_importance( + ... model, X_val, y_val, n_repeats=30, random_state=0, scoring=scoring) + ... + >>> for metric in r_multi: + ... print(f"{metric}") + ... r = r_multi[metric] + ... for i in r.importances_mean.argsort()[::-1]: + ... if r.importances_mean[i] - 2 * r.importances_std[i] > 0: + ... print(f" {diabetes.feature_names[i]:<8}" + ... f"{r.importances_mean[i]:.3f}" + ... f" +/- {r.importances_std[i]:.3f}") + ... + r2 + s5 0.204 +/- 0.050 + bmi 0.176 +/- 0.048 + bp 0.088 +/- 0.033 + sex 0.056 +/- 0.023 + neg_mean_absolute_percentage_error + s5 0.081 +/- 0.020 + bmi 0.064 +/- 0.015 + bp 0.029 +/- 0.010 + neg_mean_squared_error + s5 1013.866 +/- 246.445 + bmi 872.726 +/- 240.298 + bp 438.663 +/- 163.022 + sex 277.376 +/- 115.123 + + The ranking of the features is approximately the same for different metrics even + if the scales of the importance values are very different. However, this is not + guaranteed and different metrics might lead to significantly different feature + importances, in particular for models trained for imbalanced classification problems, + for which **the choice of the classification metric can be critical**. Outline of the permutation importance algorithm ----------------------------------------------- @@ -145,20 +177,20 @@ Relation to impurity-based importance in trees Tree-based models provide an alternative measure of :ref:`feature importances based on the mean decrease in impurity ` (MDI). Impurity is quantified by the splitting criterion of the decision trees -(Gini, Entropy or Mean Squared Error). However, this method can give high +(Gini, Log Loss or Mean Squared Error). However, this method can give high importance to features that may not be predictive on unseen data when the model is overfitting. Permutation-based feature importance, on the other hand, avoids this issue, since it can be computed on unseen data. -Furthermore, impurity-based feature importance for trees are **strongly +Furthermore, impurity-based feature importance for trees is **strongly biased** and **favor high cardinality features** (typically numerical features) over low cardinality features such as binary features or categorical variables with a small number of possible categories. Permutation-based feature importances do not exhibit such a bias. Additionally, -the permutation feature importance may be computed performance metric on the -model predictions predictions and can be used to analyze any model class (not -just tree-based models). +the permutation feature importance may be computed with any performance metric +on the model predictions and can be used to analyze any model class (not just +tree-based models). The following example highlights the limitations of impurity-based feature importance in contrast to permutation-based feature importance: @@ -168,21 +200,37 @@ Misleading values on strongly correlated features ------------------------------------------------- When two features are correlated and one of the features is permuted, the model -will still have access to the feature through its correlated feature. This will -result in a lower importance value for both features, where they might -*actually* be important. +still has access to the latter through its correlated feature. This results in a +lower reported importance value for both features, though they might *actually* +be important. + +The figure below shows the permutation feature importance of a +:class:`~sklearn.ensemble.RandomForestClassifier` trained using the +:ref:`breast_cancer_dataset`, which contains strongly correlated features. A +naive interpretation would suggest that all features are unimportant: + +.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_permutation_importance_multicollinear_002.png + :target: ../auto_examples/inspection/plot_permutation_importance_multicollinear.html + :align: center + :scale: 70 + +One way to handle the issue is to cluster features that are correlated and only +keep one feature from each cluster. + +.. figure:: ../auto_examples/inspection/images/sphx_glr_plot_permutation_importance_multicollinear_004.png + :target: ../auto_examples/inspection/plot_permutation_importance_multicollinear.html + :align: center + :scale: 70 -One way to handle this is to cluster features that are correlated and only -keep one feature from each cluster. This strategy is explored in the following -example: +For more details on such strategy, see the example :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance_multicollinear.py`. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` - * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance_multicollinear.py` +* :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` +* :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance_multicollinear.py` -.. topic:: References: +.. rubric:: References - .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, - 2001. https://doi.org/10.1023/A:1010933404324 +.. [1] L. Breiman, :doi:`"Random Forests" <10.1023/A:1010933404324>`, + Machine Learning, 45(1), 5-32, 2001. diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index b87971ec4ae5a..69dff95518c41 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -10,10 +10,11 @@ The ``sklearn.preprocessing`` package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. -In general, learning algorithms benefit from standardization of the data set. If -some outliers are present in the set, robust scalers or transformers are more -appropriate. The behaviors of the different scalers, transformers, and -normalizers on a dataset containing marginal outliers is highlighted in +In general, many learning algorithms such as linear models benefit from standardization of the data set +(see :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py`). +If some outliers are present in the set, robust scalers or other transformers can +be more appropriate. The behaviors of the different scalers, transformers, and +normalizers on a dataset containing marginal outliers are highlighted in :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. @@ -34,8 +35,8 @@ standard deviation. For instance, many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector -Machines or the l1 and l2 regularizers of linear models) assume that -all features are centered around zero and have variance in the same +Machines or the l1 and l2 regularizers of linear models) may assume that +all features are centered around zero or have variance in the same order. If a feature has a variance that is orders of magnitude larger than others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected. @@ -56,16 +57,16 @@ dataset:: StandardScaler() >>> scaler.mean_ - array([1. ..., 0. ..., 0.33...]) + array([1., 0., 0.33]) >>> scaler.scale_ - array([0.81..., 0.81..., 1.24...]) + array([0.81, 0.81, 1.24]) >>> X_scaled = scaler.transform(X_train) >>> X_scaled - array([[ 0. ..., -1.22..., 1.33...], - [ 1.22..., 0. ..., -0.26...], - [-1.22..., 1.22..., -1.06...]]) + array([[ 0. , -1.22, 1.33 ], + [ 1.22, 0. , -0.267], + [-1.22, 1.22, -1.06 ]]) .. >>> import numpy as np @@ -117,7 +118,7 @@ or so that the maximum absolute value of each feature is scaled to unit size. This can be achieved using :class:`MinMaxScaler` or :class:`MaxAbsScaler`, respectively. -The motivation to use this scaling include robustness to very small +The motivation to use this scaling includes robustness to very small standard deviations of features and preserving zero entries in sparse data. Here is an example to scale a toy data matrix to the ``[0, 1]`` range:: @@ -146,10 +147,10 @@ It is possible to introspect the scaler attributes to find about the exact nature of the transformation learned on the training data:: >>> min_max_scaler.scale_ - array([0.5 , 0.5 , 0.33...]) + array([0.5 , 0.5 , 0.33]) >>> min_max_scaler.min_ - array([0. , 0.5 , 0.33...]) + array([0. , 0.5 , 0.33]) If :class:`MinMaxScaler` is given an explicit ``feature_range=(min, max)`` the full formula is:: @@ -219,13 +220,13 @@ of the data is likely to not work very well. In these cases, you can use more robust estimates for the center and range of your data. -.. topic:: References: +.. dropdown:: References Further discussion on the importance of centering and scaling data is available on this FAQ: `Should I normalize/standardize/rescale the data? `_ -.. topic:: Scaling vs Whitening +.. dropdown:: Scaling vs Whitening It is sometimes not enough to center and scale the features independently, since a downstream model can further make some assumption @@ -234,16 +235,72 @@ more robust estimates for the center and range of your data. To address this issue you can use :class:`~sklearn.decomposition.PCA` with ``whiten=True`` to further remove the linear correlation across features. + .. _kernel_centering: Centering kernel matrices ------------------------- If you have a kernel matrix of a kernel :math:`K` that computes a dot product -in a feature space defined by function :math:`\phi`, -a :class:`KernelCenterer` can transform the kernel matrix -so that it contains inner products in the feature space -defined by :math:`\phi` followed by removal of the mean in that space. +in a feature space (possibly implicitly) defined by a function +:math:`\phi(\cdot)`, a :class:`KernelCenterer` can transform the kernel matrix +so that it contains inner products in the feature space defined by :math:`\phi` +followed by the removal of the mean in that space. In other words, +:class:`KernelCenterer` computes the centered Gram matrix associated to a +positive semidefinite kernel :math:`K`. + +.. dropdown:: Mathematical formulation + + We can have a look at the mathematical formulation now that we have the + intuition. Let :math:`K` be a kernel matrix of shape `(n_samples, n_samples)` + computed from :math:`X`, a data matrix of shape `(n_samples, n_features)`, + during the `fit` step. :math:`K` is defined by + + .. math:: + K(X, X) = \phi(X) . \phi(X)^{T} + + :math:`\phi(X)` is a function mapping of :math:`X` to a Hilbert space. A + centered kernel :math:`\tilde{K}` is defined as: + + .. math:: + \tilde{K}(X, X) = \tilde{\phi}(X) . \tilde{\phi}(X)^{T} + + where :math:`\tilde{\phi}(X)` results from centering :math:`\phi(X)` in the + Hilbert space. + + Thus, one could compute :math:`\tilde{K}` by mapping :math:`X` using the + function :math:`\phi(\cdot)` and center the data in this new space. However, + kernels are often used because they allow some algebra calculations that + avoid computing explicitly this mapping using :math:`\phi(\cdot)`. Indeed, one + can implicitly center as shown in Appendix B in [Scholkopf1998]_: + + .. math:: + \tilde{K} = K - 1_{\text{n}_{samples}} K - K 1_{\text{n}_{samples}} + 1_{\text{n}_{samples}} K 1_{\text{n}_{samples}} + + :math:`1_{\text{n}_{samples}}` is a matrix of `(n_samples, n_samples)` where + all entries are equal to :math:`\frac{1}{\text{n}_{samples}}`. In the + `transform` step, the kernel becomes :math:`K_{test}(X, Y)` defined as: + + .. math:: + K_{test}(X, Y) = \phi(Y) . \phi(X)^{T} + + :math:`Y` is the test dataset of shape `(n_samples_test, n_features)` and thus + :math:`K_{test}` is of shape `(n_samples_test, n_samples)`. In this case, + centering :math:`K_{test}` is done as: + + .. math:: + \tilde{K}_{test}(X, Y) = K_{test} - 1'_{\text{n}_{samples}} K - K_{test} 1_{\text{n}_{samples}} + 1'_{\text{n}_{samples}} K 1_{\text{n}_{samples}} + + :math:`1'_{\text{n}_{samples}}` is a matrix of shape + `(n_samples_test, n_samples)` where all entries are equal to + :math:`\frac{1}{\text{n}_{samples}}`. + + .. rubric:: References + + .. [Scholkopf1998] B. SchÃļlkopf, A. Smola, and K.R. MÃŧller, + `"Nonlinear component analysis as a kernel eigenvalue problem." + `_ + Neural computation 10.5 (1998): 1299-1319. .. _preprocessing_transformer: @@ -289,21 +346,21 @@ with values between 0 and 1:: array([ 4.3, 5.1, 5.8, 6.5, 7.9]) This feature corresponds to the sepal length in cm. Once the quantile -transformation applied, those landmarks approach closely the percentiles +transformation is applied, those landmarks approach closely the percentiles previously defined:: >>> np.percentile(X_train_trans[:, 0], [0, 25, 50, 75, 100]) ... # doctest: +SKIP - array([ 0.00... , 0.24..., 0.49..., 0.73..., 0.99... ]) + array([ 0.00 , 0.24, 0.49, 0.73, 0.99 ]) -This can be confirmed on a independent testing set with similar remarks:: +This can be confirmed on an independent testing set with similar remarks:: >>> np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]) ... # doctest: +SKIP array([ 4.4 , 5.125, 5.75 , 6.175, 7.3 ]) >>> np.percentile(X_test_trans[:, 0], [0, 25, 50, 75, 100]) ... # doctest: +SKIP - array([ 0.01..., 0.25..., 0.46..., 0.60... , 0.94...]) + array([ 0.01, 0.25, 0.46, 0.60 , 0.94]) Mapping to a Gaussian distribution ---------------------------------- @@ -316,46 +373,46 @@ possible in order to stabilize variance and minimize skewness. :class:`PowerTransformer` currently provides two such power transformations, the Yeo-Johnson transform and the Box-Cox transform. -The Yeo-Johnson transform is given by: - -.. math:: - x_i^{(\lambda)} = - \begin{cases} - [(x_i + 1)^\lambda - 1] / \lambda & \text{if } \lambda \neq 0, x_i \geq 0, \\[8pt] - \ln{(x_i + 1)} & \text{if } \lambda = 0, x_i \geq 0 \\[8pt] - -[(-x_i + 1)^{2 - \lambda} - 1] / (2 - \lambda) & \text{if } \lambda \neq 2, x_i < 0, \\[8pt] - - \ln (- x_i + 1) & \text{if } \lambda = 2, x_i < 0 - \end{cases} - -while the Box-Cox transform is given by: - -.. math:: - x_i^{(\lambda)} = - \begin{cases} - \dfrac{x_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt] - \ln{(x_i)} & \text{if } \lambda = 0, - \end{cases} - - -Box-Cox can only be applied to strictly positive data. In both methods, the -transformation is parameterized by :math:`\lambda`, which is determined through -maximum likelihood estimation. Here is an example of using Box-Cox to map -samples drawn from a lognormal distribution to a normal distribution:: - - >>> pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) - >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3)) - >>> X_lognormal - array([[1.28..., 1.18..., 0.84...], - [0.94..., 1.60..., 0.38...], - [1.35..., 0.21..., 1.09...]]) - >>> pt.fit_transform(X_lognormal) - array([[ 0.49..., 0.17..., -0.15...], - [-0.05..., 0.58..., -0.57...], - [ 0.69..., -0.84..., 0.10...]]) - -While the above example sets the `standardize` option to `False`, -:class:`PowerTransformer` will apply zero-mean, unit-variance normalization -to the transformed output by default. +.. dropdown:: Yeo-Johnson transform + + .. math:: + x_i^{(\lambda)} = + \begin{cases} + [(x_i + 1)^\lambda - 1] / \lambda & \text{if } \lambda \neq 0, x_i \geq 0, \\[8pt] + \ln{(x_i + 1)} & \text{if } \lambda = 0, x_i \geq 0 \\[8pt] + -[(-x_i + 1)^{2 - \lambda} - 1] / (2 - \lambda) & \text{if } \lambda \neq 2, x_i < 0, \\[8pt] + - \ln (- x_i + 1) & \text{if } \lambda = 2, x_i < 0 + \end{cases} + +.. dropdown:: Box-Cox transform + + .. math:: + x_i^{(\lambda)} = + \begin{cases} + \dfrac{x_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt] + \ln{(x_i)} & \text{if } \lambda = 0, + \end{cases} + + Box-Cox can only be applied to strictly positive data. In both methods, the + transformation is parameterized by :math:`\lambda`, which is determined through + maximum likelihood estimation. Here is an example of using Box-Cox to map + samples drawn from a lognormal distribution to a normal distribution:: + + >>> pt = preprocessing.PowerTransformer(method='box-cox', standardize=False) + >>> X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3)) + >>> X_lognormal + array([[1.28, 1.18 , 0.84 ], + [0.94, 1.60 , 0.388], + [1.35, 0.217, 1.09 ]]) + >>> pt.fit_transform(X_lognormal) + array([[ 0.49 , 0.179, -0.156], + [-0.051, 0.589, -0.576], + [ 0.69 , -0.849, 0.101]]) + + While the above example sets the `standardize` option to `False`, + :class:`PowerTransformer` will apply zero-mean, unit-variance normalization + to the transformed output by default. + Below are examples of Box-Cox and Yeo-Johnson applied to various probability distributions. Note that when applied to certain distributions, the power @@ -413,9 +470,9 @@ operation on a single array-like dataset, either using the ``l1``, ``l2``, or >>> X_normalized = preprocessing.normalize(X, norm='l2') >>> X_normalized - array([[ 0.40..., -0.40..., 0.81...], - [ 1. ..., 0. ..., 0. ...], - [ 0. ..., 0.70..., -0.70...]]) + array([[ 0.408, -0.408, 0.812], + [ 1. , 0. , 0. ], + [ 0. , 0.707, -0.707]]) The ``preprocessing`` module further provides a utility class :class:`Normalizer` that implements the same operation using the @@ -433,17 +490,17 @@ This class is hence suitable for use in the early steps of a The normalizer instance can then be used on sample vectors as any transformer:: >>> normalizer.transform(X) - array([[ 0.40..., -0.40..., 0.81...], - [ 1. ..., 0. ..., 0. ...], - [ 0. ..., 0.70..., -0.70...]]) + array([[ 0.408, -0.408, 0.812], + [ 1. , 0. , 0. ], + [ 0. , 0.707, -0.707]]) >>> normalizer.transform([[-1., 1., 0.]]) - array([[-0.70..., 0.70..., 0. ...]]) + array([[-0.707, 0.707, 0.]]) Note: L2 normalization is also known as spatial sign preprocessing. -.. topic:: Sparse input +.. dropdown:: Sparse input :func:`normalize` and :class:`Normalizer` accept **both dense array-like and sparse matrices from scipy.sparse as input**. @@ -457,6 +514,7 @@ Note: L2 normalization is also known as spatial sign preprocessing. Encoding categorical features ============================= + Often features are not given as continuous values but categorical. For example a person could have features ``["male", "female"]``, ``["from Europe", "from US", "from Asia"]``, @@ -482,8 +540,8 @@ scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). -:class:`OrdinalEncoder` will also passthrough missing values that are -indicated by `np.nan`. +By default, :class:`OrdinalEncoder` will also passthrough missing values that +are indicated by `np.nan`. >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male'], ['female'], [np.nan], ['female']] @@ -493,6 +551,32 @@ indicated by `np.nan`. [nan], [ 0.]]) +:class:`OrdinalEncoder` provides a parameter `encoded_missing_value` to encode +the missing values without the need to create a pipeline and using +:class:`~sklearn.impute.SimpleImputer`. + + >>> enc = preprocessing.OrdinalEncoder(encoded_missing_value=-1) + >>> X = [['male'], ['female'], [np.nan], ['female']] + >>> enc.fit_transform(X) + array([[ 1.], + [ 0.], + [-1.], + [ 0.]]) + +The above processing is equivalent to the following pipeline:: + + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.impute import SimpleImputer + >>> enc = Pipeline(steps=[ + ... ("encoder", preprocessing.OrdinalEncoder()), + ... ("imputer", SimpleImputer(strategy="constant", fill_value=-1)), + ... ]) + >>> enc.fit_transform(X) + array([[ 1.], + [ 0.], + [-1.], + [ 0.]]) + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. @@ -539,17 +623,19 @@ dataset:: array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]]) If there is a possibility that the training data might have missing categorical -features, it can often be better to specify ``handle_unknown='ignore'`` instead -of setting the ``categories`` manually as above. When -``handle_unknown='ignore'`` is specified and unknown categories are encountered -during transform, no error will be raised but the resulting one-hot encoded -columns for this feature will be all zeros -(``handle_unknown='ignore'`` is only supported for one-hot encoding):: - - >>> enc = preprocessing.OneHotEncoder(handle_unknown='ignore') +features, it can often be better to specify +`handle_unknown='infrequent_if_exist'` instead of setting the `categories` +manually as above. When `handle_unknown='infrequent_if_exist'` is specified +and unknown categories are encountered during transform, no error will be +raised but the resulting one-hot encoded columns for this feature will be all +zeros or considered as an infrequent category if enabled. +(`handle_unknown='infrequent_if_exist'` is only supported for one-hot +encoding):: + + >>> enc = preprocessing.OneHotEncoder(handle_unknown='infrequent_if_exist') >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] >>> enc.fit(X) - OneHotEncoder(handle_unknown='ignore') + OneHotEncoder(handle_unknown='infrequent_if_exist') >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 0., 0., 0.]]) @@ -560,15 +646,14 @@ parameter allows the user to specify a category for each feature to be dropped. This is useful to avoid co-linearity in the input matrix in some classifiers. Such functionality is useful, for example, when using non-regularized regression (:class:`LinearRegression `), -since co-linearity would cause the covariance matrix to be non-invertible. -When this parameter is not None, ``handle_unknown`` must be set to -``error``:: +since co-linearity would cause the covariance matrix to be non-invertible:: >>> X = [['male', 'from US', 'uses Safari'], ... ['female', 'from Europe', 'uses Firefox']] >>> drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X) >>> drop_enc.categories_ - [array(['female', 'male'], dtype=object), array(['from Europe', 'from US'], dtype=object), array(['uses Firefox', 'uses Safari'], dtype=object)] + [array(['female', 'male'], dtype=object), array(['from Europe', 'from US'], dtype=object), + array(['uses Firefox', 'uses Safari'], dtype=object)] >>> drop_enc.transform(X).toarray() array([[1., 1., 1.], [0., 0., 0.]]) @@ -581,46 +666,326 @@ categories. In this case, you can set the parameter `drop='if_binary'`. ... ['female', 'Asia', 'Chrome']] >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X) >>> drop_enc.categories_ - [array(['female', 'male'], dtype=object), array(['Asia', 'Europe', 'US'], dtype=object), array(['Chrome', 'Firefox', 'Safari'], dtype=object)] + [array(['female', 'male'], dtype=object), array(['Asia', 'Europe', 'US'], dtype=object), + array(['Chrome', 'Firefox', 'Safari'], dtype=object)] >>> drop_enc.transform(X).toarray() array([[1., 0., 0., 1., 0., 0., 1.], [0., 0., 1., 0., 0., 1., 0.], [0., 1., 0., 0., 1., 0., 0.]]) In the transformed `X`, the first column is the encoding of the feature with -categories "male"/"female", while the remaining 6 columns is the encoding of +categories "male"/"female", while the remaining 6 columns are the encoding of the 2 features with respectively 3 categories each. -:class:`OneHotEncoder` supports categorical features with missing values by -considering the missing values as an additional category:: +When `handle_unknown='ignore'` and `drop` is not None, unknown categories will +be encoded as all zeros:: + + >>> drop_enc = preprocessing.OneHotEncoder(drop='first', + ... handle_unknown='ignore').fit(X) + >>> X_test = [['unknown', 'America', 'IE']] + >>> drop_enc.transform(X_test).toarray() + array([[0., 0., 0., 0., 0.]]) + +All the categories in `X_test` are unknown during transform and will be mapped +to all zeros. This means that unknown categories will have the same mapping as +the dropped category. :meth:`OneHotEncoder.inverse_transform` will map all zeros +to the dropped category if a category is dropped and `None` if a category is +not dropped:: + + >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse_output=False, + ... handle_unknown='ignore').fit(X) + >>> X_test = [['unknown', 'America', 'IE']] + >>> X_trans = drop_enc.transform(X_test) + >>> X_trans + array([[0., 0., 0., 0., 0., 0., 0.]]) + >>> drop_enc.inverse_transform(X_trans) + array([['female', None, None]], dtype=object) + +.. dropdown:: Support of categorical features with missing values + + :class:`OneHotEncoder` supports categorical features with missing values by + considering the missing values as an additional category:: + + >>> X = [['male', 'Safari'], + ... ['female', None], + ... [np.nan, 'Firefox']] + >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X) + >>> enc.categories_ + [array(['female', 'male', nan], dtype=object), + array(['Firefox', 'Safari', None], dtype=object)] + >>> enc.transform(X).toarray() + array([[0., 1., 0., 0., 1., 0.], + [1., 0., 0., 0., 0., 1.], + [0., 0., 1., 1., 0., 0.]]) + + If a feature contains both `np.nan` and `None`, they will be considered + separate categories:: + + >>> X = [['Safari'], [None], [np.nan], ['Firefox']] + >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X) + >>> enc.categories_ + [array(['Firefox', 'Safari', None, nan], dtype=object)] + >>> enc.transform(X).toarray() + array([[0., 1., 0., 0.], + [0., 0., 1., 0.], + [0., 0., 0., 1.], + [1., 0., 0., 0.]]) + + See :ref:`dict_feature_extraction` for categorical features that are + represented as a dict, not as scalars. + + +.. _encoder_infrequent_categories: + +Infrequent categories +--------------------- - >>> X = [['male', 'Safari'], - ... ['female', None], - ... [np.nan, 'Firefox']] - >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X) - >>> enc.categories_ - [array(['female', 'male', nan], dtype=object), - array(['Firefox', 'Safari', None], dtype=object)] - >>> enc.transform(X).toarray() - array([[0., 1., 0., 0., 1., 0.], - [1., 0., 0., 0., 0., 1.], - [0., 0., 1., 1., 0., 0.]]) - -If a feature contains both `np.nan` and `None`, they will be considered -separate categories:: - - >>> X = [['Safari'], [None], [np.nan], ['Firefox']] - >>> enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X) - >>> enc.categories_ - [array(['Firefox', 'Safari', None, nan], dtype=object)] - >>> enc.transform(X).toarray() - array([[0., 1., 0., 0.], - [0., 0., 1., 0.], - [0., 0., 0., 1.], - [1., 0., 0., 0.]]) +:class:`OneHotEncoder` and :class:`OrdinalEncoder` support aggregating +infrequent categories into a single output for each feature. The parameters to +enable the gathering of infrequent categories are `min_frequency` and +`max_categories`. + +1. `min_frequency` is either an integer greater or equal to 1, or a float in + the interval `(0.0, 1.0)`. If `min_frequency` is an integer, categories with + a cardinality smaller than `min_frequency` will be considered infrequent. + If `min_frequency` is a float, categories with a cardinality smaller than + this fraction of the total number of samples will be considered infrequent. + The default value is 1, which means every category is encoded separately. + +2. `max_categories` is either `None` or any integer greater than 1. This + parameter sets an upper limit to the number of output features for each + input feature. `max_categories` includes the feature that combines + infrequent categories. + +In the following example with :class:`OrdinalEncoder`, the categories `'dog'` +and `'snake'` are considered infrequent:: + + >>> X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 + + ... ['snake'] * 3], dtype=object).T + >>> enc = preprocessing.OrdinalEncoder(min_frequency=6).fit(X) + >>> enc.infrequent_categories_ + [array(['dog', 'snake'], dtype=object)] + >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']])) + array([[2.], + [0.], + [1.], + [2.]]) + +:class:`OrdinalEncoder`'s `max_categories` do **not** take into account missing +or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an +integer will increase the number of unique integer codes by one each. This can +result in up to `max_categories + 2` integer codes. In the following example, +"a" and "d" are considered infrequent and grouped together into a single +category, "b" and "c" are their own categories, unknown values are encoded as 3 +and missing values are encoded as 4. + + >>> X_train = np.array( + ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], + ... dtype=object).T + >>> enc = preprocessing.OrdinalEncoder( + ... handle_unknown="use_encoded_value", unknown_value=3, + ... max_categories=3, encoded_missing_value=4) + >>> _ = enc.fit(X_train) + >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) + >>> enc.transform(X_test) + array([[2.], + [0.], + [1.], + [2.], + [3.], + [4.]]) + +Similarly, :class:`OneHotEncoder` can be configured to group together infrequent +categories:: + + >>> enc = preprocessing.OneHotEncoder(min_frequency=6, sparse_output=False).fit(X) + >>> enc.infrequent_categories_ + [array(['dog', 'snake'], dtype=object)] + >>> enc.transform(np.array([['dog'], ['cat'], ['rabbit'], ['snake']])) + array([[0., 0., 1.], + [1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]) + +By setting handle_unknown to `'infrequent_if_exist'`, unknown categories will +be considered infrequent:: + + >>> enc = preprocessing.OneHotEncoder( + ... handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=6) + >>> enc = enc.fit(X) + >>> enc.transform(np.array([['dragon']])) + array([[0., 0., 1.]]) + +:meth:`OneHotEncoder.get_feature_names_out` uses 'infrequent' as the infrequent +feature name:: + + >>> enc.get_feature_names_out() + array(['x0_cat', 'x0_rabbit', 'x0_infrequent_sklearn'], dtype=object) + +When `'handle_unknown'` is set to `'infrequent_if_exist'` and an unknown +category is encountered in transform: + +1. If infrequent category support was not configured or there was no + infrequent category during training, the resulting one-hot encoded columns + for this feature will be all zeros. In the inverse transform, an unknown + category will be denoted as `None`. + +2. If there is an infrequent category during training, the unknown category + will be considered infrequent. In the inverse transform, 'infrequent_sklearn' + will be used to represent the infrequent category. + +Infrequent categories can also be configured using `max_categories`. In the +following example, we set `max_categories=2` to limit the number of features in +the output. This will result in all but the `'cat'` category to be considered +infrequent, leading to two features, one for `'cat'` and one for infrequent +categories - which are all the others:: + + >>> enc = preprocessing.OneHotEncoder(max_categories=2, sparse_output=False) + >>> enc = enc.fit(X) + >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]) + array([[0., 1.], + [1., 0.], + [0., 1.], + [0., 1.]]) + +If both `max_categories` and `min_frequency` are non-default values, then +categories are selected based on `min_frequency` first and `max_categories` +categories are kept. In the following example, `min_frequency=4` considers +only `snake` to be infrequent, but `max_categories=3`, forces `dog` to also be +infrequent:: + + >>> enc = preprocessing.OneHotEncoder(min_frequency=4, max_categories=3, sparse_output=False) + >>> enc = enc.fit(X) + >>> enc.transform([['dog'], ['cat'], ['rabbit'], ['snake']]) + array([[0., 0., 1.], + [1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]) + +If there are infrequent categories with the same cardinality at the cutoff of +`max_categories`, then the first `max_categories` are taken based on lexicon +ordering. In the following example, "b", "c", and "d", have the same cardinality +and with `max_categories=2`, "b" and "c" are infrequent because they have a higher +lexicon order. + + >>> X = np.asarray([["a"] * 20 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10], dtype=object).T + >>> enc = preprocessing.OneHotEncoder(max_categories=3).fit(X) + >>> enc.infrequent_categories_ + [array(['b', 'c'], dtype=object)] + +.. _target_encoder: + +Target Encoder +-------------- + +.. currentmodule:: sklearn.preprocessing + +The :class:`TargetEncoder` uses the target mean conditioned on the categorical +feature for encoding unordered categories, i.e. nominal categories [PAR]_ +[MIC]_. This encoding scheme is useful with categorical features with high +cardinality, where one-hot encoding would inflate the feature space making it +more expensive for a downstream model to process. A classical example of high +cardinality categories are location based such as zip code or region. + +.. dropdown:: Binary classification targets + + For the binary classification target, the target encoding is given by: + + .. math:: + S_i = \lambda_i\frac{n_{iY}}{n_i} + (1 - \lambda_i)\frac{n_Y}{n} + + where :math:`S_i` is the encoding for category :math:`i`, :math:`n_{iY}` is the + number of observations with :math:`Y=1` and category :math:`i`, :math:`n_i` is + the number of observations with category :math:`i`, :math:`n_Y` is the number of + observations with :math:`Y=1`, :math:`n` is the number of observations, and + :math:`\lambda_i` is a shrinkage factor for category :math:`i`. The shrinkage + factor is given by: + + .. math:: + \lambda_i = \frac{n_i}{m + n_i} + + where :math:`m` is a smoothing factor, which is controlled with the `smooth` + parameter in :class:`TargetEncoder`. Large smoothing factors will put more + weight on the global mean. When `smooth="auto"`, the smoothing factor is + computed as an empirical Bayes estimate: :math:`m=\sigma_i^2/\tau^2`, where + :math:`\sigma_i^2` is the variance of `y` with category :math:`i` and + :math:`\tau^2` is the global variance of `y`. + +.. dropdown:: Multiclass classification targets + + For multiclass classification targets, the formulation is similar to binary + classification: + + .. math:: + S_{ij} = \lambda_i\frac{n_{iY_j}}{n_i} + (1 - \lambda_i)\frac{n_{Y_j}}{n} + + where :math:`S_{ij}` is the encoding for category :math:`i` and class :math:`j`, + :math:`n_{iY_j}` is the number of observations with :math:`Y=j` and category + :math:`i`, :math:`n_i` is the number of observations with category :math:`i`, + :math:`n_{Y_j}` is the number of observations with :math:`Y=j`, :math:`n` is the + number of observations, and :math:`\lambda_i` is a shrinkage factor for category + :math:`i`. + +.. dropdown:: Continuous targets + + For continuous targets, the formulation is similar to binary classification: -See :ref:`dict_feature_extraction` for categorical features that are -represented as a dict, not as scalars. + .. math:: + S_i = \lambda_i\frac{\sum_{k\in L_i}Y_k}{n_i} + (1 - \lambda_i)\frac{\sum_{k=1}^{n}Y_k}{n} + + where :math:`L_i` is the set of observations with category :math:`i` and + :math:`n_i` is the number of observations with category :math:`i`. + + +:meth:`~TargetEncoder.fit_transform` internally relies on a :term:`cross fitting` +scheme to prevent target information from leaking into the train-time +representation, especially for non-informative high-cardinality categorical +variables, and help prevent the downstream model from overfitting spurious +correlations. Note that as a result, `fit(X, y).transform(X)` does not equal +`fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training +data is split into *k* folds (determined by the `cv` parameter) and each fold is +encoded using the encodings learnt using the other *k-1* folds. The following +diagram shows the :term:`cross fitting` scheme in +:meth:`~TargetEncoder.fit_transform` with the default `cv=5`: + +.. image:: ../images/target_encoder_cross_validation.svg + :width: 600 + :align: center + +:meth:`~TargetEncoder.fit_transform` also learns a 'full data' encoding using +the whole training set. This is never used in +:meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`, +for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings +learned for each fold during the :term:`cross fitting` scheme are not saved to +an attribute. + +The :meth:`~TargetEncoder.fit` method does **not** use any :term:`cross fitting` +schemes and learns one encoding on the entire training set, which is used to +encode categories in :meth:`~TargetEncoder.transform`. +This encoding is the same as the 'full data' +encoding learned in :meth:`~TargetEncoder.fit_transform`. + +.. note:: + :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`, + as another category and encodes them like any other category. Categories + that are not seen during `fit` are encoded with the target mean, i.e. + `target_mean_`. + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py` +* :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py` + +.. rubric:: References + +.. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + SIGKDD Explor. Newsl. 3, 1 (July 2001), 27-32. <10.1145/507533.507538>` + +.. [PAR] :doi:`Pargent, F., Pfisterer, F., Thomas, J. et al. "Regularized target + encoding outperforms traditional methods in supervised machine learning with + high cardinality features" Comput Stat 37, 2671-2692 (2022) + <10.1007/s00180-022-01207-6>` .. _preprocessing_discretization: @@ -656,9 +1021,9 @@ For each feature, the bin edges are computed during ``fit`` and together with the number of bins, they will define the intervals. Therefore, for the current example, these intervals are defined as: - - feature 1: :math:`{[-\infty, -1), [-1, 2), [2, \infty)}` - - feature 2: :math:`{[-\infty, 5), [5, \infty)}` - - feature 3: :math:`{[-\infty, 14), [14, \infty)}` +- feature 1: :math:`{[-\infty, -1), [-1, 2), [2, \infty)}` +- feature 2: :math:`{[-\infty, 5), [5, \infty)}` +- feature 3: :math:`{[-\infty, 14), [14, \infty)}` Based on these bin intervals, ``X`` is transformed as follows:: @@ -686,6 +1051,8 @@ For instance, we can use the Pandas function :func:`pandas.cut`:: >>> import pandas as pd >>> import numpy as np + >>> from sklearn import preprocessing + >>> >>> bins = [0, 1, 13, 20, 60, np.inf] >>> labels = ['infant', 'kid', 'teen', 'adult', 'senior citizen'] >>> transformer = preprocessing.FunctionTransformer( @@ -696,11 +1063,11 @@ For instance, we can use the Pandas function :func:`pandas.cut`:: ['infant', 'kid', 'teen', 'adult', 'senior citizen'] Categories (5, object): ['infant' < 'kid' < 'teen' < 'adult' < 'senior citizen'] -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py` - * :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py` - * :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py` +* :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py` +* :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py` +* :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py` .. _preprocessing_binarization: @@ -847,23 +1214,23 @@ below. Some of the advantages of splines over polynomials are: - - B-splines are very flexible and robust if you keep a fixed low degree, - usually 3, and parsimoniously adapt the number of knots. Polynomials - would need a higher degree, which leads to the next point. - - B-splines do not have oscillatory behaviour at the boundaries as have - polynomials (the higher the degree, the worse). This is known as `Runge's - phenomenon `_. - - B-splines provide good options for extrapolation beyond the boundaries, - i.e. beyond the range of fitted values. Have a look at the option - ``extrapolation``. - - B-splines generate a feature matrix with a banded structure. For a single - feature, every row contains only ``degree + 1`` non-zero elements, which - occur consecutively and are even positive. This results in a matrix with - good numerical properties, e.g. a low condition number, in sharp contrast - to a matrix of polynomials, which goes under the name - `Vandermonde matrix `_. - A low condition number is important for stable algorithms of linear - models. +- B-splines are very flexible and robust if you keep a fixed low degree, + usually 3, and parsimoniously adapt the number of knots. Polynomials + would need a higher degree, which leads to the next point. +- B-splines do not have oscillatory behaviour at the boundaries as have + polynomials (the higher the degree, the worse). This is known as `Runge's + phenomenon `_. +- B-splines provide good options for extrapolation beyond the boundaries, + i.e. beyond the range of fitted values. Have a look at the option + ``extrapolation``. +- B-splines generate a feature matrix with a banded structure. For a single + feature, every row contains only ``degree + 1`` non-zero elements, which + occur consecutively and are even positive. This results in a matrix with + good numerical properties, e.g. a low condition number, in sharp contrast + to a matrix of polynomials, which goes under the name + `Vandermonde matrix `_. + A low condition number is important for stable algorithms of linear + models. The following code snippet shows splines in action:: @@ -893,20 +1260,20 @@ Interestingly, a :class:`SplineTransformer` of ``degree=0`` is the same as ``encode='onehot-dense'`` and ``n_bins = n_knots - 1`` if ``knots = strategy``. -.. topic:: Examples: +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py` +* :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` - * :ref:`sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py` +.. dropdown:: References -.. topic:: References: + * Eilers, P., & Marx, B. (1996). :doi:`Flexible Smoothing with B-splines and + Penalties <10.1214/ss/1038425655>`. Statist. Sci. 11 (1996), no. 2, 89--121. - * Eilers, P., & Marx, B. (1996). Flexible Smoothing with B-splines and - Penalties. Statist. Sci. 11 (1996), no. 2, 89--121. - `doi:10.1214/ss/1038425655 `_ + * Perperoglou, A., Sauerbrei, W., Abrahamowicz, M. et al. :doi:`A review of + spline function procedures in R <10.1186/s12874-019-0666-3>`. + BMC Med Res Methodol 19, 46 (2019). - * Perperoglou, A., Sauerbrei, W., Abrahamowicz, M. et al. A review of - spline function procedures in R. BMC Med Res Methodol 19, 46 (2019). - `doi:10.1186/s12874-019-0666-3 - `_ .. _function_transformer: @@ -922,6 +1289,7 @@ a transformer that applies a log transformation in a pipeline, do:: >>> from sklearn.preprocessing import FunctionTransformer >>> transformer = FunctionTransformer(np.log1p, validate=True) >>> X = np.array([[0, 1], [2, 3]]) + >>> # Since FunctionTransformer is no-op during fit, we can call transform directly >>> transformer.transform(X) array([[0. , 0.69314718], [1.09861229, 1.38629436]]) @@ -937,4 +1305,5 @@ error with a ``filterwarnings``:: For a full code example that demonstrates using a :class:`FunctionTransformer` to extract features from text data see -:ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` +:ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` and +:ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`. diff --git a/doc/modules/preprocessing_targets.rst b/doc/modules/preprocessing_targets.rst index b7e8802785257..f8035bc059af4 100644 --- a/doc/modules/preprocessing_targets.rst +++ b/doc/modules/preprocessing_targets.rst @@ -95,8 +95,8 @@ hashable and comparable) to numerical labels:: >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) LabelEncoder() >>> list(le.classes_) - ['amsterdam', 'paris', 'tokyo'] + [np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')] >>> le.transform(["tokyo", "tokyo", "paris"]) array([2, 2, 1]) >>> list(le.inverse_transform([2, 2, 1])) - ['tokyo', 'tokyo', 'paris'] + [np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')] diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst index b8a87fe602c9a..ec437c60c7d4c 100644 --- a/doc/modules/random_projection.rst +++ b/doc/modules/random_projection.rst @@ -19,19 +19,19 @@ samples of the dataset. Thus random projection is a suitable approximation technique for distance based method. -.. topic:: References: +.. rubric:: References - * Sanjoy Dasgupta. 2000. - `Experiments with random projection. `_ - In Proceedings of the Sixteenth conference on Uncertainty in artificial - intelligence (UAI'00), Craig Boutilier and MoisÊs Goldszmidt (Eds.). Morgan - Kaufmann Publishers Inc., San Francisco, CA, USA, 143-151. +* Sanjoy Dasgupta. 2000. + `Experiments with random projection. `_ + In Proceedings of the Sixteenth conference on Uncertainty in artificial + intelligence (UAI'00), Craig Boutilier and MoisÊs Goldszmidt (Eds.). Morgan + Kaufmann Publishers Inc., San Francisco, CA, USA, 143-151. - * Ella Bingham and Heikki Mannila. 2001. - `Random projection in dimensionality reduction: applications to image and text data. `_ - In Proceedings of the seventh ACM SIGKDD international conference on - Knowledge discovery and data mining (KDD '01). ACM, New York, NY, USA, - 245-250. +* Ella Bingham and Heikki Mannila. 2001. + `Random projection in dimensionality reduction: applications to image and text data. `_ + In Proceedings of the seventh ACM SIGKDD international conference on + Knowledge discovery and data mining (KDD '01). ACM, New York, NY, USA, + 245-250. .. _johnson_lindenstrauss: @@ -58,7 +58,7 @@ bounded distortion introduced by the random projection:: >>> from sklearn.random_projection import johnson_lindenstrauss_min_dim >>> johnson_lindenstrauss_min_dim(n_samples=1e6, eps=0.5) - 663 + np.int64(663) >>> johnson_lindenstrauss_min_dim(n_samples=1e6, eps=[0.5, 0.1, 0.01]) array([ 663, 11841, 1112658]) >>> johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1) @@ -74,17 +74,17 @@ bounded distortion introduced by the random projection:: :scale: 75 :align: center -.. topic:: Example: +.. rubric:: Examples - * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` - for a theoretical explication on the Johnson-Lindenstrauss lemma and an - empirical validation using sparse random matrices. +* See :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` + for a theoretical explication on the Johnson-Lindenstrauss lemma and an + empirical validation using sparse random matrices. -.. topic:: References: +.. rubric:: References - * Sanjoy Dasgupta and Anupam Gupta, 1999. - `An elementary proof of the Johnson-Lindenstrauss Lemma. - `_ +* Sanjoy Dasgupta and Anupam Gupta, 1999. + `An elementary proof of the Johnson-Lindenstrauss Lemma. + `_ .. _gaussian_random_matrix: @@ -95,7 +95,7 @@ dimensionality by projecting the original input space on a randomly generated matrix where components are drawn from the following distribution :math:`N(0, \frac{1}{n_{components}})`. -Here a small excerpt which illustrates how to use the Gaussian random +Here is a small excerpt which illustrates how to use the Gaussian random projection transformer:: >>> import numpy as np @@ -136,7 +136,7 @@ where :math:`n_{\text{components}}` is the size of the projected subspace. By default the density of non zero elements is set to the minimum density as recommended by Ping Li et al.: :math:`1 / \sqrt{n_{\text{features}}}`. -Here a small excerpt which illustrates how to use the sparse random +Here is a small excerpt which illustrates how to use the sparse random projection transformer:: >>> import numpy as np @@ -148,15 +148,53 @@ projection transformer:: (100, 3947) -.. topic:: References: +.. rubric:: References - * D. Achlioptas. 2003. - `Database-friendly random projections: Johnson-Lindenstrauss with binary - coins `_. - Journal of Computer and System Sciences 66 (2003) 671–687 +* D. Achlioptas. 2003. + `Database-friendly random projections: Johnson-Lindenstrauss with binary + coins `_. + Journal of Computer and System Sciences 66 (2003) 671-687. - * Ping Li, Trevor J. Hastie, and Kenneth W. Church. 2006. - `Very sparse random projections. `_ - In Proceedings of the 12th ACM SIGKDD international conference on - Knowledge discovery and data mining (KDD '06). ACM, New York, NY, USA, - 287-296. +* Ping Li, Trevor J. Hastie, and Kenneth W. Church. 2006. + `Very sparse random projections. `_ + In Proceedings of the 12th ACM SIGKDD international conference on + Knowledge discovery and data mining (KDD '06). ACM, New York, NY, USA, 287-296. + + +.. _random_projection_inverse_transform: + +Inverse Transform +================= +The random projection transformers have ``compute_inverse_components`` parameter. When +set to True, after creating the random ``components_`` matrix during fitting, +the transformer computes the pseudo-inverse of this matrix and stores it as +``inverse_components_``. The ``inverse_components_`` matrix has shape +:math:`n_{features} \times n_{components}`, and it is always a dense matrix, +regardless of whether the components matrix is sparse or dense. So depending on +the number of features and components, it may use a lot of memory. + +When the ``inverse_transform`` method is called, it computes the product of the +input ``X`` and the transpose of the inverse components. If the inverse components have +been computed during fit, they are reused at each call to ``inverse_transform``. +Otherwise they are recomputed each time, which can be costly. The result is always +dense, even if ``X`` is sparse. + +Here is a small code example which illustrates how to use the inverse transform +feature:: + + >>> import numpy as np + >>> from sklearn.random_projection import SparseRandomProjection + >>> X = np.random.rand(100, 10000) + >>> transformer = SparseRandomProjection( + ... compute_inverse_components=True + ... ) + ... + >>> X_new = transformer.fit_transform(X) + >>> X_new.shape + (100, 3947) + >>> X_new_inversed = transformer.inverse_transform(X_new) + >>> X_new_inversed.shape + (100, 10000) + >>> X_new_again = transformer.transform(X_new_inversed) + >>> np.allclose(X_new, X_new_again) + True diff --git a/doc/modules/semi_supervised.rst b/doc/modules/semi_supervised.rst index 7c1ea8f296a49..6c050b698f42c 100644 --- a/doc/modules/semi_supervised.rst +++ b/doc/modules/semi_supervised.rst @@ -40,8 +40,8 @@ this algorithm, a given supervised classifier can function as a semi-supervised classifier, allowing it to learn from unlabeled data. :class:`SelfTrainingClassifier` can be called with any classifier that -implements `predict_proba`, passed as the parameter `base_classifier`. In -each iteration, the `base_classifier` predicts labels for the unlabeled +implements `predict_proba`, passed as the parameter `estimator`. In +each iteration, the `estimator` predicts labels for the unlabeled samples and adds a subset of these labels to the labeled dataset. The choice of this subset is determined by the selection criterion. This @@ -60,18 +60,18 @@ until all samples have labels or no new samples are selected in that iteration. When using the self-training classifier, the :ref:`calibration ` of the classifier is important. -.. topic:: Examples +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_self_training_varying_threshold.py` - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_semi_supervised_versus_svm_iris.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_self_training_varying_threshold.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_semi_supervised_versus_svm_iris.py` -.. topic:: References +.. rubric:: References - .. [1] David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling - supervised methods. In Proceedings of the 33rd annual meeting on - Association for Computational Linguistics (ACL '95). Association for - Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI: - https://doi.org/10.3115/981658.981684 +.. [1] :doi:`"Unsupervised word sense disambiguation rivaling supervised methods" + <10.3115/981658.981684>` + David Yarowsky, Proceedings of the 33rd annual meeting on Association for + Computational Linguistics (ACL '95). Association for Computational Linguistics, + Stroudsburg, PA, USA, 189-196. .. _label_propagation: @@ -79,7 +79,7 @@ Label Propagation ================= Label propagation denotes a few variations of semi-supervised graph -inference algorithms. +inference algorithms. A few features available in this model: * Used for classification tasks @@ -87,7 +87,7 @@ A few features available in this model: `scikit-learn` provides two label propagation models: :class:`LabelPropagation` and :class:`LabelSpreading`. Both work by -constructing a similarity graph over all items in the input dataset. +constructing a similarity graph over all items in the input dataset. .. figure:: ../auto_examples/semi_supervised/images/sphx_glr_plot_label_propagation_structure_001.png :target: ../auto_examples/semi_supervised/plot_label_propagation_structure.html @@ -118,14 +118,14 @@ computing the normalized graph Laplacian matrix. This procedure is also used in :ref:`spectral_clustering`. Label propagation models have two built-in kernel methods. Choice of kernel -effects both scalability and performance of the algorithms. The following are +affects both scalability and performance of the algorithms. The following are available: - * rbf (:math:`\exp(-\gamma |x-y|^2), \gamma > 0`). :math:`\gamma` is - specified by keyword gamma. +* rbf (:math:`\exp(-\gamma |x-y|^2), \gamma > 0`). :math:`\gamma` is + specified by keyword gamma. - * knn (:math:`1[x' \in kNN(x)]`). :math:`k` is specified by keyword - n_neighbors. +* knn (:math:`1[x' \in kNN(x)]`). :math:`k` is specified by keyword + n_neighbors. The RBF kernel will produce a fully connected graph which is represented in memory by a dense matrix. This matrix may be very large and combined with the cost of @@ -134,18 +134,18 @@ algorithm can lead to prohibitively long running times. On the other hand, the KNN kernel will produce a much more memory-friendly sparse matrix which can drastically reduce running times. -.. topic:: Examples +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_semi_supervised_versus_svm_iris.py` - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_structure.py` - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_digits.py` - * :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_digits_active_learning.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_semi_supervised_versus_svm_iris.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_structure.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_digits.py` +* :ref:`sphx_glr_auto_examples_semi_supervised_plot_label_propagation_digits_active_learning.py` -.. topic:: References +.. rubric:: References - [2] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised - Learning (2006), pp. 193-216 +[2] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised +Learning (2006), pp. 193-216 - [3] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient - Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005 - https://research.microsoft.com/en-us/people/nicolasl/efficient_ssl.pdf +[3] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient +Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005 +https://www.gatsby.ucl.ac.uk/aistats/fullpapers/204.pdf diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst index 95a5111747509..84812a0cccf12 100644 --- a/doc/modules/sgd.rst +++ b/doc/modules/sgd.rst @@ -18,34 +18,34 @@ recently in the context of large-scale learning. SGD has been successfully applied to large-scale and sparse machine learning problems often encountered in text classification and natural language processing. Given that the data is sparse, the classifiers -in this module easily scale to problems with more than 10^5 training -examples and more than 10^5 features. +in this module easily scale to problems with more than :math:`10^5` training +examples and more than :math:`10^5` features. Strictly speaking, SGD is merely an optimization technique and does not correspond to a specific family of machine learning models. It is only a *way* to train a model. Often, an instance of :class:`SGDClassifier` or :class:`SGDRegressor` will have an equivalent estimator in the scikit-learn API, potentially using a different optimization technique. -For example, using `SGDClassifier(loss='log')` results in logistic regression, +For example, using `SGDClassifier(loss='log_loss')` results in logistic regression, i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression` which is fitted via SGD instead of being fitted by one of the other solvers in :class:`~sklearn.linear_model.LogisticRegression`. Similarly, -`SGDRegressor(loss='squared_loss', penalty='l2')` and +`SGDRegressor(loss='squared_error', penalty='l2')` and :class:`~sklearn.linear_model.Ridge` solve the same optimization problem, via different means. The advantages of Stochastic Gradient Descent are: - + Efficiency. ++ Efficiency. - + Ease of implementation (lots of opportunities for code tuning). ++ Ease of implementation (lots of opportunities for code tuning). The disadvantages of Stochastic Gradient Descent include: - + SGD requires a number of hyperparameters such as the regularization - parameter and the number of iterations. ++ SGD requires a number of hyperparameters such as the regularization + parameter and the number of iterations. - + SGD is sensitive to feature scaling. ++ SGD is sensitive to feature scaling. .. warning:: @@ -71,7 +71,7 @@ penalties for classification. Below is the decision boundary of a As other classifiers, SGD has to be fitted with two arrays: an array `X` of shape (n_samples, n_features) holding the training samples, and an -array y of shape (n_samples,) holding the target values (class labels) +array `y` of shape (n_samples,) holding the target values (class labels) for the training samples:: >>> from sklearn.linear_model import SGDClassifier @@ -91,12 +91,12 @@ SGD fits a linear model to the training data. The ``coef_`` attribute holds the model parameters:: >>> clf.coef_ - array([[9.9..., 9.9...]]) + array([[9.9, 9.9]]) The ``intercept_`` attribute holds the intercept (aka offset or bias):: >>> clf.intercept_ - array([-9.9...]) + array([-9.9]) Whether or not the model should use an intercept, i.e. a biased hyperplane, is controlled by the parameter ``fit_intercept``. @@ -106,46 +106,46 @@ the coefficients and the input sample, plus the intercept) is given by :meth:`SGDClassifier.decision_function`:: >>> clf.decision_function([[2., 2.]]) - array([29.6...]) + array([29.6]) The concrete loss function can be set via the ``loss`` parameter. :class:`SGDClassifier` supports the following loss functions: - * ``loss="hinge"``: (soft-margin) linear Support Vector Machine, - * ``loss="modified_huber"``: smoothed hinge loss, - * ``loss="log"``: logistic regression, - * and all regression losses below. In this case the target is encoded as -1 - or 1, and the problem is treated as a regression problem. The predicted - class then correspond to the sign of the predicted target. +* ``loss="hinge"``: (soft-margin) linear Support Vector Machine, +* ``loss="modified_huber"``: smoothed hinge loss, +* ``loss="log_loss"``: logistic regression, +* and all regression losses below. In this case the target is encoded as :math:`-1` + or :math:`1`, and the problem is treated as a regression problem. The predicted + class then corresponds to the sign of the predicted target. Please refer to the :ref:`mathematical section below ` for formulas. The first two loss functions are lazy, they only update the model parameters if an example violates the margin constraint, which makes training very efficient and may result in sparser models (i.e. with more zero -coefficents), even when L2 penalty is used. +coefficients), even when :math:`L_2` penalty is used. -Using ``loss="log"`` or ``loss="modified_huber"`` enables the +Using ``loss="log_loss"`` or ``loss="modified_huber"`` enables the ``predict_proba`` method, which gives a vector of probability estimates :math:`P(y|x)` per sample :math:`x`:: - >>> clf = SGDClassifier(loss="log", max_iter=5).fit(X, y) - >>> clf.predict_proba([[1., 1.]]) - array([[0.00..., 0.99...]]) + >>> clf = SGDClassifier(loss="log_loss", max_iter=5).fit(X, y) + >>> clf.predict_proba([[1., 1.]]) # doctest: +SKIP + array([[0.00, 0.99]]) The concrete penalty can be set via the ``penalty`` parameter. SGD supports the following penalties: - * ``penalty="l2"``: L2 norm penalty on ``coef_``. - * ``penalty="l1"``: L1 norm penalty on ``coef_``. - * ``penalty="elasticnet"``: Convex combination of L2 and L1; - ``(1 - l1_ratio) * L2 + l1_ratio * L1``. +* ``penalty="l2"``: :math:`L_2` norm penalty on ``coef_``. +* ``penalty="l1"``: :math:`L_1` norm penalty on ``coef_``. +* ``penalty="elasticnet"``: Convex combination of :math:`L_2` and :math:`L_1`; + ``(1 - l1_ratio) * L2 + l1_ratio * L1``. -The default setting is ``penalty="l2"``. The L1 penalty leads to sparse +The default setting is ``penalty="l2"``. The :math:`L_1` penalty leads to sparse solutions, driving most coefficients to zero. The Elastic Net [#5]_ solves -some deficiencies of the L1 penalty in the presence of highly correlated +some deficiencies of the :math:`L_1` penalty in the presence of highly correlated attributes. The parameter ``l1_ratio`` controls the convex combination -of L1 and L2 penalty. +of :math:`L_1` and :math:`L_2` penalty. :class:`SGDClassifier` supports multi-class classification by combining multiple binary classifiers in a "one versus all" (OVA) scheme. For each @@ -164,11 +164,11 @@ the decision surface induced by the three classifiers. In the case of multi-class classification ``coef_`` is a two-dimensional array of shape (n_classes, n_features) and ``intercept_`` is a -one-dimensional array of shape (n_classes,). The i-th row of ``coef_`` holds -the weight vector of the OVA classifier for the i-th class; classes are +one-dimensional array of shape (n_classes,). The :math:`i`-th row of ``coef_`` holds +the weight vector of the OVA classifier for the :math:`i`-th class; classes are indexed in ascending order (see attribute ``classes_``). Note that, in principle, since they allow to create a probability model, -``loss="log"`` and ``loss="modified_huber"`` are more suitable for +``loss="log_loss"`` and ``loss="modified_huber"`` are more suitable for one-vs-all classification. :class:`SGDClassifier` supports both weighted classes and weighted @@ -189,14 +189,13 @@ For classification with a logistic loss, another variant of SGD with an averaging strategy is available with Stochastic Average Gradient (SAG) algorithm, available as a solver in :class:`LogisticRegression`. -.. topic:: Examples: +.. rubric:: Examples - - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_separating_hyperplane.py`, - - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_iris.py` - - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_weighted_samples.py` - - :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_comparison.py` - - :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py` - (See the Note in the example) +- :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_separating_hyperplane.py` +- :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_iris.py` +- :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_weighted_samples.py` +- :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py` + (See the Note in the example) Regression ========== @@ -211,9 +210,9 @@ samples (> 10.000), for other problems we recommend :class:`Ridge`, The concrete loss function can be set via the ``loss`` parameter. :class:`SGDRegressor` supports the following loss functions: - * ``loss="squared_loss"``: Ordinary least squares, - * ``loss="huber"``: Huber loss for robust regression, - * ``loss="epsilon_insensitive"``: linear Support Vector Regression. +* ``loss="squared_error"``: Ordinary least squares, +* ``loss="huber"``: Huber loss for robust regression, +* ``loss="epsilon_insensitive"``: linear Support Vector Regression. Please refer to the :ref:`mathematical section below ` for formulas. @@ -228,10 +227,72 @@ description above in the classification section). :class:`SGDRegressor` also supports averaged SGD [#4]_ (here again, see description above in the classification section). -For regression with a squared loss and a l2 penalty, another variant of +For regression with a squared loss and a :math:`L_2` penalty, another variant of SGD with an averaging strategy is available with Stochastic Average Gradient (SAG) algorithm, available as a solver in :class:`Ridge`. +.. rubric:: Examples + +- :ref:`sphx_glr_auto_examples_applications_plot_prediction_latency.py` + +.. _sgd_online_one_class_svm: + +Online One-Class SVM +==================== + +The class :class:`sklearn.linear_model.SGDOneClassSVM` implements an online +linear version of the One-Class SVM using a stochastic gradient descent. +Combined with kernel approximation techniques, +:class:`sklearn.linear_model.SGDOneClassSVM` can be used to approximate the +solution of a kernelized One-Class SVM, implemented in +:class:`sklearn.svm.OneClassSVM`, with a linear complexity in the number of +samples. Note that the complexity of a kernelized One-Class SVM is at best +quadratic in the number of samples. +:class:`sklearn.linear_model.SGDOneClassSVM` is thus well suited for datasets +with a large number of training samples (over 10,000) for which the SGD +variant can be several orders of magnitude faster. + +.. dropdown:: Mathematical details + + Its implementation is based on the implementation of the stochastic + gradient descent. Indeed, the original optimization problem of the One-Class + SVM is given by + + .. math:: + + \begin{aligned} + \min_{w, \rho, \xi} & \quad \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \xi_i \\ + \text{s.t.} & \quad \langle w, x_i \rangle \geq \rho - \xi_i \quad 1 \leq i \leq n \\ + & \quad \xi_i \geq 0 \quad 1 \leq i \leq n + \end{aligned} + + where :math:`\nu \in (0, 1]` is the user-specified parameter controlling the + proportion of outliers and the proportion of support vectors. Getting rid of + the slack variables :math:`\xi_i` this problem is equivalent to + + .. math:: + + \min_{w, \rho} \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \max(0, \rho - \langle w, x_i \rangle) \, . + + Multiplying by the constant :math:`\nu` and introducing the intercept + :math:`b = 1 - \rho` we obtain the following equivalent optimization problem + + .. math:: + + \min_{w, b} \frac{\nu}{2}\Vert w \Vert^2 + b\nu + \frac{1}{n} \sum_{i=1}^n \max(0, 1 - (\langle w, x_i \rangle + b)) \, . + + This is similar to the optimization problems studied in section + :ref:`sgd_mathematical_formulation` with :math:`y_i = 1, 1 \leq i \leq n` and + :math:`\alpha = \nu/2`, :math:`L` being the hinge loss function and :math:`R` + being the :math:`L_2` norm. We just need to add the term :math:`b\nu` in the + optimization loop. + +As :class:`SGDClassifier` and :class:`SGDRegressor`, :class:`SGDOneClassSVM` +supports averaged SGD. Averaging can be enabled by setting ``average=True``. + +.. rubric:: Examples + +- :ref:`sphx_glr_auto_examples_linear_model_plot_sgdocsvm_vs_ocsvm.py` Stochastic Gradient Descent for sparse data =========================================== @@ -247,16 +308,17 @@ efficiency, however, use the CSR matrix format as defined in `scipy.sparse.csr_matrix `_. -.. topic:: Examples: +.. rubric:: Examples - - :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` +- :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` Complexity ========== The major advantage of SGD is its efficiency, which is basically -linear in the number of training examples. If X is a matrix of size (n, p) -training has a cost of :math:`O(k n \bar p)`, where k is the number +linear in the number of training examples. If :math:`X` is a matrix of size +:math:`n \times p` (with :math:`n` samples and :math:`p` features), +training has a cost of :math:`O(k n \bar p)`, where :math:`k` is the number of iterations (epochs) and :math:`\bar p` is the average number of non-zero attributes per sample. @@ -269,69 +331,71 @@ Stopping criterion The classes :class:`SGDClassifier` and :class:`SGDRegressor` provide two criteria to stop the algorithm when a given level of convergence is reached: - * With ``early_stopping=True``, the input data is split into a training set - and a validation set. The model is then fitted on the training set, and the - stopping criterion is based on the prediction score (using the `score` - method) computed on the validation set. The size of the validation set - can be changed with the parameter ``validation_fraction``. - * With ``early_stopping=False``, the model is fitted on the entire input data - and the stopping criterion is based on the objective function computed on - the training data. +* With ``early_stopping=True``, the input data is split into a training set + and a validation set. The model is then fitted on the training set, and the + stopping criterion is based on the prediction score (using the `score` + method) computed on the validation set. The size of the validation set + can be changed with the parameter ``validation_fraction``. +* With ``early_stopping=False``, the model is fitted on the entire input data + and the stopping criterion is based on the objective function computed on + the training data. In both cases, the criterion is evaluated once by epoch, and the algorithm stops when the criterion does not improve ``n_iter_no_change`` times in a row. The improvement is evaluated with absolute tolerance ``tol``, and the algorithm -stops in any case after a maximum number of iteration ``max_iter``. +stops in any case after a maximum number of iterations ``max_iter``. +See :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_early_stopping.py` for an +example of the effects of early stopping. Tips on Practical Use ===================== - * Stochastic Gradient Descent is sensitive to feature scaling, so it - is highly recommended to scale your data. For example, scale each - attribute on the input vector X to [0,1] or [-1,+1], or standardize - it to have mean 0 and variance 1. Note that the *same* scaling - must be applied to the test vector to obtain meaningful - results. This can be easily done using :class:`StandardScaler`:: +* Stochastic Gradient Descent is sensitive to feature scaling, so it + is highly recommended to scale your data. For example, scale each + attribute on the input vector :math:`X` to :math:`[0,1]` or :math:`[-1,1]`, or standardize + it to have mean :math:`0` and variance :math:`1`. Note that the *same* scaling must be + applied to the test vector to obtain meaningful results. This can be easily + done using :class:`~sklearn.preprocessing.StandardScaler`:: - from sklearn.preprocessing import StandardScaler - scaler = StandardScaler() - scaler.fit(X_train) # Don't cheat - fit only on training data - X_train = scaler.transform(X_train) - X_test = scaler.transform(X_test) # apply same transformation to test data + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + scaler.fit(X_train) # Don't cheat - fit only on training data + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) # apply same transformation to test data - # Or better yet: use a pipeline! - from sklearn.pipeline import make_pipeline - est = make_pipeline(StandardScaler(), SGDClassifier()) - est.fit(X_train) - est.predict(X_test) + # Or better yet: use a pipeline! + from sklearn.pipeline import make_pipeline + est = make_pipeline(StandardScaler(), SGDClassifier()) + est.fit(X_train) + est.predict(X_test) - If your attributes have an intrinsic scale (e.g. word frequencies or - indicator features) scaling is not needed. + If your attributes have an intrinsic scale (e.g. word frequencies or + indicator features) scaling is not needed. - * Finding a reasonable regularization term :math:`\alpha` is - best done using automatic hyper-parameter search, e.g. - :class:`~sklearn.model_selection.GridSearchCV` or - :class:`~sklearn.model_selection.RandomizedSearchCV`, usually in the - range ``10.0**-np.arange(1,7)``. +* Finding a reasonable regularization term :math:`\alpha` is + best done using automatic hyper-parameter search, e.g. + :class:`~sklearn.model_selection.GridSearchCV` or + :class:`~sklearn.model_selection.RandomizedSearchCV`, usually in the + range ``10.0**-np.arange(1,7)``. - * Empirically, we found that SGD converges after observing - approximately 10^6 training samples. Thus, a reasonable first guess - for the number of iterations is ``max_iter = np.ceil(10**6 / n)``, - where ``n`` is the size of the training set. +* Empirically, we found that SGD converges after observing + approximately :math:`10^6` training samples. Thus, a reasonable first guess + for the number of iterations is ``max_iter = np.ceil(10**6 / n)``, + where ``n`` is the size of the training set. - * If you apply SGD to features extracted using PCA we found that - it is often wise to scale the feature values by some constant `c` - such that the average L2 norm of the training data equals one. +* If you apply SGD to features extracted using PCA we found that + it is often wise to scale the feature values by some constant `c` + such that the average :math:`L_2` norm of the training data equals one. - * We found that Averaged SGD works best with a larger number of features - and a higher eta0 +* We found that Averaged SGD works best with a larger number of features + and a higher `eta0`. -.. topic:: References: +.. rubric:: References - * `"Efficient BackProp" `_ - Y. LeCun, L. Bottou, G. Orr, K. MÃŧller - In Neural Networks: Tricks - of the Trade 1998. +* `"Efficient BackProp" `_ + Y. LeCun, L. Bottou, G. Orr, K. MÃŧller - In Neural Networks: Tricks + of the Trade 1998. .. _sgd_mathematical_formulation: @@ -342,8 +406,9 @@ We describe here the mathematical details of the SGD procedure. A good overview with convergence rates can be found in [#6]_. Given a set of training examples :math:`(x_1, y_1), \ldots, (x_n, y_n)` where -:math:`x_i \in \mathbf{R}^m` and :math:`y_i \in \mathcal{R}` (:math:`y_i \in -{-1, 1}` for classification), our goal is to learn a linear scoring function +:math:`x_i \in \mathbf{R}^m` and :math:`y_i \in \mathbf{R}` +(:math:`y_i \in \{-1, 1\}` for classification), +our goal is to learn a linear scoring function :math:`f(x) = w^T x + b` with model parameters :math:`w \in \mathbf{R}^m` and intercept :math:`b \in \mathbf{R}`. In order to make predictions for binary classification, we simply look at the sign of :math:`f(x)`. To find the model @@ -356,28 +421,30 @@ parameters, we minimize the regularized training error given by where :math:`L` is a loss function that measures model (mis)fit and :math:`R` is a regularization term (aka penalty) that penalizes model complexity; :math:`\alpha > 0` is a non-negative hyperparameter that controls -the regularization stength. - -Different choices for :math:`L` entail different classifiers or regressors: - -- Hinge (soft-margin): equivalent to Support Vector Classification. - :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`. -- Perceptron: - :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`. -- Modified Huber: - :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) > - 1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise. -- Log: equivalent to Logistic Regression. - :math:`L(y_i, f(x_i)) = \log(1 + \exp (-y_i f(x_i)))`. -- Least-Squares: Linear regression (Ridge or Lasso depending on - :math:`R`). - :math:`L(y_i, f(x_i)) = \frac{1}{2}(y_i - f(x_i))^2`. -- Huber: less sensitive to outliers than least-squares. It is equivalent to - least squares when :math:`|y_i - f(x_i)| \leq \varepsilon`, and - :math:`L(y_i, f(x_i)) = \varepsilon |y_i - f(x_i)| - \frac{1}{2} - \varepsilon^2` otherwise. -- Epsilon-Insensitive: (soft-margin) equivalent to Support Vector Regression. - :math:`L(y_i, f(x_i)) = \max(0, |y_i - f(x_i)| - \varepsilon)`. +the regularization strength. + +.. dropdown:: Loss functions details + + Different choices for :math:`L` entail different classifiers or regressors: + + - Hinge (soft-margin): equivalent to Support Vector Classification. + :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`. + - Perceptron: + :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`. + - Modified Huber: + :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) > + -1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise. + - Log Loss: equivalent to Logistic Regression. + :math:`L(y_i, f(x_i)) = \log(1 + \exp (-y_i f(x_i)))`. + - Squared Error: Linear regression (Ridge or Lasso depending on + :math:`R`). + :math:`L(y_i, f(x_i)) = \frac{1}{2}(y_i - f(x_i))^2`. + - Huber: less sensitive to outliers than least-squares. It is equivalent to + least squares when :math:`|y_i - f(x_i)| \leq \varepsilon`, and + :math:`L(y_i, f(x_i)) = \varepsilon |y_i - f(x_i)| - \frac{1}{2} + \varepsilon^2` otherwise. + - Epsilon-Insensitive: (soft-margin) equivalent to Support Vector Regression. + :math:`L(y_i, f(x_i)) = \max(0, |y_i - f(x_i)| - \varepsilon)`. All of the above loss functions can be regarded as an upper bound on the misclassification error (Zero-one loss) as shown in the Figure below. @@ -390,12 +457,12 @@ misclassification error (Zero-one loss) as shown in the Figure below. Popular choices for the regularization term :math:`R` (the `penalty` parameter) include: - - L2 norm: :math:`R(w) := \frac{1}{2} \sum_{j=1}^{m} w_j^2 = ||w||_2^2`, - - L1 norm: :math:`R(w) := \sum_{j=1}^{m} |w_j|`, which leads to sparse - solutions. - - Elastic Net: :math:`R(w) := \frac{\rho}{2} \sum_{j=1}^{n} w_j^2 + - (1-\rho) \sum_{j=1}^{m} |w_j|`, a convex combination of L2 and L1, where - :math:`\rho` is given by ``1 - l1_ratio``. +- :math:`L_2` norm: :math:`R(w) := \frac{1}{2} \sum_{j=1}^{m} w_j^2 = ||w||_2^2`, +- :math:`L_1` norm: :math:`R(w) := \sum_{j=1}^{m} |w_j|`, which leads to sparse + solutions. +- Elastic Net: :math:`R(w) := \frac{\rho}{2} \sum_{j=1}^{n} w_j^2 + + (1-\rho) \sum_{j=1}^{m} |w_j|`, a convex combination of :math:`L_2` and :math:`L_1`, where + :math:`\rho` is given by ``1 - l1_ratio``. The Figure below shows the contours of the different regularization terms in a 2-dimensional parameter space (:math:`m=2`) when :math:`R(w) = 1`. @@ -438,8 +505,8 @@ is given by where :math:`t` is the time step (there are a total of `n_samples * n_iter` time steps), :math:`t_0` is determined based on a heuristic proposed by LÊon Bottou such that the expected initial updates are comparable with the expected -size of the weights (this assuming that the norm of the training samples is -approx. 1). The exact definition can be found in ``_init_t`` in :class:`BaseSGD`. +size of the weights (this assumes that the norm of the training samples is +approximately 1). The exact definition can be found in ``_init_t`` in `BaseSGD`. For regression the default learning rate schedule is inverse scaling @@ -450,7 +517,7 @@ For regression the default learning rate schedule is inverse scaling \eta^{(t)} = \frac{eta_0}{t^{power\_t}} where :math:`eta_0` and :math:`power\_t` are hyperparameters chosen by the -user via ``eta0`` and ``power_t``, resp. +user via ``eta0`` and ``power_t``, respectively. For a constant learning rate use ``learning_rate='constant'`` and use ``eta0`` to specify the learning rate. @@ -458,7 +525,7 @@ to specify the learning rate. For an adaptively decreasing learning rate, use ``learning_rate='adaptive'`` and use ``eta0`` to specify the starting learning rate. When the stopping criterion is reached, the learning rate is divided by 5, and the algorithm -does not stop. The algorithm stops when the learning rate goes below 1e-6. +does not stop. The algorithm stops when the learning rate goes below `1e-6`. The model parameters can be accessed through the ``coef_`` and ``intercept_`` attributes: ``coef_`` holds the weights :math:`w` and @@ -478,7 +545,7 @@ The implementation of SGD is influenced by the `Stochastic Gradient SVM` of [#1]_. Similar to SvmSGD, the weight vector is represented as the product of a scalar and a vector -which allows an efficient weight update in the case of L2 regularization. +which allows an efficient weight update in the case of :math:`L_2` regularization. In the case of sparse input `X`, the intercept is updated with a smaller learning rate (multiplied by 0.01) to account for the fact that it is updated more frequently. Training examples are picked up sequentially @@ -486,35 +553,32 @@ and the learning rate is lowered after each observed example. We adopted the learning rate schedule from [#2]_. For multi-class classification, a "one versus all" approach is used. We use the truncated gradient algorithm proposed in [#3]_ -for L1 regularization (and the Elastic Net). +for :math:`L_1` regularization (and the Elastic Net). The code is written in Cython. -.. topic:: References: +.. rubric:: References - .. [#1] `"Stochastic Gradient Descent" - `_ L. Bottou - Website, 2010. +.. [#1] `"Stochastic Gradient Descent" + `_ L. Bottou - Website, 2010. - .. [#2] `"Pegasos: Primal estimated sub-gradient solver for svm" - `_ - S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. +.. [#2] :doi:`"Pegasos: Primal estimated sub-gradient solver for svm" + <10.1145/1273496.1273598>` + S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. - .. [#3] `"Stochastic gradient descent training for l1-regularized - log-linear models with cumulative penalty" - `_ - Y. Tsuruoka, J. Tsujii, S. Ananiadou - In Proceedings of the AFNLP/ACL - '09. +.. [#3] `"Stochastic gradient descent training for l1-regularized + log-linear models with cumulative penalty" + `_ + Y. Tsuruoka, J. Tsujii, S. Ananiadou - In Proceedings of the AFNLP/ACL'09. - .. [#4] `"Towards Optimal One Pass Large Scale Learning with - Averaged Stochastic Gradient Descent" - `_ - Xu, Wei +.. [#4] :arxiv:`"Towards Optimal One Pass Large Scale Learning with + Averaged Stochastic Gradient Descent" + <1107.2490v2>`. Xu, Wei (2011) - .. [#5] `"Regularization and variable selection via the elastic net" - `_ - H. Zou, T. Hastie - Journal of the Royal Statistical Society Series B, - 67 (2), 301-320. +.. [#5] :doi:`"Regularization and variable selection via the elastic net" + <10.1111/j.1467-9868.2005.00503.x>` + H. Zou, T. Hastie - Journal of the Royal Statistical Society Series B, + 67 (2), 301-320. - .. [#6] `"Solving large scale linear prediction problems using stochastic - gradient descent algorithms" - `_ - T. Zhang - In Proceedings of ICML '04. +.. [#6] :doi:`"Solving large scale linear prediction problems using stochastic + gradient descent algorithms" <10.1145/1015330.1015332>` + T. Zhang - In Proceedings of ICML '04. diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 57d2cfb3cb7a7..ac9fbdb12e58d 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -16,27 +16,27 @@ methods used for :ref:`classification `, The advantages of support vector machines are: - - Effective in high dimensional spaces. +- Effective in high dimensional spaces. - - Still effective in cases where number of dimensions is greater - than the number of samples. +- Still effective in cases where number of dimensions is greater + than the number of samples. - - Uses a subset of training points in the decision function (called - support vectors), so it is also memory efficient. +- Uses a subset of training points in the decision function (called + support vectors), so it is also memory efficient. - - Versatile: different :ref:`svm_kernels` can be - specified for the decision function. Common kernels are - provided, but it is also possible to specify custom kernels. +- Versatile: different :ref:`svm_kernels` can be + specified for the decision function. Common kernels are + provided, but it is also possible to specify custom kernels. The disadvantages of support vector machines include: - - If the number of features is much greater than the number of - samples, avoid over-fitting in choosing :ref:`svm_kernels` and regularization - term is crucial. +- If the number of features is much greater than the number of + samples, avoid over-fitting in choosing :ref:`svm_kernels` and regularization + term is crucial. - - SVMs do not directly provide probability estimates, these are - calculated using an expensive five-fold cross-validation - (see :ref:`Scores and probabilities `, below). +- SVMs do not directly provide probability estimates, these are + calculated using an expensive five-fold cross-validation + (see :ref:`Scores and probabilities `, below). The support vector machines in scikit-learn support both dense (``numpy.ndarray`` and convertible to that by ``numpy.asarray``) and @@ -60,14 +60,19 @@ capable of performing binary and multi-class classification on a dataset. :align: center -:class:`SVC` and :class:`NuSVC` are similar methods, but accept -slightly different sets of parameters and have different mathematical -formulations (see section :ref:`svm_mathematical_formulation`). On the -other hand, :class:`LinearSVC` is another (faster) implementation of Support -Vector Classification for the case of a linear kernel. Note that -:class:`LinearSVC` does not accept parameter ``kernel``, as this is -assumed to be linear. It also lacks some of the attributes of -:class:`SVC` and :class:`NuSVC`, like ``support_``. +:class:`SVC` and :class:`NuSVC` are similar methods, but accept slightly +different sets of parameters and have different mathematical formulations (see +section :ref:`svm_mathematical_formulation`). On the other hand, +:class:`LinearSVC` is another (faster) implementation of Support Vector +Classification for the case of a linear kernel. It also +lacks some of the attributes of :class:`SVC` and :class:`NuSVC`, like +`support_`. :class:`LinearSVC` uses `squared_hinge` loss and due to its +implementation in `liblinear` it also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. As other classifiers, :class:`SVC`, :class:`NuSVC` and :class:`LinearSVC` take as input two arrays: an array `X` of shape @@ -103,11 +108,11 @@ properties of these support vectors can be found in attributes >>> clf.n_support_ array([1, 1]...) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py`, - * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py` - * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`, +* :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py` +* :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py` +* :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py` .. _svm_multi_class: @@ -121,7 +126,8 @@ classifiers are constructed and each one trains data from two classes. To provide a consistent interface with other classifiers, the ``decision_function_shape`` option allows to monotonically transform the results of the "one-versus-one" classifiers to a "one-vs-rest" decision -function of shape ``(n_samples, n_classes)``. +function of shape ``(n_samples, n_classes)``, which is the default setting +of the parameter (default='ovr'). >>> X = [[0], [1], [2], [3]] >>> Y = [0, 1, 2, 3] @@ -129,7 +135,7 @@ function of shape ``(n_samples, n_classes)``. >>> clf.fit(X, Y) SVC(decision_function_shape='ovo') >>> dec = clf.decision_function([[1]]) - >>> dec.shape[1] # 4 classes: 4*3/2 = 6 + >>> dec.shape[1] # 6 classes: 4*3/2 = 6 6 >>> clf.decision_function_shape = "ovr" >>> dec = clf.decision_function([[1]]) @@ -149,64 +155,61 @@ multi-class strategy, thus training `n_classes` models. See :ref:`svm_mathematical_formulation` for a complete description of the decision function. -Note that the :class:`LinearSVC` also implements an alternative multi-class -strategy, the so-called multi-class SVM formulated by Crammer and Singer -[#8]_, by using the option ``multi_class='crammer_singer'``. In practice, -one-vs-rest classification is usually preferred, since the results are mostly -similar, but the runtime is significantly less. - -For "one-vs-rest" :class:`LinearSVC` the attributes ``coef_`` and ``intercept_`` -have the shape ``(n_classes, n_features)`` and ``(n_classes,)`` respectively. -Each row of the coefficients corresponds to one of the ``n_classes`` -"one-vs-rest" classifiers and similar for the intercepts, in the -order of the "one" class. - -In the case of "one-vs-one" :class:`SVC` and :class:`NuSVC`, the layout of -the attributes is a little more involved. In the case of a linear -kernel, the attributes ``coef_`` and ``intercept_`` have the shape -``(n_classes * (n_classes - 1) / 2, n_features)`` and ``(n_classes * -(n_classes - 1) / 2)`` respectively. This is similar to the layout for -:class:`LinearSVC` described above, with each row now corresponding -to a binary classifier. The order for classes -0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . . -. "n-1 vs n". - -The shape of ``dual_coef_`` is ``(n_classes-1, n_SV)`` with -a somewhat hard to grasp layout. -The columns correspond to the support vectors involved in any -of the ``n_classes * (n_classes - 1) / 2`` "one-vs-one" classifiers. -Each of the support vectors is used in ``n_classes - 1`` classifiers. -The ``n_classes - 1`` entries in each row correspond to the dual coefficients -for these classifiers. - -This might be clearer with an example: consider a three class problem with -class 0 having three support vectors -:math:`v^{0}_0, v^{1}_0, v^{2}_0` and class 1 and 2 having two support vectors -:math:`v^{0}_1, v^{1}_1` and :math:`v^{0}_2, v^{1}_2` respectively. For each -support vector :math:`v^{j}_i`, there are two dual coefficients. Let's call -the coefficient of support vector :math:`v^{j}_i` in the classifier between -classes :math:`i` and :math:`k` :math:`\alpha^{j}_{i,k}`. -Then ``dual_coef_`` looks like this: - -+------------------------+------------------------+------------------+ -|:math:`\alpha^{0}_{0,1}`|:math:`\alpha^{0}_{0,2}`|Coefficients | -+------------------------+------------------------+for SVs of class 0| -|:math:`\alpha^{1}_{0,1}`|:math:`\alpha^{1}_{0,2}`| | -+------------------------+------------------------+ | -|:math:`\alpha^{2}_{0,1}`|:math:`\alpha^{2}_{0,2}`| | -+------------------------+------------------------+------------------+ -|:math:`\alpha^{0}_{1,0}`|:math:`\alpha^{0}_{1,2}`|Coefficients | -+------------------------+------------------------+for SVs of class 1| -|:math:`\alpha^{1}_{1,0}`|:math:`\alpha^{1}_{1,2}`| | -+------------------------+------------------------+------------------+ -|:math:`\alpha^{0}_{2,0}`|:math:`\alpha^{0}_{2,1}`|Coefficients | -+------------------------+------------------------+for SVs of class 2| -|:math:`\alpha^{1}_{2,0}`|:math:`\alpha^{1}_{2,1}`| | -+------------------------+------------------------+------------------+ - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py`, +.. dropdown:: Details on multi-class strategies + + Note that the :class:`LinearSVC` also implements an alternative multi-class + strategy, the so-called multi-class SVM formulated by Crammer and Singer + [#8]_, by using the option ``multi_class='crammer_singer'``. In practice, + one-vs-rest classification is usually preferred, since the results are mostly + similar, but the runtime is significantly less. + + For "one-vs-rest" :class:`LinearSVC` the attributes ``coef_`` and ``intercept_`` + have the shape ``(n_classes, n_features)`` and ``(n_classes,)`` respectively. + Each row of the coefficients corresponds to one of the ``n_classes`` + "one-vs-rest" classifiers and similar for the intercepts, in the + order of the "one" class. + + In the case of "one-vs-one" :class:`SVC` and :class:`NuSVC`, the layout of + the attributes is a little more involved. In the case of a linear + kernel, the attributes ``coef_`` and ``intercept_`` have the shape + ``(n_classes * (n_classes - 1) / 2, n_features)`` and ``(n_classes * + (n_classes - 1) / 2)`` respectively. This is similar to the layout for + :class:`LinearSVC` described above, with each row now corresponding + to a binary classifier. The order for classes + 0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . . + . "n-1 vs n". + + The shape of ``dual_coef_`` is ``(n_classes-1, n_SV)`` with + a somewhat hard to grasp layout. + The columns correspond to the support vectors involved in any + of the ``n_classes * (n_classes - 1) / 2`` "one-vs-one" classifiers. + Each support vector ``v`` has a dual coefficient in each of the + ``n_classes - 1`` classifiers comparing the class of ``v`` against another class. + Note that some, but not all, of these dual coefficients, may be zero. + The ``n_classes - 1`` entries in each column are these dual coefficients, + ordered by the opposing class. + + This might be clearer with an example: consider a three class problem with + class 0 having three support vectors + :math:`v^{0}_0, v^{1}_0, v^{2}_0` and class 1 and 2 having two support vectors + :math:`v^{0}_1, v^{1}_1` and :math:`v^{0}_2, v^{1}_2` respectively. For each + support vector :math:`v^{j}_i`, there are two dual coefficients. Let's call + the coefficient of support vector :math:`v^{j}_i` in the classifier between + classes :math:`i` and :math:`k` :math:`\alpha^{j}_{i,k}`. + Then ``dual_coef_`` looks like this: + + +------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+ + |:math:`\alpha^{0}_{0,1}`|:math:`\alpha^{1}_{0,1}`|:math:`\alpha^{2}_{0,1}`|:math:`\alpha^{0}_{1,0}`|:math:`\alpha^{1}_{1,0}`|:math:`\alpha^{0}_{2,0}`|:math:`\alpha^{1}_{2,0}`| + +------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+ + |:math:`\alpha^{0}_{0,2}`|:math:`\alpha^{1}_{0,2}`|:math:`\alpha^{2}_{0,2}`|:math:`\alpha^{0}_{1,2}`|:math:`\alpha^{1}_{1,2}`|:math:`\alpha^{0}_{2,1}`|:math:`\alpha^{1}_{2,1}`| + +------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+ + |Coefficients |Coefficients |Coefficients | + |for SVs of class 0 |for SVs of class 1 |for SVs of class 2 | + +--------------------------------------------------------------------------+-------------------------------------------------+-------------------------------------------------+ + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py` .. _scores_probabilities: @@ -227,7 +230,7 @@ In the multiclass case, this is extended as per [#2]_. The same probability calibration procedure is available for all estimators via the :class:`~sklearn.calibration.CalibratedClassifierCV` (see :ref:`calibration`). In the case of :class:`SVC` and :class:`NuSVC`, this - procedure is builtin in `libsvm`_ which is used under the hood, so it does + procedure is builtin to `libsvm`_ which is used under the hood, so it does not rely on scikit-learn's :class:`~sklearn.calibration.CalibratedClassifierCV`. @@ -289,10 +292,10 @@ to the sample weights: :align: center :scale: 75 -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py` - * :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py`, +* :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py` +* :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py` .. _svm_regression: @@ -313,10 +316,15 @@ target. There are three different implementations of Support Vector Regression: :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR` -provides a faster implementation than :class:`SVR` but only considers -the linear kernel, while :class:`NuSVR` implements a slightly different -formulation than :class:`SVR` and :class:`LinearSVR`. See -:ref:`svm_implementation_details` for further details. +provides a faster implementation than :class:`SVR` but only considers the +linear kernel, while :class:`NuSVR` implements a slightly different formulation +than :class:`SVR` and :class:`LinearSVR`. Due to its implementation in +`liblinear` :class:`LinearSVR` also regularizes the intercept, if considered. +This effect can however be reduced by carefully fine tuning its +`intercept_scaling` parameter, which allows the intercept term to have a +different regularization behavior compared to the other features. The +classification results and score can therefore differ from the other two +classifiers. See :ref:`svm_implementation_details` for further details. As with classification classes, the fit method will take as argument vectors X, y, only that in this case y is expected to have @@ -332,9 +340,9 @@ floating point values instead of integer values:: array([1.5]) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` +* :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` .. _svm_outlier_detection: @@ -370,95 +378,95 @@ Tips on Practical Use ===================== - * **Avoiding data copy**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and - :class:`NuSVR`, if the data passed to certain methods is not C-ordered - contiguous and double precision, it will be copied before calling the - underlying C implementation. You can check whether a given numpy array is - C-contiguous by inspecting its ``flags`` attribute. - - For :class:`LinearSVC` (and :class:`LogisticRegression - `) any input passed as a numpy - array will be copied and converted to the `liblinear`_ internal sparse data - representation (double precision floats and int32 indices of non-zero - components). If you want to fit a large-scale linear classifier without - copying a dense numpy C-contiguous double precision array as input, we - suggest to use the :class:`SGDClassifier - ` class instead. The objective - function can be configured to be almost the same as the :class:`LinearSVC` - model. - - * **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and - :class:`NuSVR`, the size of the kernel cache has a strong impact on run - times for larger problems. If you have enough RAM available, it is - recommended to set ``cache_size`` to a higher value than the default of - 200(MB), such as 500(MB) or 1000(MB). - - - * **Setting C**: ``C`` is ``1`` by default and it's a reasonable default - choice. If you have a lot of noisy observations you should decrease it: - decreasing C corresponds to more regularization. - - :class:`LinearSVC` and :class:`LinearSVR` are less sensitive to ``C`` when - it becomes large, and prediction results stop improving after a certain - threshold. Meanwhile, larger ``C`` values will take more time to train, - sometimes up to 10 times longer, as shown in [#3]_. - - * Support Vector Machine algorithms are not scale invariant, so **it - is highly recommended to scale your data**. For example, scale each - attribute on the input vector X to [0,1] or [-1,+1], or standardize it - to have mean 0 and variance 1. Note that the *same* scaling must be - applied to the test vector to obtain meaningful results. This can be done - easily by using a :class:`~sklearn.pipeline.Pipeline`:: - - >>> from sklearn.pipeline import make_pipeline - >>> from sklearn.preprocessing import StandardScaler - >>> from sklearn.svm import SVC - - >>> clf = make_pipeline(StandardScaler(), SVC()) - - See section :ref:`preprocessing` for more details on scaling and - normalization. - - .. _shrinking_svm: - - * Regarding the `shrinking` parameter, quoting [#4]_: *We found that if the - number of iterations is large, then shrinking can shorten the training - time. However, if we loosely solve the optimization problem (e.g., by - using a large stopping tolerance), the code without using shrinking may - be much faster* - - * Parameter ``nu`` in :class:`NuSVC`/:class:`OneClassSVM`/:class:`NuSVR` - approximates the fraction of training errors and support vectors. - - * In :class:`SVC`, if the data is unbalanced (e.g. many - positive and few negative), set ``class_weight='balanced'`` and/or try - different penalty parameters ``C``. - - * **Randomness of the underlying implementations**: The underlying - implementations of :class:`SVC` and :class:`NuSVC` use a random number - generator only to shuffle the data for probability estimation (when - ``probability`` is set to ``True``). This randomness can be controlled - with the ``random_state`` parameter. If ``probability`` is set to ``False`` - these estimators are not random and ``random_state`` has no effect on the - results. The underlying :class:`OneClassSVM` implementation is similar to - the ones of :class:`SVC` and :class:`NuSVC`. As no probability estimation - is provided for :class:`OneClassSVM`, it is not random. - - The underlying :class:`LinearSVC` implementation uses a random number - generator to select features when fitting the model with a dual coordinate - descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon - to have slightly different results for the same input data. If that - happens, try with a smaller `tol` parameter. This randomness can also be - controlled with the ``random_state`` parameter. When ``dual`` is - set to ``False`` the underlying implementation of :class:`LinearSVC` is - not random and ``random_state`` has no effect on the results. - - * Using L1 penalization as provided by ``LinearSVC(penalty='l1', - dual=False)`` yields a sparse solution, i.e. only a subset of feature - weights is different from zero and contribute to the decision function. - Increasing ``C`` yields a more complex model (more features are selected). - The ``C`` value that yields a "null" model (all weights equal to zero) can - be calculated using :func:`l1_min_c`. +* **Avoiding data copy**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and + :class:`NuSVR`, if the data passed to certain methods is not C-ordered + contiguous and double precision, it will be copied before calling the + underlying C implementation. You can check whether a given numpy array is + C-contiguous by inspecting its ``flags`` attribute. + + For :class:`LinearSVC` (and :class:`LogisticRegression + `) any input passed as a numpy + array will be copied and converted to the `liblinear`_ internal sparse data + representation (double precision floats and int32 indices of non-zero + components). If you want to fit a large-scale linear classifier without + copying a dense numpy C-contiguous double precision array as input, we + suggest to use the :class:`SGDClassifier + ` class instead. The objective + function can be configured to be almost the same as the :class:`LinearSVC` + model. + +* **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and + :class:`NuSVR`, the size of the kernel cache has a strong impact on run + times for larger problems. If you have enough RAM available, it is + recommended to set ``cache_size`` to a higher value than the default of + 200(MB), such as 500(MB) or 1000(MB). + + +* **Setting C**: ``C`` is ``1`` by default and it's a reasonable default + choice. If you have a lot of noisy observations you should decrease it: + decreasing C corresponds to more regularization. + + :class:`LinearSVC` and :class:`LinearSVR` are less sensitive to ``C`` when + it becomes large, and prediction results stop improving after a certain + threshold. Meanwhile, larger ``C`` values will take more time to train, + sometimes up to 10 times longer, as shown in [#3]_. + +* Support Vector Machine algorithms are not scale invariant, so **it + is highly recommended to scale your data**. For example, scale each + attribute on the input vector X to [0,1] or [-1,+1], or standardize it + to have mean 0 and variance 1. Note that the *same* scaling must be + applied to the test vector to obtain meaningful results. This can be done + easily by using a :class:`~sklearn.pipeline.Pipeline`:: + + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.svm import SVC + + >>> clf = make_pipeline(StandardScaler(), SVC()) + + See section :ref:`preprocessing` for more details on scaling and + normalization. + +.. _shrinking_svm: + +* Regarding the `shrinking` parameter, quoting [#4]_: *We found that if the + number of iterations is large, then shrinking can shorten the training + time. However, if we loosely solve the optimization problem (e.g., by + using a large stopping tolerance), the code without using shrinking may + be much faster* + +* Parameter ``nu`` in :class:`NuSVC`/:class:`OneClassSVM`/:class:`NuSVR` + approximates the fraction of training errors and support vectors. + +* In :class:`SVC`, if the data is unbalanced (e.g. many + positive and few negative), set ``class_weight='balanced'`` and/or try + different penalty parameters ``C``. + +* **Randomness of the underlying implementations**: The underlying + implementations of :class:`SVC` and :class:`NuSVC` use a random number + generator only to shuffle the data for probability estimation (when + ``probability`` is set to ``True``). This randomness can be controlled + with the ``random_state`` parameter. If ``probability`` is set to ``False`` + these estimators are not random and ``random_state`` has no effect on the + results. The underlying :class:`OneClassSVM` implementation is similar to + the ones of :class:`SVC` and :class:`NuSVC`. As no probability estimation + is provided for :class:`OneClassSVM`, it is not random. + + The underlying :class:`LinearSVC` implementation uses a random number + generator to select features when fitting the model with a dual coordinate + descent (i.e. when ``dual`` is set to ``True``). It is thus not uncommon + to have slightly different results for the same input data. If that + happens, try with a smaller `tol` parameter. This randomness can also be + controlled with the ``random_state`` parameter. When ``dual`` is + set to ``False`` the underlying implementation of :class:`LinearSVC` is + not random and ``random_state`` has no effect on the results. + +* Using L1 penalization as provided by ``LinearSVC(penalty='l1', + dual=False)`` yields a sparse solution, i.e. only a subset of feature + weights is different from zero and contribute to the decision function. + Increasing ``C`` yields a more complex model (more features are selected). + The ``C`` value that yields a "null" model (all weights equal to zero) can + be calculated using :func:`l1_min_c`. .. _svm_kernels: @@ -468,16 +476,16 @@ Kernel functions The *kernel function* can be any of the following: - * linear: :math:`\langle x, x'\rangle`. +* linear: :math:`\langle x, x'\rangle`. - * polynomial: :math:`(\gamma \langle x, x'\rangle + r)^d`, where - :math:`d` is specified by parameter ``degree``, :math:`r` by ``coef0``. +* polynomial: :math:`(\gamma \langle x, x'\rangle + r)^d`, where + :math:`d` is specified by parameter ``degree``, :math:`r` by ``coef0``. - * rbf: :math:`\exp(-\gamma \|x-x'\|^2)`, where :math:`\gamma` is - specified by parameter ``gamma``, must be greater than 0. +* rbf: :math:`\exp(-\gamma \|x-x'\|^2)`, where :math:`\gamma` is + specified by parameter ``gamma``, must be greater than 0. - * sigmoid :math:`\tanh(\gamma \langle x,x'\rangle + r)`, - where :math:`r` is specified by ``coef0``. +* sigmoid :math:`\tanh(\gamma \langle x,x'\rangle + r)`, + where :math:`r` is specified by ``coef0``. Different kernels are specified by the `kernel` parameter:: @@ -488,6 +496,8 @@ Different kernels are specified by the `kernel` parameter:: >>> rbf_svc.kernel 'rbf' +See also :ref:`kernel_approximation` for a solution to use RBF kernels that is much faster and more scalable. + Parameters of the RBF Kernel ---------------------------- @@ -503,11 +513,10 @@ Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance. One is advised to use :class:`~sklearn.model_selection.GridSearchCV` with ``C`` and ``gamma`` spaced exponentially far apart to choose good values. -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py` - * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py` +.. rubric:: Examples +* :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py` +* :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py` Custom Kernels -------------- @@ -518,62 +527,60 @@ python function or by precomputing the Gram matrix. Classifiers with custom kernels behave the same way as any other classifiers, except that: - * Field ``support_vectors_`` is now empty, only indices of support - vectors are stored in ``support_`` +* Field ``support_vectors_`` is now empty, only indices of support + vectors are stored in ``support_`` - * A reference (and not a copy) of the first argument in the ``fit()`` - method is stored for future reference. If that array changes between the - use of ``fit()`` and ``predict()`` you will have unexpected results. +* A reference (and not a copy) of the first argument in the ``fit()`` + method is stored for future reference. If that array changes between the + use of ``fit()`` and ``predict()`` you will have unexpected results. -Using Python functions as kernels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. dropdown:: Using Python functions as kernels -You can use your own defined kernels by passing a function to the -``kernel`` parameter. + You can use your own defined kernels by passing a function to the + ``kernel`` parameter. -Your kernel must take as arguments two matrices of shape -``(n_samples_1, n_features)``, ``(n_samples_2, n_features)`` -and return a kernel matrix of shape ``(n_samples_1, n_samples_2)``. + Your kernel must take as arguments two matrices of shape + ``(n_samples_1, n_features)``, ``(n_samples_2, n_features)`` + and return a kernel matrix of shape ``(n_samples_1, n_samples_2)``. -The following code defines a linear kernel and creates a classifier -instance that will use that kernel:: + The following code defines a linear kernel and creates a classifier + instance that will use that kernel:: - >>> import numpy as np - >>> from sklearn import svm - >>> def my_kernel(X, Y): - ... return np.dot(X, Y.T) - ... - >>> clf = svm.SVC(kernel=my_kernel) + >>> import numpy as np + >>> from sklearn import svm + >>> def my_kernel(X, Y): + ... return np.dot(X, Y.T) + ... + >>> clf = svm.SVC(kernel=my_kernel) -.. topic:: Examples: - * :ref:`sphx_glr_auto_examples_svm_plot_custom_kernel.py`. +.. dropdown:: Using the Gram matrix -Using the Gram matrix -~~~~~~~~~~~~~~~~~~~~~ + You can pass pre-computed kernels by using the ``kernel='precomputed'`` + option. You should then pass Gram matrix instead of X to the `fit` and + `predict` methods. The kernel values between *all* training vectors and the + test vectors must be provided: -You can pass pre-computed kernels by using the ``kernel='precomputed'`` -option. You should then pass Gram matrix instead of X to the `fit` and -`predict` methods. The kernel values between *all* training vectors and the -test vectors must be provided: + >>> import numpy as np + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from sklearn import svm + >>> X, y = make_classification(n_samples=10, random_state=0) + >>> X_train , X_test , y_train, y_test = train_test_split(X, y, random_state=0) + >>> clf = svm.SVC(kernel='precomputed') + >>> # linear kernel computation + >>> gram_train = np.dot(X_train, X_train.T) + >>> clf.fit(gram_train, y_train) + SVC(kernel='precomputed') + >>> # predict on training examples + >>> gram_test = np.dot(X_test, X_train.T) + >>> clf.predict(gram_test) + array([0, 1, 0]) - >>> import numpy as np - >>> from sklearn.datasets import make_classification - >>> from sklearn.model_selection import train_test_split - >>> from sklearn import svm - >>> X, y = make_classification(n_samples=10, random_state=0) - >>> X_train , X_test , y_train, y_test = train_test_split(X, y, random_state=0) - >>> clf = svm.SVC(kernel='precomputed') - >>> # linear kernel computation - >>> gram_train = np.dot(X_train, X_train.T) - >>> clf.fit(gram_train, y_train) - SVC(kernel='precomputed') - >>> # predict on training examples - >>> gram_test = np.dot(X_test, X_train.T) - >>> clf.predict(gram_test) - array([0, 1, 0]) +.. rubric:: Examples +* :ref:`sphx_glr_auto_examples_svm_plot_custom_kernel.py` .. _svm_mathematical_formulation: @@ -623,7 +630,7 @@ misclassified or within the margin boundary. Ideally, the value :math:`y_i (w^T \phi (x_i) + b)` would be :math:`\geq 1` for all samples, which indicates a perfect prediction. But problems are usually not always perfectly separable with a hyperplane, so we allow some samples to be at a distance :math:`\zeta_i` from -their correct margin boundary. The penalty term `C` controls the strengh of +their correct margin boundary. The penalty term `C` controls the strength of this penalty, and as a result, acts as an inverse regularization parameter (see note below). @@ -652,14 +659,14 @@ Once the optimization problem is solved, the output of .. math:: \sum_{i\in SV} y_i \alpha_i K(x_i, x) + b, -and the predicted class correspond to its sign. We only need to sum over the +and the predicted class corresponds to its sign. We only need to sum over the support vectors (i.e. the samples that lie within the margin) because the dual coefficients :math:`\alpha_i` are zero for the other samples. These parameters can be accessed through the attributes ``dual_coef_`` which holds the product :math:`y_i \alpha_i`, ``support_vectors_`` which holds the support vectors, and ``intercept_`` which holds the independent -term :math:`b` +term :math:`b`. .. note:: @@ -668,40 +675,37 @@ term :math:`b` equivalence between the amount of regularization of two models depends on the exact objective function optimized by the model. For example, when the estimator used is :class:`~sklearn.linear_model.Ridge` regression, - the relation between them is given as :math:`C = \frac{1}{alpha}`. + the relation between them is given as :math:`C = \frac{1}{\alpha}`. -LinearSVC ---------- +.. dropdown:: LinearSVC -The primal problem can be equivalently formulated as + The primal problem can be equivalently formulated as -.. math:: + .. math:: - \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, y_i (w^T \phi(x_i) + b)), + \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}^{n}\max(0, 1 - y_i (w^T \phi(x_i) + b)), -where we make use of the `hinge loss -`_. This is the form that is -directly optimized by :class:`LinearSVC`, but unlike the dual form, this one -does not involve inner products between samples, so the famous kernel trick -cannot be applied. This is why only the linear kernel is supported by -:class:`LinearSVC` (:math:`\phi` is the identity function). + where we make use of the `hinge loss + `_. This is the form that is + directly optimized by :class:`LinearSVC`, but unlike the dual form, this one + does not involve inner products between samples, so the famous kernel trick + cannot be applied. This is why only the linear kernel is supported by + :class:`LinearSVC` (:math:`\phi` is the identity function). .. _nu_svc: -NuSVC ------ +.. dropdown:: NuSVC -The :math:`\nu`-SVC formulation [#7]_ is a reparameterization of the -:math:`C`-SVC and therefore mathematically equivalent. - -We introduce a new parameter :math:`\nu` (instead of :math:`C`) which -controls the number of support vectors and *margin errors*: -:math:`\nu \in (0, 1]` is an upper bound on the fraction of margin errors and -a lower bound of the fraction of support vectors. A margin error corresponds -to a sample that lies on the wrong side of its margin boundary: it is either -misclassified, or it is correctly classified but does not lie beyond the -margin. + The :math:`\nu`-SVC formulation [#7]_ is a reparameterization of the + :math:`C`-SVC and therefore mathematically equivalent. + We introduce a new parameter :math:`\nu` (instead of :math:`C`) which + controls the number of support vectors and *margin errors*: + :math:`\nu \in (0, 1]` is an upper bound on the fraction of margin errors and + a lower bound of the fraction of support vectors. A margin error corresponds + to a sample that lies on the wrong side of its margin boundary: it is either + misclassified, or it is correctly classified but does not lie beyond the + margin. SVR --- @@ -750,18 +754,17 @@ which holds the difference :math:`\alpha_i - \alpha_i^*`, ``support_vectors_`` w holds the support vectors, and ``intercept_`` which holds the independent term :math:`b` -LinearSVR ---------- +.. dropdown:: LinearSVR -The primal problem can be equivalently formulated as + The primal problem can be equivalently formulated as -.. math:: + .. math:: - \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, |y_i - (w^T \phi(x_i) + b)| - \varepsilon), + \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}^{n}\max(0, |y_i - (w^T \phi(x_i) + b)| - \varepsilon), -where we make use of the epsilon-insensitive loss, i.e. errors of less than -:math:`\varepsilon` are ignored. This is the form that is directly optimized -by :class:`LinearSVR`. + where we make use of the epsilon-insensitive loss, i.e. errors of less than + :math:`\varepsilon` are ignored. This is the form that is directly optimized + by :class:`LinearSVR`. .. _svm_implementation_details: @@ -777,38 +780,38 @@ used, please refer to their respective papers. .. _`libsvm`: https://www.csie.ntu.edu.tw/~cjlin/libsvm/ .. _`liblinear`: https://www.csie.ntu.edu.tw/~cjlin/liblinear/ -.. topic:: References: - - .. [#1] Platt `"Probabilistic outputs for SVMs and comparisons to - regularized likelihood methods" - `_. - - .. [#2] Wu, Lin and Weng, `"Probability estimates for multi-class - classification by pairwise coupling" - `_, JMLR - 5:975-1005, 2004. - - .. [#3] Fan, Rong-En, et al., - `"LIBLINEAR: A library for large linear classification." - `_, - Journal of machine learning research 9.Aug (2008): 1871-1874. - - .. [#4] Chang and Lin, `LIBSVM: A Library for Support Vector Machines - `_. - - .. [#5] Bishop, `Pattern recognition and machine learning - `_, - chapter 7 Sparse Kernel Machines - - .. [#6] `"A Tutorial on Support Vector Regression" - `_, - Alex J. Smola, Bernhard SchÃļlkopf - Statistics and Computing archive - Volume 14 Issue 3, August 2004, p. 199-222. - - .. [#7] SchÃļlkopf et. al `New Support Vector Algorithms - `_ - - .. [#8] Crammer and Singer `On the Algorithmic Implementation ofMulticlass - Kernel-based Vector Machines - `_, - JMLR 2001. +.. rubric:: References + +.. [#1] Platt `"Probabilistic outputs for SVMs and comparisons to + regularized likelihood methods" + `_. + +.. [#2] Wu, Lin and Weng, `"Probability estimates for multi-class + classification by pairwise coupling" + `_, + JMLR 5:975-1005, 2004. + +.. [#3] Fan, Rong-En, et al., + `"LIBLINEAR: A library for large linear classification." + `_, + Journal of machine learning research 9.Aug (2008): 1871-1874. + +.. [#4] Chang and Lin, `LIBSVM: A Library for Support Vector Machines + `_. + +.. [#5] Bishop, `Pattern recognition and machine learning + `_, + chapter 7 Sparse Kernel Machines. + +.. [#6] :doi:`"A Tutorial on Support Vector Regression" + <10.1023/B:STCO.0000035301.49549.88>` + Alex J. Smola, Bernhard SchÃļlkopf - Statistics and Computing archive + Volume 14 Issue 3, August 2004, p. 199-222. + +.. [#7] SchÃļlkopf et. al `New Support Vector Algorithms + `_, + Neural Computation 12, 1207-1245 (2000). + +.. [#8] Crammer and Singer `On the Algorithmic Implementation of Multiclass + Kernel-based Vector Machines + `_, JMLR 2001. diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 6d30fdcc6bf2f..ee36d9f6af1b2 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -23,68 +23,68 @@ the tree, the more complex the decision rules and the fitter the model. Some advantages of decision trees are: - - Simple to understand and to interpret. Trees can be visualised. +- Simple to understand and to interpret. Trees can be visualized. - - Requires little data preparation. Other techniques often require data - normalisation, dummy variables need to be created and blank values to - be removed. Note however that this module does not support missing - values. +- Requires little data preparation. Other techniques often require data + normalization, dummy variables need to be created and blank values to + be removed. Some tree and algorithm combinations support + :ref:`missing values `. - - The cost of using the tree (i.e., predicting data) is logarithmic in the - number of data points used to train the tree. +- The cost of using the tree (i.e., predicting data) is logarithmic in the + number of data points used to train the tree. - - Able to handle both numerical and categorical data. However scikit-learn - implementation does not support categorical variables for now. Other - techniques are usually specialised in analysing datasets that have only one type - of variable. See :ref:`algorithms ` for more - information. +- Able to handle both numerical and categorical data. However, the scikit-learn + implementation does not support categorical variables for now. Other + techniques are usually specialized in analyzing datasets that have only one type + of variable. See :ref:`algorithms ` for more + information. - - Able to handle multi-output problems. +- Able to handle multi-output problems. - - Uses a white box model. If a given situation is observable in a model, - the explanation for the condition is easily explained by boolean logic. - By contrast, in a black box model (e.g., in an artificial neural - network), results may be more difficult to interpret. +- Uses a white box model. If a given situation is observable in a model, + the explanation for the condition is easily explained by boolean logic. + By contrast, in a black box model (e.g., in an artificial neural + network), results may be more difficult to interpret. - - Possible to validate a model using statistical tests. That makes it - possible to account for the reliability of the model. +- Possible to validate a model using statistical tests. That makes it + possible to account for the reliability of the model. - - Performs well even if its assumptions are somewhat violated by - the true model from which the data were generated. +- Performs well even if its assumptions are somewhat violated by + the true model from which the data were generated. The disadvantages of decision trees include: - - Decision-tree learners can create over-complex trees that do not - generalise the data well. This is called overfitting. Mechanisms - such as pruning, setting the minimum number of samples required - at a leaf node or setting the maximum depth of the tree are - necessary to avoid this problem. +- Decision-tree learners can create over-complex trees that do not + generalize the data well. This is called overfitting. Mechanisms + such as pruning, setting the minimum number of samples required + at a leaf node or setting the maximum depth of the tree are + necessary to avoid this problem. - - Decision trees can be unstable because small variations in the - data might result in a completely different tree being generated. - This problem is mitigated by using decision trees within an - ensemble. +- Decision trees can be unstable because small variations in the + data might result in a completely different tree being generated. + This problem is mitigated by using decision trees within an + ensemble. - - Predictions of decision trees are neither smooth nor continuous, but - piecewise constant approximations as seen in the above figure. Therefore, - they are not good at extrapolation. +- Predictions of decision trees are neither smooth nor continuous, but + piecewise constant approximations as seen in the above figure. Therefore, + they are not good at extrapolation. - - The problem of learning an optimal decision tree is known to be - NP-complete under several aspects of optimality and even for simple - concepts. Consequently, practical decision-tree learning algorithms - are based on heuristic algorithms such as the greedy algorithm where - locally optimal decisions are made at each node. Such algorithms - cannot guarantee to return the globally optimal decision tree. This - can be mitigated by training multiple trees in an ensemble learner, - where the features and samples are randomly sampled with replacement. +- The problem of learning an optimal decision tree is known to be + NP-complete under several aspects of optimality and even for simple + concepts. Consequently, practical decision-tree learning algorithms + are based on heuristic algorithms such as the greedy algorithm where + locally optimal decisions are made at each node. Such algorithms + cannot guarantee to return the globally optimal decision tree. This + can be mitigated by training multiple trees in an ensemble learner, + where the features and samples are randomly sampled with replacement. - - There are concepts that are hard to learn because decision trees - do not express them easily, such as XOR, parity or multiplexer problems. +- There are concepts that are hard to learn because decision trees + do not express them easily, such as XOR, parity or multiplexer problems. - - Decision tree learners create biased trees if some classes dominate. - It is therefore recommended to balance the dataset prior to fitting - with the decision tree. +- Decision tree learners create biased trees if some classes dominate. + It is therefore recommended to balance the dataset prior to fitting + with the decision tree. .. _tree_classification: @@ -130,90 +130,94 @@ Using the Iris dataset, we can construct a tree as follows:: >>> from sklearn.datasets import load_iris >>> from sklearn import tree - >>> X, y = load_iris(return_X_y=True) + >>> iris = load_iris() + >>> X, y = iris.data, iris.target >>> clf = tree.DecisionTreeClassifier() >>> clf = clf.fit(X, y) Once trained, you can plot the tree with the :func:`plot_tree` function:: - >>> tree.plot_tree(clf) # doctest: +SKIP + >>> tree.plot_tree(clf) + [...] .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center -We can also export the tree in `Graphviz -`_ format using the :func:`export_graphviz` -exporter. If you use the `conda `_ package manager, the graphviz binaries -and the python package can be installed with `conda install python-graphviz`. +.. dropdown:: Alternative ways to export trees -Alternatively binaries for graphviz can be downloaded from the graphviz project homepage, -and the Python wrapper installed from pypi with `pip install graphviz`. + We can also export the tree in `Graphviz + `_ format using the :func:`export_graphviz` + exporter. If you use the `conda `_ package manager, the graphviz binaries + and the python package can be installed with `conda install python-graphviz`. -Below is an example graphviz export of the above tree trained on the entire -iris dataset; the results are saved in an output file `iris.pdf`:: + Alternatively binaries for graphviz can be downloaded from the graphviz project homepage, + and the Python wrapper installed from pypi with `pip install graphviz`. + Below is an example graphviz export of the above tree trained on the entire + iris dataset; the results are saved in an output file `iris.pdf`:: - >>> import graphviz # doctest: +SKIP - >>> dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP - >>> graph = graphviz.Source(dot_data) # doctest: +SKIP - >>> graph.render("iris") # doctest: +SKIP -The :func:`export_graphviz` exporter also supports a variety of aesthetic -options, including coloring nodes by their class (or value for regression) and -using explicit variable and class names if desired. Jupyter notebooks also -render these plots inline automatically:: + >>> import graphviz # doctest: +SKIP + >>> dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP + >>> graph = graphviz.Source(dot_data) # doctest: +SKIP + >>> graph.render("iris") # doctest: +SKIP - >>> dot_data = tree.export_graphviz(clf, out_file=None, # doctest: +SKIP - ... feature_names=iris.feature_names, # doctest: +SKIP - ... class_names=iris.target_names, # doctest: +SKIP - ... filled=True, rounded=True, # doctest: +SKIP - ... special_characters=True) # doctest: +SKIP - >>> graph = graphviz.Source(dot_data) # doctest: +SKIP - >>> graph # doctest: +SKIP + The :func:`export_graphviz` exporter also supports a variety of aesthetic + options, including coloring nodes by their class (or value for regression) and + using explicit variable and class names if desired. Jupyter notebooks also + render these plots inline automatically:: -.. only:: html + >>> dot_data = tree.export_graphviz(clf, out_file=None, # doctest: +SKIP + ... feature_names=iris.feature_names, # doctest: +SKIP + ... class_names=iris.target_names, # doctest: +SKIP + ... filled=True, rounded=True, # doctest: +SKIP + ... special_characters=True) # doctest: +SKIP + >>> graph = graphviz.Source(dot_data) # doctest: +SKIP + >>> graph # doctest: +SKIP - .. figure:: ../images/iris.svg - :align: center + .. only:: html -.. only:: latex + .. figure:: ../images/iris.svg + :align: center - .. figure:: ../images/iris.pdf - :align: center + .. only:: latex -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_001.png - :target: ../auto_examples/tree/plot_iris_dtc.html - :align: center - :scale: 75 + .. figure:: ../images/iris.pdf + :align: center -Alternatively, the tree can also be exported in textual format with the -function :func:`export_text`. This method doesn't require the installation -of external libraries and is more compact: + .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_001.png + :target: ../auto_examples/tree/plot_iris_dtc.html + :align: center + :scale: 75 - >>> from sklearn.datasets import load_iris - >>> from sklearn.tree import DecisionTreeClassifier - >>> from sklearn.tree import export_text - >>> iris = load_iris() - >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) - >>> decision_tree = decision_tree.fit(iris.data, iris.target) - >>> r = export_text(decision_tree, feature_names=iris['feature_names']) - >>> print(r) - |--- petal width (cm) <= 0.80 - | |--- class: 0 - |--- petal width (cm) > 0.80 - | |--- petal width (cm) <= 1.75 - | | |--- class: 1 - | |--- petal width (cm) > 1.75 - | | |--- class: 2 - - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py` - * :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` + Alternatively, the tree can also be exported in textual format with the + function :func:`export_text`. This method doesn't require the installation + of external libraries and is more compact: + + >>> from sklearn.datasets import load_iris + >>> from sklearn.tree import DecisionTreeClassifier + >>> from sklearn.tree import export_text + >>> iris = load_iris() + >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) + >>> decision_tree = decision_tree.fit(iris.data, iris.target) + >>> r = export_text(decision_tree, feature_names=iris['feature_names']) + >>> print(r) + |--- petal width (cm) <= 0.80 + | |--- class: 0 + |--- petal width (cm) > 0.80 + | |--- petal width (cm) <= 1.75 + | | |--- class: 1 + | |--- petal width (cm) > 1.75 + | | |--- class: 2 + + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py` +* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` .. _tree_regression: @@ -240,9 +244,9 @@ instead of integer values:: >>> clf.predict([[1, 1]]) array([0.5]) -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py` +* :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py` .. _tree_multioutput: @@ -265,27 +269,26 @@ generalization accuracy of the resulting estimator may often be increased. With regard to decision trees, this strategy can readily be used to support multi-output problems. This requires the following changes: - - Store n output values in leaves, instead of 1; - - Use splitting criteria that compute the average reduction across all - n outputs. +- Store n output values in leaves, instead of 1; +- Use splitting criteria that compute the average reduction across all + n outputs. This module offers support for multi-output problems by implementing this strategy in both :class:`DecisionTreeClassifier` and :class:`DecisionTreeRegressor`. If a decision tree is fit on an output array Y of shape ``(n_samples, n_outputs)`` then the resulting estimator will: - * Output n_output values upon ``predict``; - - * Output a list of n_output arrays of class probabilities upon - ``predict_proba``. +* Output n_output values upon ``predict``; +* Output a list of n_output arrays of class probabilities upon + ``predict_proba``. The use of multi-output trees for regression is demonstrated in -:ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`. In this example, the input +:ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py`. In this example, the input X is a single real value and the outputs Y are the sine and cosine of X. -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_multioutput_001.png - :target: ../auto_examples/tree/plot_tree_regression_multioutput.html +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_002.png + :target: ../auto_examples/tree/plot_tree_regression.html :scale: 75 :align: center @@ -299,17 +302,16 @@ the lower half of those faces. :scale: 75 :align: center -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py` - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` -.. topic:: References: +.. rubric:: References - * M. Dumont et al, `Fast multi-class image annotation with random subwindows - and multiple output randomized trees - `_, International Conference on - Computer Vision Theory and Applications 2009 +* M. Dumont et al, `Fast multi-class image annotation with random subwindows + and multiple output randomized trees + `_, + International Conference on Computer Vision Theory and Applications 2009 .. _tree_complexity: @@ -322,7 +324,8 @@ In general, the run time cost to construct a balanced binary tree is to generate balanced trees, they will not always be balanced. Assuming that the subtrees remain approximately balanced, the cost at each node consists of searching through :math:`O(n_{features})` to find the feature that offers the -largest reduction in entropy. This has a cost of +largest reduction in the impurity criterion, e.g. log loss (which is equivalent to an +information gain). This has a cost of :math:`O(n_{features}n_{samples}\log(n_{samples}))` at each node, leading to a total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. @@ -331,65 +334,65 @@ total cost over the entire trees (by summing the cost at each node) of Tips on practical use ===================== - * Decision trees tend to overfit on data with a large number of features. - Getting the right ratio of samples to number of features is important, since - a tree with few samples in high dimensional space is very likely to overfit. - - * Consider performing dimensionality reduction (:ref:`PCA `, - :ref:`ICA `, or :ref:`feature_selection`) beforehand to - give your tree a better chance of finding features that are discriminative. - - * :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` will help - in gaining more insights about how the decision tree makes predictions, which is - important for understanding the important features in the data. - - * Visualise your tree as you are training by using the ``export`` - function. Use ``max_depth=3`` as an initial tree depth to get a feel for - how the tree is fitting to your data, and then increase the depth. - - * Remember that the number of samples required to populate the tree doubles - for each additional level the tree grows to. Use ``max_depth`` to control - the size of the tree to prevent overfitting. - - * Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple - samples inform every decision in the tree, by controlling which splits will - be considered. A very small number will usually mean the tree will overfit, - whereas a large number will prevent the tree from learning the data. Try - ``min_samples_leaf=5`` as an initial value. If the sample size varies - greatly, a float number can be used as percentage in these two parameters. - While ``min_samples_split`` can create arbitrarily small leaves, - ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding - low-variance, over-fit leaf nodes in regression problems. For - classification with few classes, ``min_samples_leaf=1`` is often the best - choice. - - Note that ``min_samples_split`` considers samples directly and independent of - ``sample_weight``, if provided (e.g. a node with m weighted samples is still - treated as having exactly m samples). Consider ``min_weight_fraction_leaf`` or - ``min_impurity_decrease`` if accounting for sample weights is required at splits. - - * Balance your dataset before training to prevent the tree from being biased - toward the classes that are dominant. Class balancing can be done by - sampling an equal number of samples from each class, or preferably by - normalizing the sum of the sample weights (``sample_weight``) for each - class to the same value. Also note that weight-based pre-pruning criteria, - such as ``min_weight_fraction_leaf``, will then be less biased toward - dominant classes than criteria that are not aware of the sample weights, - like ``min_samples_leaf``. - - * If the samples are weighted, it will be easier to optimize the tree - structure using weight-based pre-pruning criterion such as - ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least - a fraction of the overall sum of the sample weights. - - * All decision trees use ``np.float32`` arrays internally. - If training data is not in this format, a copy of the dataset will be made. - - * If the input matrix X is very sparse, it is recommended to convert to sparse - ``csc_matrix`` before calling fit and sparse ``csr_matrix`` before calling - predict. Training time can be orders of magnitude faster for a sparse - matrix input compared to a dense matrix when features have zero values in - most of the samples. +* Decision trees tend to overfit on data with a large number of features. + Getting the right ratio of samples to number of features is important, since + a tree with few samples in high dimensional space is very likely to overfit. + +* Consider performing dimensionality reduction (:ref:`PCA `, + :ref:`ICA `, or :ref:`feature_selection`) beforehand to + give your tree a better chance of finding features that are discriminative. + +* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` will help + in gaining more insights about how the decision tree makes predictions, which is + important for understanding the important features in the data. + +* Visualize your tree as you are training by using the ``export`` + function. Use ``max_depth=3`` as an initial tree depth to get a feel for + how the tree is fitting to your data, and then increase the depth. + +* Remember that the number of samples required to populate the tree doubles + for each additional level the tree grows to. Use ``max_depth`` to control + the size of the tree to prevent overfitting. + +* Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple + samples inform every decision in the tree, by controlling which splits will + be considered. A very small number will usually mean the tree will overfit, + whereas a large number will prevent the tree from learning the data. Try + ``min_samples_leaf=5`` as an initial value. If the sample size varies + greatly, a float number can be used as percentage in these two parameters. + While ``min_samples_split`` can create arbitrarily small leaves, + ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding + low-variance, over-fit leaf nodes in regression problems. For + classification with few classes, ``min_samples_leaf=1`` is often the best + choice. + + Note that ``min_samples_split`` considers samples directly and independent of + ``sample_weight``, if provided (e.g. a node with m weighted samples is still + treated as having exactly m samples). Consider ``min_weight_fraction_leaf`` or + ``min_impurity_decrease`` if accounting for sample weights is required at splits. + +* Balance your dataset before training to prevent the tree from being biased + toward the classes that are dominant. Class balancing can be done by + sampling an equal number of samples from each class, or preferably by + normalizing the sum of the sample weights (``sample_weight``) for each + class to the same value. Also note that weight-based pre-pruning criteria, + such as ``min_weight_fraction_leaf``, will then be less biased toward + dominant classes than criteria that are not aware of the sample weights, + like ``min_samples_leaf``. + +* If the samples are weighted, it will be easier to optimize the tree + structure using weight-based pre-pruning criterion such as + ``min_weight_fraction_leaf``, which ensures that leaf nodes contain at least + a fraction of the overall sum of the sample weights. + +* All decision trees use ``np.float32`` arrays internally. + If training data is not in this format, a copy of the dataset will be made. + +* If the input matrix X is very sparse, it is recommended to convert to sparse + ``csc_matrix`` before calling fit and sparse ``csr_matrix`` before calling + predict. Training time can be orders of magnitude faster for a sparse + matrix input compared to a dense matrix when features have zero values in + most of the samples. .. _tree_algorithms: @@ -400,36 +403,37 @@ Tree algorithms: ID3, C4.5, C5.0 and CART What are all the various decision tree algorithms and how do they differ from each other? Which one is implemented in scikit-learn? -ID3_ (Iterative Dichotomiser 3) was developed in 1986 by Ross Quinlan. -The algorithm creates a multiway tree, finding for each node (i.e. in -a greedy manner) the categorical feature that will yield the largest -information gain for categorical targets. Trees are grown to their -maximum size and then a pruning step is usually applied to improve the -ability of the tree to generalise to unseen data. - -C4.5 is the successor to ID3 and removed the restriction that features -must be categorical by dynamically defining a discrete attribute (based -on numerical variables) that partitions the continuous attribute value -into a discrete set of intervals. C4.5 converts the trained trees -(i.e. the output of the ID3 algorithm) into sets of if-then rules. -These accuracy of each rule is then evaluated to determine the order -in which they should be applied. Pruning is done by removing a rule's -precondition if the accuracy of the rule improves without it. - -C5.0 is Quinlan's latest version release under a proprietary license. -It uses less memory and builds smaller rulesets than C4.5 while being -more accurate. - -CART_ (Classification and Regression Trees) is very similar to C4.5, but -it differs in that it supports numerical target variables (regression) and -does not compute rule sets. CART constructs binary trees using the feature -and threshold that yield the largest information gain at each node. - -scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn -implementation does not support categorical variables for now. +.. dropdown:: Various decision tree algorithms + + ID3_ (Iterative Dichotomiser 3) was developed in 1986 by Ross Quinlan. + The algorithm creates a multiway tree, finding for each node (i.e. in + a greedy manner) the categorical feature that will yield the largest + information gain for categorical targets. Trees are grown to their + maximum size and then a pruning step is usually applied to improve the + ability of the tree to generalize to unseen data. + + C4.5 is the successor to ID3 and removed the restriction that features + must be categorical by dynamically defining a discrete attribute (based + on numerical variables) that partitions the continuous attribute value + into a discrete set of intervals. C4.5 converts the trained trees + (i.e. the output of the ID3 algorithm) into sets of if-then rules. + The accuracy of each rule is then evaluated to determine the order + in which they should be applied. Pruning is done by removing a rule's + precondition if the accuracy of the rule improves without it. + + C5.0 is Quinlan's latest version release under a proprietary license. + It uses less memory and builds smaller rulesets than C4.5 while being + more accurate. + + CART (Classification and Regression Trees) is very similar to C4.5, but + it differs in that it supports numerical target variables (regression) and + does not compute rule sets. CART constructs binary trees using the feature + and threshold that yield the largest information gain at each node. + +scikit-learn uses an optimized version of the CART algorithm; however, the +scikit-learn implementation does not support categorical variables for now. .. _ID3: https://en.wikipedia.org/wiki/ID3_algorithm -.. _CART: https://en.wikipedia.org/wiki/Predictive_analytics#Classification_and_regression_trees_.28CART.29 .. _tree_mathematical_formulation: @@ -442,14 +446,14 @@ Given training vectors :math:`x_i \in R^n`, i=1,..., l and a label vector such that the samples with the same labels or similar target values are grouped together. -Let the data at node :math:`m` be represented by :math:`Q_m` with :math:`N_m` +Let the data at node :math:`m` be represented by :math:`Q_m` with :math:`n_m` samples. For each candidate split :math:`\theta = (j, t_m)` consisting of a feature :math:`j` and threshold :math:`t_m`, partition the data into :math:`Q_m^{left}(\theta)` and :math:`Q_m^{right}(\theta)` subsets .. math:: - Q_m^{left}(\theta) = \{(x, y) | x_j <= t_m\} + Q_m^{left}(\theta) = \{(x, y) | x_j \leq t_m\} Q_m^{right}(\theta) = Q_m \setminus Q_m^{left}(\theta) @@ -459,8 +463,8 @@ the task being solved (classification or regression) .. math:: - G(Q_m, \theta) = \frac{N_m^{left}}{N_m} H(Q_m^{left}(\theta)) - + \frac{N_m^{right}}{N_m} H(Q_m^{right}(\theta)) + G(Q_m, \theta) = \frac{n_m^{left}}{n_m} H(Q_m^{left}(\theta)) + + \frac{n_m^{right}}{n_m} H(Q_m^{right}(\theta)) Select the parameters that minimises the impurity @@ -470,7 +474,7 @@ Select the parameters that minimises the impurity Recurse for subsets :math:`Q_m^{left}(\theta^*)` and :math:`Q_m^{right}(\theta^*)` until the maximum allowable depth is reached, -:math:`N_m < \min_{samples}` or :math:`N_m = 1`. +:math:`n_m < \min_{samples}` or :math:`n_m = 1`. Classification criteria ----------------------- @@ -480,7 +484,7 @@ for node :math:`m`, let .. math:: - p_{mk} = 1/ N_m \sum_{y \in Q_m} I(y = k) + p_{mk} = \frac{1}{n_m} \sum_{y \in Q_m} I(y = k) be the proportion of class k observations in node :math:`m`. If :math:`m` is a terminal node, `predict_proba` for this region is set to :math:`p_{mk}`. @@ -492,17 +496,41 @@ Gini: H(Q_m) = \sum_k p_{mk} (1 - p_{mk}) -Entropy: +Log Loss or Entropy: .. math:: H(Q_m) = - \sum_k p_{mk} \log(p_{mk}) -Misclassification: +.. dropdown:: Shannon entropy -.. math:: + The entropy criterion computes the Shannon entropy of the possible classes. It + takes the class frequencies of the training data points that reached a given + leaf :math:`m` as their probability. Using the **Shannon entropy as tree node + splitting criterion is equivalent to minimizing the log loss** (also known as + cross-entropy and multinomial deviance) between the true labels :math:`y_i` + and the probabilistic predictions :math:`T_k(x_i)` of the tree model :math:`T` for class :math:`k`. + + To see this, first recall that the log loss of a tree model :math:`T` + computed on a dataset :math:`D` is defined as follows: + + .. math:: + + \mathrm{LL}(D, T) = -\frac{1}{n} \sum_{(x_i, y_i) \in D} \sum_k I(y_i = k) \log(T_k(x_i)) - H(Q_m) = 1 - \max(p_{mk}) + where :math:`D` is a training dataset of :math:`n` pairs :math:`(x_i, y_i)`. + + In a classification tree, the predicted class probabilities within leaf nodes + are constant, that is: for all :math:`(x_i, y_i) \in Q_m`, one has: + :math:`T_k(x_i) = p_{mk}` for each class :math:`k`. + + This property makes it possible to rewrite :math:`\mathrm{LL}(D, T)` as the + sum of the Shannon entropies computed for each leaf of :math:`T` weighted by + the number of training data points that reached each leaf: + + .. math:: + + \mathrm{LL}(D, T) = \sum_{m \in T} \frac{n_m}{n} H(Q_m) Regression criteria ------------------- @@ -519,21 +547,22 @@ Mean Squared Error: .. math:: - \bar{y}_m = \frac{1}{N_m} \sum_{y \in Q_m} y + \bar{y}_m = \frac{1}{n_m} \sum_{y \in Q_m} y - H(Q_m) = \frac{1}{N_m} \sum_{y \in Q_m} (y - \bar{y}_m)^2 + H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} (y - \bar{y}_m)^2 -Half Poisson deviance: +Mean Poisson deviance: .. math:: - H(Q_m) = \frac{1}{N_m} \sum_{y \in Q_m} (y \log\frac{y}{\bar{y}_m} + H(Q_m) = \frac{2}{n_m} \sum_{y \in Q_m} (y \log\frac{y}{\bar{y}_m} - y + \bar{y}_m) Setting `criterion="poisson"` might be a good choice if your target is a count or a frequency (count per some unit). In any case, :math:`y >= 0` is a necessary condition to use this criterion. Note that it fits much slower than -the MSE criterion. +the MSE criterion. For performance reasons the actual implementation minimizes +the half mean poisson deviance, i.e. the mean poisson deviance divided by 2. Mean Absolute Error: @@ -541,10 +570,95 @@ Mean Absolute Error: median(y)_m = \underset{y \in Q_m}{\mathrm{median}}(y) - H(Q_m) = \frac{1}{N_m} \sum_{y \in Q_m} |y - median(y)_m| + H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} |y - median(y)_m| Note that it fits much slower than the MSE criterion. +.. _tree_missing_value_support: + +Missing Values Support +====================== + +:class:`DecisionTreeClassifier`, :class:`DecisionTreeRegressor` +have built-in support for missing values using `splitter='best'`, where +the splits are determined in a greedy fashion. +:class:`ExtraTreeClassifier`, and :class:`ExtraTreeRegressor` have built-in +support for missing values for `splitter='random'`, where the splits +are determined randomly. For more details on how the splitter differs on +non-missing values, see the :ref:`Forest section `. + +The criterion supported when there are missing values are +`'gini'`, `'entropy'`, or `'log_loss'`, for classification or +`'squared_error'`, `'friedman_mse'`, or `'poisson'` for regression. + +First we will describe how :class:`DecisionTreeClassifier`, :class:`DecisionTreeRegressor` +handle missing-values in the data. + +For each potential threshold on the non-missing data, the splitter will evaluate +the split with all the missing values going to the left node or the right node. + +Decisions are made as follows: + +- By default when predicting, the samples with missing values are classified + with the class used in the split found during training:: + + >>> from sklearn.tree import DecisionTreeClassifier + >>> import numpy as np + + >>> X = np.array([0, 1, 6, np.nan]).reshape(-1, 1) + >>> y = [0, 0, 1, 1] + + >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y) + >>> tree.predict(X) + array([0, 0, 1, 1]) + +- If the criterion evaluation is the same for both nodes, + then the tie for missing value at predict time is broken by going to the + right node. The splitter also checks the split where all the missing + values go to one child and non-missing values go to the other:: + + >>> from sklearn.tree import DecisionTreeClassifier + >>> import numpy as np + + >>> X = np.array([np.nan, -1, np.nan, 1]).reshape(-1, 1) + >>> y = [0, 0, 1, 1] + + >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y) + + >>> X_test = np.array([np.nan]).reshape(-1, 1) + >>> tree.predict(X_test) + array([1]) + +- If no missing values are seen during training for a given feature, then during + prediction missing values are mapped to the child with the most samples:: + + >>> from sklearn.tree import DecisionTreeClassifier + >>> import numpy as np + + >>> X = np.array([0, 1, 2, 3]).reshape(-1, 1) + >>> y = [0, 1, 1, 1] + + >>> tree = DecisionTreeClassifier(random_state=0).fit(X, y) + + >>> X_test = np.array([np.nan]).reshape(-1, 1) + >>> tree.predict(X_test) + array([1]) + +:class:`ExtraTreeClassifier`, and :class:`ExtraTreeRegressor` handle missing values +in a slightly different way. When splitting a node, a random threshold will be chosen +to split the non-missing values on. Then the non-missing values will be sent to the +left and right child based on the randomly selected threshold, while the missing +values will also be randomly sent to the left or right child. This is repeated for +every feature considered at each split. The best split among these is chosen. + +During prediction, the treatment of missing-values is the same as that of the +decision tree: + +- By default when predicting, the samples with missing values are classified + with the class used in the split found during training. + +- If no missing values are seen during training for a given feature, then during + prediction missing values are mapped to the child with the most samples. .. _minimal_cost_complexity_pruning: @@ -581,21 +695,21 @@ with the smallest value of :math:`\alpha_{eff}` is the weakest link and will be pruned. This process stops when the pruned tree's minimal :math:`\alpha_{eff}` is greater than the ``ccp_alpha`` parameter. -.. topic:: Examples: +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +* :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` -.. topic:: References: +.. rubric:: References - .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification - and Regression Trees. Wadsworth, Belmont, CA, 1984. +.. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification + and Regression Trees. Wadsworth, Belmont, CA, 1984. - * https://en.wikipedia.org/wiki/Decision_tree_learning +* https://en.wikipedia.org/wiki/Decision_tree_learning - * https://en.wikipedia.org/wiki/Predictive_analytics +* https://en.wikipedia.org/wiki/Predictive_analytics - * J.R. Quinlan. C4. 5: programs for machine learning. Morgan - Kaufmann, 1993. +* J.R. Quinlan. C4. 5: programs for machine learning. Morgan + Kaufmann, 1993. - * T. Hastie, R. Tibshirani and J. Friedman. Elements of Statistical - Learning, Springer, 2009. +* T. Hastie, R. Tibshirani and J. Friedman. Elements of Statistical + Learning, Springer, 2009. diff --git a/doc/modules/unsupervised_reduction.rst b/doc/modules/unsupervised_reduction.rst index 6e16886064cfc..12f3647454861 100644 --- a/doc/modules/unsupervised_reduction.rst +++ b/doc/modules/unsupervised_reduction.rst @@ -9,7 +9,7 @@ If your number of features is high, it may be useful to reduce it with an unsupervised step prior to supervised steps. Many of the :ref:`unsupervised-learning` methods implement a ``transform`` method that can be used to reduce the dimensionality. Below we discuss two specific -example of this pattern that are heavily used. +examples of this pattern that are heavily used. .. topic:: **Pipelining** @@ -24,20 +24,20 @@ PCA: principal component analysis :class:`decomposition.PCA` looks for a combination of features that capture well the variance of the original features. See :ref:`decompositions`. -.. topic:: **Examples** +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` +* :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py` Random projections ------------------- -The module: :mod:`random_projection` provides several tools for data +The module: :mod:`~sklearn.random_projection` provides several tools for data reduction by random projections. See the relevant section of the documentation: :ref:`random_projection`. -.. topic:: **Examples** +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` Feature agglomeration ------------------------ @@ -46,15 +46,14 @@ Feature agglomeration :ref:`hierarchical_clustering` to group together features that behave similarly. -.. topic:: **Examples** +.. rubric:: Examples - * :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py` - * :ref:`sphx_glr_auto_examples_cluster_plot_digits_agglomeration.py` +* :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py` +* :ref:`sphx_glr_auto_examples_cluster_plot_digits_agglomeration.py` .. topic:: **Feature scaling** Note that if features have very different scaling or statistical properties, :class:`cluster.FeatureAgglomeration` may not be able to - capture the links between related features. Using a + capture the links between related features. Using a :class:`preprocessing.StandardScaler` can be useful in these settings. - diff --git a/doc/preface.rst b/doc/preface.rst deleted file mode 100644 index 447083a3a8136..0000000000000 --- a/doc/preface.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. This helps define the TOC ordering for "about us" sections. Particularly - useful for PDF output as this section is not linked from elsewhere. - -.. Places global toc into the sidebar - -:globalsidebartoc: True - -.. _preface_menu: - -.. include:: includes/big_toc_css.rst -.. include:: tune_toc.rst - -======================= -Welcome to scikit-learn -======================= - -| - -.. toctree:: - :maxdepth: 2 - - install - faq - support - related_projects - about - testimonials/testimonials - whats_new - roadmap - governance - -| diff --git a/doc/presentations.rst b/doc/presentations.rst index 15b02469d3a6c..25a947d180e00 100644 --- a/doc/presentations.rst +++ b/doc/presentations.rst @@ -1,15 +1,52 @@ +.. _external_resources: + =========================================== External Resources, Videos and Talks =========================================== -For written tutorials, see the :ref:`Tutorial section ` of -the documentation. +The scikit-learn MOOC +===================== + +If you are new to scikit-learn, or looking to strengthen your understanding, +we highly recommend the **scikit-learn MOOC (Massive Open Online Course)**. + +The MOOC, created and maintained by some of the scikit-learn core-contributors, +is **free of charge** and is designed to help learners of all levels master +machine learning using scikit-learn. It covers topics +from the fundamental machine learning concepts to more advanced areas like +predictive modeling pipelines and model evaluation. + +The course materials are available on the +`scikit-learn MOOC website `_. + +This course is also hosted on the `FUN platform +`_, +which additionally makes the content interactive without the need to install +anything, and gives access to a discussion forum. + +The videos are available on the +`Inria Learning Lab channel `_ +in a +`playlist `__. + +.. _videos: + +Videos +====== + +- The `scikit-learn YouTube channel `_ + features a + `playlist `__ + of videos + showcasing talks by maintainers + and community members. New to Scientific Python? ========================== + For those that are still new to the scientific Python ecosystem, we highly recommend the `Python Scientific Lecture Notes -`_. This will help you find your footing a +`_. This will help you find your footing a bit and will definitely improve your scikit-learn experience. A basic understanding of NumPy arrays is recommended to make the most of scikit-learn. @@ -21,58 +58,3 @@ specific subject areas: - `Machine Learning for NeuroImaging in Python `_ - `Machine Learning for Astronomical Data Analysis `_ - -.. _videos: - -Videos -====== - -- An introduction to scikit-learn `Part - I `_ and - `Part II `_ at Scipy 2013 - by `Gael Varoquaux`_, `Jake Vanderplas`_ and `Olivier Grisel`_. Notebooks on - `github `_. - -- `Introduction to scikit-learn - `_ by `Gael Varoquaux`_ at - ICML 2010 - - A three minute video from a very early stage of scikit-learn, explaining the - basic idea and approach we are following. - -- `Introduction to statistical learning with scikit-learn `_ - by `Gael Varoquaux`_ at SciPy 2011 - - An extensive tutorial, consisting of four sessions of one hour. - The tutorial covers the basics of machine learning, - many algorithms and how to apply them using scikit-learn. The - material corresponding is now in the scikit-learn documentation - section :ref:`stat_learn_tut_index`. - -- `Statistical Learning for Text Classification with scikit-learn and NLTK - `_ - (and `slides `_) - by `Olivier Grisel`_ at PyCon 2011 - - Thirty minute introduction to text classification. Explains how to - use NLTK and scikit-learn to solve real-world text classification - tasks and compares against cloud-based solutions. - -- `Introduction to Interactive Predictive Analytics in Python with scikit-learn `_ - by `Olivier Grisel`_ at PyCon 2012 - - 3-hours long introduction to prediction tasks using scikit-learn. - -- `scikit-learn - Machine Learning in Python `_ - by `Jake Vanderplas`_ at the 2012 PyData workshop at Google - - Interactive demonstration of some scikit-learn features. 75 minutes. - -- `scikit-learn tutorial `_ by `Jake Vanderplas`_ at PyData NYC 2012 - - Presentation using the online tutorial, 45 minutes. - - -.. _Gael Varoquaux: http://gael-varoquaux.info -.. _Jake Vanderplas: https://staff.washington.edu/jakevdp -.. _Olivier Grisel: https://twitter.com/ogrisel diff --git a/doc/related_projects.rst b/doc/related_projects.rst index acc2689388896..a7a10aef7929e 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -7,7 +7,7 @@ Related Projects Projects implementing the scikit-learn estimator API are encouraged to use the `scikit-learn-contrib template `_ which facilitates best practices for testing and documenting estimators. -The `scikit-learn-contrib GitHub organisation `_ +The `scikit-learn-contrib GitHub organization `_ also accepts high-quality contributions of repositories conforming to this template. @@ -19,17 +19,6 @@ Interoperability and framework enhancements These tools adapt scikit-learn for use with other technologies or otherwise enhance the functionality of scikit-learn's estimators. -**Data formats** - -- `Fast svmlight / libsvm file loader `_ - Fast and memory-efficient svmlight / libsvm file loader for Python. - -- `sklearn_pandas `_ bridge for - scikit-learn pipelines and pandas data frame with dedicated transformers. - -- `sklearn_xarray `_ provides - compatibility of scikit-learn estimators with xarray data structures. - **Auto-ML** - `auto-sklearn `_ @@ -45,74 +34,99 @@ enhance the functionality of scikit-learn's estimators. operators to design a machine learning pipeline, including data and feature preprocessors as well as the estimators. Works as a drop-in replacement for a scikit-learn estimator. - -- `Featuretools `_ - A framework to perform automated feature engineering. It can be used for - transforming temporal and relational datasets into feature matrices for + +- `Featuretools `_ + A framework to perform automated feature engineering. It can be used for + transforming temporal and relational datasets into feature matrices for machine learning. -- `Neuraxle `_ - A library for building neat pipelines, providing the right abstractions to - both ease research, development, and deployment of machine learning - applications. Compatible with deep learning frameworks and scikit-learn API, - it can stream minibatches, use data checkpoints, build funky pipelines, and - serialize models with custom per-step savers. +- `EvalML `_ + An AutoML library which builds, optimizes, and evaluates + machine learning pipelines using domain-specific objective functions. + It incorporates multiple modeling libraries under one API, and + the objects that EvalML creates use an sklearn-compatible API. + +- `MLJAR AutoML `_ + A Python package for AutoML on Tabular Data with Feature Engineering, + Hyper-Parameters Tuning, Explanations and Automatic Documentation. -**Experimentation frameworks** +**Experimentation and model registry frameworks** -- `REP `_ Environment for conducting data-driven - research in a consistent and reproducible way +- `MLFlow `_ An open source platform to manage the ML + lifecycle, including experimentation, reproducibility, deployment, and a central + model registry. + +- `Neptune `_ A metadata store for MLOps, + built for teams that run a lot of experiments. It gives you a single + place to log, store, display, organize, compare, and query all your + model building metadata. + +- `Sacred `_ A tool to help you configure, + organize, log and reproduce experiments - `Scikit-Learn Laboratory `_ A command-line wrapper around scikit-learn that makes it easy to run machine learning experiments with multiple learners and large feature sets. -**Model inspection and visualisation** +**Model inspection and visualization** -- `dtreeviz `_ A python library for +- `dtreeviz `_ A Python library for decision tree visualization and model interpretation. -- `eli5 `_ A library for - debugging/inspecting machine learning models and explaining their - predictions. +- `model-diagnostics `_ Tools for + diagnostics and assessment of (machine learning) models (in Python). -- `mlxtend `_ Includes model visualization - utilities. +- `sklearn-evaluation `_ + Machine learning model evaluation made easy: plots, tables, HTML reports, + experiment tracking and Jupyter notebook analysis. Visual analysis, model + selection, evaluation and diagnostics. - `yellowbrick `_ A suite of custom matplotlib visualizers for scikit-learn estimators to support visual feature analysis, model selection, evaluation, and diagnostics. -**Model selection** - -- `scikit-optimize `_ - A library to minimize (very) expensive and noisy black-box functions. It - implements several methods for sequential model-based optimization, and - includes a replacement for ``GridSearchCV`` or ``RandomizedSearchCV`` to do - cross-validated parameter search using any of these strategies. - -- `sklearn-deap `_ Use evolutionary - algorithms instead of gridsearch in scikit-learn. - **Model export for production** - `sklearn-onnx `_ Serialization of many Scikit-learn pipelines to `ONNX `_ for interchange and prediction. +- `skops.io `__ A + persistence model more secure than pickle, which can be used instead of + pickle in most common cases. + - `sklearn2pmml `_ Serialization of a wide variety of scikit-learn estimators and transformers into PMML with the help of `JPMML-SkLearn `_ library. -- `sklearn-porter `_ - Transpile trained scikit-learn models to C, Java, Javascript and others. - - `treelite `_ Compiles tree-based ensemble models into C code for minimizing prediction latency. +- `emlearn `_ + Implements scikit-learn estimators in C99 for embedded devices and microcontrollers. + Supports several classifier, regression and outlier detection models. + +**Model throughput** + +- `Intel(R) Extension for scikit-learn `_ + Mostly on high end Intel(R) hardware, accelerates some scikit-learn models + for both training and inference under certain circumstances. This project is + maintained by Intel(R) and scikit-learn's maintainers are not involved in the + development of this project. Also note that in some cases using the tools and + estimators under ``scikit-learn-intelex`` would give different results than + ``scikit-learn`` itself. If you encounter issues while using this project, + make sure you report potential issues in their respective repositories. + +**Interface to R with genomic applications** + +- `BiocSklearn `_ + Exposes a small number of dimension reduction facilities as an illustration + of the basilisk protocol for interfacing Python with R. Intended as a + springboard for more complete interop. + Other estimators and tasks -------------------------- @@ -122,37 +136,52 @@ project. The following are projects providing interfaces similar to scikit-learn for additional learning algorithms, infrastructures and tasks. -**Structured learning** +**Time series and forecasting** -- `tslearn `_ A machine learning library for time series - that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. +- `aeon `_ A + scikit-learn compatible toolbox for machine learning with time series + (fork of `sktime`_). -- `sktime `_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting. +- `Darts `_ A Python library for + user-friendly forecasting and anomaly detection on time series. It contains a variety + of models, from classics such as ARIMA to deep neural networks. The forecasting + models can all be used in the same way, using fit() and predict() functions, similar + to scikit-learn. -- `HMMLearn `_ Implementation of hidden - markov models that was previously part of scikit-learn. +- `sktime `_ A scikit-learn compatible + toolbox for machine learning with time series including time series + classification/regression and (supervised/panel) forecasting. -- `PyStruct `_ General conditional random fields - and structured prediction. +- `skforecast `_ A Python library + that eases using scikit-learn regressors as multi-step forecasters. It also works + with any regressor compatible with the scikit-learn API. -- `pomegranate `_ Probabilistic modelling - for Python, with an emphasis on hidden Markov models. +- `tslearn `_ A machine learning library for + time series that offers tools for pre-processing and feature extraction as well as + dedicated models for clustering, classification and regression. -- `sklearn-crfsuite `_ - Linear-chain conditional random fields - (`CRFsuite `_ wrapper with - sklearn-like API). +**Gradient (tree) boosting** -**Deep neural networks etc.** +Note scikit-learn own modern gradient boosting estimators +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and +:class:`~sklearn.ensemble.HistGradientBoostingRegressor`. -- `nolearn `_ A number of wrappers and - abstractions around existing neural network libraries +- `XGBoost `_ XGBoost is an optimized distributed + gradient boosting library designed to be highly efficient, flexible and portable. -- `Keras `_ High-level API for - TensorFlow with a scikit-learn inspired API. +- `LightGBM `_ LightGBM is a gradient boosting + framework that uses tree based learning algorithms. It is designed to be distributed + and efficient. -- `lasagne `_ A lightweight library to - build and train neural networks in Theano. +**Structured learning** + +- `HMMLearn `_ Implementation of hidden + markov models that was previously part of scikit-learn. + +- `pomegranate `_ Probabilistic modelling + for Python, with an emphasis on hidden Markov models. + +**Deep neural networks etc.** - `skorch `_ A scikit-learn compatible neural network library that wraps PyTorch. @@ -161,47 +190,35 @@ and tasks. Keras to interface it with scikit-learn. SciKeras is the successor of `tf.keras.wrappers.scikit_learn`. +**Federated Learning** + +- `Flower `_ A friendly federated learning framework with a + unified approach that can federate any workload, any ML framework, and any programming language. + +**Privacy Preserving Machine Learning** + +- `Concrete ML `_ A privacy preserving + ML framework built on top of `Concrete + `_, with bindings to traditional ML + frameworks, thanks to fully homomorphic encryption. APIs of so-called + Concrete ML built-in models are very close to scikit-learn APIs. + **Broad scope** - `mlxtend `_ Includes a number of additional estimators as well as model visualization utilities. -- `scikit-lego `_ A number of scikit-learn compatible +- `scikit-lego `_ A number of scikit-learn compatible custom transformers, models and metrics, focusing on solving practical industry tasks. **Other regression and classification** -- `xgboost `_ Optimised gradient boosted decision - tree library. - -- `ML-Ensemble `_ Generalized - ensemble learning (stacking, blending, subsemble, deep ensembles, - etc.). - -- `lightning `_ Fast - state-of-the-art linear model solvers (SDCA, AdaGrad, SVRG, SAG, etc...). - -- `py-earth `_ Multivariate - adaptive regression splines - -- `Kernel Regression `_ - Implementation of Nadaraya-Watson kernel regression with automatic bandwidth - selection - - `gplearn `_ Genetic Programming for symbolic regression tasks. - `scikit-multilearn `_ Multi-label classification with focus on label space manipulation. -- `seglearn `_ Time series and sequence - learning using sliding window segmentation. - -- `libOPF `_ Optimal path forest classifier - -- `fastFM `_ Fast factorization machine - implementation compatible with scikit-learn - **Decomposition and clustering** - `lda `_: Fast implementation of latent @@ -218,16 +235,19 @@ and tasks. - `hdbscan `_ HDBSCAN and Robust Single Linkage clustering algorithms for robust variable density clustering. - -- `spherecluster `_ Spherical - K-means and mixture of von Mises Fisher clustering routines for data on the - unit hypersphere. + As of scikit-learn version 1.3.0, there is :class:`~sklearn.cluster.HDBSCAN`. **Pre-processing** - `categorical-encoding `_ A library of sklearn compatible categorical variable encoders. + As of scikit-learn version 1.3.0, there is + :class:`~sklearn.preprocessing.TargetEncoder`. + +- `skrub `_ : facilitate learning on dataframes, + with sklearn compatible encoders (of categories, dates, strings) and + more. - `imbalanced-learn `_ Various @@ -261,15 +281,16 @@ Other packages useful for data analysis and machine learning. statistical models. More focused on statistical tests and less on prediction than scikit-learn. -- `PyMC `_ Bayesian statistical models and +- `PyMC `_ Bayesian statistical models and fitting algorithms. -- `Sacred `_ Tool to help you configure, - organize, log and reproduce experiments - -- `Seaborn `_ Visualization library based on +- `Seaborn `_ A visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. +- `scikit-survival `_ A library implementing + models to learn from censored time-to-event data (also called survival analysis). + Models are fully compatible with scikit-learn. + Recommendation Engine packages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -279,20 +300,16 @@ Recommendation Engine packages - `lightfm `_ A Python/Cython implementation of a hybrid recommender system. -- `OpenRec `_ TensorFlow-based - neural-network inspired recommendation algorithms. - -- `Spotlight `_ Pytorch-based - implementation of deep recommender models. - -- `Surprise Lib `_ Library for explicit feedback +- `Surprise Lib `_ Library for explicit feedback datasets. Domain specific packages ~~~~~~~~~~~~~~~~~~~~~~~~ +- `scikit-network `_ Machine learning on graphs. + - `scikit-image `_ Image processing and computer - vision in python. + vision in Python. - `Natural language toolkit (nltk) `_ Natural language processing and some machine learning. @@ -304,13 +321,10 @@ Domain specific packages - `AstroML `_ Machine learning for astronomy. -- `MSMBuilder `_ Machine learning for protein - conformational dynamics time series. - Translations of scikit-learn documentation ------------------------------------------ -Translation’s purpose is to ease reading and understanding in languages +Translation's purpose is to ease reading and understanding in languages other than English. Its aim is to help people who do not understand English or have doubts about its interpretation. Additionally, some people prefer to read documentation in their native language, but please bear in mind that @@ -327,9 +341,13 @@ and promote community efforts. (`source `__) - `Persian translation `_ (`source `__) +- `Spanish translation `_ + (`source `__) +- `Korean translation `_ + (`source `__) + .. rubric:: Footnotes .. [#f1] following `linux documentation Disclaimer `__ - diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 30c9f58339502..a9e3e73d01deb 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -1,5 +1,3 @@ -īģŋ.. _roadmap: - .. |ss| raw:: html @@ -8,12 +6,14 @@ +.. _roadmap: + Roadmap ======= Purpose of this document ------------------------ -This document list general directions that core contributors are interested +This document lists general directions that core contributors are interested to see developed in scikit-learn. The fact that an item is listed here is in no way a promise that it will happen, as resources are limited. Rather, it is an indication that help is welcomed on this topic. @@ -51,7 +51,7 @@ external to the core library. (i.e. rectangular data largely invariant to column and row order; predicting targets with simple structure) * improve the ease for users to develop and publish external components -* improve inter-operability with modern data science tools (e.g. Pandas, Dask) +* improve interoperability with modern data science tools (e.g. Pandas, Dask) and infrastructures (e.g. distributed processing) Many of the more fine-grained goals can be found under the `API tag @@ -69,29 +69,17 @@ the document up to date as we work on these issues. #. Improved handling of Pandas DataFrames * document current handling - * column reordering issue :issue:`7242` - * avoiding unnecessary conversion to ndarray :issue:`12147` - * returning DataFrames from transformers :issue:`5523` - * getting DataFrames from dataset loaders :issue:`10733`, - |ss| :issue:`13902` |se| - * Sparse currently not considered :issue:`12800` #. Improved handling of categorical features * Tree-based models should be able to handle both continuous and categorical - features :issue:`12866` and :issue:`15550`. - * |ss| In dataset loaders :issue:`13902` |se| - * As generic transformers to be used with ColumnTransforms (e.g. ordinal - encoding supervised by correlation with target variable) :issue:`5853`, - :issue:`11805` + features :issue:`29437`. * Handling mixtures of categorical and continuous variables #. Improved handling of missing data - * Making sure meta-estimators are lenient towards missing data, - :issue:`15319` - * Non-trivial imputers |ss| :issue:`11977`, :issue:`12852` |se| - * Learners directly handling missing data |ss| :issue:`13911` |se| + * Making sure meta-estimators are lenient towards missing data by implementing + a common test. * An amputation sample generator to make parts of a dataset go missing :issue:`6284` @@ -101,16 +89,8 @@ the document up to date as we work on these issues. documentation is crowded which makes it hard for beginners to get the big picture. Some work could be done in prioritizing the information. -#. Passing around information that is not (X, y): Sample properties - - * We need to be able to pass sample weights to scorers in cross validation. - * We should have standard/generalised ways of passing sample-wise properties - around in meta-estimators. :issue:`4497` :issue:`7646` - #. Passing around information that is not (X, y): Feature properties - * Feature names or descriptions should ideally be available to fit for, e.g. - . :issue:`6425` :issue:`6424` * Per-feature handling (e.g. "is this a nominal / ordinal / English language text?") should also not need to be provided to estimator constructors, ideally, but should be available as metadata alongside X. :issue:`8480` @@ -124,27 +104,21 @@ the document up to date as we work on these issues. #. Make it easier for external users to write Scikit-learn-compatible components - * More flexible estimator checks that do not select by estimator name - :issue:`6599` :issue:`6715` - * Example of how to develop an estimator or a meta-estimator, :issue:`14582` * More self-sufficient running of scikit-learn-contrib or a similar resource #. Support resampling and sample reduction * Allow subsampling of majority classes (in a pipeline?) :issue:`3855` - * Implement random forests with resampling :issue:`8732` #. Better interfaces for interactive development - * |ss| __repr__ |se| and HTML visualisations of estimators - |ss| :issue:`6323` |se| and :pr:`14180`. - * Include plotting tools, not just as examples. :issue:`9173` + * Improve the HTML visualisations of estimators via the `estimator_html_repr`. + * Include more plotting tools, not just as examples. #. Improved tools for model diagnostics and basic inference - * |ss| alternative feature importances implementations, :issue:`13146` |se| + * work on a unified interface for "feature importance" * better ways to handle validation sets when fitting - * better ways to find thresholds / create decision rules :issue:`8614` #. Better tools for selecting hyperparameters with transductive estimators @@ -175,11 +149,6 @@ the document up to date as we work on these issues. learning is on smaller data than ETL, hence we can maybe adapt to very large scale while supporting only a fraction of the patterns. -#. Support for working with pre-trained models - - * Estimator "freezing". In particular, right now it's impossible to clone a - `CalibratedClassifierCV` with prefit. :issue:`8370`. :issue:`6451` - #. Backwards-compatible de/serialization of some estimators * Currently serialization (with pickle) breaks across versions. While we may @@ -201,15 +170,15 @@ the document up to date as we work on these issues. versions: * Try to load the old pickle, if it works, use the validation set - prediction snapshot to detect that the serialized model still behave + prediction snapshot to detect that the serialized model still behaves the same; - * If joblib.load / pickle.load not work, use the versioned control + * If joblib.load / pickle.load does not work, use the versioned control training script + historical training set to retrain the model and use the validation set prediction snapshot to assert that it is possible to recover the previous predictive performance: if this is not the case there is probably a bug in scikit-learn that needs to be reported. -#. Everything in Scikit-learn should probably conform to our API contract. +#. Everything in scikit-learn should probably conform to our API contract. We are still in the process of making decisions on some of these related issues. @@ -229,43 +198,3 @@ the document up to date as we work on these issues. * Document good practices to detect temporal distribution drift for deployed model and good practices for re-training on fresh data without causing catastrophic predictive performance regressions. - - -Subpackage-specific goals -------------------------- - -:mod:`sklearn.ensemble` - -* |ss| a stacking implementation, :issue:`11047` |se| - -:mod:`sklearn.cluster` - -* kmeans variants for non-Euclidean distances, if we can show these have - benefits beyond hierarchical clustering. - -:mod:`sklearn.model_selection` - -* |ss| multi-metric scoring is slow :issue:`9326` |se| -* perhaps we want to be able to get back more than multiple metrics -* the handling of random states in CV splitters is a poor design and - contradicts the validation of similar parameters in estimators, - :issue:`15177` -* exploit warm-starting and path algorithms so the benefits of `EstimatorCV` - objects can be accessed via `GridSearchCV` and used in Pipelines. - :issue:`1626` -* Cross-validation should be able to be replaced by OOB estimates whenever a - cross-validation iterator is used. -* Redundant computations in pipelines should be avoided (related to point - above) cf `daskml - `_ - -:mod:`sklearn.neighbors` - -* |ss| Ability to substitute a custom/approximate/precomputed nearest neighbors - implementation for ours in all/most contexts that nearest neighbors are used - for learning. :issue:`10463` |se| - -:mod:`sklearn.pipeline` - -* Performance issues with `Pipeline.memory` -* see "Everything in Scikit-learn should conform to our API contract" above diff --git a/doc/scss/api-search.scss b/doc/scss/api-search.scss new file mode 100644 index 0000000000000..51cf15f92c1cb --- /dev/null +++ b/doc/scss/api-search.scss @@ -0,0 +1,111 @@ +/** + * This is the styling for the API index page (`api/index`), in particular for the API + * search table. It involves overriding the style sheet of DataTables which does not + * fit well into the theme, especially in dark theme; see https://datatables.net/ + */ + +.dt-container { + margin-bottom: 2rem; + + // Fix the selection box for entries per page + select.dt-input { + padding: 0 !important; + margin-right: 0.4rem !important; + + > option { + color: var(--pst-color-text-base); + background-color: var(--pst-color-background); + } + } + + // Fix the search box + input.dt-input { + width: 50%; + line-height: normal; + padding: 0.1rem 0.3rem !important; + margin-left: 0.4rem !important; + } + + table.dataTable { + th { + // Avoid table header being too tall + p { + margin-bottom: 0; + } + + // Fix the ascending/descending order buttons in the header + span.dt-column-order { + &::before, + &::after { + color: var(--pst-color-text-base); + line-height: 0.7rem !important; + } + } + } + + td { + // Fix color of text warning no records found + &.dt-empty { + color: var(--pst-color-text-base) !important; + } + } + + // Unset bottom border of the last row + tr:last-child > * { + border-bottom: unset !important; + } + } + + div.dt-paging button.dt-paging-button { + padding: 0 0.5rem; + + &.disabled { + color: var(--pst-color-border) !important; + + // Overwrite the !important color assigned by DataTables because we must keep + // the color of disabled buttons consistent with and without hovering + &:hover { + color: var(--pst-color-border) !important; + } + } + + // Fix colors of paging buttons + &.current, + &:not(.disabled):not(.current):hover { + color: var(--pst-color-on-surface) !important; + border-color: var(--pst-color-surface) !important; + background: var(--pst-color-surface) !important; + } + + // Highlight the border of the current selected paging button + &.current { + border-color: var(--pst-color-text-base) !important; + } + } +} + +// Styling the object description cells in the table +div.sk-apisearch-desc { + p { + margin-bottom: 0; + } + + div.caption > p { + a, + code { + color: var(--pst-color-text-muted); + } + + code { + padding: 0; + font-size: 0.7rem; + font-weight: var(--pst-font-weight-caption); + background-color: transparent; + } + + .sd-badge { + font-size: 0.7rem; + margin-left: 0.3rem; + } + } +} diff --git a/doc/scss/api.scss b/doc/scss/api.scss new file mode 100644 index 0000000000000..d7110def4ac09 --- /dev/null +++ b/doc/scss/api.scss @@ -0,0 +1,52 @@ +/** + * This is the styling for API reference pages, currently under `modules/generated`. + * Note that it should be applied *ONLY* to API reference pages, as the selectors are + * designed based on how `autodoc` and `autosummary` generate the stuff. + */ + +// Make the admonitions more compact +div.versionadded, +div.versionchanged, +div.deprecated { + margin: 1rem auto; + + > p { + margin: 0.3rem auto; + } +} + +// Make docstrings more compact +dd { + p:not(table *) { + margin-bottom: 0.5rem !important; + } + + ul { + margin-bottom: 0.5rem !important; + padding-left: 2rem !important; + } +} + +// The first method is too close the the docstring above +dl.py.method:first-of-type { + margin-top: 2rem; +} + +// https://github.com/pydata/pydata-sphinx-theme/blob/8cf45f835bfdafc5f3821014a18f3b7e0fc2d44b/src/pydata_sphinx_theme/assets/styles/content/_api.scss +dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) { + margin-bottom: 1.5rem; + + dd { + margin-left: 1.2rem; + } + + // "Parameters", "Returns", etc. in the docstring + dt.field-odd, + dt.field-even { + margin: 0.5rem 0; + + + dd > dl { + margin-bottom: 0.5rem; + } + } +} diff --git a/doc/scss/colors.scss b/doc/scss/colors.scss new file mode 100644 index 0000000000000..bbc6aa6c2a3d6 --- /dev/null +++ b/doc/scss/colors.scss @@ -0,0 +1,51 @@ +/** + * This is the style sheet for customized colors of scikit-learn. + * Tints and shades are generated by https://colorkit.co/color-shades-generator/ + * + * This file is compiled into styles/colors.css by sphinxcontrib.sass, see: + * https://sass-lang.com/guide/ + */ + +:root { + /* scikit-learn cyan */ + --sk-cyan-tint-9: #edf7fd; + --sk-cyan-tint-8: #daeffa; + --sk-cyan-tint-7: #c8e6f8; + --sk-cyan-tint-6: #b5def5; + --sk-cyan-tint-5: #a2d6f2; + --sk-cyan-tint-4: #8fcdef; + --sk-cyan-tint-3: #7ac5ec; + --sk-cyan-tint-2: #64bce9; + --sk-cyan-tint-1: #4bb4e5; + --sk-cyan: #29abe2; + --sk-cyan-shades-1: #2294c4; + --sk-cyan-shades-2: #1c7ea8; + --sk-cyan-shades-3: #15688c; + --sk-cyan-shades-4: #0f5471; + --sk-cyan-shades-5: #094057; + --sk-cyan-shades-6: #052d3e; + --sk-cyan-shades-7: #021b27; + --sk-cyan-shades-8: #010b12; + --sk-cyan-shades-9: #000103; + + /* scikit-learn orange */ + --sk-orange-tint-9: #fff5ec; + --sk-orange-tint-8: #ffead9; + --sk-orange-tint-7: #ffe0c5; + --sk-orange-tint-6: #ffd5b2; + --sk-orange-tint-5: #fecb9e; + --sk-orange-tint-4: #fdc08a; + --sk-orange-tint-3: #fcb575; + --sk-orange-tint-2: #fbaa5e; + --sk-orange-tint-1: #f99f44; + --sk-orange: #f7931e; + --sk-orange-shades-1: #d77f19; + --sk-orange-shades-2: #b76c13; + --sk-orange-shades-3: #99590e; + --sk-orange-shades-4: #7c4709; + --sk-orange-shades-5: #603605; + --sk-orange-shades-6: #452503; + --sk-orange-shades-7: #2c1601; + --sk-orange-shades-8: #150800; + --sk-orange-shades-9: #030100; +} diff --git a/doc/scss/custom.scss b/doc/scss/custom.scss new file mode 100644 index 0000000000000..cac81b03e7ce2 --- /dev/null +++ b/doc/scss/custom.scss @@ -0,0 +1,253 @@ +/** + * This is a general styling sheet. + * It should be used for customizations that affect multiple pages. + * + * This file is compiled into styles/custom.css by sphinxcontrib.sass, see: + * https://sass-lang.com/guide/ + */ + +/* Global */ + +code.literal { + border: 0; +} + +/* Version switcher */ + +.version-switcher__menu.dropdown-menu { + // The version switcher is aligned right so we need to avoid the dropdown menu + // to be cut off by the right boundary + left: unset; + right: 0; + + a.list-group-item.sk-avail-docs-link { + display: flex; + align-items: center; + + &:after { + content: var(--pst-icon-external-link); + font: var(--fa-font-solid); + font-size: 0.75rem; + margin-left: 0.5rem; + } + } +} + +/* Primary sidebar */ + +.bd-sidebar-primary { + width: 22.5%; + min-width: 16rem; + + // The version switcher button in the sidebar is ill-styled + button.version-switcher__button { + margin-bottom: unset; + margin-left: 0.3rem; + font-size: 1rem; + } + + // The section navigation part is to close to the right boundary (originally an even + // larger negative right margin was used) + nav.bd-links { + margin-right: -0.5rem; + } +} + +/* Article content */ + +.bd-article { + h1 { + font-weight: 500; + margin-bottom: 2rem; + } + + h2 { + font-weight: 500; + margin-bottom: 1.5rem; + } + + // Avoid changing the aspect ratio of images; add some padding so that at least + // there is some space between image and background in dark mode + img { + height: unset !important; + padding: 1%; + } + + // Resize table of contents to make the top few levels of headings more visible + li.toctree-l1 { + padding-bottom: 0.5em; + + > a { + font-size: 150%; + font-weight: bold; + } + } + + li.toctree-l2, + li.toctree-l3, + li.toctree-l4 { + margin-left: 15px; + } +} + +/* Dropdowns (sphinx-design) */ + +details.sd-dropdown { + &:hover > summary.sd-summary-title { + > .sd-summary-text > a.headerlink { + visibility: visible; + } + + > .sk-toggle-all { + opacity: 1; + } + } + + > summary.sd-summary-title { + > .sd-summary-text > a.headerlink { + font-size: 1rem; + } + + // See `js/scripts/dropdown.js`: this is styling the "expand/collapse all" button + > .sk-toggle-all { + color: var(--pst-sd-dropdown-color); + margin-right: 0.5rem; + pointer-events: auto !important; + opacity: 0; + } + } +} + +/* Tabs (sphinx-design) */ + +.sd-tab-set { + --tab-caption-width: 0%; // No tab caption by default + margin-top: 1.5rem; + + &::before { + // Set `content` for tab caption + width: var(--tab-caption-width); + display: flex; + align-items: center; + font-weight: bold; + } + + .sd-tab-content { + padding: 0.5rem 0 0 0 !important; + background-color: transparent !important; + border: none !important; + + > p:first-child { + margin-top: 1rem !important; + } + } + + > label.sd-tab-label { + margin: 0 3px; + display: flex; + align-items: center; + justify-content: center; + border-radius: 5px !important; + + &.tab-6 { + width: calc((100% - var(--tab-caption-width)) / 2 - 6px) !important; + } + + &.tab-4 { + width: calc((100% - var(--tab-caption-width)) / 3 - 6px) !important; + } + } + + > input:checked + label.sd-tab-label { + transform: unset; + border: 2px solid var(--pst-color-primary); + } +} + +/* Download/launcher links and top hint (sphinx-gallery) */ + +// https://sphinx-gallery.github.io/stable/advanced.html#using-sphinx-gallery-sidebar-components +.sphx-glr-download-link-note, +.binder-badge, +.lite-badge, +.sphx-glr-download-jupyter, +.sphx-glr-download-python, +.sphx-glr-download-zip { + display: none; +} + +/* scikit-learn buttons */ + +a.btn { + &.sk-btn-orange { + background-color: var(--sk-orange-tint-1); + color: black !important; + + &:hover { + background-color: var(--sk-orange-tint-3); + } + } + + &.sk-btn-cyan { + background-color: var(--sk-cyan-shades-2); + color: white !important; + + &:hover { + background-color: var(--sk-cyan-shades-1); + } + } +} + +/* scikit-learn avatar grid, see build_tools/generate_authors_table.py */ + +div.sk-authors-container { + display: flex; + flex-wrap: wrap; + justify-content: center; + + > div { + width: 6rem; + margin: 0.5rem; + font-size: 0.9rem; + } +} + +/* scikit-learn text-image grid, used in testimonials and sponsors pages */ + +@mixin sk-text-image-grid($img-max-height) { + display: flex; + align-items: center; + flex-wrap: wrap; + + div.text-box, + div.image-box { + width: 50%; + + @media screen and (max-width: 500px) { + width: 100%; + } + } + + div.text-box .annotation { + font-size: 0.9rem; + font-style: italic; + color: var(--pst-color-text-muted); + } + + div.image-box { + text-align: center; + + img { + max-height: $img-max-height; + max-width: 50%; + } + } +} + +div.sk-text-image-grid-small { + @include sk-text-image-grid(60px); +} + +div.sk-text-image-grid-large { + @include sk-text-image-grid(100px); +} diff --git a/doc/scss/index.scss b/doc/scss/index.scss new file mode 100644 index 0000000000000..c3bb8e86b41c6 --- /dev/null +++ b/doc/scss/index.scss @@ -0,0 +1,176 @@ +/** + * Styling sheet for the scikit-learn landing page. This should be loaded only for the + * landing page. + * + * This file is compiled into styles/index.css by sphinxcontrib.sass, see: + * https://sass-lang.com/guide/ + */ + +/* Theme-aware colors for the landing page */ + +html { + &[data-theme="light"] { + --sk-landing-bg-1: var(--sk-cyan-shades-3); + --sk-landing-bg-2: var(--sk-cyan); + --sk-landing-bg-3: var(--sk-orange-tint-8); + --sk-landing-bg-4: var(--sk-orange-tint-3); + } + + &[data-theme="dark"] { + --sk-landing-bg-1: var(--sk-cyan-shades-5); + --sk-landing-bg-2: var(--sk-cyan-shades-2); + --sk-landing-bg-3: var(--sk-orange-tint-4); + --sk-landing-bg-4: var(--sk-orange-tint-1); + } +} + +/* General */ + +div.sk-landing-container { + max-width: 1400px; +} + +/* Top bar */ + +div.sk-landing-top-bar { + background-image: linear-gradient( + 160deg, + var(--sk-landing-bg-1) 0%, + var(--sk-landing-bg-2) 17%, + var(--sk-landing-bg-3) 59%, + var(--sk-landing-bg-4) 100% + ); + + .sk-landing-header, + .sk-landing-subheader { + color: white; + text-shadow: 0px 0px 8px var(--sk-landing-bg-1); + } + + .sk-landing-header { + font-size: 3.2rem; + margin-bottom: 0.5rem; + } + + .sk-landing-subheader { + letter-spacing: 0.17rem; + margin-top: 0; + font-weight: 500; + } + + a.sk-btn-orange { + font-size: 1.1rem; + font-weight: 500; + } + + ul.sk-landing-header-body { + margin-top: auto; + margin-bottom: auto; + font-size: 1.2rem; + font-weight: 500; + color: black; + } +} + +/* Body */ + +div.sk-landing-body { + div.card { + background-color: var(--pst-color-background); + border-color: var(--pst-color-border); + } + + .sk-px-xl-4 { + @media screen and (min-width: 1200px) { + padding-left: 1.3rem !important; + padding-right: 1.3rem !important; + } + } + + .card-body { + p { + margin-bottom: 0.8rem; + color: var(--pst-color-text-base); + } + + .sk-card-title { + font-weight: 700; + margin: 0 0 1rem 0; + } + } + + .sk-card-img-container { + display: flex; + justify-content: center; + align-items: end; + margin-bottom: 1rem; + + img { + max-width: unset; + height: 15rem; + } + } +} + +/* More info */ + +div.sk-landing-more-info { + font-size: 0.96rem; + background-color: var(--pst-color-surface); + + .sk-landing-call-header { + font-weight: 700; + margin-top: 0; + + html[data-theme="light"] & { + color: var(--sk-orange-shades-1); + } + + html[data-theme="dark"] & { + color: var(--sk-orange); + } + } + + ul.sk-landing-call-list > li { + margin-bottom: 0.25rem; + } + + .sk-who-uses-carousel { + min-height: 200px; + + .carousel-item img { + max-height: 100px; + max-width: 50%; + margin: 0.5rem; + } + } + + .sk-more-testimonials { + text-align: right !important; + } +} + +/* Footer */ + +div.sk-landing-footer { + a.sk-footer-funding-link { + text-decoration: none; + + p.sk-footer-funding-text { + color: var(--pst-color-link); + + &:hover { + color: var(--pst-color-secondary); + } + } + + div.sk-footer-funding-logos > img { + max-height: 40px; + max-width: 85px; + margin: 0 8px 8px 8px; + padding: 5px; + border-radius: 3px; + background-color: white; + } + } +} diff --git a/doc/sphinxext/add_toctree_functions.py b/doc/sphinxext/add_toctree_functions.py deleted file mode 100644 index 7cd0e7a29bb28..0000000000000 --- a/doc/sphinxext/add_toctree_functions.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Inspired by https://github.com/pandas-dev/pydata-sphinx-theme - -BSD 3-Clause License - -Copyright (c) 2018, pandas -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -import docutils - - -def add_toctree_functions(app, pagename, templatename, context, doctree): - """Add functions so Jinja templates can add toctree objects. - - This converts the docutils nodes into a nested dictionary that Jinja can - use in our templating. - """ - from sphinx.environment.adapters.toctree import TocTree - - def get_nav_object(maxdepth=None, collapse=True, numbered=False, **kwargs): - """Return a list of nav links that can be accessed from Jinja. - - Parameters - ---------- - maxdepth: int - How many layers of TocTree will be returned - collapse: bool - Whether to only include sub-pages of the currently-active page, - instead of sub-pages of all top-level pages of the site. - numbered: bool - Whether to add section number to title - kwargs: key/val pairs - Passed to the `TocTree.get_toctree_for` Sphinx method - """ - # The TocTree will contain the full site TocTree including sub-pages. - # "collapse=True" collapses sub-pages of non-active TOC pages. - # maxdepth controls how many TOC levels are returned - toctree = TocTree(app.env).get_toctree_for( - pagename, app.builder, collapse=collapse, maxdepth=maxdepth, - **kwargs) - # If no toctree is defined (AKA a single-page site), skip this - if toctree is None: - return [] - - # toctree has this structure - # - # - # - # - # `list_item`s are the actual TOC links and are the only thing we want - toc_items = [item for child in toctree.children for item in child - if isinstance(item, docutils.nodes.list_item)] - - # Now convert our docutils nodes into dicts that Jinja can use - nav = [docutils_node_to_jinja(child, only_pages=True, - numbered=numbered) - for child in toc_items] - - return nav - - context["get_nav_object"] = get_nav_object - - -def docutils_node_to_jinja(list_item, only_pages=False, numbered=False): - """Convert a docutils node to a structure that can be read by Jinja. - - Parameters - ---------- - list_item : docutils list_item node - A parent item, potentially with children, corresponding to the level - of a TocTree. - only_pages : bool - Only include items for full pages in the output dictionary. Exclude - anchor links (TOC items with a URL that starts with #) - numbered: bool - Whether to add section number to title - - Returns - ------- - nav : dict - The TocTree, converted into a dictionary with key/values that work - within Jinja. - """ - if not list_item.children: - return None - - # We assume this structure of a list item: - # - # - # <-- the thing we want - reference = list_item.children[0].children[0] - title = reference.astext() - url = reference.attributes["refuri"] - active = "current" in list_item.attributes["classes"] - - secnumber = reference.attributes.get("secnumber", None) - if numbered and secnumber is not None: - secnumber = ".".join(str(n) for n in secnumber) - title = f"{secnumber}. {title}" - - # If we've got an anchor link, skip it if we wish - if only_pages and '#' in url: - return None - - # Converting the docutils attributes into jinja-friendly objects - nav = {} - nav["title"] = title - nav["url"] = url - nav["active"] = active - - # Recursively convert children as well - # If there are sub-pages for this list_item, there should be two children: - # a paragraph, and a bullet_list. - nav["children"] = [] - if len(list_item.children) > 1: - # The `.children` of the bullet_list has the nodes of the sub-pages. - subpage_list = list_item.children[1].children - for sub_page in subpage_list: - child_nav = docutils_node_to_jinja(sub_page, only_pages=only_pages, - numbered=numbered) - if child_nav is not None: - nav["children"].append(child_nav) - return nav - - -def setup(app): - app.connect("html-page-context", add_toctree_functions) - - return {'parallel_read_safe': True, 'parallel_write_safe': True} diff --git a/doc/sphinxext/allow_nan_estimators.py b/doc/sphinxext/allow_nan_estimators.py new file mode 100644 index 0000000000000..3b85ce6c87508 --- /dev/null +++ b/doc/sphinxext/allow_nan_estimators.py @@ -0,0 +1,58 @@ +from contextlib import suppress + +from docutils import nodes +from docutils.parsers.rst import Directive + +from sklearn.utils import all_estimators +from sklearn.utils._test_common.instance_generator import _construct_instances +from sklearn.utils._testing import SkipTest + + +class AllowNanEstimators(Directive): + @staticmethod + def make_paragraph_for_estimator_type(estimator_type): + intro = nodes.list_item() + intro += nodes.strong(text="Estimators that allow NaN values for type ") + intro += nodes.literal(text=f"{estimator_type}") + intro += nodes.strong(text=":\n") + exists = False + lst = nodes.bullet_list() + for name, est_class in all_estimators(type_filter=estimator_type): + with suppress(SkipTest): + # Here we generate the text only for one instance. This directive + # should not be used for meta-estimators where tags depend on the + # sub-estimator. + est = next(_construct_instances(est_class)) + + if est.__sklearn_tags__().input_tags.allow_nan: + module_name = ".".join(est_class.__module__.split(".")[:2]) + class_title = f"{est_class.__name__}" + class_url = f"./generated/{module_name}.{class_title}.html" + item = nodes.list_item() + para = nodes.paragraph() + para += nodes.reference( + class_title, text=class_title, internal=False, refuri=class_url + ) + exists = True + item += para + lst += item + intro += lst + return [intro] if exists else None + + def run(self): + lst = nodes.bullet_list() + for i in ["cluster", "regressor", "classifier", "transformer"]: + item = self.make_paragraph_for_estimator_type(i) + if item is not None: + lst += item + return [lst] + + +def setup(app): + app.add_directive("allow_nan_estimators", AllowNanEstimators) + + return { + "version": "0.1", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/doc/sphinxext/autoshortsummary.py b/doc/sphinxext/autoshortsummary.py new file mode 100644 index 0000000000000..8451f3133d05b --- /dev/null +++ b/doc/sphinxext/autoshortsummary.py @@ -0,0 +1,53 @@ +from sphinx.ext.autodoc import ModuleLevelDocumenter + + +class ShortSummaryDocumenter(ModuleLevelDocumenter): + """An autodocumenter that only renders the short summary of the object.""" + + # Defines the usage: .. autoshortsummary:: {{ object }} + objtype = "shortsummary" + + # Disable content indentation + content_indent = "" + + # Avoid being selected as the default documenter for some objects, because we are + # returning `can_document_member` as True for all objects + priority = -99 + + @classmethod + def can_document_member(cls, member, membername, isattr, parent): + """Allow documenting any object.""" + return True + + def get_object_members(self, want_all): + """Document no members.""" + return (False, []) + + def add_directive_header(self, sig): + """Override default behavior to add no directive header or options.""" + pass + + def add_content(self, more_content): + """Override default behavior to add only the first line of the docstring. + + Modified based on the part of processing docstrings in the original + implementation of this method. + + https://github.com/sphinx-doc/sphinx/blob/faa33a53a389f6f8bc1f6ae97d6015fa92393c4a/sphinx/ext/autodoc/__init__.py#L609-L622 + """ + sourcename = self.get_sourcename() + docstrings = self.get_doc() + + if docstrings is not None: + if not docstrings: + docstrings.append([]) + # Get the first non-empty line of the processed docstring; this could lead + # to unexpected results if the object does not have a short summary line. + short_summary = next( + (s for s in self.process_doc(docstrings) if s), "" + ) + self.add_line(short_summary, sourcename, 0) + + +def setup(app): + app.add_autodocumenter(ShortSummaryDocumenter) diff --git a/doc/sphinxext/custom_references_resolver.py b/doc/sphinxext/custom_references_resolver.py deleted file mode 100644 index 2fd32b7da785e..0000000000000 --- a/doc/sphinxext/custom_references_resolver.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Adapted from -sphinx.transforms.post_transforms.ReferencesResolver.resolve_anyref - -If 'py' is one of the domains and `py:class` is defined, -the Python domain will be processed before the 'std' domain. - -License for Sphinx -================== - -Copyright (c) 2007-2019 by the Sphinx team (see AUTHORS file). -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -from contextlib import suppress - -from docutils import nodes -from sphinx.transforms.post_transforms import ReferencesResolver - - -class CustomReferencesResolver(ReferencesResolver): - def resolve_anyref(self, refdoc, node, contnode): - """Resolve reference generated by the "any" role.""" - stddomain = self.env.get_domain('std') - target = node['reftarget'] - - # process 'py' domain first for python classes - if "py:class" in node: - with suppress(KeyError): - py_domain = self.env.domains['py'] - py_ref = py_domain.resolve_any_xref( - self.env, refdoc, self.app.builder, target, node, contnode) - if py_ref: - return self.create_node(py_ref[0]) - - # resolve :term: - term_ref = stddomain.resolve_xref(self.env, refdoc, self.app.builder, - 'term', target, node, contnode) - if term_ref: - # replace literal nodes with inline nodes - if not isinstance(term_ref[0], nodes.inline): - inline_node = nodes.inline(rawsource=term_ref[0].rawsource, - classes=term_ref[0].get('classes')) - if term_ref[0]: - inline_node.append(term_ref[0][0]) - term_ref[0] = inline_node - return self.create_node(("std:term", term_ref)) - - # next, do the standard domain - std_ref = stddomain.resolve_any_xref( - self.env, refdoc, self.app.builder, target, node, contnode) - if std_ref: - return self.create_node(std_ref[0]) - - for domain in self.env.domains.values(): - try: - ref = domain.resolve_any_xref( - self.env, refdoc, self.app.builder, target, node, contnode) - if ref: - return self.create_node(ref[0]) - except NotImplementedError: - # the domain doesn't yet support the new interface - # we have to manually collect possible references (SLOW) - for role in domain.roles: - res = domain.resolve_xref(self.env, refdoc, - self.app.builder, role, target, - node, contnode) - if res and isinstance(res[0], nodes.Element): - result = ('%s:%s' % (domain.name, role), res) - return self.create_node(result) - - # no results considered to be - contnode['classes'] = [] - return contnode - - def create_node(self, result): - res_role, newnode = result - # Override "any" class with the actual role type to get the styling - # approximately correct. - res_domain = res_role.split(':')[0] - if (len(newnode) > 0 and isinstance(newnode[0], nodes.Element) - and newnode[0].get('classes')): - newnode[0]['classes'].append(res_domain) - newnode[0]['classes'].append(res_role.replace(':', '-')) - return newnode - - -def setup(app): - if (hasattr(app.registry, "get_post_transforms") - and callable(app.registry.get_post_transforms)): - post_transforms = app.registry.get_post_transforms() - else: - # Support sphinx 1.6.* - post_transforms = app.post_transforms - - for i, transform_class in enumerate(post_transforms): - if transform_class == ReferencesResolver: - post_transforms[i] = CustomReferencesResolver - break - else: - raise RuntimeError("ReferencesResolver not found") diff --git a/doc/sphinxext/doi_role.py b/doc/sphinxext/doi_role.py new file mode 100644 index 0000000000000..9f117b07fa6a3 --- /dev/null +++ b/doc/sphinxext/doi_role.py @@ -0,0 +1,47 @@ +""" +doilinks +~~~~~~~~ +Extension to add links to DOIs. With this extension you can use e.g. +:doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will +create a link to a DOI resolver +(``https://doi.org/10.1016/S0022-2836(05)80360-2``). +The link caption will be the raw DOI. +You can also give an explicit caption, e.g. +:doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`. + +:copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by + the Sphinx team. +:license: BSD. +""" + +from docutils import nodes, utils +from sphinx.util.nodes import split_explicit_title + + +def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]): + text = utils.unescape(text) + has_explicit_title, title, part = split_explicit_title(text) + if typ in ["arXiv", "arxiv"]: + full_url = "https://arxiv.org/abs/" + part + if not has_explicit_title: + title = "arXiv:" + part + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + if typ in ["doi", "DOI"]: + full_url = "https://doi.org/" + part + if not has_explicit_title: + title = "DOI:" + part + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def setup_link_role(app): + app.add_role("arxiv", reference_role, override=True) + app.add_role("arXiv", reference_role, override=True) + app.add_role("doi", reference_role, override=True) + app.add_role("DOI", reference_role, override=True) + + +def setup(app): + app.connect("builder-inited", setup_link_role) + return {"version": "0.1", "parallel_read_safe": True} diff --git a/doc/sphinxext/dropdown_anchors.py b/doc/sphinxext/dropdown_anchors.py new file mode 100644 index 0000000000000..a001dfa11d403 --- /dev/null +++ b/doc/sphinxext/dropdown_anchors.py @@ -0,0 +1,58 @@ +import re + +from docutils import nodes +from sphinx.transforms.post_transforms import SphinxPostTransform +from sphinx_design.dropdown import dropdown_main + + +class DropdownAnchorAdder(SphinxPostTransform): + """Insert anchor links to the sphinx-design dropdowns. + + Some of the dropdowns were originally headers that had automatic anchors, so we + need to make sure that the old anchors still work. See the original implementation + (in JS): https://github.com/scikit-learn/scikit-learn/pull/27409 + + The anchor links are inserted at the end of the node with class "sd-summary-text" + which includes only the title text part of the dropdown (no icon, markers, etc). + """ + + default_priority = 9999 # Apply later than everything else + formats = ["html"] + + def run(self): + """Run the post transformation.""" + # Counter to store the duplicated summary text to add it as a suffix in the + # anchor ID + anchor_id_counters = {} + + for sd_dropdown in self.document.findall(dropdown_main): + # Grab the summary text node + sd_summary_text = sd_dropdown.next_node( + lambda node: "sd-summary-text" in node.get("classes", []) + ) + + # Concatenate the text of relevant nodes as the title text + title_text = "".join(node.astext() for node in sd_summary_text.children) + + # The ID uses the first line, lowercased, with spaces replaced by dashes; + # suffix the anchor ID with a counter if it already exists + anchor_id = re.sub(r"\s+", "-", title_text.strip().split("\n")[0]).lower() + if anchor_id in anchor_id_counters: + anchor_id_counters[anchor_id] += 1 + anchor_id = f"{anchor_id}-{anchor_id_counters[anchor_id]}" + else: + anchor_id_counters[anchor_id] = 1 + sd_dropdown["ids"].append(anchor_id) + + # Create the anchor element and insert after the title text; we do this + # directly with raw HTML + anchor_html = ( + f'#' + ) + anchor_node = nodes.raw("", anchor_html, format="html") + sd_summary_text.append(anchor_node) + + +def setup(app): + app.add_post_transform(DropdownAnchorAdder) diff --git a/doc/sphinxext/github_link.py b/doc/sphinxext/github_link.py index 1592b266a548a..2cd1fbd83af47 100644 --- a/doc/sphinxext/github_link.py +++ b/doc/sphinxext/github_link.py @@ -1,20 +1,20 @@ -from operator import attrgetter import inspect -import subprocess import os +import subprocess import sys from functools import partial +from operator import attrgetter -REVISION_CMD = 'git rev-parse --short HEAD' +REVISION_CMD = "git rev-parse --short HEAD" def _get_git_revision(): try: revision = subprocess.check_output(REVISION_CMD.split()).strip() except (subprocess.CalledProcessError, OSError): - print('Failed to execute git to get revision') + print("Failed to execute git to get revision") return None - return revision.decode('utf-8') + return revision.decode("utf-8") def _linkcode_resolve(domain, info, package, url_fmt, revision): @@ -26,22 +26,22 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision): >>> _linkcode_resolve('py', {'module': 'tty', ... 'fullname': 'setraw'}, ... package='tty', - ... url_fmt='http://hg.python.org/cpython/file/' + ... url_fmt='https://hg.python.org/cpython/file/' ... '{revision}/Lib/{package}/{path}#L{lineno}', ... revision='xxxx') - 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' + 'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' """ if revision is None: return - if domain not in ('py', 'pyx'): + if domain not in ("py", "pyx"): return - if not info.get('module') or not info.get('fullname'): + if not info.get("module") or not info.get("fullname"): return - class_name = info['fullname'].split('.')[0] - module = __import__(info['module'], fromlist=[class_name]) - obj = attrgetter(info['fullname'])(module) + class_name = info["fullname"].split(".")[0] + module = __import__(info["module"], fromlist=[class_name]) + obj = attrgetter(info["fullname"])(module) # Unwrap the object to get the correct source # file in case that is wrapped by a decorator @@ -59,14 +59,12 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision): if not fn: return - fn = os.path.relpath(fn, - start=os.path.dirname(__import__(package).__file__)) + fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) try: lineno = inspect.getsourcelines(obj)[1] except Exception: - lineno = '' - return url_fmt.format(revision=revision, package=package, - path=fn, lineno=lineno) + lineno = "" + return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) def make_linkcode_resolve(package, url_fmt): @@ -81,5 +79,6 @@ def make_linkcode_resolve(package, url_fmt): '{path}#L{lineno}') """ revision = _get_git_revision() - return partial(_linkcode_resolve, revision=revision, package=package, - url_fmt=url_fmt) + return partial( + _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt + ) diff --git a/doc/sphinxext/override_pst_pagetoc.py b/doc/sphinxext/override_pst_pagetoc.py new file mode 100644 index 0000000000000..f5697de8ef155 --- /dev/null +++ b/doc/sphinxext/override_pst_pagetoc.py @@ -0,0 +1,84 @@ +from functools import cache + +from sphinx.util.logging import getLogger + +logger = getLogger(__name__) + + +def override_pst_pagetoc(app, pagename, templatename, context, doctree): + """Overrides the `generate_toc_html` function of pydata-sphinx-theme for API.""" + + @cache + def generate_api_toc_html(kind="html"): + """Generate the in-page toc for an API page. + + This relies on the `generate_toc_html` function added by pydata-sphinx-theme + into the context. We save the original function into `pst_generate_toc_html` + and override `generate_toc_html` with this function for generated API pages. + + The pagetoc of an API page would look like the following: + +
    <-- Unwrap +
  • <-- Unwrap + {{obj}} <-- Decompose + +
      +
    • + ...object +
        <-- Set visible if exists +
      • ...method 1
      • <-- Shorten +
      • ...method 2
      • <-- Shorten + ...more methods <-- Shorten +
      +
    • +
    • ...gallery examples
    • +
    + +
  • <-- Unwrapped +
<-- Unwrapped + """ + soup = context["pst_generate_toc_html"](kind="soup") + + try: + # Unwrap the outermost level + soup.ul.unwrap() + soup.li.unwrap() + soup.a.decompose() + + # Get all toc-h2 level entries, where the first one should be the function + # or class, and the second one, if exists, should be the examples; there + # should be no more than two entries at this level for generated API pages + lis = soup.ul.select("li.toc-h2") + main_li = lis[0] + meth_list = main_li.ul + + if meth_list is not None: + # This is a class API page, we remove the class name from the method + # names to make them better fit into the secondary sidebar; also we + # make the toc-h3 level entries always visible to more easily navigate + # through the methods + meth_list["class"].append("visible") + for meth in meth_list.find_all("li", {"class": "toc-h3"}): + target = meth.a.code.span + target.string = target.string.split(".", 1)[1] + + # This corresponds to the behavior of `generate_toc_html` + return str(soup) if kind == "html" else soup + + except Exception as e: + # Upon any failure we return the original pagetoc + logger.warning( + f"Failed to generate API pagetoc for {pagename}: {e}; falling back" + ) + return context["pst_generate_toc_html"](kind=kind) + + # Override the pydata-sphinx-theme implementation for generate API pages + if pagename.startswith("modules/generated/"): + context["pst_generate_toc_html"] = context["generate_toc_html"] + context["generate_toc_html"] = generate_api_toc_html + + +def setup(app): + # Need to be triggered after `pydata_sphinx_theme.toctree.add_toctree_functions`, + # and since default priority is 500 we set 900 for safety + app.connect("html-page-context", override_pst_pagetoc, priority=900) diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py index ba14de62d7a2e..206359a1bd703 100644 --- a/doc/sphinxext/sphinx_issues.py +++ b/doc/sphinxext/sphinx_issues.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """A Sphinx extension for linking to your project's issue tracker. Copyright 2014 Steven Loria @@ -19,6 +18,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ + import re from docutils import nodes, utils @@ -76,7 +76,6 @@ def cve_role(name, rawtext, text, lineno, inliner, options=None, content=None): class IssueRole(object): - EXTERNAL_REPO_REGEX = re.compile(r"^(\w+)/(.+)([#@])([\w]+)$") def __init__( @@ -120,8 +119,9 @@ def make_node(self, name, issue_no, config, options=None): ) else: raise ValueError( - "Neither {} nor issues_github_path " - "is set".format(self.uri_config_option) + "Neither {} nor issues_github_path is set".format( + self.uri_config_option + ) ) issue_text = self.format_text(issue_no) link = nodes.reference(text=issue_text, refuri=ref, **options) diff --git a/doc/supervised_learning.rst b/doc/supervised_learning.rst index e8578fa1e0307..ba24e8ee23c6f 100644 --- a/doc/supervised_learning.rst +++ b/doc/supervised_learning.rst @@ -1,13 +1,7 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -.. include:: includes/big_toc_css.rst - .. _supervised-learning: Supervised learning ------------------------ +------------------- .. toctree:: :maxdepth: 2 diff --git a/doc/support.rst b/doc/support.rst index f96609bbab937..eb90ff6dd3d94 100644 --- a/doc/support.rst +++ b/doc/support.rst @@ -2,96 +2,131 @@ Support ======= -There are several ways to get in touch with the developers. +There are several channels to connect with scikit-learn developers for assistance, feedback, or contributions. +**Note**: Communications on all channels should respect our `Code of Conduct `_. -.. _mailing_lists: -Mailing List -============ +.. _announcements_and_notification: -- The main mailing list is `scikit-learn - `_. +Mailing Lists +============= -- There is also a commit list `scikit-learn-commits - `_, - where updates to the main repository and test failures get notified. +- **Main Mailing List**: Join the primary discussion + platform for scikit-learn at `scikit-learn Mailing List + `_. +- **Commit Updates**: Stay informed about repository + updates and test failures on the `scikit-learn-commits list + `_. .. _user_questions: -User questions +User Questions ============== -- Some scikit-learn developers support users on StackOverflow using - the `[scikit-learn] `_ +If you have questions, this is our general workflow. + +- **Stack Overflow**: Some scikit-learn developers support users using the + `[scikit-learn] `_ tag. -- For general theoretical or methodological Machine Learning questions - `stack exchange `_ is probably a more - suitable venue. +- **General Machine Learning Queries**: For broader machine learning + discussions, visit `Stack Exchange `_. + +When posting questions: + +- Please use a descriptive question in the title field (e.g. no "Please + help with scikit-learn!" as this is not a question) + +- Provide detailed context, expected results, and actual observations. + +- Include code and data snippets (preferably minimalistic scripts, + up to ~20 lines). + +- Describe your data and preprocessing steps, including sample size, + feature types (categorical or numerical), and the target for supervised + learning tasks (classification type or regression). + +**Note**: Avoid asking user questions on the bug tracker to keep +the focus on development. -In both cases please use a descriptive question in the title field (e.g. -no "Please help with scikit-learn!" as this is not a question) and put -details on what you tried to achieve, what were the expected results and -what you observed instead in the details field. +- `GitHub Discussions `_ + Usage questions such as methodological -Code and data snippets are welcome. Minimalistic (up to ~20 lines long) -reproduction script very helpful. +- `Stack Overflow `_ + Programming/user questions with `[scikit-learn]` tag -Please describe the nature of your data and how you preprocessed it: -what is the number of samples, what is the number and type of features -(i.d. categorical or numerical) and for supervised learning tasks, -what target are your trying to predict: binary, multiclass (1 out of -``n_classes``) or multilabel (``k`` out of ``n_classes``) classification -or continuous variable regression. +- `GitHub Bug Tracker `_ + Bug reports - Please do not ask usage questions on the issue tracker. -User questions should **not be asked on the bug tracker**, as it crowds -the list of issues and makes the development of the project harder. +- `Discord Server `_ + Current pull requests - Post any specific PR-related questions on your PR, + and you can share a link to your PR on this server. .. _bug_tracker: -Bug tracker +Bug Tracker =========== -If you think you've encountered a bug, please report it to the issue tracker: +Encountered a bug? Report it on our `issue tracker +`_ -https://github.com/scikit-learn/scikit-learn/issues +Include in your report: -Don't forget to include: +- Steps or scripts to reproduce the bug. - - steps (or better script) to reproduce, +- Expected and observed outcomes. - - expected outcome, +- Python or gdb tracebacks, if applicable. - - observed outcome or Python (or gdb) tracebacks +- The ideal bug report contains a :ref:`short reproducible code snippet + `, this way anyone can try to reproduce the bug easily. -To help developers fix your bug faster, please link to a https://gist.github.com -holding a standalone minimalistic python script that reproduces your bug and -optionally a minimalistic subsample of your dataset (for instance, exported -as CSV files using ``numpy.savetxt``). +- If your snippet is longer than around 50 lines, please link to a + `gist `_ or a github repo. -Note: Gists are Git cloneable repositories and thus you can use Git to -push datafiles to them. +**Tip**: Gists are Git repositories; you can push data files to them using Git. +Paid support +============ + +The following companies (listed in alphabetical order) offer support services +related to scikit-learn and have a proven track record of employing long-term +maintainers of scikit-learn and related open source projects: + +- `:probabl. `__ +- `Quansight `__ + +.. _social_media: + +Social Media +============ + +scikit-learn has presence on various social media platforms to share +updates with the community. The platforms are not monitored for user +questions. .. _gitter: Gitter -=== - -Some developers like to hang out on scikit-learn Gitter room: -https://gitter.im/scikit-learn/scikit-learn. +====== +**Note**: The scikit-learn Gitter room is no longer an active community. +For live discussions and support, please refer to the other channels +mentioned in this document. .. _documentation_resources: -Documentation resources +Documentation Resources ======================= -This documentation is relative to |release|. Documentation for -other versions can be found `here -`__. +This documentation is for |release|. Documentation for other versions can be found `here +`__, including zip archives which can be +downloaded for offline access. -Printable pdf documentation for old versions can be found `here -`_. +We no longer provide a PDF version of the documentation, but you can still generate it +locally by following the :ref:`building documentation instructions `. +The most recent version with a PDF documentation is quite old, 0.23.2 (released +in August 2020), but the PDF is available `here +`__. diff --git a/doc/templates/base.rst b/doc/templates/base.rst new file mode 100644 index 0000000000000..ee86bd8a18dbe --- /dev/null +++ b/doc/templates/base.rst @@ -0,0 +1,36 @@ +{{ objname | escape | underline(line="=") }} + +{% if objtype == "module" -%} + +.. automodule:: {{ fullname }} + +{%- elif objtype == "function" -%} + +.. currentmodule:: {{ module }} + +.. autofunction:: {{ objname }} + +.. minigallery:: {{ module }}.{{ objname }} + :add-heading: Gallery examples + :heading-level: - + +{%- elif objtype == "class" -%} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :inherited-members: + :special-members: __call__ + +.. minigallery:: {{ module }}.{{ objname }} {% for meth in methods %}{{ module }}.{{ objname }}.{{ meth }} {% endfor %} + :add-heading: Gallery examples + :heading-level: - + +{%- else -%} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} + +{%- endif -%} diff --git a/doc/templates/class.rst b/doc/templates/class.rst deleted file mode 100644 index 79ff2cf807794..0000000000000 --- a/doc/templates/class.rst +++ /dev/null @@ -1,12 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}============== - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/class_with_call.rst b/doc/templates/class_with_call.rst deleted file mode 100644 index f98b7dbbf6578..0000000000000 --- a/doc/templates/class_with_call.rst +++ /dev/null @@ -1,16 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}=============== - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - - {% block methods %} - .. automethod:: __call__ - {% endblock %} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/deprecated_class.rst b/doc/templates/deprecated_class.rst deleted file mode 100644 index 857e2c28ce1da..0000000000000 --- a/doc/templates/deprecated_class.rst +++ /dev/null @@ -1,23 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}============== - -.. meta:: - :robots: noindex - -.. warning:: - **DEPRECATED** - - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - - {% block methods %} - .. automethod:: __init__ - {% endblock %} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/deprecated_class_with_call.rst b/doc/templates/deprecated_class_with_call.rst deleted file mode 100644 index a04efcb80be07..0000000000000 --- a/doc/templates/deprecated_class_with_call.rst +++ /dev/null @@ -1,24 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}=============== - -.. meta:: - :robots: noindex - -.. warning:: - **DEPRECATED** - - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - - {% block methods %} - .. automethod:: __init__ - .. automethod:: __call__ - {% endblock %} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/deprecated_class_without_init.rst b/doc/templates/deprecated_class_without_init.rst deleted file mode 100644 index c019992493610..0000000000000 --- a/doc/templates/deprecated_class_without_init.rst +++ /dev/null @@ -1,19 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}============== - -.. meta:: - :robots: noindex - -.. warning:: - **DEPRECATED** - - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/deprecated_function.rst b/doc/templates/deprecated_function.rst deleted file mode 100644 index 6d13ac6aca2de..0000000000000 --- a/doc/templates/deprecated_function.rst +++ /dev/null @@ -1,19 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}==================== - -.. meta:: - :robots: noindex - -.. warning:: - **DEPRECATED** - - -.. currentmodule:: {{ module }} - -.. autofunction:: {{ objname }} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/function.rst b/doc/templates/function.rst deleted file mode 100644 index f4b11eda770e4..0000000000000 --- a/doc/templates/function.rst +++ /dev/null @@ -1,12 +0,0 @@ -:mod:`{{module}}`.{{objname}} -{{ underline }}==================== - -.. currentmodule:: {{ module }} - -.. autofunction:: {{ objname }} - -.. include:: {{module}}.{{objname}}.examples - -.. raw:: html - -
diff --git a/doc/templates/generate_deprecated.sh b/doc/templates/generate_deprecated.sh deleted file mode 100755 index a7301fb5dc419..0000000000000 --- a/doc/templates/generate_deprecated.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -for f in [^d]*; do (head -n2 < $f; echo ' -.. meta:: - :robots: noindex - -.. warning:: - **DEPRECATED** -'; tail -n+3 $f) > deprecated_$f; done diff --git a/doc/templates/index.html b/doc/templates/index.html index d333530ef8376..ff71b52ebd59c 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -1,15 +1,27 @@ {% extends "layout.html" %} {% set title = 'scikit-learn: machine learning in Python' %} -{% block content %} -
+ +{% if is_devrelease|tobool %} + {%- set contributing_link = pathto("developers/contributing") %} + {%- set contributing_attrs = "" %} +{%- else %} + {%- set contributing_link = "https://scikit-learn.org/dev/developers/contributing.html" %} + {%- set contributing_attrs = 'target="_blank" rel="noopener noreferrer"' %} +{%- endif %} + +{%- import "static/webpack-macros.html" as _webpack with context %} + +{% block docs_navbar %} +{{ super() }} + +
-

scikit-learn

-

Machine Learning in Python

- Getting Started - Release Highlights for {{ release_highlights_version }} - GitHub +

scikit-learn

+

Machine Learning in Python

+ Getting Started + Release Highlights for {{ release_highlights_version }}
    @@ -23,231 +35,279 @@

    Machine Learning in

-
+{% endblock docs_navbar %} + +{% block docs_main %} + +
+
-
+
-

Classification

-

Identifying which category an object belongs to.

-

Applications: Spam detection, image recognition.
- Algorithms: - SVM, - nearest neighbors, - random forest, - and more...

+

+ Classification +

+

Identifying which category an object belongs to.

+

+ Applications: Spam detection, image recognition.
+ Algorithms: + Gradient boosting, + nearest neighbors, + random forest, + logistic regression, + and more... +

-
+
-
+
-

Regression

-

Predicting a continuous-valued attribute associated with an object.

-

Applications: Drug response, Stock prices.
- Algorithms: - SVR, - nearest neighbors, - random forest, - and more...

+

+ Regression +

+

Predicting a continuous-valued attribute associated with an object.

+

+ Applications: Drug response, stock prices.
+ Algorithms: + Gradient boosting, + nearest neighbors, + random forest, + ridge, + and more... +

-
+
-
+
-

Clustering

-

Automatic grouping of similar objects into sets.

-

Applications: Customer segmentation, Grouping experiment outcomes
- Algorithms: - k-Means, - spectral clustering, - mean-shift, - and more...

+

+ Clustering +

+

Automatic grouping of similar objects into sets.

+

+ Applications: Customer segmentation, grouping experiment outcomes.
+ Algorithms: + k-Means, + HDBSCAN, + hierarchical clustering, + and more... +

-
+
-
+
-

Dimensionality reduction

-

Reducing the number of random variables to consider.

-

Applications: Visualization, Increased efficiency
- Algorithms: - k-Means, - feature selection, - non-negative matrix factorization, - and more...

+

+ Dimensionality reduction +

+

Reducing the number of random variables to consider.

+

+ Applications: Visualization, increased efficiency.
+ Algorithms: + PCA, + feature selection, + non-negative matrix factorization, + and more... +

-
+
-
+
-

Model selection

-

Comparing, validating and choosing parameters and models.

-

Applications: Improved accuracy via parameter tuning
- Algorithms: - grid search, - cross validation, - metrics, - and more...

+

+ Model selection +

+

Comparing, validating and choosing parameters and models.

+

+ Applications: Improved accuracy via parameter tuning.
+ Algorithms: + Grid search, + cross validation, + metrics, + and more... +

-
+
-
+
-

Preprocessing

-

Feature extraction and normalization.

-

Applications: Transforming input data such as text for use with machine learning algorithms.
- Algorithms: - preprocessing, - feature extraction, - and more...

+

+ Preprocessing +

+

Feature extraction and normalization.

+

+ Applications: Transforming input data such as text for use with machine learning algorithms.
+ Algorithms: + Preprocessing, + feature extraction, + and more... +

-
-
-
+{% endblock docs_main %} + +{% block footer %} + +
+
+

News

    -
  • On-going development: - What's new (Changelog) -
  • December 2020. scikit-learn 0.24.0 is available for download (Changelog). -
  • -
  • August 2020. scikit-learn 0.23.2 is available for download (Changelog). -
  • -
  • May 2020. scikit-learn 0.23.1 is available for download (Changelog). -
  • -
  • May 2020. scikit-learn 0.23.0 is available for download (Changelog). -
  • -
  • Scikit-learn from 0.23 requires Python 3.6 or greater. -
  • -
  • March 2020. scikit-learn 0.22.2 is available for download (Changelog). -
  • January 2020. scikit-learn 0.22.1 is available for download (Changelog). -
  • December 2019. scikit-learn 0.22 is available for download (Changelog and Release Highlights). -
  • -
  • Scikit-learn from 0.21 requires Python 3.5 or greater. -
  • -
  • July 2019. scikit-learn 0.21.3 (Changelog) and 0.20.4 (Changelog) are available for download. -
  • -
  • May 2019. scikit-learn 0.21.0 to 0.21.2 are available for download (Changelog). -
  • +
  • On-going development: scikit-learn 1.8 (Changelog).
  • +
  • June 2025. scikit-learn 1.7.0 is available for download (Changelog).
  • +
  • January 2025. scikit-learn 1.6.1 is available for download (Changelog).
  • +
  • December 2024. scikit-learn 1.6.0 is available for download (Changelog).
  • +
  • September 2024. scikit-learn 1.5.2 is available for download (Changelog).
  • +
  • July 2024. scikit-learn 1.5.1 is available for download (Changelog).
  • +
  • May 2024. scikit-learn 1.5.0 is available for download (Changelog).
  • +
  • All releases: What's new (Changelog).
+

Community

- - Help us, donate! - Cite us! +

+ Help us, donate! + Cite us! +

+

Who uses scikit-learn?

-
-
+ +