diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 11fcb5cd25d78..1f133d701ca53 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -84,6 +84,8 @@ jobs:
 
       - name: Setup Python
         uses: actions/setup-python@v2
+        with:
+          python-version: '3.9'  # update once build dependencies are available
 
       - name: Build and test wheels
         env:
diff --git a/.travis.yml b/.travis.yml
index 456d94301d4c1..7b37bc7c91f58 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,7 +22,7 @@ env:
     # Custom environment variables for the ARM wheel builder
     - CIBW_BUILD_VERBOSITY=1
     - CIBW_TEST_COMMAND="bash {project}/build_tools/travis/test_wheels.sh"
-    - CIBW_ENVIRONMENT="CPU_COUNT=6
+    - CIBW_ENVIRONMENT="CPU_COUNT=2
                         OMP_NUM_THREADS=2
                         OPENBLAS_NUM_THREADS=2
                         SKLEARN_BUILD_PARALLEL=10
@@ -39,7 +39,10 @@ jobs:
     # fast.
     - python: 3.7
       os: linux
-      arch: arm64
+      arch: arm64-graviton2
+      dist: focal
+      virt: lxd
+      group: edge
       if: type = cron or commit_message =~ /\[cd build\]/
       env:
         - BUILD_WHEEL=true
@@ -47,7 +50,10 @@ jobs:
 
     - python: 3.8
       os: linux
-      arch: arm64
+      arch: arm64-graviton2
+      dist: focal
+      virt: lxd
+      group: edge
       if: type = cron or commit_message =~ /\[cd build\]/
       env:
         - BUILD_WHEEL=true
@@ -55,7 +61,10 @@ jobs:
 
     - python: 3.9
       os: linux
-      arch: arm64
+      arch: arm64-graviton2
+      dist: focal
+      virt: lxd
+      group: edge
       if: type = cron or commit_message =~ /\[cd build\]/
       env:
         - BUILD_WHEEL=true
diff --git a/README.rst b/README.rst
index b41bb9c98daba..d8357c246543c 100644
--- a/README.rst
+++ b/README.rst
@@ -5,15 +5,15 @@
 .. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=main
 .. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main
 
+.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token
+.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn
+
 .. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=main
-.. _Travis: https://travis-ci.com/scikit-learn/scikit-learn
+.. _Travis: https://app.travis-ci.com/github/scikit-learn/scikit-learn
 
 .. |Codecov| image:: https://codecov.io/gh/scikit-learn/scikit-learn/branch/main/graph/badge.svg?token=Pk8G9gg3y9
 .. _Codecov: https://codecov.io/gh/scikit-learn/scikit-learn
 
-.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token
-.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn
-
 .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule
 .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule
 
@@ -186,6 +186,7 @@ Communication
 - Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn
 - Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions
 - Website: https://scikit-learn.org
+- LinkedIn: https://www.linkedin.com/company/scikit-learn
 
 Citation
 ~~~~~~~~
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000..251dbb054df52
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,17 @@
+# Security Policy
+
+## Supported Versions
+
+| Version   | Supported          |
+| --------- | ------------------ |
+| 1.0.1     | :white_check_mark: |
+| < 1.0.1   | :x:                |
+
+## Reporting a Vulnerability
+
+Please report security vulnerabilities by email to `security@scikit-learn.org`.
+This email is an alias to a subset of the scikit-learn maintainers' team.
+
+If the security vulnerability is accepted, a patch will be crafted privately
+in order to prepare a dedicated bugfix release as timely as possible (depending
+on the complexity of the fix).
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 19bc8b4efe4df..60b1f2811f88d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -48,7 +48,7 @@ jobs:
         pip install pytest flake8 mypy==0.782 black==21.6b0
       displayName: Install linters
     - bash: |
-        black --check .
+        black --check --diff .
       displayName: Run black
     - bash: |
         ./build_tools/circle/linting.sh
@@ -81,6 +81,7 @@ jobs:
         # Tests that require large downloads over the networks are skipped in CI.
         # Here we make sure, that they are still run on a regular basis.
         SKLEARN_SKIP_NETWORK_TESTS: '0'
+        CREATE_ISSUE_ON_TRACKER: 'true'
 
 # Check compilation with intel C++ compiler (ICC)
 - template: build_tools/azure/posix.yml
@@ -105,6 +106,28 @@ jobs:
         COVERAGE: 'false'
         BUILD_WITH_ICC: 'true'
 
+- template: build_tools/azure/posix-docker.yml
+  parameters:
+    name: Linux_Nightly_PyPy
+    vmImage: ubuntu-20.04
+    dependsOn: [linting, git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        or(
+          eq(variables['Build.Reason'], 'Schedule'),
+          contains(dependencies['git_commit']['outputs']['commit.message'], '[pypy]')
+        )
+      )
+    matrix:
+      pypy3:
+        DISTRIB: 'conda-mamba-pypy3'
+        DOCKER_CONTAINER: 'condaforge/mambaforge-pypy3:4.10.3-5'
+        PILLOW_VERSION: 'none'
+        PANDAS_VERSION: 'none'
+        CREATE_ISSUE_ON_TRACKER: 'true'
+
 # Will run all the time regardless of linting outcome.
 - template: build_tools/azure/posix.yml
   parameters:
@@ -123,6 +146,7 @@ jobs:
         PYTHON_VERSION: '*'
         BLAS: 'mkl'
         COVERAGE: 'true'
+        SHOW_SHORT_SUMMARY: 'true'
 
 # Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge
 - template: build_tools/azure/posix.yml
@@ -198,11 +222,6 @@ jobs:
         ne(variables['Build.Reason'], 'Schedule')
       )
     matrix:
-      pypy3:
-        DISTRIB: 'conda-mamba-pypy3'
-        DOCKER_CONTAINER: 'condaforge/mambaforge-pypy3:4.10.3-5'
-        PILLOW_VERSION: 'none'
-        PANDAS_VERSION: 'none'
       debian_atlas_32bit:
         DISTRIB: 'debian-32'
         DOCKER_CONTAINER: 'i386/debian:10.9'
@@ -237,7 +256,7 @@ jobs:
 - template: build_tools/azure/windows.yml
   parameters:
     name: Windows
-    vmImage: vs2017-win2016
+    vmImage: windows-latest
     dependsOn: [linting, git_commit]
     condition: |
       and(
diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml
index 18642f1f39b7b..2578453578f61 100644
--- a/build_tools/azure/posix-docker.yml
+++ b/build_tools/azure/posix-docker.yml
@@ -34,6 +34,7 @@ jobs:
     # Set in azure-pipelines.yml
     DISTRIB: ''
     DOCKER_CONTAINER: ''
+    SHOW_SHORT_SUMMARY: 'false'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}
diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml
index 79101c8eef227..fb4e7f85f9527 100644
--- a/build_tools/azure/posix.yml
+++ b/build_tools/azure/posix.yml
@@ -33,6 +33,8 @@ jobs:
     THREADPOOLCTL_VERSION: 'latest'
     COVERAGE: 'true'
     TEST_DOCSTRINGS: 'false'
+    CREATE_ISSUE_ON_TRACKER: 'false'
+    SHOW_SHORT_SUMMARY: 'false'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}
@@ -72,6 +74,30 @@ jobs:
         testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }}
       displayName: 'Publish Test Results'
       condition: succeededOrFailed()
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.9'
+      displayName: Place Python into path to update issue tracker
+      condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'),
+                     eq(variables['Build.Reason'], 'Schedule'))
+    - bash: |
+        set -ex
+        if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then
+          echo "GitHub Token is not set. Issue tracker will not be updated."
+          exit
+        fi
+
+        LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID"
+        CI_NAME="$SYSTEM_JOBIDENTIFIER"
+        ISSUE_REPO="$BUILD_REPOSITORY_NAME"
+
+        pip install defusedxml PyGithub
+        python maint_tools/create_issue_from_juint.py $(BOT_GITHUB_TOKEN) $CI_NAME $ISSUE_REPO $LINK_TO_RUN $JUNIT_FILE
+      displayName: 'Update issue tracker'
+      env:
+        JUNIT_FILE: $(TEST_DIR)/$(JUNITXML)
+      condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'),
+                     eq(variables['Build.Reason'], 'Schedule'))
     - script: |
         build_tools/azure/upload_codecov.sh
       condition: and(succeeded(), eq(variables['COVERAGE'], 'true'))
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 8e8110bcaef04..721cfc665a14a 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -17,7 +17,6 @@ cp setup.cfg $TEST_DIR
 cd $TEST_DIR
 
 python -c "import sklearn; sklearn.show_versions()"
-python -m threadpoolctl -i sklearn
 
 if ! command -v conda &> /dev/null
 then
@@ -36,18 +35,28 @@ if [[ "$COVERAGE" == "true" ]]; then
     # report that otherwise hides the test failures and forces long scrolls in
     # the CI logs.
     export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc"
-    TEST_CMD="$TEST_CMD --cov-config=$COVERAGE_PROCESS_START --cov sklearn --cov-report="
+    TEST_CMD="$TEST_CMD --cov-config='$COVERAGE_PROCESS_START' --cov sklearn --cov-report="
 fi
 
 if [[ -n "$CHECK_WARNINGS" ]]; then
     # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage
     TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning"
+
+    # Python 3.10 deprecates disutils and is imported by numpy interally during import time
+    TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning"
+
+    # Workaround for https://github.com/pypa/setuptools/issues/2885
+    TEST_CMD="$TEST_CMD -Wignore:Creating\ a\ LegacyVersion:DeprecationWarning"
 fi
 
 if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then
     TEST_CMD="$TEST_CMD -n2"
 fi
 
+if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then
+    TEST_CMD="$TEST_CMD -ra"
+fi
+
 set -x
-$TEST_CMD --pyargs sklearn
+eval "$TEST_CMD --pyargs sklearn"
 set +x
diff --git a/build_tools/azure/windows.yml b/build_tools/azure/windows.yml
index f835fd02cf58e..6b4789e87627b 100644
--- a/build_tools/azure/windows.yml
+++ b/build_tools/azure/windows.yml
@@ -20,6 +20,7 @@ jobs:
     PYTEST_XDIST: 'true'
     PYTEST_XDIST_VERSION: 'latest'
     TEST_DIR: '$(Agent.WorkFolder)/tmp_folder'
+    SHOW_SHORT_SUMMARY: 'false'
   strategy:
     matrix:
       ${{ insert }}: ${{ parameters.matrix }}
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 8facdc59a6c56..ddb54c840093a 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -145,8 +145,8 @@ fi
 
 MINICONDA_PATH=$HOME/miniconda
 # Install dependencies with miniconda
-wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-   -O miniconda.sh
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
+    -O miniconda.sh
 chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
 export PATH="/usr/lib/ccache:$MINICONDA_PATH/bin:$PATH"
 
@@ -165,7 +165,7 @@ fi
 source build_tools/shared.sh
 
 # packaging won't be needed once setuptools starts shipping packaging>=17.0
-conda create -n $CONDA_ENV_NAME --yes --quiet \
+mamba create -n $CONDA_ENV_NAME --yes --quiet \
     python="${PYTHON_VERSION:-*}" \
     "$(get_dep numpy $NUMPY_VERSION)" \
     "$(get_dep scipy $SCIPY_VERSION)" \
@@ -176,6 +176,11 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \
     joblib memory_profiler packaging seaborn pillow pytest coverage
 
 source activate testenv
+# Pin PyWavelet to 1.1.1 that is the latest version that support our minumum
+# NumPy version required. If PyWavelets 1.2+ is installed, it would require
+# NumPy 1.17+ that trigger a bug with Pandas 0.25:
+# https://github.com/numpy/numpy/issues/18355#issuecomment-774610226
+pip install PyWavelets==1.1.1
 pip install "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)"
 pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)"
 pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)"
diff --git a/build_tools/github/test_wheels.sh b/build_tools/github/test_wheels.sh
index 58a05b6182006..1a984bc91dba8 100755
--- a/build_tools/github/test_wheels.sh
+++ b/build_tools/github/test_wheels.sh
@@ -9,7 +9,7 @@ if [[ "$OSTYPE" != "linux-gnu" ]]; then
     cp $CONFTEST_PATH $CONFTEST_NAME
 fi
 
+# Test that there are no links to system libraries in the
+# threadpoolctl output section of the show_versions output:
+python -c "import sklearn; sklearn.show_versions()"
 pytest --pyargs sklearn
-
-# Test that there are no links to system libraries
-python -m threadpoolctl -i sklearn
diff --git a/build_tools/github/test_windows_wheels.sh b/build_tools/github/test_windows_wheels.sh
index cf33252d551ba..a04e390b5cdc4 100755
--- a/build_tools/github/test_windows_wheels.sh
+++ b/build_tools/github/test_windows_wheels.sh
@@ -10,15 +10,17 @@ if [[ "$BITNESS" == "32" ]]; then
     # 32-bit architectures use the regular
     # test command (outside of the minimal Docker container)
     cp $CONFTEST_PATH $CONFTEST_NAME
+    python -c "import sklearn; sklearn.show_versions()"
     pytest --pyargs sklearn
-    python -m threadpoolctl -i sklearn
 else
-    docker container run -e SKLEARN_SKIP_NETWORK_TESTS=1 \
-                         -e OMP_NUM_THREADS=2 \
-                         -e OPENBLAS_NUM_THREADS=2 \
-                         --rm scikit-learn/minimal-windows \
-                         powershell -Command "pytest --pyargs sklearn"
+    docker container run \
+        --rm scikit-learn/minimal-windows \
+        powershell -Command "python -c 'import sklearn; sklearn.show_versions()'"
 
-    docker container run --rm scikit-learn/minimal-windows \
-                         powershell -Command "python -m threadpoolctl -i sklearn"
+    docker container run \
+        -e SKLEARN_SKIP_NETWORK_TESTS=1 \
+        -e OMP_NUM_THREADS=2 \
+        -e OPENBLAS_NUM_THREADS=2 \
+        --rm scikit-learn/minimal-windows \
+        powershell -Command "pytest --pyargs sklearn"
 fi
diff --git a/build_tools/travis/test_wheels.sh b/build_tools/travis/test_wheels.sh
index aa3d0d8c0ef1b..11d4bd73cedd7 100755
--- a/build_tools/travis/test_wheels.sh
+++ b/build_tools/travis/test_wheels.sh
@@ -3,7 +3,7 @@
 pip install --upgrade pip || travis_terminate $?
 pip install pytest pytest-xdist || travis_terminate $?
 
+# Test that there are no links to system libraries in the threadpoolctl
+# section of the show_versions output.
+python -c "import sklearn; sklearn.show_versions()" || travis_terminate $?
 python -m pytest -n $CPU_COUNT --pyargs sklearn || travis_terminate $?
-
-# Test that there are no links to system libraries
-python -m threadpoolctl -i sklearn || travis_terminate $?
diff --git a/doc/authors.rst b/doc/authors.rst
index 73631f2be0121..9a2b1f89d5d06 100644
--- a/doc/authors.rst
+++ b/doc/authors.rst
@@ -42,6 +42,10 @@
     <p>Adrin Jalali</p>
     </div>
     <div>
+    <a href='https://github.com/jjerphan'><img src='https://avatars.githubusercontent.com/u/13029839?v=4' class='avatar' /></a> <br />
+    <p>Julien Jerphanion</p>
+    </div>
+    <div>
     <a href='https://github.com/glemaitre'><img src='https://avatars.githubusercontent.com/u/7454015?v=4' class='avatar' /></a> <br />
     <p>Guillaume Lemaitre</p>
     </div>
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 985a246eb73b4..54e7f66ffc3c9 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -254,33 +254,35 @@ how to set up your git repository:
 
    .. prompt:: bash $
 
-       git clone git@github.com:YourLogin/scikit-learn.git  # add --depth 1 if your connection is slow
-       cd scikit-learn
+      git clone git@github.com:YourLogin/scikit-learn.git  # add --depth 1 if your connection is slow
+      cd scikit-learn
 
-4. Install the development dependencies:
-
-    .. prompt:: bash $
-
-       pip install cython pytest pytest-cov flake8 mypy black==21.6b0
-
-5. Install scikit-learn in editable mode:
+3. Follow steps 2-7 in :ref:`install_bleeding_edge` to build scikit-learn in
+   development mode and return to this document.
 
-    .. prompt:: bash $
+4. Install the development dependencies:
 
-       pip install --no-build-isolation --editable .
+   .. prompt:: bash $
 
-   If you receive errors in building scikit-learn, see the
-   :ref:`install_bleeding_edge` section.
+        pip install pytest pytest-cov flake8 mypy black==21.6b0
 
 .. _upstream:
 
-6. Add the ``upstream`` remote. This saves a reference to the main
+5. Add the ``upstream`` remote. This saves a reference to the main
    scikit-learn repository, which you can use to keep your repository
    synchronized with the latest changes:
 
    .. prompt:: bash $
 
-    git remote add upstream https://github.com/scikit-learn/scikit-learn.git
+        git remote add upstream git@github.com:scikit-learn/scikit-learn.git
+
+6. Check that the `upstream` and `origin` remote aliases are configured correctly
+   by running `git remote -v` which should display::
+
+        origin	git@github.com:YourLogin/scikit-learn.git (fetch)
+        origin	git@github.com:YourLogin/scikit-learn.git (push)
+        upstream	git@github.com:scikit-learn/scikit-learn.git (fetch)
+        upstream	git@github.com:scikit-learn/scikit-learn.git (push)
 
 You should now have a working installation of scikit-learn, and your git
 repository properly configured. The next steps now describe the process of
@@ -536,7 +538,7 @@ profiling and Cython optimizations.
 
    For two very well documented and more detailed guides on development
    workflow, please pay a visit to the `Scipy Development Workflow
-   <https://docs.scipy.org/doc/numpy/dev/gitwash/development_workflow.html>`_ -
+   <https://docs.scipy.org/doc/scipy/reference/dev/contributor/development_workflow.html>`_ -
    and the `Astropy Workflow for Developers
    <https://astropy.readthedocs.io/en/latest/development/workflow/development_workflow.html>`_
    sections.
@@ -547,7 +549,7 @@ Continuous Integration (CI)
 * Azure pipelines are used for testing scikit-learn on Linux, Mac and Windows,
   with different dependencies and settings.
 * CircleCI is used to build the docs for viewing, for linting with flake8, and
-  for testing with PyPy and ARM64 / aarch64 on Linux
+  for testing with ARM64 / aarch64 on Linux
 
 Please note that if one of the following markers appear in the latest commit
 message, the following actions are taken.
@@ -558,8 +560,9 @@ message, the following actions are taken.
     [ci skip]              CI is skipped completely
     [cd build]             CD is run (wheels and source distribution are built)
     [lint skip]            Azure pipeline skips linting
-    [scipy-dev]            Add a Travis build with our dependencies (numpy, scipy, etc ...) development builds
-    [icc-build]            Add a Travis build with the Intel C compiler (ICC)
+    [scipy-dev]            Build & test with our dependencies (numpy, scipy, etc ...) development builds
+    [icc-build]            Build & test with the Intel C compiler (ICC)
+    [pypy]                 Build & test with PyPy
     [doc skip]             Docs are not built
     [doc quick]            Docs built, but excludes example gallery plots
     [doc build]            Docs built including example gallery plots (very long)
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index b22febb7f1359..298200f5a2afd 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -9,8 +9,8 @@ etc.). Please refer to our :ref:`installation instructions
 
 ``Scikit-learn`` is an open source machine learning library that supports
 supervised and unsupervised learning. It also provides various tools for
-model fitting, data preprocessing, model selection and evaluation, and many
-other utilities.
+model fitting, data preprocessing, model selection, model evaluation,
+and many other utilities.
 
 Fitting and predicting: estimator basics
 ----------------------------------------
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 09637b5d938d1..ac4807e052f66 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -492,11 +492,15 @@ computed using a function of a gradient of the image.
 
 .. |coin_kmeans| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_001.png
     :target: ../auto_examples/cluster/plot_coin_segmentation.html
-    :scale: 65
+    :scale: 35
 
 .. |coin_discretize| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_002.png
     :target: ../auto_examples/cluster/plot_coin_segmentation.html
-    :scale: 65
+    :scale: 35
+
+.. |coin_cluster_qr| image:: ../auto_examples/cluster/images/sphx_glr_plot_coin_segmentation_003.png
+    :target: ../auto_examples/cluster/plot_coin_segmentation.html
+    :scale: 35
 
 Different label assignment strategies
 -------------------------------------
@@ -508,12 +512,24 @@ In particular, unless you control the ``random_state``, it may not be
 reproducible from run-to-run, as it depends on random initialization.
 The alternative ``"discretize"`` strategy is 100% reproducible, but tends
 to create parcels of fairly even and geometrical shape.
+The recently added ``"cluster_qr"`` option is a deterministic alternative that
+tends to create the visually best partitioning on the example application
+below.
+
+================================  ================================  ================================
+ ``assign_labels="kmeans"``        ``assign_labels="discretize"``    ``assign_labels="cluster_qr"``
+================================  ================================  ================================
+|coin_kmeans|                          |coin_discretize|                  |coin_cluster_qr|
+================================  ================================  ================================
+
+.. topic:: References:
+       
+ * `"Multiclass spectral clustering"
+   <https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf>`_
+   Stella X. Yu, Jianbo Shi, 2003
 
-=====================================  =====================================
- ``assign_labels="kmeans"``              ``assign_labels="discretize"``
-=====================================  =====================================
-|coin_kmeans|                          |coin_discretize|
-=====================================  =====================================
+ * :doi:`"Simple, direct, and efficient multi-way spectral clustering"<10.1093/imaiai/iay008>`
+    Anil Damle, Victor Minden, Lexing Ying, 2019
 
 Spectral Clustering Graphs
 --------------------------
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 54ed20cc36be4..eac8f063be258 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -231,7 +231,7 @@ problem solved is a PCA problem (dictionary learning) with an
 .. math::
    (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2}
                 ||X-UV||_{\text{Fro}}^2+\alpha||V||_{1,1} \\
-                \text{subject to } & ||U_k||_2 = 1 \text{ for all }
+                \text{subject to } & ||U_k||_2 <= 1 \text{ for all }
                 0 \leq k < n_{components}
 
 :math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}`
@@ -273,26 +273,39 @@ Exact Kernel PCA
 ----------------
 
 :class:`KernelPCA` is an extension of PCA which achieves non-linear
-dimensionality reduction through the use of kernels (see :ref:`metrics`). It
+dimensionality reduction through the use of kernels (see :ref:`metrics`) [Scholkopf1997]_. It
 has many applications including denoising, compression and structured
 prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 ``transform`` and ``inverse_transform``.
 
-.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png
     :target: ../auto_examples/decomposition/plot_kernel_pca.html
     :align: center
     :scale: 75%
 
+.. note::
+    :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
+    function mapping samples from the PCA basis into the original feature
+    space [Bakir2004]_. Thus, the reconstruction obtained with
+    :meth:`KernelPCA.inverse_transform` is an approximation. See the example
+    linked below for more details.
+
 .. topic:: Examples:
 
     * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
 
 .. topic:: References:
 
-    * Kernel PCA was introduced in "Kernel principal component analysis"
-      Bernhard Schoelkopf, Alexander J. Smola, and Klaus-Robert Mueller. 1999.
-      In Advances in kernel methods, MIT Press, Cambridge, MA, USA 327-352.
+    .. [Scholkopf1997] Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       `"Kernel principal component analysis."
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
 
+    .. [Bakir2004] Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       `"Learning to find pre-images."
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
+       Advances in neural information processing systems 16 (2004): 449-456.
 
 .. _kPCA_Solvers:
 
@@ -513,7 +526,7 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code.
 .. math::
    (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2}
                 ||X-UV||_{\text{Fro}}^2+\alpha||U||_{1,1} \\
-                \text{subject to } & ||V_k||_2 = 1 \text{ for all }
+                \text{subject to } & ||V_k||_2 <= 1 \text{ for all }
                 0 \leq k < n_{\mathrm{atoms}}
 
 
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 6aa9cb417aa5d..94fc69305cf6d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -88,14 +88,14 @@ estimate the noise level of data. An illustration of the
 log-marginal-likelihood (LML) landscape shows that there exist two local
 maxima of LML.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_003.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
 The first corresponds to a model with a high noise level and a
 large length scale, which explains all variations in the data by noise.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_002.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_004.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
@@ -106,7 +106,7 @@ hyperparameters, the gradient-based optimization might also converge to the
 high-noise solution. It is thus important to repeat the optimization several
 times for different initializations.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_003.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_noisy_005.png
    :target: ../auto_examples/gaussian_process/plot_gpr_noisy.html
    :align: center
 
@@ -142,7 +142,7 @@ Moreover, the noise level
 of the data is learned explicitly by GPR by an additional WhiteKernel component
 in the kernel and by the regularization parameter alpha of KRR.
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_compare_gpr_krr_005.png
    :target: ../auto_examples/gaussian_process/plot_compare_gpr_krr.html
    :align: center
 
@@ -220,7 +220,7 @@ overall noise level is very small, indicating that the data can be very well
 explained by the model. The figure shows also that the model makes very
 confident predictions until around 2015
 
-.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_001.png
+.. figure:: ../auto_examples/gaussian_process/images/sphx_glr_plot_gpr_co2_003.png
    :target: ../auto_examples/gaussian_process/plot_gpr_co2.html
    :align: center
 
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 043d158825a6f..1cf85b08db66a 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -102,6 +102,7 @@ can provide additional strategies beyond what is built-in:
   - :class:`neural_network.MLPClassifier`
   - :class:`neighbors.RadiusNeighborsClassifier`
   - :class:`ensemble.RandomForestClassifier`
+  - :class:`linear_model.RidgeClassifier`
   - :class:`linear_model.RidgeClassifierCV`
 
 
diff --git a/doc/triage_team.rst b/doc/triage_team.rst
index ba8fba64c2414..b561272d4d380 100644
--- a/doc/triage_team.rst
+++ b/doc/triage_team.rst
@@ -6,10 +6,6 @@
       img.avatar {border-radius: 10px;}
     </style>
     <div>
-    <a href='https://github.com/jjerphan'><img src='https://avatars.githubusercontent.com/u/13029839?v=4' class='avatar' /></a> <br />
-    <p>Julien Jerphanion</p>
-    </div>
-    <div>
     <a href='https://github.com/alfaro96'><img src='https://avatars.githubusercontent.com/u/32649176?v=4' class='avatar' /></a> <br />
     <p>Juan Carlos Alfaro Jiménez</p>
     </div>
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 17c2b845d40d9..af22f714081a9 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -2,6 +2,45 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_1_0_2:
+
+Version 1.0.2
+=============
+
+**In Development**
+
+Changelog
+---------
+
+:mod:`sklearn.cluster`
+......................
+
+- |Fix| Fixed an infinite loop in :func:`cluster.SpectralClustering` by
+  moving an iteration counter from try to except.
+  :pr:`21271` by :user:`Tyler Martin <martintb>`
+
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed the constraint on the objective function of
+  :class:`decomposition.DictionaryLearning`,
+  :class:`decomposition.MiniBatchDictionaryLearning`, :class:`decomposition.SparsePCA`
+  and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced
+  article. :pr:`19210` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+:mod:`sklearn.preprocessing`
+............................
+
+- |Fix| Fixes compatibility bug with NumPy 1.22 in :class:`preprocessing.OneHotEncoder`.
+  :pr:`21517` by `Thomas Fan`_.
+
+:mod:`sklearn.utils`
+....................
+
+- |Fix| :func:`utils.estimator_html_repr` now escapes all the estimator
+  descriptions in the generated HTML. :pr:`21493` by
+  :user:`Aurélien Geron <ageron>`.
+
 .. _changes_1_0_1:
 
 Version 1.0.1
@@ -1082,10 +1121,10 @@ Adrian Garcia Badaracco, Adrian Sadłocha, Adrin Jalali, Agamemnon Krasoulis,
 Alberto Rubiales, Albert Thomas, Albert Villanova del Moral, Alek Lefebvre,
 Alessia Marcolini, Alexandr Fonari, Alihan Zihna, Aline Ribeiro de Almeida,
 Amanda, Amanda Dsouza, Amol Deshmukh, Ana Pessoa, Anavelyz, Andreas Mueller,
-Andrew Delong, Ashish, Ashvith Shetty, Atsushi Nukariya, Avi Gupta, Ayush
-Singh, baam, BaptBillard, Benjamin Pedigo, Bertrand Thirion, Bharat
-Raghunathan, bmalezieux, Brian Rice, Brian Sun, Bruno Charron, Bryan Chen,
-bumblebee, caherrera-meli, Carsten Allefeld, CeeThinwa, Chiara Marmo,
+Andrew Delong, Ashish, Ashvith Shetty, Atsushi Nukariya, Aurélien Geron, Avi
+Gupta, Ayush Singh, baam, BaptBillard, Benjamin Pedigo, Bertrand Thirion,
+Bharat Raghunathan, bmalezieux, Brian Rice, Brian Sun, Bruno Charron, Bryan
+Chen, bumblebee, caherrera-meli, Carsten Allefeld, CeeThinwa, Chiara Marmo,
 chrissobel, Christian Lorentzen, Christopher Yeh, Chuliang Xiao, Clément
 Fauchereau, cliffordEmmanuel, Conner Shen, Connor Tann, David Dale, David Katz,
 David Poznik, Dimitri Papadopoulos Orfanos, Divyanshu Deoli, dmallia17,
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 7141080afbc06..dc63f8f341501 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -38,6 +38,17 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+- |Enhancement| All scikit-learn models now generate a more informative
+  error message when some input contains unexpected `NaN` or infinite values.
+  In particular the message contains the input name ("X", "y" or
+  "sample_weight") and if an unexpected `NaN` value is found in `X`, the error
+  message suggests potential solutions.
+  :pr:`21219` by :user:`Olivier Grisel <ogrisel>`.
+
+- |Enhancement| All scikit-learn models now generate a more informative
+  error message when setting invalid hyper-parameters with `set_params`.
+  :pr:`21542` by :user:`Olivier Grisel <ogrisel>`.
+
 :mod:`sklearn.calibration`
 ..........................
 
@@ -45,6 +56,17 @@ Changelog
   `pos_label` to specify the positive class label.
   :pr:`21032` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Enhancement| :class:`CalibrationDisplay` accepts a parameter `pos_label` to
+  add this information to the plot.
+  :pr:`21038` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |Enhancement| :class:`cluster.SpectralClustering` and :func:`cluster.spectral`
+  now include the new `'cluster_qr'` method from :func:`cluster.cluster_qr`
+  that clusters samples in the embedding space as an alternative to the existing
+  `'kmeans'` and `'discrete'` methods.
+  See :func:`cluster.spectral_clustering` for more details.
+  :pr:`21148` by :user:`Andrew Knyazev <lobpcg>`
+
 :mod:`sklearn.cross_decomposition`
 ..................................
 
@@ -52,19 +74,69 @@ Changelog
   reconstruction of a `X` target when a `Y` parameter is given. :pr:`19680` by
   :user:`Robin Thibaut <robinthibaut>`.
 
+:mod:`sklearn.datasets`
+.......................
+
+- |Enhancement| :func:`datasets.make_swiss_roll` now supports the optional argument
+  hole; when set to True, it returns the swiss-hole dataset. :pr:`21482` by
+  :user:`Sebastian Pujalte <pujaltes>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
+- |Enhancement| :class:`decomposition.PCA` exposes a parameter `n_oversamples` to tune
+  :func:`sklearn.decomposition.randomized_svd` and
+  get accurate results when the number of features is large.
+  :pr:`21109` by :user:`Smile <x-shadow-man>`.
+
 - |Fix| :class:`decomposition.FastICA` now validates input parameters in `fit` instead of `__init__`.
   :pr:`21432` by :user:`Hannah Bohle <hhnnhh>` and :user:`Maren Westermann <marenwestermann>`.
 
+- |API| Adds :term:`get_feature_names_out` to all transformers in the
+  :mod:`~sklearn.decomposition` module:
+  :class:`~sklearn.decomposition.DictionaryLearning`,
+  :class:`~sklearn.decomposition.FactorAnalysis`,
+  :class:`~sklearn.decomposition.FastICA`,
+  :class:`~sklearn.decomposition.IncrementalPCA`,
+  :class:`~sklearn.decomposition.KernelPCA`,
+  :class:`~sklearn.decomposition.LatentDirichletAllocation`,
+  :class:`~sklearn.decomposition.MiniBatchDictionaryLearning`,
+  :class:`~sklearn.decomposition.MiniBatchSparsePCA`,
+  :class:`~sklearn.decomposition.NMF`,
+  :class:`~sklearn.decomposition.PCA`,
+  :class:`~sklearn.decomposition.SparsePCA`,
+  and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by
+  `Thomas Fan`_.
+
+- |API| :func:`decomposition.FastICA` now supports unit variance for whitening.
+  The default value of its `whiten` argument will change from `True`
+  (which behaves like `'arbitrary-variance'`) to `'unit-variance'` in version 1.3.
+  :pr:`19490` by :user:`Facundo Ferrin <fferrin>` and :user:`Julien Jerphanion <jjerphan>`
+
 :mod:`sklearn.impute`
 .....................
 
+- |Enhancement| Added support for `pd.NA` in :class:`SimpleImputer`.
+  :pr:`21114` by :user:`Ying Xiong <yxiong>`.
+
 - |API| Adds :meth:`get_feature_names_out` to :class:`impute.SimpleImputer`,
   :class:`impute.KNNImputer`, :class:`impute.IterativeImputer`, and
   :class:`impute.MissingIndicator`. :pr:`21078` by `Thomas Fan`_.
 
+- |API| The `verbose` parameter was deprecated for :class:`impute.SimpleImputer`.
+  A warning will always be raised upon the removal of empty columns.
+  :pr:`21448` by :user:`Oleh Kozynets <OlehKSS>` and
+  :user:`Christian Ritter <chritter>`.
+
+- |Fix| Fix a bug in :class:`linear_model.RidgeClassifierCV` where the method
+  `predict` was performing an `argmax` on the scores obtained from
+  `decision_function` instead of returning the multilabel indicator matrix.
+  :pr:`19869` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |Enhancement| :class:`linear_model.RidgeClassifier` is now supporting
+  multilabel classification.
+  :pr:`19689` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.metrics`
 ......................
 
@@ -74,6 +146,20 @@ Changelog
   backward compatibility, but this alias will be removed in 1.3.
   :pr:`21177` by :user:`Julien Jerphanion <jjerphan>`.
 
+- |API| Parameters ``sample_weight`` and ``multioutput`` of :func:`metrics.
+  mean_absolute_percentage_error` are now keyword-only, in accordance with `SLEP009
+  <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`.
+  A deprecation cycle was introduced.
+  :pr:`21576` by :user:`Paul-Emile Dugnat <pedugnat>`.
+
+:mod:`sklearn.manifold`
+.......................
+
+- |Enhancement| :func:`manifold.spectral_embedding` and
+  :class:`manifold.SpectralEmbedding` supports `np.float32` dtype and will
+  preserve this dtype.
+  :pr:`21534` by :user:`Andrew Knyazev <lobpcg>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
@@ -88,6 +174,33 @@ Changelog
   Setting a transformer to "passthrough" will pass the features unchanged.
   :pr:`20860` by :user:`Shubhraneel Pal <shubhraneel>`.
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Enhancement| Adds a `subsample` parameter to :class:`preprocessing.KBinsDiscretizer`.
+  This allows specifying a maximum number of samples to be used while fitting
+  the model. The option is only available when `strategy` is set to `quantile`.
+  :pr:`21445` by :user:`Felipe Bidu <fbidu>` and :user:`Amanda Dsouza <amy12xx>`.
+
+- |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in `fit`
+  instead of `__init__`.
+  :pr:`21434` by :user:`Krum Arnaudov <krumeto>`.
+
+
+:mod:`sklearn.decomposition.KernelPCA`
+......................................
+- |Fix| :class:`decomposition.KernelPCA` now validates input parameters in
+  `fit` instead of `__init__`.
+  :pr:`21567` by :user:`Maggie Chege <MaggieChege>`.
+
+:mod:`sklearn.svm`
+..................
+
+- |Fix| :class:`smv.NuSVC`, :class:`svm.NuSVR`, :class:`svm.SVC`,
+  :class:`svm.SVR`, :class:`svm.OneClassSVM` now validate input
+  parameters in `fit` instead of `__init__`.
+  :pr:`21436` by :user:`Haidar Almubarak <Haidar13 >`.
+
 :mod:`sklearn.utils`
 ....................
 
@@ -102,6 +215,25 @@ Changelog
   instead of `__init__`. :pr:`21430` by :user:`Desislava Vasileva <DessyVV>` and
   :user:`Lucy Jimenez <LucyJimenez>`.
 
+- |Enhancement| `utils.validation.check_array` and `utils.validation.type_of_target`
+  now accept an `input_name` parameter to make the error message more
+  informative when passed invalid input data (e.g. with NaN or infinite
+  values).
+  :pr:`21219` by :user:`Olivier Grisel <ogrisel>`.
+
+- |Enhancement| :func:`utils.validation.check_array` returns a float
+  ndarray with `np.nan` when passed a `Float32` or `Float64` pandas extension
+  array with `pd.NA`. :pr:`21278` by `Thomas Fan`_.
+
+:mod:`sklearn.random_projection`
+................................
+
+- |API| Adds :term:`get_feature_names_out` to all transformers in the
+  :mod:`~sklearn.random_projection` module:
+  :class:`~sklearn.random_projection.GaussianRandomProjection` and
+  :class:`~sklearn.random_projection.SparseRandomProjection`. :pr:`21330` by
+  :user:`Loïc Estève <lesteve>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/examples/cluster/plot_coin_segmentation.py b/examples/cluster/plot_coin_segmentation.py
index 4d83d2bccf639..cf916df3167c2 100644
--- a/examples/cluster/plot_coin_segmentation.py
+++ b/examples/cluster/plot_coin_segmentation.py
@@ -10,16 +10,19 @@
 This procedure (spectral clustering on an image) is an efficient
 approximate solution for finding normalized graph cuts.
 
-There are two options to assign labels:
+There are three options to assign labels:
 
-* with 'kmeans' spectral clustering will cluster samples in the embedding space
+* 'kmeans' spectral clustering clusters samples in the embedding space
   using a kmeans algorithm
-* whereas 'discrete' will iteratively search for the closest partition
-  space to the embedding space.
-
+* 'discrete' iteratively searches for the closest partition
+  space to the embedding space of spectral clustering.
+* 'cluster_qr' assigns labels using the QR factorization with pivoting
+  that directly determines the partition in the embedding space.
 """
 
-# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>, Brian Cheung
+# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
+#         Brian Cheung
+#         Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
 # License: BSD 3 clause
 
 import time
@@ -61,28 +64,51 @@
 eps = 1e-6
 graph.data = np.exp(-beta * graph.data / graph.data.std()) + eps
 
-# Apply spectral clustering (this step goes much faster if you have pyamg
-# installed)
-N_REGIONS = 25
+# The number of segmented regions to display needs to be chosen manually.
+# The current version of 'spectral_clustering' does not support determining
+# the number of good quality clusters automatically.
+n_regions = 26
 
 # %%
-# Visualize the resulting regions
-
-for assign_labels in ("kmeans", "discretize"):
+# Compute and visualize the resulting regions
+
+# Computing a few extra eigenvectors may speed up the eigen_solver.
+# The spectral clustering quality may also benetif from requesting
+# extra regions for segmentation.
+n_regions_plus = 3
+
+# Apply spectral clustering using the default eigen_solver='arpack'.
+# Any implemented solver can be used: eigen_solver='arpack', 'lobpcg', or 'amg'.
+# Choosing eigen_solver='amg' requires an extra package called 'pyamg'.
+# The quality of segmentation and the speed of calculations is mostly determined
+# by the choice of the solver and the value of the tolerance 'eigen_tol'.
+# TODO: varying eigen_tol seems to have no effect for 'lobpcg' and 'amg' #21243.
+for assign_labels in ("kmeans", "discretize", "cluster_qr"):
     t0 = time.time()
     labels = spectral_clustering(
-        graph, n_clusters=N_REGIONS, assign_labels=assign_labels, random_state=42
+        graph,
+        n_clusters=(n_regions + n_regions_plus),
+        eigen_tol=1e-7,
+        assign_labels=assign_labels,
+        random_state=42,
     )
+
     t1 = time.time()
     labels = labels.reshape(rescaled_coins.shape)
-
     plt.figure(figsize=(5, 5))
     plt.imshow(rescaled_coins, cmap=plt.cm.gray)
-    for l in range(N_REGIONS):
-        plt.contour(labels == l, colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
+
     plt.xticks(())
     plt.yticks(())
     title = "Spectral clustering: %s, %.2fs" % (assign_labels, (t1 - t0))
     print(title)
     plt.title(title)
+    for l in range(n_regions):
+        colors = [plt.cm.nipy_spectral((l + 4) / float(n_regions + 4))]
+        plt.contour(labels == l, colors=colors)
+        # To view individual segments as appear comment in plt.pause(0.5)
 plt.show()
+
+# TODO: After #21194 is merged and #21243 is fixed, check which eigen_solver
+# is the best and set eigen_solver='arpack', 'lobpcg', or 'amg' and eigen_tol
+# explicitly in this example.
diff --git a/examples/decomposition/plot_faces_decomposition.py b/examples/decomposition/plot_faces_decomposition.py
index ec6c4ebad226f..dc3b9cf8f4d7a 100644
--- a/examples/decomposition/plot_faces_decomposition.py
+++ b/examples/decomposition/plot_faces_decomposition.py
@@ -75,7 +75,7 @@ def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
     ),
     (
         "Non-negative components - NMF",
-        decomposition.NMF(n_components=n_components, init="nndsvda", tol=5e-3),
+        decomposition.NMF(n_components=n_components, tol=5e-3),
         False,
     ),
     (
diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 3ec0958a9e602..fe6d63240523e 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -3,70 +3,163 @@
 Kernel PCA
 ==========
 
-This example shows that Kernel PCA is able to find a projection of the data
-that makes data linearly separable.
+This example shows the difference between the Principal Components Analysis
+(:class:`~sklearn.decomposition.PCA`) and its kernalized version
+(:class:`~sklearn.decomposition.KernelPCA`).
 
+On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
+to find a projection of the data which linearly separates them while it is not the case
+with :class:`~sklearn.decomposition.PCA`.
+
+Finally, we show that inverting this projection is an approximation with
+:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
+:class:`~sklearn.decomposition.PCA`.
 """
 
 # Authors: Mathieu Blondel
 #          Andreas Mueller
+#          Guillaume Lemaitre
 # License: BSD 3 clause
 
-import numpy as np
+# %%
+# Projecting data: `PCA` vs. `KernelPCA`
+# --------------------------------------
+#
+# In this section, we show the advantages of using a kernel when
+# projecting data using a Principal Component Analysis (PCA). We create a
+# dataset made of two nested circles.
+from sklearn.datasets import make_circles
+from sklearn.model_selection import train_test_split
+
+X, y = make_circles(n_samples=1_000, factor=0.3, noise=0.05, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+
+# %%
+# Let's have a quick first look at the generated dataset.
 import matplotlib.pyplot as plt
 
+_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
+
+train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
+train_ax.set_ylabel("Feature #1")
+train_ax.set_xlabel("Feature #0")
+train_ax.set_title("Training data")
+
+test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+test_ax.set_xlabel("Feature #0")
+_ = test_ax.set_title("Testing data")
+
+# %%
+# The samples from each class cannot be linearly separated: there is no
+# straight line that can split the samples of the inner set from the outer
+# set.
+#
+# Now, we will use PCA with and without a kernel to see what is the effect of
+# using such a kernel. The kernel used here is a radial basis function (RBF)
+# kernel.
 from sklearn.decomposition import PCA, KernelPCA
-from sklearn.datasets import make_circles
 
-np.random.seed(0)
-
-X, y = make_circles(n_samples=400, factor=0.3, noise=0.05)
-
-kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
-X_kpca = kpca.fit_transform(X)
-X_back = kpca.inverse_transform(X_kpca)
-pca = PCA()
-X_pca = pca.fit_transform(X)
-
-# Plot results
-
-plt.figure()
-plt.subplot(2, 2, 1, aspect="equal")
-plt.title("Original space")
-reds = y == 0
-blues = y == 1
-
-plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k")
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
-X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
-# projection on the first principal component (in the phi space)
-Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
-plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower")
-
-plt.subplot(2, 2, 2, aspect="equal")
-plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Projection by PCA")
-plt.xlabel("1st principal component")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 3, aspect="equal")
-plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Projection by KPCA")
-plt.xlabel(r"1st principal component in space induced by $\phi$")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 4, aspect="equal")
-plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor="k")
-plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor="k")
-plt.title("Original space after inverse transform")
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-plt.tight_layout()
-plt.show()
+pca = PCA(n_components=2)
+kernel_pca = KernelPCA(
+    n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1
+)
+
+X_test_pca = pca.fit(X_train).transform(X_test)
+X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
+
+# %%
+fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(
+    ncols=3, figsize=(14, 4)
+)
+
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Testing data")
+
+pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
+pca_proj_ax.set_ylabel("Principal component #1")
+pca_proj_ax.set_xlabel("Principal component #0")
+pca_proj_ax.set_title("Projection of testing data\n using PCA")
+
+kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
+kernel_pca_proj_ax.set_ylabel("Principal component #1")
+kernel_pca_proj_ax.set_xlabel("Principal component #0")
+_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
+
+# %%
+# We recall that PCA transforms the data linearly. Intuitively, it means that
+# the coordinate system will be centered, rescaled on each component
+# with respected to its variance and finally be rotated.
+# The obtained data from this transformation is isotropic and can now be
+# projected on its _principal components_.
+#
+# Thus, looking at the projection made using PCA (i.e. the middle figure), we
+# see that there is no change regarding the scaling; indeed the data being two
+# concentric circles centered in zero, the original data is already isotropic.
+# However, we can see that the data have been rotated. As a
+# conclusion, we see that such a projection would not help if define a linear
+# classifier to distinguish samples from both classes.
+#
+# Using a kernel allows to make a non-linear projection. Here, by using an RBF
+# kernel, we expect that the projection will unfold the dataset while keeping
+# approximately preserving the relative distances of pairs of data points that
+# are close to one another in the original space.
+#
+# We observe such behaviour in the figure on the right: the samples of a given
+# class are closer to each other than the samples from the opposite class,
+# untangling both sample sets. Now, we can use a linear classifier to separate
+# the samples from the two classes.
+#
+# Projecting into the original feature space
+# ------------------------------------------
+#
+# One particularity to have in mind when using
+# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
+# (i.e. the back projection in the original feature space). With
+# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
+# `n_components` is the same than the number of original features.
+# This is the case in this example.
+#
+# We can investigate if we get the original dataset when back projecting with
+# :class:`~sklearn.decomposition.KernelPCA`.
+X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
+X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))
+
+# %%
+fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(
+    ncols=3, sharex=True, sharey=True, figsize=(13, 4)
+)
+
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Original test data")
+
+pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
+pca_back_proj_ax.set_xlabel("Feature #0")
+pca_back_proj_ax.set_title("Reconstruction via PCA")
+
+kernel_pca_back_proj_ax.scatter(
+    X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test
+)
+kernel_pca_back_proj_ax.set_xlabel("Feature #0")
+_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")
+
+# %%
+# While we see a perfect reconstruction with
+# :class:`~sklearn.decomposition.PCA` we observe a different result for
+# :class:`~sklearn.decomposition.KernelPCA`.
+#
+# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
+# rely on an analytical back-projection and thus an extact reconstruction.
+# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
+# to learn a mapping from the kernalized PCA basis to the original feature
+# space. This method therefore comes with an approximation introducing small
+# differences when back projecting in the original feature space.
+#
+# To improve the reconstruction using
+# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
+# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
+# which controls the reliance on the training data during the training of
+# the mapping.
diff --git a/examples/gaussian_process/plot_compare_gpr_krr.py b/examples/gaussian_process/plot_compare_gpr_krr.py
index d0aa462f5c3eb..42c013523f79c 100644
--- a/examples/gaussian_process/plot_compare_gpr_krr.py
+++ b/examples/gaussian_process/plot_compare_gpr_krr.py
@@ -3,119 +3,393 @@
 Comparison of kernel ridge and Gaussian process regression
 ==========================================================
 
-Both kernel ridge regression (KRR) and Gaussian process regression (GPR) learn
-a target function by employing internally the "kernel trick". KRR learns a
-linear function in the space induced by the respective kernel which corresponds
-to a non-linear function in the original space. The linear function in the
-kernel space is chosen based on the mean-squared error loss with
-ridge regularization. GPR uses the kernel to define the covariance of
-a prior distribution over the target functions and uses the observed training
-data to define a likelihood function. Based on Bayes theorem, a (Gaussian)
-posterior distribution over target functions is defined, whose mean is used
-for prediction.
-
-A major difference is that GPR can choose the kernel's hyperparameters based
-on gradient-ascent on the marginal likelihood function while KRR needs to
-perform a grid search on a cross-validated loss function (mean-squared error
-loss). A further difference is that GPR learns a generative, probabilistic
-model of the target function and can thus provide meaningful confidence
-intervals and posterior samples along with the predictions while KRR only
-provides predictions.
-
-This example illustrates both methods on an artificial dataset, which
-consists of a sinusoidal target function and strong noise. The figure compares
-the learned model of KRR and GPR based on a ExpSineSquared kernel, which is
-suited for learning periodic functions. The kernel's hyperparameters control
-the smoothness (l) and periodicity of the kernel (p). Moreover, the noise level
-of the data is learned explicitly by GPR by an additional WhiteKernel component
-in the kernel and by the regularization parameter alpha of KRR.
-
-The figure shows that both methods learn reasonable models of the target
-function. GPR correctly identifies the periodicity of the function to be
-roughly 2*pi (6.28), while KRR chooses the doubled periodicity 4*pi. Besides
-that, GPR provides reasonable confidence bounds on the prediction which are not
-available for KRR. A major difference between the two methods is the time
-required for fitting and predicting: while fitting KRR is fast in principle,
-the grid-search for hyperparameter optimization scales exponentially with the
-number of hyperparameters ("curse of dimensionality"). The gradient-based
-optimization of the parameters in GPR does not suffer from this exponential
-scaling and is thus considerable faster on this example with 3-dimensional
-hyperparameter space. The time for predicting is similar; however, generating
-the variance of the predictive distribution of GPR takes considerable longer
-than just predicting the mean.
+This example illustrates differences between a kernel ridge regression and a
+Gaussian process regression.
 
+Both kernel ridge regression and Gaussian process regression are using a
+so-called "kernel trick" to make their models expressive enough to fit
+the training data. However, the machine learning problems solved by the two
+methods are drastically different.
+
+Kernel ridge regression will find the target function that minimizes a loss
+function (the mean squared error).
+
+Instead of finding a single target function, the Gaussian process regression
+employs a probabilistic approach : a Gaussian posterior distribution over
+target functions is defined based on the Bayes' theorem, Thus prior
+probabilities on target functions are being combined with a likelihood function
+defined by the observed training data to provide estimates of the posterior
+distributions.
+
+We will illustrate these differences with an example and we will also focus on
+tuning the kernel hyperparameters.
 """
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
-import time
-
+# %%
+# Generating a dataset
+# --------------------
+#
+# We create a synthetic dataset. The true generative process will take a 1-D
+# vector and compute its sine. Note that the period of this sine is thus
+# :math:`2 \pi`. We will reuse this information later in this example.
 import numpy as np
 
+rng = np.random.RandomState(0)
+data = np.linspace(0, 30, num=1_000).reshape(-1, 1)
+target = np.sin(data).ravel()
+
+# %%
+# Now, we can imagine a scenario where we get observations from this true
+# process. However, we will add some challenges:
+#
+# - the measurements will be noisy;
+# - only samples from the beginning of the signal will be available.
+training_sample_indices = rng.choice(np.arange(0, 400), size=40, replace=False)
+training_data = data[training_sample_indices]
+training_noisy_target = target[training_sample_indices] + 0.5 * rng.randn(
+    len(training_sample_indices)
+)
+
+# %%
+# Let's plot the true signal and the noisy measurements available for training.
 import matplotlib.pyplot as plt
 
+plt.plot(data, target, label="True signal", linewidth=2)
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.legend()
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title(
+    "Illustration of the true generative process and \n"
+    "noisy measurements available during training"
+)
+
+# %%
+# Limitations of a simple linear model
+# ------------------------------------
+#
+# First, we would like to highlight the limitations of a linear model given
+# our dataset. We fit a :class:`~sklearn.linear_model.Ridge` and check the
+# predictions of this model on our dataset.
+from sklearn.linear_model import Ridge
+
+ridge = Ridge().fit(training_data, training_noisy_target)
+
+plt.plot(data, target, label="True signal", linewidth=2)
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(data, ridge.predict(data), label="Ridge regression")
+plt.legend()
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Limitation of a linear model such as ridge")
+
+# %%
+# Such a ridge regressor underfits data since it is not expressive enough.
+#
+# Kernel methods: kernel ridge and Gaussian process
+# -------------------------------------------------
+#
+# Kernel ridge
+# ............
+#
+# We can make the previous linear model more expressive by using a so-called
+# kernel. A kernel is an embedding from the original feature space to another
+# one. Simply put, it is used to map our original data into a newer and more
+# complex feature space. This new space is explicitly defined by the choice of
+# kernel.
+#
+# In our case, we know that the true generative process is a periodic function.
+# We can use a :class:`~sklearn.gaussian_process.kernels.ExpSineSquared` kernel
+# which allows recovering the periodicity. The class
+# :class:`~sklearn.kernel_ridge.KernelRidge` will accept such a kernel.
+#
+# Using this model together with a kernel is equivalent to embed the data
+# using the mapping function of the kernel and then apply a ridge regression.
+# In practice, the data are not mapped explicitly; instead the dot product
+# between samples in the higher dimensional feature space is computed using the
+# "kernel trick".
+#
+# Thus, let's use such a :class:`~sklearn.kernel_ridge.KernelRidge`.
+import time
+from sklearn.gaussian_process.kernels import ExpSineSquared
 from sklearn.kernel_ridge import KernelRidge
-from sklearn.model_selection import GridSearchCV
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
 
-rng = np.random.RandomState(0)
+kernel_ridge = KernelRidge(kernel=ExpSineSquared())
+
+start_time = time.time()
+kernel_ridge.fit(training_data, training_noisy_target)
+print(
+    f"Fitting KernelRidge with default kernel: {time.time() - start_time:.3f} seconds"
+)
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(
+    data,
+    kernel_ridge.predict(data),
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title(
+    "Kernel ridge regression with an exponential sine squared\n "
+    "kernel using default hyperparameters"
+)
+
+# %%
+# This fitted model is not accurate. Indeed, we did not set the parameters of
+# the kernel and instead used the default ones. We can inspect them.
+kernel_ridge.kernel
+
+# %%
+# Our kernel has two parameters: the length-scale and the periodicity. For our
+# dataset, we use `sin` as the generative process, implying a
+# :math:`2 \pi`-periodicity for the signal. The default value of the parameter
+# being :math:`1`, it explains the high frequency observed in the predictions of
+# our model.
+# Similar conclusions could be drawn with the length-scale parameter. Thus, it
+# tell us that the kernel parameters need to be tuned. We will use a randomized
+# search to tune the different parameters the kernel ridge model: the `alpha`
+# parameter and the kernel parameters.
+
+# %%
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.utils.fixes import loguniform
 
-# Generate sample data
-X = 15 * rng.rand(100, 1)
-y = np.sin(X).ravel()
-y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise
-
-# Fit KernelRidge with parameter selection based on 5-fold cross validation
-param_grid = {
-    "alpha": [1e0, 1e-1, 1e-2, 1e-3],
-    "kernel": [
-        ExpSineSquared(l, p)
-        for l in np.logspace(-2, 2, 10)
-        for p in np.logspace(0, 2, 10)
-    ],
+param_distributions = {
+    "alpha": loguniform(1e0, 1e3),
+    "kernel__length_scale": loguniform(1e-2, 1e2),
+    "kernel__periodicity": loguniform(1e0, 1e1),
 }
-kr = GridSearchCV(KernelRidge(), param_grid=param_grid)
-stime = time.time()
-kr.fit(X, y)
-print("Time for KRR fitting: %.3f" % (time.time() - stime))
-
-gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) + WhiteKernel(1e-1)
-gpr = GaussianProcessRegressor(kernel=gp_kernel)
-stime = time.time()
-gpr.fit(X, y)
-print("Time for GPR fitting: %.3f" % (time.time() - stime))
-
-# Predict using kernel ridge
-X_plot = np.linspace(0, 20, 10000)[:, None]
-stime = time.time()
-y_kr = kr.predict(X_plot)
-print("Time for KRR prediction: %.3f" % (time.time() - stime))
-
-# Predict using gaussian process regressor
-stime = time.time()
-y_gpr = gpr.predict(X_plot, return_std=False)
-print("Time for GPR prediction: %.3f" % (time.time() - stime))
-
-stime = time.time()
-y_gpr, y_std = gpr.predict(X_plot, return_std=True)
-print("Time for GPR prediction with standard-deviation: %.3f" % (time.time() - stime))
-
-# Plot results
-plt.figure(figsize=(10, 5))
-lw = 2
-plt.scatter(X, y, c="k", label="data")
-plt.plot(X_plot, np.sin(X_plot), color="navy", lw=lw, label="True")
-plt.plot(X_plot, y_kr, color="turquoise", lw=lw, label="KRR (%s)" % kr.best_params_)
-plt.plot(X_plot, y_gpr, color="darkorange", lw=lw, label="GPR (%s)" % gpr.kernel_)
+kernel_ridge_tuned = RandomizedSearchCV(
+    kernel_ridge,
+    param_distributions=param_distributions,
+    n_iter=500,
+    random_state=0,
+)
+start_time = time.time()
+kernel_ridge_tuned.fit(training_data, training_noisy_target)
+print(f"Time for KernelRidge fitting: {time.time() - start_time:.3f} seconds")
+
+# %%
+# Fitting the model is now more computationally expensive since we have to try
+# several combinations of hyperparameters. We can have a look at the
+# hyperparameters found to get some intuitions.
+kernel_ridge_tuned.best_params_
+
+# %%
+# Looking at the best parameters, we see that they are different from the
+# defaults. We also see that the periodicity is closer to the expected value:
+# :math:`2 \pi`. We can now inspect the predictions of our tuned kernel ridge.
+start_time = time.time()
+predictions_kr = kernel_ridge_tuned.predict(data)
+print(f"Time for KernelRidge predict: {time.time() - start_time:.3f} seconds")
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title(
+    "Kernel ridge regression with an exponential sine squared\n "
+    "kernel using tuned hyperparameters"
+)
+
+# %%
+# We get a much more accurate model. We still observe some errors mainly due to
+# the noise added to the dataset.
+#
+# Gaussian process regression
+# ...........................
+#
+# Now, we will use a
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor` to fit the same
+# dataset. When training a Gaussian process, the hyperparameters of the kernel
+# are optimized during the fitting process. There is no need for an external
+# hyperparameter search. Here, we create a slightly more complex kernel than
+# for the kernel ridge regressor: we add a
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` that is used to
+# estimate the noise in the dataset.
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import WhiteKernel
+
+kernel = 1.0 * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) + WhiteKernel(
+    1e-1
+)
+gaussian_process = GaussianProcessRegressor(kernel=kernel)
+start_time = time.time()
+gaussian_process.fit(training_data, training_noisy_target)
+print(
+    f"Time for GaussianProcessRegressor fitting: {time.time() - start_time:.3f} seconds"
+)
+
+# %%
+# The computation cost of training a Gaussian process is much less than the
+# kernel ridge that uses a randomized search. We can check the parameters of
+# the kernels that we computed.
+gaussian_process.kernel_
+
+# %%
+# Indeed, we see that the parameters have been optimized. Looking at the
+# `periodicity` parameter, we see that we found a period close to the
+# theoretical value :math:`2 \pi`. We can have a look now at the predictions of
+# our model.
+start_time = time.time()
+mean_predictions_gpr, std_predictions_gpr = gaussian_process.predict(
+    data,
+    return_std=True,
+)
+print(
+    f"Time for GaussianProcessRegressor predict: {time.time() - start_time:.3f} seconds"
+)
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+# Plot the predictions of the kernel ridge
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+# Plot the predictions of the gaussian process regressor
+plt.plot(
+    data,
+    mean_predictions_gpr,
+    label="Gaussian process regressor",
+    linewidth=2,
+    linestyle="dotted",
+)
 plt.fill_between(
-    X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color="darkorange", alpha=0.2
+    data.ravel(),
+    mean_predictions_gpr - std_predictions_gpr,
+    mean_predictions_gpr + std_predictions_gpr,
+    color="tab:green",
+    alpha=0.2,
 )
+plt.legend(loc="lower right")
 plt.xlabel("data")
 plt.ylabel("target")
-plt.xlim(0, 20)
-plt.ylim(-4, 4)
-plt.title("GPR versus Kernel Ridge")
-plt.legend(loc="best", scatterpoints=1, prop={"size": 8})
-plt.show()
+_ = plt.title("Comparison between kernel ridge and gaussian process regressor")
+
+# %%
+# We observe that the results of the kernel ridge and the Gaussian process
+# regressor are close. However, the Gaussian process regressor also provide
+# an uncertainty information that is not available with a kernel ridge.
+# Due to the probabilistic formulation of the target functions, the
+# Gaussian process can output the standard deviation (or the covariance)
+# together with the mean predictions of the target functions.
+#
+# However, it comes at a cost: the time to compute the predictions is higher
+# with a Gaussian process.
+#
+# Final conclusion
+# ----------------
+#
+# We can give a final word regarding the possibility of the two models to
+# extrapolate. Indeed, we only provided the beginning of the signal as a
+# training set. Using a periodic kernel forces our model to repeat the pattern
+# found on the training set. Using this kernel information together with the
+# capacity of the both models to extrapolate, we observe that the models will
+# continue to predict the sine pattern.
+#
+# Gaussian process allows to combine kernels together. Thus, we could associate
+# the exponential sine squared kernel together with a radial basis function
+# kernel.
+from sklearn.gaussian_process.kernels import RBF
+
+kernel = 1.0 * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) * RBF(
+    length_scale=15, length_scale_bounds="fixed"
+) + WhiteKernel(1e-1)
+gaussian_process = GaussianProcessRegressor(kernel=kernel)
+gaussian_process.fit(training_data, training_noisy_target)
+mean_predictions_gpr, std_predictions_gpr = gaussian_process.predict(
+    data,
+    return_std=True,
+)
+
+# %%
+plt.plot(data, target, label="True signal", linewidth=2, linestyle="dashed")
+plt.scatter(
+    training_data,
+    training_noisy_target,
+    color="black",
+    label="Noisy measurements",
+)
+# Plot the predictions of the kernel ridge
+plt.plot(
+    data,
+    predictions_kr,
+    label="Kernel ridge",
+    linewidth=2,
+    linestyle="dashdot",
+)
+# Plot the predictions of the gaussian process regressor
+plt.plot(
+    data,
+    mean_predictions_gpr,
+    label="Gaussian process regressor",
+    linewidth=2,
+    linestyle="dotted",
+)
+plt.fill_between(
+    data.ravel(),
+    mean_predictions_gpr - std_predictions_gpr,
+    mean_predictions_gpr + std_predictions_gpr,
+    color="tab:green",
+    alpha=0.2,
+)
+plt.legend(loc="lower right")
+plt.xlabel("data")
+plt.ylabel("target")
+_ = plt.title("Effect of using a radial basis function kernel")
+
+# %%
+# The effect of using a radial basis function kernel will attenuate the
+# periodicity effect once that no sample are available in the training.
+# As testing samples get further away from the training ones, predictions
+# are converging towards their mean and their standard deviation
+# also increases.
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 988ec228817ff..5ce1aa0efb5ea 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -1,161 +1,218 @@
 """
-========================================================
-Gaussian process regression (GPR) on Mauna Loa CO2 data.
-========================================================
+=======================================================
+Gaussian process regression (GPR) on Mauna Loa CO2 data
+=======================================================
 
 This example is based on Section 5.4.3 of "Gaussian Processes for Machine
-Learning" [RW2006]. It illustrates an example of complex kernel engineering and
-hyperparameter optimization using gradient ascent on the
+Learning" [RW2006]_. It illustrates an example of complex kernel engineering
+and hyperparameter optimization using gradient ascent on the
 log-marginal-likelihood. The data consists of the monthly average atmospheric
-CO2 concentrations (in parts per million by volume (ppmv)) collected at the
+CO2 concentrations (in parts per million by volume (ppm)) collected at the
 Mauna Loa Observatory in Hawaii, between 1958 and 2001. The objective is to
-model the CO2 concentration as a function of the time t.
-
-The kernel is composed of several terms that are responsible for explaining
-different properties of the signal:
-
-- a long term, smooth rising trend is to be explained by an RBF kernel. The
-  RBF kernel with a large length-scale enforces this component to be smooth;
-  it is not enforced that the trend is rising which leaves this choice to the
-  GP. The specific length-scale and the amplitude are free hyperparameters.
-
-- a seasonal component, which is to be explained by the periodic
-  ExpSineSquared kernel with a fixed periodicity of 1 year. The length-scale
-  of this periodic component, controlling its smoothness, is a free parameter.
-  In order to allow decaying away from exact periodicity, the product with an
-  RBF kernel is taken. The length-scale of this RBF component controls the
-  decay time and is a further free parameter.
-
-- smaller, medium term irregularities are to be explained by a
-  RationalQuadratic kernel component, whose length-scale and alpha parameter,
-  which determines the diffuseness of the length-scales, are to be determined.
-  According to [RW2006], these irregularities can better be explained by
-  a RationalQuadratic than an RBF kernel component, probably because it can
-  accommodate several length-scales.
-
-- a "noise" term, consisting of an RBF kernel contribution, which shall
-  explain the correlated noise components such as local weather phenomena,
-  and a WhiteKernel contribution for the white noise. The relative amplitudes
-  and the RBF's length scale are further free parameters.
-
-Maximizing the log-marginal-likelihood after subtracting the target's mean
-yields the following kernel with an LML of -83.214::
-
-   34.4**2 * RBF(length_scale=41.8)
-   + 3.27**2 * RBF(length_scale=180) * ExpSineSquared(length_scale=1.44,
-                                                      periodicity=1)
-   + 0.446**2 * RationalQuadratic(alpha=17.7, length_scale=0.957)
-   + 0.197**2 * RBF(length_scale=0.138) + WhiteKernel(noise_level=0.0336)
-
-Thus, most of the target signal (34.4ppm) is explained by a long-term rising
-trend (length-scale 41.8 years). The periodic component has an amplitude of
-3.27ppm, a decay time of 180 years and a length-scale of 1.44. The long decay
-time indicates that we have a locally very close to periodic seasonal
-component. The correlated noise has an amplitude of 0.197ppm with a length
-scale of 0.138 years and a white-noise contribution of 0.197ppm. Thus, the
-overall noise level is very small, indicating that the data can be very well
-explained by the model. The figure shows also that the model makes very
-confident predictions until around 2015.
+model the CO2 concentration as a function of the time :math:`t` and extrapolate
+for years after 2001.
 
+.. topic: References
+
+    .. [RW2006] `Rasmussen, Carl Edward.
+       "Gaussian processes in machine learning."
+       Summer school on machine learning. Springer, Berlin, Heidelberg, 2003
+       <http://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_.
 """
 
+print(__doc__)
+
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-#
+#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
-import numpy as np
-
-from matplotlib import pyplot as plt
+# %%
+# Build the dataset
+# -----------------
+#
+# We will derive a dataset from the Mauna Loa Observatory that collected air
+# samples. We are interested in estimating the concentration of CO2 and
+# extrapolate it for futher year. First, we load the original dataset available
+# in OpenML.
 from sklearn.datasets import fetch_openml
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import (
-    RBF,
-    WhiteKernel,
-    RationalQuadratic,
-    ExpSineSquared,
-)
 
-
-def load_mauna_loa_atmospheric_co2():
-    ml_data = fetch_openml(data_id=41187, as_frame=False)
-    months = []
-    ppmv_sums = []
-    counts = []
-
-    y = ml_data.data[:, 0]
-    m = ml_data.data[:, 1]
-    month_float = y + (m - 1) / 12
-    ppmvs = ml_data.target
-
-    for month, ppmv in zip(month_float, ppmvs):
-        if not months or month != months[-1]:
-            months.append(month)
-            ppmv_sums.append(ppmv)
-            counts.append(1)
-        else:
-            # aggregate monthly sum to produce average
-            ppmv_sums[-1] += ppmv
-            counts[-1] += 1
-
-    months = np.asarray(months).reshape(-1, 1)
-    avg_ppmvs = np.asarray(ppmv_sums) / counts
-    return months, avg_ppmvs
-
-
-X, y = load_mauna_loa_atmospheric_co2()
-
-# Kernel with parameters given in GPML book
-k1 = 66.0 ** 2 * RBF(length_scale=67.0)  # long term smooth rising trend
-k2 = (
-    2.4 ** 2
-    * RBF(length_scale=90.0)
-    * ExpSineSquared(length_scale=1.3, periodicity=1.0)
-)  # seasonal component
-# medium term irregularity
-k3 = 0.66 ** 2 * RationalQuadratic(length_scale=1.2, alpha=0.78)
-k4 = 0.18 ** 2 * RBF(length_scale=0.134) + WhiteKernel(
-    noise_level=0.19 ** 2
-)  # noise terms
-kernel_gpml = k1 + k2 + k3 + k4
-
-gp = GaussianProcessRegressor(
-    kernel=kernel_gpml, alpha=0, optimizer=None, normalize_y=True
+co2 = fetch_openml(data_id=41187, as_frame=True)
+co2.frame.head()
+
+# %%
+# First, we process the original dataframe to create a date index and select
+# only the CO2 column.
+import pandas as pd
+
+co2_data = co2.frame
+co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]])
+co2_data = co2_data[["date", "co2"]].set_index("date")
+co2_data.head()
+
+# %%
+co2_data.index.min(), co2_data.index.max()
+
+# %%
+# We see that we get CO2 concentration for some days from March, 1958 to
+# December, 2001. We can plot these raw information to have a better
+# understanding.
+import matplotlib.pyplot as plt
+
+co2_data.plot()
+plt.ylabel("CO$_2$ concentration (ppm)")
+_ = plt.title("Raw air samples measurements from the Mauna Loa Observatory")
+
+# %%
+# We will preprocess the dataset by taking a monthly average and drop month
+# for which no measurements were collected. Such a processing will have an
+# smoothing effect on the data.
+co2_data = co2_data.resample("M").mean().dropna(axis="index", how="any")
+co2_data.plot()
+plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
+_ = plt.title(
+    "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
 )
-gp.fit(X, y)
-
-print("GPML kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta))
 
-# Kernel with optimized parameters
-k1 = 50.0 ** 2 * RBF(length_scale=50.0)  # long term smooth rising trend
-k2 = (
+# %%
+# The idea in this example will be to predict the CO2 concentration in function
+# of the date. We are as well interested in extrapolating for upcoming year
+# after 2001.
+#
+# As a first step, we will divide the data and the target to estimate. The data
+# being a date, we will convert it into a numeric.
+X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1)
+y = co2_data["co2"].to_numpy()
+
+# %%
+# Design the proper kernel
+# ------------------------
+#
+# To design the kernel to use with our Gaussian process, we can make some
+# assumption regarding the data at hand. We observe that they have several
+# characteristics: we see a long term rising trend, a pronounced seasonal
+# variation and some smaller irregularities. We can use different appropriate
+# kernel that would capture these features.
+#
+# First, the long term rising trend could be fitted using a radial basis
+# function (RBF) kernel with a large length-scale parameter. The RBF kernel
+# with a large length-scale enforces this component to be smooth. An trending
+# increase is not enforced as to give a degree of freedom to our model. The
+# specific length-scale and the amplitude are free hyperparameters.
+from sklearn.gaussian_process.kernels import RBF
+
+long_term_trend_kernel = 50.0 ** 2 * RBF(length_scale=50.0)
+
+# %%
+# The seasonal variation is explained by the periodic exponential sine squared
+# kernel with a fixed periodicity of 1 year. The length-scale of this periodic
+# component, controlling its smoothness, is a free parameter. In order to allow
+# decaying away from exact periodicity, the product with an RBF kernel is
+# taken. The length-scale of this RBF component controls the decay time and is
+# a further free parameter. This type of kernel is also known as locally
+# periodic kernel.
+from sklearn.gaussian_process.kernels import ExpSineSquared
+
+seasonal_kernel = (
     2.0 ** 2
     * RBF(length_scale=100.0)
     * ExpSineSquared(length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed")
-)  # seasonal component
-# medium term irregularities
-k3 = 0.5 ** 2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
-k4 = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(
-    noise_level=0.1 ** 2, noise_level_bounds=(1e-5, np.inf)
-)  # noise terms
-kernel = k1 + k2 + k3 + k4
-
-gp = GaussianProcessRegressor(kernel=kernel, alpha=0, normalize_y=True)
-gp.fit(X, y)
-
-print("\nLearned kernel: %s" % gp.kernel_)
-print("Log-marginal-likelihood: %.3f" % gp.log_marginal_likelihood(gp.kernel_.theta))
-
-X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
-y_pred, y_std = gp.predict(X_, return_std=True)
-
-# Illustration
-plt.scatter(X, y, c="k")
-plt.plot(X_, y_pred)
-plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std, alpha=0.5, color="k")
-plt.xlim(X_.min(), X_.max())
+)
+
+# %%
+# The small irregularities are to be explained by a rational quadratic kernel
+# component, whose length-scale and alpha parameter, which quantifies the
+# diffuseness of the length-scales, are to be determined. A rational quadratic
+# kernel is equivalent to an RBF kernel with several length-scale and will
+# better accommodate the different irregularities.
+from sklearn.gaussian_process.kernels import RationalQuadratic
+
+irregularities_kernel = 0.5 ** 2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
+
+# %%
+# Finally, the noise in the dataset can be accounted with a kernel consisting
+# of an RBF kernel contribution, which shall explain the correlated noise
+# components such as local weather phenomena, and a white kernel contribution
+# for the white noise. The relative amplitudes and the RBF's length scale are
+# further free parameters.
+from sklearn.gaussian_process.kernels import WhiteKernel
+
+noise_kernel = 0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(
+    noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)
+)
+
+# %%
+# Thus, our final kernel is an addition of all previous kernel.
+co2_kernel = (
+    long_term_trend_kernel + seasonal_kernel + irregularities_kernel + noise_kernel
+)
+co2_kernel
+
+# %%
+# Model fitting and extrapolation
+# -------------------------------
+#
+# Now, we are ready to use a Gaussian process regressor and fit the available
+# data. To follow the example from the literature, we will subtract the mean
+# from the target. We could have used `normalize_y=True`. However, doing so
+# would have also scaled the target (dividing `y` by its standard deviation).
+# Thus, the hyperparameters of the different kernel would have had different
+# meaning since they would not have been expressed in ppm.
+from sklearn.gaussian_process import GaussianProcessRegressor
+
+y_mean = y.mean()
+gaussian_process = GaussianProcessRegressor(kernel=co2_kernel, normalize_y=False)
+gaussian_process.fit(X, y - y_mean)
+
+# %%
+# Now, we will use the Gaussian process to predict on:
+#
+# - training data to inspect the goodness of fit;
+# - future data to see the extrapolation done by the model.
+#
+# Thus, we create synthetic data from 1958 to the current month. In addition,
+# we need to add the subtracted mean computed during training.
+import datetime
+import numpy as np
+
+today = datetime.datetime.now()
+current_month = today.year + today.month / 12
+X_test = np.linspace(start=1958, stop=current_month, num=1_000).reshape(-1, 1)
+mean_y_pred, std_y_pred = gaussian_process.predict(X_test, return_std=True)
+mean_y_pred += y_mean
+
+# %%
+plt.plot(X, y, color="black", linestyle="dashed", label="Measurements")
+plt.plot(X_test, mean_y_pred, color="tab:blue", alpha=0.4, label="Gaussian process")
+plt.fill_between(
+    X_test.ravel(),
+    mean_y_pred - std_y_pred,
+    mean_y_pred + std_y_pred,
+    color="tab:blue",
+    alpha=0.2,
+)
+plt.legend()
 plt.xlabel("Year")
-plt.ylabel(r"CO$_2$ in ppm")
-plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
-plt.tight_layout()
-plt.show()
+plt.ylabel("Monthly average of CO$_2$ concentration (ppm)")
+_ = plt.title(
+    "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory"
+)
+
+# %%
+# Our fitted model is capable to fit previous data properly and extrapolate to
+# future year with confidence.
+#
+# Interpretation of kernel hyperparameters
+# ----------------------------------------
+#
+# Now, we can have a look at the hyperparameters of the kernel.
+gaussian_process.kernel_
+
+# %%
+# Thus, most of the target signal, with the mean substracted, is explained by a
+# long-term rising trend for ~45 ppm and a length-scale of ~52 years. The
+# periodic component has an amplitude of ~2.6ppm, a decay time of ~90 years and
+# a length-scale of ~1.5. The long decay time indicates that we have a
+# component very close to a seasonal periodicity. The correlated noise has an
+# amplitude of ~0.2 ppm with a length scale of ~0.12 years and a white-noise
+# contribution of ~0.04 ppm. Thus, the overall noise level is very small,
+# indicating that the data can be very well explained by the model.
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
index c5d5a77780df3..04ea696e4319f 100644
--- a/examples/gaussian_process/plot_gpr_noisy.py
+++ b/examples/gaussian_process/plot_gpr_noisy.py
@@ -3,109 +3,186 @@
 Gaussian process regression (GPR) with noise-level estimation
 =============================================================
 
-This example illustrates that GPR with a sum-kernel including a WhiteKernel can
-estimate the noise level of data. An illustration of the
-log-marginal-likelihood (LML) landscape shows that there exist two local
-maxima of LML. The first corresponds to a model with a high noise level and a
-large length scale, which explains all variations in the data by noise. The
-second one has a smaller noise level and shorter length scale, which explains
-most of the variation by the noise-free functional relationship. The second
-model has a higher likelihood; however, depending on the initial value for the
-hyperparameters, the gradient-based optimization might also converge to the
-high-noise solution. It is thus important to repeat the optimization several
-times for different initializations.
-
+This example shows the ability of the
+:class:`~sklearn.gaussian_process.kernels.WhiteKernel` to estimate the noise
+level in the data. Moreover, we show the importance of kernel hyperparameters
+initialization.
 """
 
 # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
-#
+#          Guillaume Lemaitre <guillaume.lemaitre@inria.fr>
 # License: BSD 3 clause
 
+# %%
+# Data generation
+# ---------------
+#
+# We will work in a setting where `X` will contain a single feature. We create a
+# function that will generate the target to be predicted. We will add an
+# option to add some noise to the generated target.
 import numpy as np
 
-from matplotlib import pyplot as plt
-from matplotlib.colors import LogNorm
 
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, WhiteKernel
+def target_generator(X, add_noise=False):
+    target = 0.5 + np.sin(3 * X)
+    if add_noise:
+        rng = np.random.RandomState(1)
+        target += rng.normal(0, 0.3, size=target.shape)
+    return target.squeeze()
+
+
+# %%
+# Let's have a look to the target generator where we will not add any noise to
+# observe the signal that we would like to predict.
+X = np.linspace(0, 5, num=30).reshape(-1, 1)
+y = target_generator(X, add_noise=False)
 
+# %%
+import matplotlib.pyplot as plt
 
+plt.plot(X, y, label="Expected signal")
+plt.legend()
+plt.xlabel("X")
+_ = plt.ylabel("y")
+
+# %%
+# The target is transforming the input `X` using a sine function. Now, we will
+# generate few noisy training samples. To illustrate the noise level, we will
+# plot the true signal together with the noisy training samples.
 rng = np.random.RandomState(0)
-X = rng.uniform(0, 5, 20)[:, np.newaxis]
-y = 0.5 * np.sin(3 * X[:, 0]) + rng.normal(0, 0.5, X.shape[0])
+X_train = rng.uniform(0, 5, size=20).reshape(-1, 1)
+y_train = target_generator(X_train, add_noise=True)
 
-# First run
-plt.figure()
-kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
-    noise_level=1, noise_level_bounds=(1e-10, 1e1)
-)
-gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, "k", lw=3, zorder=9)
-plt.fill_between(
-    X_,
-    y_mean - np.sqrt(np.diag(y_cov)),
-    y_mean + np.sqrt(np.diag(y_cov)),
-    alpha=0.5,
-    color="k",
+# %%
+plt.plot(X, y, label="Expected signal")
+plt.scatter(
+    x=X_train[:, 0],
+    y=y_train,
+    color="black",
+    alpha=0.4,
+    label="Observations",
 )
-plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title(
-    "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-    % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta))
+plt.legend()
+plt.xlabel("X")
+_ = plt.ylabel("y")
+
+# %%
+# Optimisation of kernel hyperparameters in GPR
+# ---------------------------------------------
+#
+# Now, we will create a
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor`
+# using an additive kernel adding a
+# :class:`~sklearn.gaussian_process.kernels.RBF` and
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` kernels.
+# The :class:`~sklearn.gaussian_process.kernels.WhiteKernel` is a kernel that
+# will able to estimate the amount of noise present in the data while the
+# :class:`~sklearn.gaussian_process.kernels.RBF` will serve at fitting the
+# non-linearity between the data and the target.
+#
+# However, we will show that the hyperparameter space contains several local
+# minima. It will highlights the importance of initial hyperparameter values.
+#
+# We will create a model using a kernel with a high noise level and a large
+# length scale, which will explain all variations in the data by noise.
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF, WhiteKernel
+
+kernel = 1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1, noise_level_bounds=(1e-5, 1e1)
 )
-plt.tight_layout()
+gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
 
-# Second run
-plt.figure()
-kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
-    noise_level=1e-5, noise_level_bounds=(1e-10, 1e1)
+# %%
+plt.plot(X, y, label="Expected signal")
+plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observsations")
+plt.errorbar(X, y_mean, y_std)
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}",
+    fontsize=8,
 )
-gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y)
-X_ = np.linspace(0, 5, 100)
-y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
-plt.plot(X_, y_mean, "k", lw=3, zorder=9)
-plt.fill_between(
-    X_,
-    y_mean - np.sqrt(np.diag(y_cov)),
-    y_mean + np.sqrt(np.diag(y_cov)),
-    alpha=0.5,
-    color="k",
+# %%
+# We see that the optimum kernel found still have a high noise level and
+# an even larger length scale. Furthermore, we observe that the
+# model does not provide faithful predictions.
+#
+# Now, we will initialize the
+# :class:`~sklearn.gaussian_process.kernels.RBF` with a
+# larger `length_scale` and the
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel`
+# with a smaller noise level lower bound.
+kernel = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1e-2, noise_level_bounds=(1e-10, 1e1)
 )
-plt.plot(X_, 0.5 * np.sin(3 * X_), "r", lw=3, zorder=9)
-plt.scatter(X[:, 0], y, c="r", s=50, zorder=10, edgecolors=(0, 0, 0))
-plt.title(
-    "Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
-    % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta))
+gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
+
+# %%
+plt.plot(X, y, label="Expected signal")
+plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
+plt.errorbar(X, y_mean, y_std)
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+    f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}",
+    fontsize=8,
 )
-plt.tight_layout()
-
-# Plot LML landscape
-plt.figure()
-theta0 = np.logspace(-2, 3, 49)
-theta1 = np.logspace(-2, 0, 50)
-Theta0, Theta1 = np.meshgrid(theta0, theta1)
-LML = [
-    [
-        gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
-        for i in range(Theta0.shape[0])
-    ]
-    for j in range(Theta0.shape[1])
+
+# %%
+# First, we see that the model's predictions are more precise than the
+# previous model's: this new model is able to estimate the noise-free
+# functional relationship.
+#
+# Looking at the kernel hyperparameters, we see that the best combination found
+# has a smaller noise level and shorter length scale than the first model.
+#
+# We can inspect the Log-Marginal-Likelihood (LML) of
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor`
+# for different hyperparameters to get a sense of the local minima.
+from matplotlib.colors import LogNorm
+
+length_scale = np.logspace(-2, 4, num=50)
+noise_level = np.logspace(-2, 1, num=50)
+length_scale_grid, noise_level_grid = np.meshgrid(length_scale, noise_level)
+
+log_marginal_likelihood = [
+    gpr.log_marginal_likelihood(theta=np.log([0.36, scale, noise]))
+    for scale, noise in zip(length_scale_grid.ravel(), noise_level_grid.ravel())
 ]
-LML = np.array(LML).T
+log_marginal_likelihood = np.reshape(
+    log_marginal_likelihood, newshape=noise_level_grid.shape
+)
 
-vmin, vmax = (-LML).min(), (-LML).max()
-vmax = 50
-level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1)
-plt.contour(Theta0, Theta1, -LML, levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
+# %%
+vmin, vmax = (-log_marginal_likelihood).min(), 50
+level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=50), decimals=1)
+plt.contour(
+    length_scale_grid,
+    noise_level_grid,
+    -log_marginal_likelihood,
+    levels=level,
+    norm=LogNorm(vmin=vmin, vmax=vmax),
+)
 plt.colorbar()
 plt.xscale("log")
 plt.yscale("log")
 plt.xlabel("Length-scale")
 plt.ylabel("Noise-level")
 plt.title("Log-marginal-likelihood")
-plt.tight_layout()
-
 plt.show()
+
+# %%
+# We see that there are two local minima that correspond to the combination
+# of hyperparameters previously found. Depending on the initial values for the
+# hyperparameters, the gradient-based optimization might converge whether or
+# not to the best model. It is thus important to repeat the optimization
+# several times for different initializations.
diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py
index 34b404b48eb2c..2e11957ae7045 100644
--- a/examples/gaussian_process/plot_gpr_noisy_targets.py
+++ b/examples/gaussian_process/plot_gpr_noisy_targets.py
@@ -11,111 +11,143 @@
 In both cases, the kernel's parameters are estimated using the maximum
 likelihood principle.
 
-The figures illustrate the interpolating property of the Gaussian Process
-model as well as its probabilistic nature in the form of a pointwise 95%
-confidence interval.
-
-Note that the parameter ``alpha`` is applied as a Tikhonov
-regularization of the assumed covariance between the training points.
+The figures illustrate the interpolating property of the Gaussian Process model
+as well as its probabilistic nature in the form of a pointwise 95% confidence
+interval.
 
+Note that `alpha` is a parameter to control the strength of the Tikhonov
+regularization on the assumed training points' covariance matrix.
 """
 
 # Author: Vincent Dubourg <vincent.dubourg@gmail.com>
 #         Jake Vanderplas <vanderplas@astro.washington.edu>
-#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
+#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#         Guillaume Lemaitre <g.lemaitre58@gmail.com>
 # License: BSD 3 clause
 
+# %%
+# Dataset generation
+# ------------------
+#
+# We will start by generating a synthetic dataset. The true generative process
+# is defined as :math:`f(x) = x \sin(x)`.
 import numpy as np
-from matplotlib import pyplot as plt
-
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
-
-np.random.seed(1)
-
-
-def f(x):
-    """The function to predict."""
-    return x * np.sin(x)
-
-
-# ----------------------------------------------------------------------
-#  First the noiseless case
-X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T
 
-# Observations
-y = f(X).ravel()
+X = np.linspace(start=0, stop=10, num=1_000).reshape(-1, 1)
+y = np.squeeze(X * np.sin(X))
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
-x = np.atleast_2d(np.linspace(0, 10, 1000)).T
+# %%
+import matplotlib.pyplot as plt
 
-# Instantiate a Gaussian Process model
-kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
-gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
-plt.plot(X, y, "r.", markersize=10, label="Observations")
-plt.plot(x, y_pred, "b-", label="Prediction")
-plt.fill(
-    np.concatenate([x, x[::-1]]),
-    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]),
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.legend()
+plt.xlabel("$x$")
+plt.ylabel("$f(x)$")
+_ = plt.title("True generative process")
+
+# %%
+# We will use this dataset in the next experiment to illustrate how Gaussian
+# Process regression is working.
+#
+# Example with noise-free target
+# ------------------------------
+#
+# In this first example, we will use the true generative process without
+# adding any noise. For training the Gaussian Process regression, we will only
+# select few samples.
+rng = np.random.RandomState(1)
+training_indices = rng.choice(np.arange(y.size), size=6, replace=False)
+X_train, y_train = X[training_indices], y[training_indices]
+
+# %%
+# Now, we fit a Gaussian process on these few training data samples. We will
+# use a radial basis function (RBF) kernel and a constant parameter to fit the
+# amplitude.
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RBF
+
+kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
+gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
+gaussian_process.fit(X_train, y_train)
+gaussian_process.kernel_
+
+# %%
+# After fitting our model, we see that the hyperparameters of the kernel have
+# been optimized. Now, we will use our kernel to compute the mean prediction
+# of the full dataset and plot the 95% confidence interval.
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.scatter(X_train, y_train, label="Observations")
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
     alpha=0.5,
-    fc="b",
-    ec="None",
-    label="95% confidence interval",
+    label=r"95% confidence interval",
 )
+plt.legend()
 plt.xlabel("$x$")
 plt.ylabel("$f(x)$")
-plt.ylim(-10, 20)
-plt.legend(loc="upper left")
-
-# ----------------------------------------------------------------------
-# now the noisy case
-X = np.linspace(0.1, 9.9, 20)
-X = np.atleast_2d(X).T
-
-# Observations and noise
-y = f(X).ravel()
-dy = 0.5 + 1.0 * np.random.random(y.shape)
-noise = np.random.normal(0, dy)
-y += noise
-
-# Instantiate a Gaussian Process model
-gp = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2, n_restarts_optimizer=10)
-
-# Fit to data using Maximum Likelihood Estimation of the parameters
-gp.fit(X, y)
-
-# Make the prediction on the meshed x-axis (ask for MSE as well)
-y_pred, sigma = gp.predict(x, return_std=True)
-
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-plt.figure()
-plt.plot(x, f(x), "r:", label=r"$f(x) = x\,\sin(x)$")
-plt.errorbar(X.ravel(), y, dy, fmt="r.", markersize=10, label="Observations")
-plt.plot(x, y_pred, "b-", label="Prediction")
-plt.fill(
-    np.concatenate([x, x[::-1]]),
-    np.concatenate([y_pred - 1.9600 * sigma, (y_pred + 1.9600 * sigma)[::-1]]),
+_ = plt.title("Gaussian process regression on noise-free dataset")
+
+# %%
+# We see that for a prediction made on a data point close to the one from the
+# training set, the 95% confidence has a small amplitude. Whenever a sample
+# falls far from training data, our model's prediction is less accurate and the
+# model prediction is less precise (higher uncertainty).
+#
+# Example with noisy targets
+# --------------------------
+#
+# We can repeat a similar experiment adding an additional noise to the target
+# this time. It will allow seeing the effect of the noise on the fitted model.
+#
+# We add some random Gaussian noise to the target with an arbitrary
+# standard deviation.
+noise_std = 0.75
+y_train_noisy = y_train + rng.normal(loc=0.0, scale=noise_std, size=y_train.shape)
+
+# %%
+# We create a similar Gaussian process model. In addition to the kernel, this
+# time, we specify the parameter `alpha` which can be interpreted as the
+# variance of a Gaussian noise.
+gaussian_process = GaussianProcessRegressor(
+    kernel=kernel, alpha=noise_std ** 2, n_restarts_optimizer=9
+)
+gaussian_process.fit(X_train, y_train_noisy)
+mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)
+
+# %%
+# Let's plot the mean prediction and the uncertainty region as before.
+plt.plot(X, y, label=r"$f(x) = x \sin(x)$", linestyle="dotted")
+plt.errorbar(
+    X_train,
+    y_train_noisy,
+    noise_std,
+    linestyle="None",
+    color="tab:blue",
+    marker=".",
+    markersize=10,
+    label="Observations",
+)
+plt.plot(X, mean_prediction, label="Mean prediction")
+plt.fill_between(
+    X.ravel(),
+    mean_prediction - 1.96 * std_prediction,
+    mean_prediction + 1.96 * std_prediction,
+    color="tab:orange",
     alpha=0.5,
-    fc="b",
-    ec="None",
-    label="95% confidence interval",
+    label=r"95% confidence interval",
 )
+plt.legend()
 plt.xlabel("$x$")
 plt.ylabel("$f(x)$")
-plt.ylim(-10, 20)
-plt.legend(loc="upper left")
+_ = plt.title("Gaussian process regression on a noisy dataset")
 
-plt.show()
+# %%
+# The noise affects the predictions close to the training samples: the
+# predictive uncertainty near to the training samples is larger because we
+# explicitly model a given level target noise independent of the input
+# variable.
diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index fac3fe1a4c039..f0f5dbf710714 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -16,10 +16,10 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from sklearn import linear_model
+from sklearn.linear_model import LogisticRegression, LinearRegression
 from scipy.special import expit
 
-# General a toy dataset:s it's just a straight line with some Gaussian noise:
+# Generate a toy dataset, it's just a straight line with some Gaussian noise:
 xmin, xmax = -5, 5
 n_samples = 100
 np.random.seed(0)
@@ -31,7 +31,7 @@
 X = X[:, np.newaxis]
 
 # Fit the classifier
-clf = linear_model.LogisticRegression(C=1e5)
+clf = LogisticRegression(C=1e5)
 clf.fit(X, y)
 
 # and plot the result
@@ -43,7 +43,7 @@
 loss = expit(X_test * clf.coef_ + clf.intercept_).ravel()
 plt.plot(X_test, loss, color="red", linewidth=3)
 
-ols = linear_model.LinearRegression()
+ols = LinearRegression()
 ols.fit(X, y)
 plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
 plt.axhline(0.5, color=".5")
diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index 49a69ce894a52..ac2fe28de870d 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -51,7 +51,7 @@
 
 
 # %%
-# We start by defining a function that we intent to approximate and prepare
+# We start by defining a function that we intend to approximate and prepare
 # plotting it.
 
 
diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py
index ee5fe196fa6ba..41f7aa5d337d6 100644
--- a/examples/manifold/plot_swissroll.py
+++ b/examples/manifold/plot_swissroll.py
@@ -1,46 +1,119 @@
 """
 ===================================
-Swiss Roll reduction with LLE
+Swiss Roll And Swiss-Hole Reduction
 ===================================
+This notebook seeks to compare two popular non-linear dimensionality
+techniques, T-distributed Stochastic Neighbor Embedding (t-SNE) and
+Locally Linear Embedding (LLE), on the classic Swiss Roll dataset.
+Then, we will explore how they both deal with the addition of a hole
+in the data.
+"""
+# %%
+# Swiss Roll
+# ---------------------------------------------------
+#
+# We start by generating the Swiss Roll dataset.
 
-An illustration of Swiss Roll reduction
-with locally linear embedding
+import matplotlib.pyplot as plt
+from sklearn import manifold, datasets
 
-"""
 
-# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
-# License: BSD 3 clause (C) INRIA 2011
+sr_points, sr_color = datasets.make_swiss_roll(n_samples=1500, random_state=0)
 
-import matplotlib.pyplot as plt
+# %%
+# Now, let's take a look at our data:
 
-# This import is needed to modify the way figure behaves
-from mpl_toolkits.mplot3d import Axes3D
+fig = plt.figure(figsize=(8, 6))
+ax = fig.add_subplot(111, projection="3d")
+fig.add_axes(ax)
+ax.scatter(
+    sr_points[:, 0], sr_points[:, 1], sr_points[:, 2], c=sr_color, s=50, alpha=0.8
+)
+ax.set_title("Swiss Roll in Ambient Space")
+ax.view_init(azim=-66, elev=12)
+_ = ax.text2D(0.8, 0.05, s="n_samples=1500", transform=ax.transAxes)
 
-Axes3D
+# %%
+# Computing the LLE and t-SNE embeddings, we find that LLE seems to unroll the
+# Swiss Roll pretty effectively. t-SNE on the other hand, is able
+# to preserve the general structure of the data, but, poorly represents the
+# continous nature of our original data. Instead, it seems to unnecessarily
+# clump sections of points together.
 
-# ----------------------------------------------------------------------
-# Locally linear embedding of the swiss roll
+sr_lle, sr_err = manifold.locally_linear_embedding(
+    sr_points, n_neighbors=12, n_components=2
+)
 
-from sklearn import manifold, datasets
+sr_tsne = manifold.TSNE(
+    n_components=2, learning_rate="auto", perplexity=40, init="pca", random_state=0
+).fit_transform(sr_points)
+
+fig, axs = plt.subplots(figsize=(8, 8), nrows=2)
+axs[0].scatter(sr_lle[:, 0], sr_lle[:, 1], c=sr_color)
+axs[0].set_title("LLE Embedding of Swiss Roll")
+axs[1].scatter(sr_tsne[:, 0], sr_tsne[:, 1], c=sr_color)
+_ = axs[1].set_title("t-SNE Embedding of Swiss Roll")
+
+# %%
+# .. note::
+#
+#     LLE seems to be stretching the points from the center (purple)
+#     of the swiss roll. However, we observe that this is simply a byproduct
+#     of how the data was generated. There is a higher density of points near the
+#     center of the roll, which ultimately affects how LLE reconstructs the
+#     data in a lower dimension.
+
+# %%
+# Swiss-Hole
+# ---------------------------------------------------
+#
+# Now let's take a look at how both algorithms deal with us adding a hole to
+# the data. First, we generate the Swiss-Hole dataset and plot it:
+
+sh_points, sh_color = datasets.make_swiss_roll(
+    n_samples=1500, hole=True, random_state=0
+)
+
+fig = plt.figure(figsize=(8, 6))
+ax = fig.add_subplot(111, projection="3d")
+fig.add_axes(ax)
+ax.scatter(
+    sh_points[:, 0], sh_points[:, 1], sh_points[:, 2], c=sh_color, s=50, alpha=0.8
+)
+ax.set_title("Swiss-Hole in Ambient Space")
+ax.view_init(azim=-66, elev=12)
+_ = ax.text2D(0.8, 0.05, s="n_samples=1500", transform=ax.transAxes)
 
-X, color = datasets.make_swiss_roll(n_samples=1500)
+# %%
+# Computing the LLE and t-SNE embeddings, we obtain similar results to the
+# Swiss Roll. LLE very capably unrolls the data and even preserves
+# the hole. t-SNE, again seems to clump sections of points together, but, we
+# note that it preserves the general topology of the original data.
 
-print("Computing LLE embedding")
-X_r, err = manifold.locally_linear_embedding(X, n_neighbors=12, n_components=2)
-print("Done. Reconstruction error: %g" % err)
 
-# ----------------------------------------------------------------------
-# Plot result
+sh_lle, sh_err = manifold.locally_linear_embedding(
+    sh_points, n_neighbors=12, n_components=2
+)
 
-fig = plt.figure()
+sh_tsne = manifold.TSNE(
+    n_components=2, learning_rate="auto", perplexity=40, init="random", random_state=0
+).fit_transform(sh_points)
 
-ax = fig.add_subplot(211, projection="3d")
-ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
+fig, axs = plt.subplots(figsize=(8, 8), nrows=2)
+axs[0].scatter(sh_lle[:, 0], sh_lle[:, 1], c=sh_color)
+axs[0].set_title("LLE Embedding of Swiss-Hole")
+axs[1].scatter(sh_tsne[:, 0], sh_tsne[:, 1], c=sh_color)
+_ = axs[1].set_title("t-SNE Embedding of Swiss-Hole")
 
-ax.set_title("Original data")
-ax = fig.add_subplot(212)
-ax.scatter(X_r[:, 0], X_r[:, 1], c=color, cmap=plt.cm.Spectral)
-plt.axis("tight")
-plt.xticks([]), plt.yticks([])
-plt.title("Projected data")
-plt.show()
+# %%
+#
+# Concluding remarks
+# ------------------
+#
+# We note that t-SNE benefits from testing more combinations of parameters.
+# Better results could probably have been obtained by better tuning these
+# parameters.
+#
+# We observe that, as seen in the "Manifold learning on
+# handwritten digits" example, t-SNE generally performs better than LLE
+# on real world data.
diff --git a/examples/neural_networks/plot_mlp_alpha.py b/examples/neural_networks/plot_mlp_alpha.py
index 3d316e4e85db0..443d41f4707bf 100644
--- a/examples/neural_networks/plot_mlp_alpha.py
+++ b/examples/neural_networks/plot_mlp_alpha.py
@@ -45,7 +45,7 @@
                 random_state=1,
                 max_iter=2000,
                 early_stopping=True,
-                hidden_layer_sizes=[100, 100],
+                hidden_layer_sizes=[10, 10],
             ),
         )
     )
@@ -69,7 +69,9 @@
 # iterate over datasets
 for X, y in datasets:
     # split into training and test part
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.4, random_state=42
+    )
 
     x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
     y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
@@ -98,9 +100,9 @@
         # Plot the decision boundary. For that, we will assign a color to each
         # point in the mesh [x_min, x_max] x [y_min, y_max].
         if hasattr(clf, "decision_function"):
-            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+            Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()]))
         else:
-            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
+            Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1]
 
         # Put the result into a color plot
         Z = Z.reshape(xx.shape)
@@ -134,7 +136,7 @@
         ax.text(
             xx.max() - 0.3,
             yy.min() + 0.3,
-            ("%.2f" % score).lstrip("0"),
+            f"{score:.3f}".lstrip("0"),
             size=15,
             horizontalalignment="right",
         )
diff --git a/maint_tools/create_issue_from_juint.py b/maint_tools/create_issue_from_juint.py
new file mode 100644
index 0000000000000..95823aad16c21
--- /dev/null
+++ b/maint_tools/create_issue_from_juint.py
@@ -0,0 +1,114 @@
+"""Creates or updates an issue if the CI fails. This is useful to keep track of
+scheduled jobs that are failing repeatedly.
+
+This script depends on:
+- `defusedxml` for safer parsing for xml
+- `PyGithub` for interacting with GitHub
+
+The GitHub token only requires the `repo:public_repo` scope are described in
+https://docs.github.com/en/developers/apps/building-oauth-apps/scopes-for-oauth-apps#available-scopes.
+This scope allows the bot to create and edit its own issues. It is best to use a
+github account that does **not** have commit access to the public repo.
+"""
+
+from pathlib import Path
+import sys
+import argparse
+
+import defusedxml.ElementTree as ET
+from github import Github
+
+parser = argparse.ArgumentParser(
+    description="Create or update issue from JUnit test results from pytest"
+)
+parser.add_argument(
+    "bot_github_token", help="Github token for creating or updating an issue"
+)
+parser.add_argument("ci_name", help="Name of CI run instance")
+parser.add_argument("issue_repo", help="Repo to track issues")
+parser.add_argument("link_to_ci_run", help="URL to link to")
+parser.add_argument("junit_file", help="JUnit file")
+
+args = parser.parse_args()
+gh = Github(args.bot_github_token)
+issue_repo = gh.get_repo(args.issue_repo)
+title = f"⚠️ CI failed on {args.ci_name} ⚠️"
+
+
+def get_issue():
+    login = gh.get_user().login
+    issues = gh.search_issues(
+        f"repo:{args.issue_repo} {title} in:title state:open author:{login}"
+    )
+    first_page = issues.get_page(0)
+    # Return issue if it exist
+    return first_page[0] if first_page else None
+
+
+def create_or_update_issue(body):
+    # Interact with GitHub API to create issue
+    header = f"**CI Failed on [{args.ci_name}]({args.link_to_ci_run})**"
+    body_text = f"{header}\n{body}"
+    issue = get_issue()
+
+    if issue is None:
+        # Create new issue
+        issue = issue_repo.create_issue(title=title, body=body_text)
+        print(f"Created issue in {args.issue_repo}#{issue.number}")
+        sys.exit()
+    else:
+        # Update existing issue
+        issue.edit(title=title, body=body_text)
+        print(f"Updated issue in {args.issue_repo}#{issue.number}")
+        sys.exit()
+
+
+junit_path = Path(args.junit_file)
+if not junit_path.exists():
+    body = "Unable to find junit file. Please see link for details."
+    create_or_update_issue(body)
+    sys.exit()
+
+# Find failures in junit file
+tree = ET.parse(args.junit_file)
+failure_cases = []
+
+# Check if test collection failed
+error = tree.find("./testsuite/testcase/error")
+if error is not None:
+    # Get information for test collection error
+    failure_cases.append({"title": "Test Collection Failure", "body": error.text})
+
+for item in tree.iter("testcase"):
+    failure = item.find("failure")
+    if failure is None:
+        continue
+
+    failure_cases.append(
+        {
+            "title": item.attrib["name"],
+            "body": failure.text,
+        }
+    )
+
+if not failure_cases:
+    print("Test has no failures!")
+    issue = get_issue()
+    if issue is not None:
+        print(f"Closing issue #{issue.number}")
+        new_body = (
+            "## Closed issue because CI is no longer failing! ✅\n\n"
+            f"[Successful run]({args.link_to_ci_run})\n\n"
+            "## Previous failing issue\n\n"
+            f"{issue.body}"
+        )
+        issue.edit(state="closed", body=new_body)
+    sys.exit()
+
+# Create content for issue
+issue_summary = (
+    "<details><summary>{title}</summary>\n\n```python\n{body}\n```\n</details>\n"
+)
+body_list = [issue_summary.format(**case) for case in failure_cases]
+body = "\n".join(body_list)
+create_or_update_issue(body)
diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py
index edbc05d260dee..d4d39aad47fc3 100644
--- a/maint_tools/test_docstrings.py
+++ b/maint_tools/test_docstrings.py
@@ -11,14 +11,7 @@
 
 numpydoc_validation = pytest.importorskip("numpydoc.validate")
 
-# List of modules ignored when checking for numpydoc validation.
-DOCSTRING_IGNORE_LIST = [
-    "SpectralCoclustering",
-]
-
 FUNCTION_DOCSTRING_IGNORE_LIST = [
-    "sklearn._config.config_context",
-    "sklearn._config.get_config",
     "sklearn.base.clone",
     "sklearn.cluster._affinity_propagation.affinity_propagation",
     "sklearn.cluster._kmeans.kmeans_plusplus",
@@ -37,7 +30,6 @@
     "sklearn.datasets._base.get_data_home",
     "sklearn.datasets._base.load_boston",
     "sklearn.datasets._base.load_breast_cancer",
-    "sklearn.datasets._base.load_diabetes",
     "sklearn.datasets._base.load_digits",
     "sklearn.datasets._base.load_files",
     "sklearn.datasets._base.load_iris",
@@ -93,19 +85,15 @@
     "sklearn.linear_model._ridge.ridge_regression",
     "sklearn.manifold._locally_linear.locally_linear_embedding",
     "sklearn.manifold._t_sne.trustworthiness",
-    "sklearn.metrics._classification.balanced_accuracy_score",
     "sklearn.metrics._classification.brier_score_loss",
     "sklearn.metrics._classification.classification_report",
     "sklearn.metrics._classification.cohen_kappa_score",
-    "sklearn.metrics._classification.confusion_matrix",
     "sklearn.metrics._classification.f1_score",
     "sklearn.metrics._classification.fbeta_score",
     "sklearn.metrics._classification.hinge_loss",
     "sklearn.metrics._classification.jaccard_score",
     "sklearn.metrics._classification.log_loss",
     "sklearn.metrics._classification.precision_recall_fscore_support",
-    "sklearn.metrics._classification.precision_score",
-    "sklearn.metrics._classification.recall_score",
     "sklearn.metrics._plot.confusion_matrix.plot_confusion_matrix",
     "sklearn.metrics._plot.det_curve.plot_det_curve",
     "sklearn.metrics._plot.precision_recall_curve.plot_precision_recall_curve",
@@ -150,7 +138,6 @@
     "sklearn.metrics.pairwise.haversine_distances",
     "sklearn.metrics.pairwise.kernel_metrics",
     "sklearn.metrics.pairwise.laplacian_kernel",
-    "sklearn.metrics.pairwise.linear_kernel",
     "sklearn.metrics.pairwise.manhattan_distances",
     "sklearn.metrics.pairwise.nan_euclidean_distances",
     "sklearn.metrics.pairwise.paired_cosine_distances",
@@ -165,7 +152,6 @@
     "sklearn.metrics.pairwise.rbf_kernel",
     "sklearn.metrics.pairwise.sigmoid_kernel",
     "sklearn.model_selection._split.check_cv",
-    "sklearn.model_selection._validation.cross_val_score",
     "sklearn.model_selection._validation.cross_validate",
     "sklearn.model_selection._validation.learning_curve",
     "sklearn.model_selection._validation.permutation_test_score",
@@ -185,8 +171,6 @@
     "sklearn.svm._bounds.l1_min_c",
     "sklearn.tree._export.plot_tree",
     "sklearn.utils.axis0_safe_slice",
-    "sklearn.utils.check_pandas_support",
-    "sklearn.utils.extmath.cartesian",
     "sklearn.utils.extmath.density",
     "sklearn.utils.extmath.fast_logdet",
     "sklearn.utils.extmath.randomized_range_finder",
@@ -228,13 +212,10 @@
     "sklearn.utils.sparsefuncs.mean_variance_axis",
     "sklearn.utils.sparsefuncs.min_max_axis",
     "sklearn.utils.tosequence",
-    "sklearn.utils.validation.as_float_array",
     "sklearn.utils.validation.assert_all_finite",
     "sklearn.utils.validation.check_is_fitted",
     "sklearn.utils.validation.check_memory",
     "sklearn.utils.validation.check_random_state",
-    "sklearn.utils.validation.column_or_1d",
-    "sklearn.utils.validation.has_fit_parameter",
 ]
 FUNCTION_DOCSTRING_IGNORE_LIST = set(FUNCTION_DOCSTRING_IGNORE_LIST)
 
@@ -424,11 +405,6 @@ def test_docstring(Estimator, method, request):
 
     import_path = ".".join(import_path)
 
-    if Estimator.__name__ in DOCSTRING_IGNORE_LIST:
-        request.applymarker(
-            pytest.mark.xfail(run=False, reason="TODO pass numpydoc validation")
-        )
-
     res = numpydoc_validation.validate(import_path)
 
     res["errors"] = list(filter_errors(res["errors"], method, Estimator=Estimator))
diff --git a/setup.cfg b/setup.cfg
index 823d6b1e55266..9eca7fad87b4b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -15,11 +15,14 @@ addopts =
     --doctest-modules
     --disable-pytest-warnings
     --color=yes
-    -rxXs
+    -rN
 
 filterwarnings =
     ignore:the matrix subclass:PendingDeprecationWarning
 
+    # Workaround for https://github.com/pypa/setuptools/issues/2885
+    ignore::DeprecationWarning:pkg_resources
+
 [wheelhouse_uploader]
 artifact_indexes=
     # Wheels built by the "Wheel builder" workflow in GitHub actions:
diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py
index be83b4c4d8baf..67b5f2c662eb0 100644
--- a/sklearn/_build_utils/__init__.py
+++ b/sklearn/_build_utils/__init__.py
@@ -76,7 +76,14 @@ def cythonize_extensions(top_path, config):
         compile_time_env={
             "SKLEARN_OPENMP_PARALLELISM_ENABLED": sklearn._OPENMP_SUPPORTED
         },
-        compiler_directives={"language_level": 3},
+        compiler_directives={
+            "language_level": 3,
+            "boundscheck": False,
+            "wraparound": False,
+            "initializedcheck": False,
+            "nonecheck": False,
+            "cdivision": True,
+        },
     )
 
 
diff --git a/sklearn/_config.py b/sklearn/_config.py
index fe2d27f64857c..c41c180012056 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -22,7 +22,7 @@ def _get_threadlocal_config():
 
 
 def get_config():
-    """Retrieve current values for configuration set by :func:`set_config`
+    """Retrieve current values for configuration set by :func:`set_config`.
 
     Returns
     -------
@@ -98,40 +98,55 @@ def set_config(
 
 
 @contextmanager
-def config_context(**new_config):
-    """Context manager for global scikit-learn configuration
+def config_context(
+    *, assume_finite=None, working_memory=None, print_changed_only=None, display=None
+):
+    """Context manager for global scikit-learn configuration.
 
     Parameters
     ----------
-    assume_finite : bool, default=False
+    assume_finite : bool, default=None
         If True, validation for finiteness will be skipped,
         saving time, but leading to potential crashes. If
         False, validation for finiteness will be performed,
-        avoiding error.  Global default: False.
+        avoiding error. If None, the existing value won't change.
+        The default value is False.
 
-    working_memory : int, default=1024
+    working_memory : int, default=None
         If set, scikit-learn will attempt to limit the size of temporary arrays
         to this number of MiB (per job when parallelised), often saving both
         computation time and memory on expensive operations that can be
-        performed in chunks. Global default: 1024.
+        performed in chunks. If None, the existing value won't change.
+        The default value is 1024.
 
-    print_changed_only : bool, default=True
+    print_changed_only : bool, default=None
         If True, only the parameters that were set to non-default
         values will be printed when printing an estimator. For example,
         ``print(SVC())`` while True will only print 'SVC()', but would print
         'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
-        when False. Default is True.
+        when False. If None, the existing value won't change.
+        The default value is True.
 
         .. versionchanged:: 0.23
            Default changed from False to True.
 
-    display : {'text', 'diagram'}, default='text'
+    display : {'text', 'diagram'}, default=None
         If 'diagram', estimators will be displayed as a diagram in a Jupyter
         lab or notebook context. If 'text', estimators will be displayed as
-        text. Default is 'text'.
+        text. If None, the existing value won't change.
+        The default value is 'text'.
 
         .. versionadded:: 0.23
 
+    Yields
+    ------
+    None.
+
+    See Also
+    --------
+    set_config : Set global scikit-learn configuration.
+    get_config : Retrieve current values of the global configuration.
+
     Notes
     -----
     All settings, not just those presently modified, will be returned to
@@ -148,15 +163,15 @@ def config_context(**new_config):
     ...         assert_all_finite([float('nan')])
     Traceback (most recent call last):
     ...
-    ValueError: Input contains NaN, ...
-
-    See Also
-    --------
-    set_config : Set global scikit-learn configuration.
-    get_config : Retrieve current values of the global configuration.
+    ValueError: Input contains NaN...
     """
     old_config = get_config()
-    set_config(**new_config)
+    set_config(
+        assume_finite=assume_finite,
+        working_memory=working_memory,
+        print_changed_only=print_changed_only,
+        display=display,
+    )
 
     try:
         yield
diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx
index 7f60b889fa284..1f9364cd92940 100644
--- a/sklearn/_isotonic.pyx
+++ b/sklearn/_isotonic.pyx
@@ -3,8 +3,6 @@
 # Uses the pool adjacent violators algorithm (PAVA), with the
 # enhancement of searching for the longest decreasing subsequence to
 # pool at each step.
-#
-# cython: boundscheck=False, wraparound=False, cdivision=True
 
 import numpy as np
 cimport numpy as np
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index bee904fffa558..468488ac276ea 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -7,6 +7,13 @@
 if platform.python_implementation() == "PyPy":
     NUMPY_MIN_VERSION = "1.19.0"
 else:
+    # We pinned PyWavelet (a scikit-image dependence) to 1.1.1 in the minimum
+    # documentation CI builds that is the latest version that support our
+    # minimum NumPy version required. If PyWavelets 1.2+ is installed, it would
+    # require NumPy 1.17+ that trigger a bug with Pandas 0.25:
+    # https://github.com/numpy/numpy/issues/18355#issuecomment-774610226
+    # When upgrading NumPy, we can unpin PyWavelets but we need to update the
+    # minimum version of Pandas >= 1.0.5.
     NUMPY_MIN_VERSION = "1.14.6"
 
 SCIPY_MIN_VERSION = "1.1.0"
diff --git a/sklearn/base.py b/sklearn/base.py
index 241dac26dd3ca..06e9a63630923 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -24,6 +24,8 @@
 from .utils.validation import _check_y
 from .utils.validation import _num_features
 from .utils.validation import _check_feature_names_in
+from .utils.validation import _generate_get_feature_names_out
+from .utils.validation import check_is_fitted
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _get_feature_names
 
@@ -210,8 +212,7 @@ def get_params(self, deep=True):
         return out
 
     def set_params(self, **params):
-        """
-        Set the parameters of this estimator.
+        """Set the parameters of this estimator.
 
         The method works on simple estimators as well as on nested objects
         (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
@@ -237,10 +238,10 @@ def set_params(self, **params):
         for key, value in params.items():
             key, delim, sub_key = key.partition("__")
             if key not in valid_params:
+                local_valid_params = self._get_param_names()
                 raise ValueError(
-                    "Invalid parameter %s for estimator %s. "
-                    "Check the list of available parameters "
-                    "with `estimator.get_params().keys()`." % (key, self)
+                    f"Invalid parameter {key!r} for estimator {self}. "
+                    f"Valid parameters are: {local_valid_params!r}."
                 )
 
             if delim:
@@ -529,15 +530,23 @@ def _validate_data(
                It is recommended to call reset=True in `fit` and in the first
                call to `partial_fit`. All other methods that validate `X`
                should set `reset=False`.
+
         validate_separately : False or tuple of dicts, default=False
             Only used if y is not None.
             If False, call validate_X_y(). Else, it must be a tuple of kwargs
             to be used for calling check_array() on X and y respectively.
+
+            `estimator=self` is automatically added to these dicts to generate
+            more informative error message in case of invalid input data.
+
         **check_params : kwargs
             Parameters passed to :func:`sklearn.utils.check_array` or
             :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
             is not False.
 
+            `estimator=self` is automatically added to these params to generate
+            more informative error message in case of invalid input data.
+
         Returns
         -------
         out : {ndarray, sparse matrix} or tuple of these
@@ -555,10 +564,13 @@ def _validate_data(
         no_val_X = isinstance(X, str) and X == "no_validation"
         no_val_y = y is None or isinstance(y, str) and y == "no_validation"
 
+        default_check_params = {"estimator": self}
+        check_params = {**default_check_params, **check_params}
+
         if no_val_X and no_val_y:
             raise ValueError("Validation should be done on X, y or both.")
         elif not no_val_X and no_val_y:
-            X = check_array(X, **check_params)
+            X = check_array(X, input_name="X", **check_params)
             out = X
         elif no_val_X and not no_val_y:
             y = _check_y(y, **check_params)
@@ -570,8 +582,12 @@ def _validate_data(
                 # on X and y isn't equivalent to just calling check_X_y()
                 # :(
                 check_X_params, check_y_params = validate_separately
-                X = check_array(X, **check_X_params)
-                y = check_array(y, **check_y_params)
+                if "estimator" not in check_X_params:
+                    check_X_params = {**default_check_params, **check_X_params}
+                X = check_array(X, input_name="X", **check_X_params)
+                if "estimator" not in check_y_params:
+                    check_y_params = {**default_check_params, **check_y_params}
+                y = check_array(y, input_name="y", **check_y_params)
             else:
                 X, y = check_X_y(X, y, **check_params)
             out = X, y
@@ -879,6 +895,31 @@ def get_feature_names_out(self, input_features=None):
         return _check_feature_names_in(self, input_features)
 
 
+class _ClassNamePrefixFeaturesOutMixin:
+    """Mixin class for transformers that generate their own names by prefixing.
+
+    Assumes that `_n_features_out` is defined for the estimator.
+    """
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Only used to validate feature names with the names seen in :meth:`fit`.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+        check_is_fitted(self, "_n_features_out")
+        return _generate_get_feature_names_out(
+            self, self._n_features_out, input_features=input_features
+        )
+
+
 class DensityMixin:
     """Mixin class for all density estimators in scikit-learn."""
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 6d9abf82d3470..6131a8f759d1a 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -30,7 +30,6 @@
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
     column_or_1d,
-    deprecated,
     indexable,
     check_matplotlib_support,
 )
@@ -666,16 +665,6 @@ class _CalibratedClassifier:
         The method to use for calibration. Can be 'sigmoid' which
         corresponds to Platt's method or 'isotonic' which is a
         non-parametric approach based on isotonic regression.
-
-    Attributes
-    ----------
-    calibrators_ : list of fitted estimator instances
-        Same as `calibrators`. Exposed for backward-compatibility. Use
-        `calibrators` instead.
-
-        .. deprecated:: 0.24
-           `calibrators_` is deprecated from 0.24 and will be removed in
-           1.1 (renaming of 0.26). Use `calibrators` instead.
     """
 
     def __init__(self, base_estimator, calibrators, *, classes, method="sigmoid"):
@@ -684,16 +673,6 @@ def __init__(self, base_estimator, calibrators, *, classes, method="sigmoid"):
         self.classes = classes
         self.method = method
 
-    # TODO: Remove in 1.1
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "`calibrators_` is deprecated in 0.24 and will be removed in 1.1"
-        "(renaming of 0.26). Use `calibrators` instead."
-    )
-    @property
-    def calibrators_(self):
-        return self.calibrators
-
     def predict_proba(self, X):
         """Calculate calibrated probabilities.
 
@@ -1015,6 +994,13 @@ class CalibrationDisplay:
     estimator_name : str, default=None
         Name of estimator. If None, the estimator name is not shown.
 
+    pos_label : str or int, default=None
+        The positive class when computing the calibration curve.
+        By default, `estimators.classes_[1]` is considered as the
+        positive class.
+
+        .. versionadded:: 1.1
+
     Attributes
     ----------
     line_ : matplotlib Artist
@@ -1054,11 +1040,14 @@ class CalibrationDisplay:
     <...>
     """
 
-    def __init__(self, prob_true, prob_pred, y_prob, *, estimator_name=None):
+    def __init__(
+        self, prob_true, prob_pred, y_prob, *, estimator_name=None, pos_label=None
+    ):
         self.prob_true = prob_true
         self.prob_pred = prob_pred
         self.y_prob = y_prob
         self.estimator_name = estimator_name
+        self.pos_label = pos_label
 
     def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
         """Plot visualization.
@@ -1095,6 +1084,9 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
             fig, ax = plt.subplots()
 
         name = self.estimator_name if name is None else name
+        info_pos_label = (
+            f"(Positive class: {self.pos_label})" if self.pos_label is not None else ""
+        )
 
         line_kwargs = {}
         if name is not None:
@@ -1110,7 +1102,9 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
         if "label" in line_kwargs:
             ax.legend(loc="lower right")
 
-        ax.set(xlabel="Mean predicted probability", ylabel="Fraction of positives")
+        xlabel = f"Mean predicted probability {info_pos_label}"
+        ylabel = f"Fraction of positives {info_pos_label}"
+        ax.set(xlabel=xlabel, ylabel=ylabel)
 
         self.ax_ = ax
         self.figure_ = ax.figure
@@ -1125,6 +1119,7 @@ def from_estimator(
         *,
         n_bins=5,
         strategy="uniform",
+        pos_label=None,
         name=None,
         ref_line=True,
         ax=None,
@@ -1170,6 +1165,13 @@ def from_estimator(
             - `'quantile'`: The bins have the same number of samples and depend
               on predicted probabilities.
 
+        pos_label : str or int, default=None
+            The positive class when computing the calibration curve.
+            By default, `estimators.classes_[1]` is considered as the
+            positive class.
+
+            .. versionadded:: 1.1
+
         name : str, default=None
             Name for labeling curve. If `None`, the name of the estimator is
             used.
@@ -1217,10 +1219,8 @@ def from_estimator(
         if not is_classifier(estimator):
             raise ValueError("'estimator' should be a fitted classifier.")
 
-        # FIXME: `pos_label` should not be set to None
-        # We should allow any int or string in `calibration_curve`.
-        y_prob, _ = _get_response(
-            X, estimator, response_method="predict_proba", pos_label=None
+        y_prob, pos_label = _get_response(
+            X, estimator, response_method="predict_proba", pos_label=pos_label
         )
 
         name = name if name is not None else estimator.__class__.__name__
@@ -1229,6 +1229,7 @@ def from_estimator(
             y_prob,
             n_bins=n_bins,
             strategy=strategy,
+            pos_label=pos_label,
             name=name,
             ref_line=ref_line,
             ax=ax,
@@ -1243,6 +1244,7 @@ def from_predictions(
         *,
         n_bins=5,
         strategy="uniform",
+        pos_label=None,
         name=None,
         ref_line=True,
         ax=None,
@@ -1283,6 +1285,13 @@ def from_predictions(
             - `'quantile'`: The bins have the same number of samples and depend
               on predicted probabilities.
 
+        pos_label : str or int, default=None
+            The positive class when computing the calibration curve.
+            By default, `estimators.classes_[1]` is considered as the
+            positive class.
+
+            .. versionadded:: 1.1
+
         name : str, default=None
             Name for labeling curve.
 
@@ -1328,11 +1337,16 @@ def from_predictions(
         check_matplotlib_support(method_name)
 
         prob_true, prob_pred = calibration_curve(
-            y_true, y_prob, n_bins=n_bins, strategy=strategy
+            y_true, y_prob, n_bins=n_bins, strategy=strategy, pos_label=pos_label
         )
-        name = name if name is not None else "Classifier"
+        name = "Classifier" if name is None else name
+        pos_label = _check_pos_label_consistency(pos_label, y_true)
 
         disp = cls(
-            prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, estimator_name=name
+            prob_true=prob_true,
+            prob_pred=prob_pred,
+            y_prob=y_prob,
+            estimator_name=name,
+            pos_label=pos_label,
         )
         return disp.plot(ax=ax, ref_line=ref_line, **kwargs)
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index a4a22c73e182f..0a57278e99ba2 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -914,7 +914,7 @@ def fit(self, X, y=None):
         self : object
             Returns the fitted instance.
         """
-        X = self._validate_data(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_data(X, ensure_min_samples=2)
         return self._fit(X)
 
     def _fit(self, X):
@@ -1234,7 +1234,7 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        X = self._validate_data(X, ensure_min_features=2, estimator=self)
+        X = self._validate_data(X, ensure_min_features=2)
         super()._fit(X.T)
         return self
 
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index bd1c4e0da003b..6b1c824fc32ec 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -27,7 +27,6 @@ def _scale_normalize(X):
 
     Returns the normalized matrix and the row and column scaling
     factors.
-
     """
     X = make_nonnegative(X)
     row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
@@ -48,7 +47,6 @@ def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
     """Normalize rows and columns of ``X`` simultaneously so that all
     rows sum to one constant and all columns sum to a different
     constant.
-
     """
     # According to paper, this can also be done more efficiently with
     # deviation reduction and balancing algorithms.
@@ -137,7 +135,6 @@ def fit(self, X, y=None):
     def _svd(self, array, n_components, n_discard):
         """Returns first `n_components` left and right singular
         vectors u and v, discarding the first `n_discard`.
-
         """
         if self.svd_method == "randomized":
             kwargs = {}
@@ -290,6 +287,17 @@ class SpectralCoclustering(BaseSpectral):
 
         .. versionadded:: 1.0
 
+    See Also
+    --------
+    SpectralBiclustering : Partitions rows and columns under the assumption
+        that the data has an underlying checkerboard structure.
+
+    References
+    ----------
+    * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using
+      bipartite spectral graph partitioning
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.140.3011>`__.
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralCoclustering
@@ -303,14 +311,6 @@ class SpectralCoclustering(BaseSpectral):
     array([0, 0], dtype=int32)
     >>> clustering
     SpectralCoclustering(n_clusters=2, random_state=0)
-
-    References
-    ----------
-
-    * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using
-      bipartite spectral graph partitioning
-      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.140.3011>`__.
-
     """
 
     def __init__(
diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx
index b9a80686a76f8..63125f60d3af1 100644
--- a/sklearn/cluster/_dbscan_inner.pyx
+++ b/sklearn/cluster/_dbscan_inner.pyx
@@ -1,8 +1,6 @@
 # Fast inner loop for DBSCAN.
 # Author: Lars Buitinck
 # License: 3-clause BSD
-#
-# cython: boundscheck=False, wraparound=False
 
 cimport cython
 from libcpp.vector cimport vector
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index 11ea3294c086a..b7d4343d7fdd2 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -8,9 +8,6 @@ ctypedef np.float64_t DOUBLE
 ctypedef np.npy_intp INTP
 ctypedef np.int8_t INT8
 
-# Numpy must be initialized. When using numpy from C or Cython you must
-# _always_ do that, or you will have segfaults
-
 np.import_array()
 
 from ..metrics._dist_metrics cimport DistanceMetric
@@ -32,9 +29,6 @@ from numpy.math cimport INFINITY
 ###############################################################################
 # Utilities for computing the ward momentum
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.cdivision(True)
 def compute_ward_dist(np.ndarray[DOUBLE, ndim=1, mode='c'] m_1,
                       np.ndarray[DOUBLE, ndim=2, mode='c'] m_2,
                       np.ndarray[INTP, ndim=1, mode='c'] coord_row,
@@ -101,8 +95,6 @@ def _hc_get_descendent(INTP node, children, INTP n_leaves):
     return descendent
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
 def hc_get_heads(np.ndarray[INTP, ndim=1] parents, copy=True):
     """Returns the heads of the forest, as defined by parents.
 
@@ -135,8 +127,6 @@ def hc_get_heads(np.ndarray[INTP, ndim=1] parents, copy=True):
     return parents
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
 def _get_parents(nodes, heads, np.ndarray[INTP, ndim=1] parents,
                  np.ndarray[INT8, ndim=1, mode='c'] not_visited):
     """Returns the heads of the given nodes, as defined by parents.
@@ -176,8 +166,6 @@ def _get_parents(nodes, heads, np.ndarray[INTP, ndim=1] parents,
 # as keys and edge weights as values.
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
 def max_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
@@ -231,8 +219,6 @@ def max_merge(IntFloatDict a, IntFloatDict b,
     return out_obj
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
 def average_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
@@ -302,7 +288,6 @@ cdef class WeightedEdge:
         self.a = a
         self.b = b
 
-    @cython.nonecheck(False)
     def __richcmp__(self, WeightedEdge other, int op):
         """Cython-specific comparison method.
 
@@ -348,8 +333,6 @@ cdef class UnionFind(object):
         self.size = np.hstack((np.ones(N, dtype=ITYPE),
                                np.zeros(N - 1, dtype=ITYPE)))
 
-    @cython.boundscheck(False)
-    @cython.nonecheck(False)
     cdef void union(self, ITYPE_t m, ITYPE_t n):
         self.parent[m] = self.next_label
         self.parent[n] = self.next_label
@@ -358,8 +341,7 @@ cdef class UnionFind(object):
 
         return
 
-    @cython.boundscheck(False)
-    @cython.nonecheck(False)
+    @cython.wraparound(True)
     cdef ITYPE_t fast_find(self, ITYPE_t n):
         cdef ITYPE_t p
         p = n
@@ -371,8 +353,7 @@ cdef class UnionFind(object):
             p, self.parent[p] = self.parent[p], n
         return n
 
-@cython.boundscheck(False)
-@cython.nonecheck(False)
+
 cpdef np.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
     np.ndarray[DTYPE_t, ndim=2] L):
     """
@@ -423,6 +404,7 @@ cpdef np.ndarray[DTYPE_t, ndim=2] _single_linkage_label(
     return result_arr
 
 
+@cython.wraparound(True)
 def single_linkage_label(L):
     """
     Convert an linkage array or MST to a tree by labelling clusters at merges.
@@ -452,8 +434,6 @@ def single_linkage_label(L):
 
 
 # Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
-@cython.boundscheck(False)
-@cython.nonecheck(False)
 def mst_linkage_core(
         const DTYPE_t [:, ::1] raw_data,
         DistanceMetric dist_metric):
diff --git a/sklearn/cluster/_k_means_common.pxd b/sklearn/cluster/_k_means_common.pxd
index db70278860097..8eefa10e64e78 100644
--- a/sklearn/cluster/_k_means_common.pxd
+++ b/sklearn/cluster/_k_means_common.pxd
@@ -1,6 +1,3 @@
-# cython: language_level=3
-
-
 from cython cimport floating
 cimport numpy as np
 
diff --git a/sklearn/cluster/_k_means_common.pyx b/sklearn/cluster/_k_means_common.pyx
index 9e8f81c9f2625..327a7ed60cb84 100644
--- a/sklearn/cluster/_k_means_common.pyx
+++ b/sklearn/cluster/_k_means_common.pyx
@@ -1,7 +1,3 @@
-# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
-# Profiling is enabled by default as the overhead does not seem to be
-# measurable on this specific use case.
-
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Olivier Grisel <olivier.grisel@ensta.org>
 #         Lars Buitinck
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index 9459d5e9fc316..4b1ca35c15db2 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -1,5 +1,3 @@
-# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
-#
 # Author: Andreas Mueller
 #
 # Licence: BSD 3 clause
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index e3526888c82ab..9611614c6239f 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -1,5 +1,3 @@
-# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
-#
 # Licence: BSD 3 clause
 
 # TODO: We still need to use ndarrays instead of typed memoryviews when using
diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
index ffae55e3b3b46..c88c0d3c40828 100644
--- a/sklearn/cluster/_k_means_minibatch.pyx
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -1,5 +1,3 @@
-# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
-
 # TODO: We still need to use ndarrays instead of typed memoryviews when using
 # fused types and when the array may be read-only (for instance when it's
 # provided by the user). This will be fixed in cython >= 0.3.
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index 8b80f9999b403..9e755769f6294 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -1,14 +1,18 @@
 # -*- coding: utf-8 -*-
 """Algorithms for spectral clustering"""
 
-# Author: Gael Varoquaux gael.varoquaux@normalesup.org
+# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
 #         Brian Cheung
 #         Wei LI <kuantkid@gmail.com>
+#         Andrew Knyazev <Andrew.Knyazev@ucdenver.edu>
 # License: BSD 3 clause
 import warnings
 
 import numpy as np
 
+from scipy.linalg import LinAlgError, qr, svd
+from scipy.sparse import csc_matrix
+
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import check_random_state, as_float_array
 from ..utils.deprecation import deprecated
@@ -18,6 +22,38 @@
 from ._kmeans import k_means
 
 
+def cluster_qr(vectors):
+    """Find the discrete partition closest to the eigenvector embedding.
+
+        This implementation was proposed in [1]_.
+
+    .. versionadded:: 1.1
+
+        Parameters
+        ----------
+        vectors : array-like, shape: (n_samples, n_clusters)
+            The embedding space of the samples.
+
+        Returns
+        -------
+        labels : array of integers, shape: n_samples
+            The cluster labels of vectors.
+
+        References
+        ----------
+        .. [1] `Simple, direct, and efficient multi-way spectral clustering, 2019
+            Anil Damle, Victor Minden, Lexing Ying
+            <:doi:`10.1093/imaiai/iay008`>`_
+
+    """
+
+    k = vectors.shape[1]
+    _, _, piv = qr(vectors.T, pivoting=True)
+    ut, _, v = svd(vectors[piv[:k], :].T)
+    vectors = abs(np.dot(vectors, np.dot(ut, v.conj())))
+    return vectors.argmax(axis=1)
+
+
 def discretize(
     vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None
 ):
@@ -73,9 +109,6 @@ def discretize(
 
     """
 
-    from scipy.sparse import csc_matrix
-    from scipy.linalg import LinAlgError
-
     random_state = check_random_state(random_state)
 
     vectors = as_float_array(vectors, copy=copy)
@@ -139,8 +172,8 @@ def discretize(
 
             try:
                 U, S, Vh = np.linalg.svd(t_svd)
-                svd_restarts += 1
             except LinAlgError:
+                svd_restarts += 1
                 print("SVD did not converge, randomizing and trying again")
                 break
 
@@ -200,10 +233,11 @@ def spectral_clustering(
         Number of eigenvectors to use for the spectral embedding
 
     eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
-        The eigenvalue decomposition strategy to use. AMG requires pyamg
-        to be installed. It can be faster on very large, sparse problems,
-        but may also lead to instabilities. If None, then ``'arpack'`` is
-        used. See [4]_ for more details regarding `'lobpcg'`.
+        The eigenvalue decomposition method. If None then ``'arpack'`` is used.
+        See [4]_ for more details regarding ``'lobpcg'``.
+        Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional
+        Algebraic MultiGrid preconditioning and requires pyamg to be installed.
+        It can be faster on very large sparse problems [6]_ and [7]_.
 
     random_state : int, RandomState instance, default=None
         A pseudo random number generator used for the initialization
@@ -229,12 +263,19 @@ def spectral_clustering(
         Stopping criterion for eigendecomposition of the Laplacian matrix
         when using arpack eigen_solver.
 
-    assign_labels : {'kmeans', 'discretize'}, default='kmeans'
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
         The strategy to use to assign labels in the embedding
-        space.  There are two ways to assign labels after the Laplacian
+        space.  There are three ways to assign labels after the Laplacian
         embedding.  k-means can be applied and is a popular choice. But it can
         also be sensitive to initialization. Discretization is another
         approach which is less sensitive to random initialization [3]_.
+        The cluster_qr method [5]_ directly extracts clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and is not an iterative method, yet may outperform
+        k-means and discretization in terms of both quality and speed.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
 
     verbose : bool, default=False
         Verbosity mode.
@@ -262,23 +303,38 @@ def spectral_clustering(
            <https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf>`_
 
     .. [4] `Toward the Optimal Preconditioned Eigensolver:
-           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.
+           Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001
            A. V. Knyazev
            SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
-           <https://epubs.siam.org/doi/pdf/10.1137/S1064827500366124>`_
+           <:doi:`10.1137/S1064827500366124`>`_
+
+    .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <:doi:`10.1093/imaiai/iay008`>`_
+
+    .. [6] `Multiscale Spectral Image Segmentation Multiscale preconditioning
+           for computing eigenvalues of graph Laplacians in image segmentation, 2006
+           Andrew Knyazev
+           <:doi:`10.13140/RG.2.2.35280.02565`>`_
+
+    .. [7] `Preconditioned spectral clustering for stochastic block partition
+           streaming graph challenge (Preliminary version at arXiv.)
+           David Zhuzhunashvili, Andrew Knyazev
+           <:doi:`10.1109/HPEC.2017.8091045`>`_
 
     Notes
     -----
-    The graph should contain only one connect component, elsewhere
+    The graph should contain only one connected component, elsewhere
     the results make little sense.
 
     This algorithm solves the normalized cut for k=2: it is a
     normalized spectral clustering.
     """
-    if assign_labels not in ("kmeans", "discretize"):
+    if assign_labels not in ("kmeans", "discretize", "cluster_qr"):
         raise ValueError(
             "The 'assign_labels' parameter should be "
-            "'kmeans' or 'discretize', but '%s' was given" % assign_labels
+            "'kmeans' or 'discretize', or 'cluster_qr', "
+            f"but {assign_labels!r} was given"
         )
     if isinstance(affinity, np.matrix):
         raise TypeError(
@@ -312,6 +368,8 @@ def spectral_clustering(
         _, labels, _ = k_means(
             maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose
         )
+    elif assign_labels == "cluster_qr":
+        labels = cluster_qr(maps)
     else:
         labels = discretize(maps, random_state=random_state)
 
@@ -407,12 +465,19 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         Stopping criterion for eigendecomposition of the Laplacian matrix
         when ``eigen_solver='arpack'``.
 
-    assign_labels : {'kmeans', 'discretize'}, default='kmeans'
+    assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
         The strategy for assigning labels in the embedding space. There are two
         ways to assign labels after the Laplacian embedding. k-means is a
         popular choice, but it can be sensitive to initialization.
         Discretization is another approach which is less sensitive to random
         initialization [3]_.
+        The cluster_qr method [5]_ directly extract clusters from eigenvectors
+        in spectral clustering. In contrast to k-means and discretization, cluster_qr
+        has no tuning parameters and runs no iterations, yet may outperform
+        k-means and discretization in terms of both quality and speed.
+
+        .. versionchanged:: 1.1
+           Added new labeling method 'cluster_qr'.
 
     degree : float, default=3
         Degree of the polynomial kernel. Ignored by other kernels.
@@ -502,6 +567,10 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
            SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.
            <https://epubs.siam.org/doi/pdf/10.1137/S1064827500366124>`_
 
+    .. [5] `Simple, direct, and efficient multi-way spectral clustering, 2019
+           Anil Damle, Victor Minden, Lexing Ying
+           <:doi:`10.1093/imaiai/iay008`>`_
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralClustering
diff --git a/sklearn/cluster/tests/test_dbscan.py b/sklearn/cluster/tests/test_dbscan.py
index 40949e81a24b1..08ca937d3e25f 100644
--- a/sklearn/cluster/tests/test_dbscan.py
+++ b/sklearn/cluster/tests/test_dbscan.py
@@ -180,7 +180,7 @@ def test_dbscan_metric_params():
             min_samples=min_samples,
             algorithm="ball_tree",
         ).fit(X)
-    assert not warns
+    assert not warns, warns[0].message
     core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
 
     # Test that sample labels are the same as passing Minkowski 'p' directly
diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index 679adf27520e4..702906b3fa0e7 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy import sparse
+from scipy.linalg import LinAlgError
 
 import pytest
 
@@ -12,7 +13,7 @@
 from sklearn.utils._testing import assert_array_equal
 
 from sklearn.cluster import SpectralClustering, spectral_clustering
-from sklearn.cluster._spectral import discretize
+from sklearn.cluster._spectral import discretize, cluster_qr
 from sklearn.feature_extraction import img_to_graph
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics import adjusted_rand_score
@@ -29,7 +30,7 @@
 
 
 @pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
-@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize"))
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
 def test_spectral_clustering(eigen_solver, assign_labels):
     S = np.array(
         [
@@ -101,7 +102,8 @@ def test_spectral_unknown_assign_labels():
         spectral_clustering(S, n_clusters=2, random_state=0, assign_labels="<unknown>")
 
 
-def test_spectral_clustering_sparse():
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
+def test_spectral_clustering_sparse(assign_labels):
     X, y = make_blobs(
         n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
     )
@@ -111,7 +113,12 @@ def test_spectral_clustering_sparse():
     S = sparse.coo_matrix(S)
 
     labels = (
-        SpectralClustering(random_state=0, n_clusters=2, affinity="precomputed")
+        SpectralClustering(
+            random_state=0,
+            n_clusters=2,
+            affinity="precomputed",
+            assign_labels=assign_labels,
+        )
         .fit(S)
         .labels_
     )
@@ -191,6 +198,36 @@ def histogram(x, y, **kwargs):
         sp.fit(X)
 
 
+def test_cluster_qr():
+    # cluster_qr by itself should not be used for clustering generic data
+    # other than the rows of the eigenvectors within spectral clustering,
+    # but cluster_qr must still preserve the labels for different dtypes
+    # of the generic fixed input even if the labels may be meaningless.
+    random_state = np.random.RandomState(seed=8)
+    n_samples, n_components = 10, 5
+    data = random_state.randn(n_samples, n_components)
+    labels_float64 = cluster_qr(data.astype(np.float64))
+    # Each sample is assigned a cluster identifier
+    assert labels_float64.shape == (n_samples,)
+    # All components should be covered by the assignment
+    assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
+    # Single precision data should yield the same cluster assignments
+    labels_float32 = cluster_qr(data.astype(np.float32))
+    assert np.array_equal(labels_float64, labels_float32)
+
+
+def test_cluster_qr_permutation_invariance():
+    # cluster_qr must be invariant to sample permutation.
+    random_state = np.random.RandomState(seed=8)
+    n_samples, n_components = 100, 5
+    data = random_state.randn(n_samples, n_components)
+    perm = random_state.permutation(n_samples)
+    assert np.array_equal(
+        cluster_qr(data)[perm],
+        cluster_qr(data[perm]),
+    )
+
+
 @pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
 def test_discretize(n_samples):
     # Test the discretize using a noise assignment matrix
@@ -283,7 +320,7 @@ def test_n_components():
     assert not np.array_equal(labels, labels_diff_ncomp)
 
 
-@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize"))
+@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
 def test_verbose(assign_labels, capsys):
     # Check verbose mode of KMeans for better coverage.
     X, y = make_blobs(
@@ -318,3 +355,19 @@ def test_spectral_clustering_np_matrix_raises():
     msg = r"spectral_clustering does not support passing in affinity as an np\.matrix"
     with pytest.raises(TypeError, match=msg):
         spectral_clustering(X)
+
+
+def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
+    """Check that discretize raises LinAlgError when svd never converges.
+
+    Non-regression test for #21380
+    """
+
+    def new_svd(*args, **kwargs):
+        raise LinAlgError()
+
+    monkeypatch.setattr(np.linalg, "svd", new_svd)
+    vectors = np.ones((10, 4))
+
+    with pytest.raises(LinAlgError, match="SVD did not converge"):
+        discretize(vectors)
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 7814db9aabe75..e96729b2d91d7 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -209,6 +209,7 @@ def fit(self, X, y, **fit_params):
         """
         y = check_array(
             y,
+            input_name="y",
             accept_sparse=False,
             force_all_finite=True,
             ensure_2d=False,
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 81087c17de344..3bdda14de6ad0 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -465,9 +465,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         # Covariance does not make sense for a single feature
-        X = self._validate_data(
-            X, ensure_min_features=2, ensure_min_samples=2, estimator=self
-        )
+        X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2)
 
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
@@ -856,7 +854,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         # Covariance does not make sense for a single feature
-        X = self._validate_data(X, ensure_min_features=2, estimator=self)
+        X = self._validate_data(X, ensure_min_features=2)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 3d4012e6050ff..202a3c3cca064 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -212,7 +212,9 @@ def fit(self, X, Y):
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
         )
-        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
+        Y = check_array(
+            Y, input_name="Y", dtype=np.float64, copy=self.copy, ensure_2d=False
+        )
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
 
@@ -388,7 +390,9 @@ def transform(self, X, Y=None, copy=True):
         # Apply rotation
         x_scores = np.dot(X, self.x_rotations_)
         if Y is not None:
-            Y = check_array(Y, ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES)
+            Y = check_array(
+                Y, input_name="Y", ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES
+            )
             if Y.ndim == 1:
                 Y = Y.reshape(-1, 1)
             Y -= self._y_mean
@@ -424,7 +428,7 @@ def inverse_transform(self, X, Y=None):
         This transformation will only be exact if `n_components=n_features`.
         """
         check_is_fitted(self)
-        X = check_array(X, dtype=FLOAT_DTYPES)
+        X = check_array(X, input_name="X", dtype=FLOAT_DTYPES)
         # From pls space to original space
         X_reconstructed = np.matmul(X, self.x_loadings_.T)
         # Denormalize
@@ -432,7 +436,7 @@ def inverse_transform(self, X, Y=None):
         X_reconstructed += self._x_mean
 
         if Y is not None:
-            Y = check_array(Y, dtype=FLOAT_DTYPES)
+            Y = check_array(Y, input_name="Y", dtype=FLOAT_DTYPES)
             # From pls space to original space
             Y_reconstructed = np.matmul(Y, self.y_loadings_.T)
             # Denormalize
@@ -1036,7 +1040,9 @@ def fit(self, X, Y):
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
         )
-        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
+        Y = check_array(
+            Y, input_name="Y", dtype=np.float64, copy=self.copy, ensure_2d=False
+        )
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
 
@@ -1151,7 +1157,7 @@ def transform(self, X, Y=None):
         Xr = (X - self._x_mean) / self._x_std
         x_scores = np.dot(Xr, self.x_weights_)
         if Y is not None:
-            Y = check_array(Y, ensure_2d=False, dtype=np.float64)
+            Y = check_array(Y, input_name="Y", ensure_2d=False, dtype=np.float64)
             if Y.ndim == 1:
                 Y = Y.reshape(-1, 1)
             Yr = (Y - self._y_mean) / self._y_std
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index dab3c92d654bb..55e31ca9fe5a5 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -928,7 +928,7 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
 
     Parameters
     ----------
-    return_X_y : bool, default=False.
+    return_X_y : bool, default=False
         If True, returns ``(data, target)`` instead of a Bunch object.
         See below for more information about the `data` and `target` object.
 
@@ -969,7 +969,9 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
             The path to the location of the target.
 
     (data, target) : tuple if ``return_X_y`` is True
-
+        Returns a tuple of two ndarray of shape (n_samples, n_features)
+        A 2D array with each row representing one sample and each column
+        representing the features and/or target of a given sample.
         .. versionadded:: 0.18
     """
     data_filename = "diabetes_data.csv.gz"
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index a3d51ad54ffcb..9988650410de7 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -1461,7 +1461,7 @@ def make_sparse_spd_matrix(
     return prec
 
 
-def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
+def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False):
     """Generate a swiss roll dataset.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1469,7 +1469,7 @@ def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     Parameters
     ----------
     n_samples : int, default=100
-        The number of sample points on the S curve.
+        The number of sample points on the Swiss Roll.
 
     noise : float, default=0.0
         The standard deviation of the gaussian noise.
@@ -1479,6 +1479,9 @@ def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
         for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
+    hole : bool, default=False
+        If True generates the swiss roll with hole dataset.
+
     Returns
     -------
     X : ndarray of shape (n_samples, 3)
@@ -1500,12 +1503,22 @@ def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     """
     generator = check_random_state(random_state)
 
-    t = 1.5 * np.pi * (1 + 2 * generator.rand(1, n_samples))
+    if not hole:
+        t = 1.5 * np.pi * (1 + 2 * generator.rand(n_samples))
+        y = 21 * generator.rand(n_samples)
+    else:
+        corners = np.array(
+            [[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)]
+        )
+        corners = np.delete(corners, 4, axis=0)
+        corner_index = generator.choice(8, n_samples)
+        parameters = generator.rand(2, n_samples) * np.array([[np.pi], [7]])
+        t, y = corners[corner_index].T + parameters
+
     x = t * np.cos(t)
-    y = 21 * generator.rand(1, n_samples)
     z = t * np.sin(t)
 
-    X = np.concatenate((x, y, z))
+    X = np.vstack((x, y, z))
     X += noise * generator.randn(3, n_samples)
     X = X.T
     t = np.squeeze(t)
diff --git a/sklearn/datasets/_svmlight_format_fast.pyx b/sklearn/datasets/_svmlight_format_fast.pyx
index 9644ecbbd20a5..12d222f8cf581 100644
--- a/sklearn/datasets/_svmlight_format_fast.pyx
+++ b/sklearn/datasets/_svmlight_format_fast.pyx
@@ -4,8 +4,6 @@
 #          Lars Buitinck
 #          Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
-#
-# cython: boundscheck=False, wraparound=False
 
 import array
 from cpython cimport array
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 3e27a6119554e..f4e136742311b 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -333,7 +333,7 @@ def test_load_boston_warning():
 @pytest.mark.filterwarnings("ignore:Function load_boston is deprecated")
 def test_load_boston_alternative():
     pd = pytest.importorskip("pandas")
-    if not os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "1":
+    if os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "1") == "1":
         raise SkipTest(
             "This test requires an internet connection to fetch the dataset."
         )
diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py
index 83cc0252ba993..50a1d11097b2e 100644
--- a/sklearn/datasets/tests/test_samples_generator.py
+++ b/sklearn/datasets/tests/test_samples_generator.py
@@ -523,11 +523,12 @@ def test_make_spd_matrix():
     )
 
 
-def test_make_swiss_roll():
-    X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)
+@pytest.mark.parametrize("hole", [False, True])
+def test_make_swiss_roll(hole):
+    X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0, hole=hole)
 
-    assert X.shape == (5, 3), "X shape mismatch"
-    assert t.shape == (5,), "t shape mismatch"
+    assert X.shape == (5, 3)
+    assert t.shape == (5,)
     assert_array_almost_equal(X[:, 0], t * np.cos(t))
     assert_array_almost_equal(X[:, 2], t * np.sin(t))
 
diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py
index e503a52ee1f92..7904ce17f7212 100644
--- a/sklearn/decomposition/_base.py
+++ b/sklearn/decomposition/_base.py
@@ -11,12 +11,14 @@
 import numpy as np
 from scipy import linalg
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..utils.validation import check_is_fitted
 from abc import ABCMeta, abstractmethod
 
 
-class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
+class _BasePCA(
+    _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta
+):
     """Base class for PCA methods.
 
     Warning: This class should not be used directly.
@@ -154,3 +156,8 @@ def inverse_transform(self, X):
             )
         else:
             return np.dot(X, self.components_) + self.mean_
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/_cdnmf_fast.pyx b/sklearn/decomposition/_cdnmf_fast.pyx
index 9c6b171096ced..c50e09e1632c7 100644
--- a/sklearn/decomposition/_cdnmf_fast.pyx
+++ b/sklearn/decomposition/_cdnmf_fast.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Author: Mathieu Blondel, Tom Dupre la Tour
 # License: BSD 3 clause
 
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index a18adb6f1e3bc..aee341bd88b05 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -14,7 +14,7 @@
 from scipy import linalg
 from joblib import Parallel, effective_n_jobs
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..utils import deprecated
 from ..utils import check_array, check_random_state, gen_even_slices, gen_batches
 from ..utils.extmath import randomized_svd, row_norms, svd_flip
@@ -480,8 +480,8 @@ def _update_dict(
         if positive:
             np.clip(dictionary[k], 0, None, out=dictionary[k])
 
-        # Projection on the constraint set ||V_k|| == 1
-        dictionary[k] /= linalg.norm(dictionary[k])
+        # Projection on the constraint set ||V_k|| <= 1
+        dictionary[k] /= max(linalg.norm(dictionary[k]), 1)
 
     if verbose and n_unused > 0:
         print(f"{n_unused} unused atoms resampled.")
@@ -762,8 +762,9 @@ def dict_learning_online(
     X : ndarray of shape (n_samples, n_features)
         Data matrix.
 
-    n_components : int, default=2
-        Number of dictionary atoms to extract.
+    n_components : int or None, default=2
+        Number of dictionary atoms to extract. If None, then ``n_components``
+        is set to ``n_features``.
 
     alpha : float, default=1
         Sparsity controlling parameter.
@@ -1013,7 +1014,7 @@ def dict_learning_online(
         return dictionary
 
 
-class _BaseSparseCoding(TransformerMixin):
+class _BaseSparseCoding(_ClassNamePrefixFeaturesOutMixin, TransformerMixin):
     """Base class from SparseCoder and DictionaryLearning algorithms."""
 
     def __init__(
@@ -1314,6 +1315,11 @@ def n_features_in_(self):
         """Number of features seen during `fit`."""
         return self.dictionary.shape[1]
 
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.n_components_
+
 
 class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     """Dictionary learning.
@@ -1325,7 +1331,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
         (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                     (U,V)
-                    with || V_k ||_2 = 1 for all  0 <= k < n_components
+                    with || V_k ||_2 <= 1 for all  0 <= k < n_components
 
     ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
     the entry-wise matrix norm which is the sum of the absolute values
@@ -1335,8 +1341,9 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     Parameters
     ----------
-    n_components : int, default=n_features
-        Number of dictionary elements to extract.
+    n_components : int, default=None
+        Number of dictionary elements to extract. If None, then ``n_components``
+        is set to ``n_features``.
 
     alpha : float, default=1.0
         Sparsity controlling parameter.
@@ -1585,6 +1592,11 @@ def fit(self, X, y=None):
         self.error_ = E
         return self
 
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
 
 class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
     """Mini-batch dictionary learning.
@@ -1596,7 +1608,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
        (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                     (U,V)
-                    with || V_k ||_2 = 1 for all  0 <= k < n_components
+                    with || V_k ||_2 <= 1 for all  0 <= k < n_components
 
     ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
     the entry-wise matrix norm which is the sum of the absolute values
@@ -1924,3 +1936,8 @@ def partial_fit(self, X, y=None, iter_offset=None):
         self.inner_stats_ = (A, B)
         self.iter_offset_ = iter_offset + 1
         return self
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index fcf96cb0eb532..8ff5b54d4e839 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -25,14 +25,14 @@
 from scipy import linalg
 
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..utils import check_random_state
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
 from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
 
 
-class FactorAnalysis(TransformerMixin, BaseEstimator):
+class FactorAnalysis(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """Factor Analysis (FA).
 
     A simple linear generative model with Gaussian latent variables.
@@ -426,6 +426,11 @@ def _rotate(self, components, n_components=None, tol=1e-6):
         else:
             raise ValueError("'method' must be in %s, not %s" % (implemented, method))
 
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
 
 def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100):
     """Return rotated components."""
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 9d4bcc9026926..b9e887f5e719f 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -14,7 +14,7 @@
 import numpy as np
 from scipy import linalg
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..exceptions import ConvergenceWarning
 
 from ..utils import check_array, as_float_array, check_random_state
@@ -152,7 +152,7 @@ def fastica(
     n_components=None,
     *,
     algorithm="parallel",
-    whiten=True,
+    whiten="warn",
     fun="logcosh",
     fun_args=None,
     max_iter=200,
@@ -182,12 +182,18 @@ def fastica(
     algorithm : {'parallel', 'deflation'}, default='parallel'
         Apply a parallel or deflational FASTICA algorithm.
 
-    whiten : bool, default=True
-        If True perform an initial whitening of the data.
-        If False, the data is assumed to have already been
-        preprocessed: it should be centered, normed and white.
-        Otherwise you will get incorrect results.
-        In this case the parameter n_components will be ignored.
+    whiten : str or bool, default="warn"
+        Specify the whitening strategy to use.
+        If 'arbitrary-variance'  (default), a whitening with variance arbitrary is used.
+        If 'unit-variance', the whitening matrix is rescaled to ensure that each
+        recovered source has unit variance.
+        If False, the data is already considered to be whitened, and no
+        whitening is performed.
+
+        .. deprecated:: 1.1
+            From version 1.3, `whiten='unit-variance'` will be used by default.
+            `whiten=True` is deprecated from 1.1 and will raise ValueError in 1.3.
+            Use `whiten=arbitrary-variance` instead.
 
     fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'
         The functional form of the G function used in the
@@ -280,7 +286,6 @@ def my_g(x):
            Algorithms and Applications, Neural Networks, 13(4-5), 2000,
            pp. 411-430.
     """
-
     est = FastICA(
         n_components=n_components,
         algorithm=algorithm,
@@ -292,34 +297,25 @@ def my_g(x):
         w_init=w_init,
         random_state=random_state,
     )
-    sources = est._fit(X, compute_sources=compute_sources)
-
-    if whiten:
-        if return_X_mean:
-            if return_n_iter:
-                return (est.whitening_, est._unmixing, sources, est.mean_, est.n_iter_)
-            else:
-                return est.whitening_, est._unmixing, sources, est.mean_
-        else:
-            if return_n_iter:
-                return est.whitening_, est._unmixing, sources, est.n_iter_
-            else:
-                return est.whitening_, est._unmixing, sources
+    S = est._fit(X, compute_sources=compute_sources)
 
+    if est._whiten in ["unit-variance", "arbitrary-variance"]:
+        K = est.whitening_
+        X_mean = est.mean_
     else:
-        if return_X_mean:
-            if return_n_iter:
-                return None, est._unmixing, sources, None, est.n_iter_
-            else:
-                return None, est._unmixing, sources, None
-        else:
-            if return_n_iter:
-                return None, est._unmixing, sources, est.n_iter_
-            else:
-                return None, est._unmixing, sources
+        K = None
+        X_mean = None
 
+    returned_values = [K, est._unmixing, S]
+    if return_X_mean:
+        returned_values.append(X_mean)
+    if return_n_iter:
+        returned_values.append(est.n_iter_)
 
-class FastICA(TransformerMixin, BaseEstimator):
+    return returned_values
+
+
+class FastICA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """FastICA: a fast algorithm for Independent Component Analysis.
 
     The implementation is based on [1]_.
@@ -334,9 +330,18 @@ class FastICA(TransformerMixin, BaseEstimator):
     algorithm : {'parallel', 'deflation'}, default='parallel'
         Apply parallel or deflational algorithm for FastICA.
 
-    whiten : bool, default=True
-        If whiten is false, the data is already considered to be
-        whitened, and no whitening is performed.
+    whiten : str or bool, default="warn"
+        Specify the whitening strategy to use.
+        If 'arbitrary-variance' (default), a whitening with variance arbitrary is used.
+        If 'unit-variance', the whitening matrix is rescaled to ensure that each
+        recovered source has unit variance.
+        If False, the data is already considered to be whitened, and no
+        whitening is performed.
+
+        .. deprecated:: 1.1
+            From version 1.3 whiten='unit-variance' will be used by default.
+            `whiten=True` is deprecated from 1.1 and will raise ValueError in 1.3.
+            Use `whiten=arbitrary-variance` instead.
 
     fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'
         The functional form of the G function used in the
@@ -424,7 +429,8 @@ def my_g(x):
     >>> from sklearn.decomposition import FastICA
     >>> X, _ = load_digits(return_X_y=True)
     >>> transformer = FastICA(n_components=7,
-    ...         random_state=0)
+    ...         random_state=0,
+    ...         whiten='unit-variance')
     >>> X_transformed = transformer.fit_transform(X)
     >>> X_transformed.shape
     (1797, 7)
@@ -435,7 +441,7 @@ def __init__(
         n_components=None,
         *,
         algorithm="parallel",
-        whiten=True,
+        whiten="warn",
         fun="logcosh",
         fun_args=None,
         max_iter=200,
@@ -455,7 +461,7 @@ def __init__(
         self.random_state = random_state
 
     def _fit(self, X, compute_sources=False):
-        """Fit the model
+        """Fit the model.
 
         Parameters
         ----------
@@ -472,8 +478,27 @@ def _fit(self, X, compute_sources=False):
         S : ndarray of shape (n_samples, n_components) or None
             Sources matrix. `None` if `compute_sources` is `False`.
         """
+        self._whiten = self.whiten
+
+        if self._whiten == "warn":
+            warnings.warn(
+                "From version 1.3 whiten='unit-variance' will be used by default.",
+                FutureWarning,
+            )
+            self._whiten = "arbitrary-variance"
+
+        if self._whiten is True:
+            warnings.warn(
+                "From version 1.3 whiten=True should be specified as "
+                "whiten='arbitrary-variance' (its current behaviour). This "
+                "behavior is deprecated in 1.1 and will raise ValueError in 1.3.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            self._whiten = "arbitrary-variance"
+
         XT = self._validate_data(
-            X, copy=self.whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2
+            X, copy=self._whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2
         ).T
         fun_args = {} if self.fun_args is None else self.fun_args
         random_state = check_random_state(self.random_state)
@@ -504,7 +529,7 @@ def g(x, fun_args):
         n_features, n_samples = XT.shape
 
         n_components = self.n_components
-        if not self.whiten and n_components is not None:
+        if not self._whiten and n_components is not None:
             n_components = None
             warnings.warn("Ignoring n_components with whiten=False.")
 
@@ -516,7 +541,7 @@ def g(x, fun_args):
                 "n_components is too large: it will be set to %s" % n_components
             )
 
-        if self.whiten:
+        if self._whiten:
             # Centering the features of X
             X_mean = XT.mean(axis=-1)
             XT -= X_mean[:, np.newaxis]
@@ -575,17 +600,24 @@ def g(x, fun_args):
             )
         del X1
 
+        self.n_iter_ = n_iter
+
         if compute_sources:
-            if self.whiten:
+            if self._whiten:
                 S = np.linalg.multi_dot([W, K, XT]).T
             else:
                 S = np.dot(W, XT).T
         else:
             S = None
 
-        self.n_iter_ = n_iter
+        if self._whiten:
+            if self._whiten == "unit-variance":
+                if not compute_sources:
+                    S = np.linalg.multi_dot([W, K, XT]).T
+                S_std = np.std(S, axis=0, keepdims=True)
+                S /= S_std
+                W /= S_std.T
 
-        if self.whiten:
             self.components_ = np.dot(W, K)
             self.mean_ = X_mean
             self.whitening_ = K
@@ -658,9 +690,9 @@ def transform(self, X, copy=True):
         check_is_fitted(self)
 
         X = self._validate_data(
-            X, copy=(copy and self.whiten), dtype=FLOAT_DTYPES, reset=False
+            X, copy=(copy and self._whiten), dtype=FLOAT_DTYPES, reset=False
         )
-        if self.whiten:
+        if self._whiten:
             X -= self.mean_
 
         return np.dot(X, self.components_.T)
@@ -683,9 +715,14 @@ def inverse_transform(self, X, copy=True):
         """
         check_is_fitted(self)
 
-        X = check_array(X, copy=(copy and self.whiten), dtype=FLOAT_DTYPES)
+        X = check_array(X, copy=(copy and self._whiten), dtype=FLOAT_DTYPES)
         X = np.dot(X, self.mixing_.T)
-        if self.whiten:
+        if self._whiten:
             X += self.mean_
 
         return X
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index aee9b46899cd6..bf08c7e215ccc 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -10,25 +10,28 @@
 
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import svd_flip, _randomized_eigsh
-from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
+from ..utils.validation import (
+    check_is_fitted,
+    _check_psd_eigenvalues,
+)
 from ..utils.deprecation import deprecated
 from ..exceptions import NotFittedError
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
 
 
-class KernelPCA(TransformerMixin, BaseEstimator):
-    """Kernel Principal component analysis (KPCA).
+class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Kernel Principal component analysis (KPCA) [1]_.
 
     Non-linear dimensionality reduction through the use of kernels (see
     :ref:`metrics`).
 
-    It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or
-    the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,
-    depending on the shape of the input data and the number of components to
-    extract. It can also use a randomized truncated SVD by the method of
-    Halko et al. 2009, see `eigen_solver`.
+    It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD
+    or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the
+    truncated SVD, depending on the shape of the input data and the number of
+    components to extract. It can also use a randomized truncated SVD by the
+    method proposed in [3]_, see `eigen_solver`.
 
     Read more in the :ref:`User Guide <kernel_PCA>`.
 
@@ -63,14 +66,16 @@ class KernelPCA(TransformerMixin, BaseEstimator):
 
     fit_inverse_transform : bool, default=False
         Learn the inverse transform for non-precomputed kernels
-        (i.e. learn to find the pre-image of a point).
+        (i.e. learn to find the pre-image of a point). This method is based
+        on [2]_.
 
     eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \
-        default='auto'
+            default='auto'
         Select eigensolver to use. If `n_components` is much
         less than the number of training samples, randomized (or arpack to a
         smaller extend) may be more efficient than the dense eigensolver.
-        Randomized SVD is performed according to the method of Halko et al.
+        Randomized SVD is performed according to the method of Halko et al
+        [3]_.
 
         auto :
             the solver is selected by a default policy based on n_samples
@@ -89,10 +94,10 @@ class KernelPCA(TransformerMixin, BaseEstimator):
             `scipy.sparse.linalg.eigsh`. It requires strictly
             0 < n_components < n_samples
         randomized :
-            run randomized SVD by the method of Halko et al. The current
+            run randomized SVD by the method of Halko et al. [3]_. The current
             implementation selects eigenvalues based on their module; therefore
             using this method can lead to unexpected results if the kernel is
-            not positive semi-definite.
+            not positive semi-definite. See also [4]_.
 
         .. versionchanged:: 1.0
            `'randomized'` was added.
@@ -200,20 +205,26 @@ class KernelPCA(TransformerMixin, BaseEstimator):
 
     References
     ----------
-    Kernel PCA was introduced in:
-        Bernhard Schoelkopf, Alexander J. Smola,
-        and Klaus-Robert Mueller. 1999. Kernel principal
-        component analysis. In Advances in kernel methods,
-        MIT Press, Cambridge, MA, USA 327-352.
-
-    For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.
-
-    For eigen_solver == 'randomized', see:
-        Finding structure with randomness: Stochastic algorithms
-        for constructing approximate matrix decompositions Halko, et al., 2009
-        (arXiv:909)
-        A randomized algorithm for the decomposition of matrices
-        Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Kernel principal component analysis."
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+
+    .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       "Learning to find pre-images."
+       Advances in neural information processing systems 16 (2004): 449-456.
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
+
+    .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
+       "Finding structure with randomness: Probabilistic algorithms for
+       constructing approximate matrix decompositions."
+       SIAM review 53.2 (2011): 217-288. <0909.4061>`
+
+    .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.
+       "A randomized algorithm for the decomposition of matrices."
+       Applied and Computational Harmonic Analysis 30.1 (2011): 47-68.
+       <https://www.sciencedirect.com/science/article/pii/S1063520310000242>`_
 
     Examples
     --------
@@ -246,8 +257,6 @@ def __init__(
         copy_X=True,
         n_jobs=None,
     ):
-        if fit_inverse_transform and kernel == "precomputed":
-            raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
         self.n_components = n_components
         self.kernel = kernel
         self.kernel_params = kernel_params
@@ -418,6 +427,8 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
+        if self.fit_inverse_transform and self.kernel == "precomputed":
+            raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
         X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X)
         self._centerer = KernelCenterer()
         K = self._get_kernel(X)
@@ -529,7 +540,10 @@ def inverse_transform(self, X):
 
         References
         ----------
-        "Learning to Find Pre-Images", G BakIr et al, 2004.
+        `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+        "Learning to find pre-images."
+        Advances in neural information processing systems 16 (2004): 449-456.
+        <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
         """
         if not self.fit_inverse_transform:
             raise NotFittedError(
@@ -546,3 +560,8 @@ def _more_tags(self):
             "preserves_dtype": [np.float64, np.float32],
             "pairwise": self.kernel == "precomputed",
         }
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.eigenvalues_.shape[0]
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index a723e3451e24f..6db9d900566eb 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -16,7 +16,7 @@
 from scipy.special import gammaln, logsumexp
 from joblib import Parallel, effective_n_jobs
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
@@ -138,7 +138,9 @@ def _update_doc_distribution(
     return (doc_topic_distr, suff_stats)
 
 
-class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
+class LatentDirichletAllocation(
+    _ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
+):
     """Latent Dirichlet Allocation with online variational Bayes algorithm.
 
     The implementation is based on [1]_ and [2]_.
@@ -887,3 +889,8 @@ def perplexity(self, X, sub_sampling=False):
             X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
         )
         return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index d914bd5b6126d..cc1451be54567 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -15,11 +15,14 @@
 
 from ._cdnmf_fast import _update_cdnmf_fast
 from .._config import config_context
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state, check_array
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
-from ..utils.validation import check_is_fitted, check_non_negative
+from ..utils.validation import (
+    check_is_fitted,
+    check_non_negative,
+)
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -226,7 +229,7 @@ def _beta_loss_to_float(beta_loss):
     return beta_loss
 
 
-def _initialize_nmf(X, n_components, init="warn", eps=1e-6, random_state=None):
+def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None):
     """Algorithms for NMF initialization.
 
     Computes an initial guess for the non-negative
@@ -242,10 +245,9 @@ def _initialize_nmf(X, n_components, init="warn", eps=1e-6, random_state=None):
 
     init :  {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None
         Method used to initialize the procedure.
-        Default: None.
         Valid options:
 
-        - None: 'nndsvd' if n_components <= min(n_samples, n_features),
+        - None: 'nndsvda' if n_components <= min(n_samples, n_features),
             otherwise 'random'.
 
         - 'random': non-negative random matrices, scaled with:
@@ -263,6 +265,10 @@ def _initialize_nmf(X, n_components, init="warn", eps=1e-6, random_state=None):
 
         - 'custom': use custom matrices W and H
 
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
     eps : float, default=1e-6
         Truncate all values less then this in output to zero.
 
@@ -285,16 +291,6 @@ def _initialize_nmf(X, n_components, init="warn", eps=1e-6, random_state=None):
     nonnegative matrix factorization - Pattern Recognition, 2008
     http://tinyurl.com/nndsvd
     """
-    if init == "warn":
-        warnings.warn(
-            "The 'init' value, when 'init=None' and "
-            "n_components is less than n_samples and "
-            "n_features, will be changed from 'nndsvd' to "
-            "'nndsvda' in 1.1 (renaming of 0.26).",
-            FutureWarning,
-        )
-        init = None
-
     check_non_negative(X, "NMF initialization")
     n_samples, n_features = X.shape
 
@@ -310,7 +306,7 @@ def _initialize_nmf(X, n_components, init="warn", eps=1e-6, random_state=None):
 
     if init is None:
         if n_components <= min(n_samples, n_features):
-            init = "nndsvd"
+            init = "nndsvda"
         else:
             init = "random"
 
@@ -877,7 +873,7 @@ def non_negative_factorization(
     H=None,
     n_components=None,
     *,
-    init="warn",
+    init=None,
     update_H=True,
     solver="cd",
     beta_loss="frobenius",
@@ -950,7 +946,7 @@ def non_negative_factorization(
 
         Valid options:
 
-        - None: 'nndsvd' if n_components < n_features, otherwise 'random'.
+        - None: 'nndsvda' if n_components < n_features, otherwise 'random'.
 
         - 'random': non-negative random matrices, scaled with:
             sqrt(X.mean() / n_components)
@@ -971,6 +967,10 @@ def non_negative_factorization(
         .. versionchanged:: 0.23
             The default value of `init` changed from 'random' to None in 0.23.
 
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
     update_H : bool, default=True
         Set to True, both W and H will be estimated from initial guesses.
         Set to False, only W will be estimated.
@@ -1109,7 +1109,7 @@ def non_negative_factorization(
     return W, H, n_iter
 
 
-class NMF(TransformerMixin, BaseEstimator):
+class NMF(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """Non-Negative Matrix Factorization (NMF).
 
     Find two non-negative matrices (W, H) whose product approximates the non-
@@ -1160,7 +1160,7 @@ class NMF(TransformerMixin, BaseEstimator):
         Default: None.
         Valid options:
 
-        - `None`: 'nndsvd' if n_components <= min(n_samples, n_features),
+        - `None`: 'nndsvda' if n_components <= min(n_samples, n_features),
           otherwise random.
 
         - `'random'`: non-negative random matrices, scaled with:
@@ -1178,6 +1178,10 @@ class NMF(TransformerMixin, BaseEstimator):
 
         - `'custom'`: use custom matrices W and H
 
+        .. versionchanged:: 1.1
+            When `init=None` and n_components is less than n_samples and n_features
+            defaults to `nndsvda` instead of `nndsvd`.
+
     solver : {'cd', 'mu'}, default='cd'
         Numerical solver to use:
         'cd' is a Coordinate Descent solver.
@@ -1331,7 +1335,7 @@ def __init__(
         self,
         n_components=None,
         *,
-        init="warn",
+        init=None,
         solver="cd",
         beta_loss="frobenius",
         tol=1e-4,
@@ -1708,3 +1712,8 @@ def inverse_transform(self, W):
         """
         check_is_fitted(self)
         return np.dot(W, self.components_)
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/_online_lda_fast.pyx b/sklearn/decomposition/_online_lda_fast.pyx
index 1c00af02d2375..446232a57f084 100644
--- a/sklearn/decomposition/_online_lda_fast.pyx
+++ b/sklearn/decomposition/_online_lda_fast.pyx
@@ -1,6 +1,3 @@
-#
-# cython: boundscheck=False, wraparound=False
-
 cimport cython
 cimport numpy as np
 import numpy as np
@@ -91,7 +88,6 @@ def _dirichlet_expectation_2d(np.ndarray[ndim=2, dtype=np.float64_t] arr):
 #
 # After: J. Bernardo (1976). Algorithm AS 103: Psi (Digamma) Function.
 # https://www.uv.es/~bernardo/1976AppStatist.pdf
-@cython.cdivision(True)
 cdef double psi(double x) nogil:
     if x <= 1e-6:
         # psi(x) = -EULER - 1/x + O(x)
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index c87a0d852617a..30d71299ce01a 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -20,7 +20,7 @@
 from scipy.sparse.linalg import svds
 
 from ._base import _BasePCA
-from ..utils import check_random_state
+from ..utils import check_random_state, check_scalar
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
@@ -203,6 +203,14 @@ class PCA(_BasePCA):
 
         .. versionadded:: 0.18.0
 
+    n_oversamples : int, default=10
+        This parameter is only relevant when `svd_solver="randomized"`.
+        It corresponds to the additional number of random vectors to sample the
+        range of `X` so as to ensure proper conditioning. See
+        :func:`~sklearn.utils.extmath.randomized_svd` for more details.
+
+        .. versionadded:: 1.1
+
     random_state : int, RandomState instance or None, default=None
         Used when the 'arpack' or 'randomized' solvers are used. Pass an int
         for reproducible results across multiple function calls.
@@ -352,6 +360,7 @@ def __init__(
         svd_solver="auto",
         tol=0.0,
         iterated_power="auto",
+        n_oversamples=10,
         random_state=None,
     ):
         self.n_components = n_components
@@ -360,6 +369,7 @@ def __init__(
         self.svd_solver = svd_solver
         self.tol = tol
         self.iterated_power = iterated_power
+        self.n_oversamples = n_oversamples
         self.random_state = random_state
 
     def fit(self, X, y=None):
@@ -379,6 +389,13 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
+        check_scalar(
+            self.n_oversamples,
+            "n_oversamples",
+            min_val=1,
+            target_type=numbers.Integral,
+        )
+
         self._fit(X)
         return self
 
@@ -580,6 +597,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
             U, S, Vt = randomized_svd(
                 X,
                 n_components=n_components,
+                n_oversamples=self.n_oversamples,
                 n_iter=self.iterated_power,
                 flip_sign=True,
                 random_state=random_state,
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index cb05423283174..31c8d2168a3e6 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -7,11 +7,11 @@
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
 from ..linear_model import ridge_regression
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ._dict_learning import dict_learning, dict_learning_online
 
 
-class SparsePCA(TransformerMixin, BaseEstimator):
+class SparsePCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """Sparse Principal Components Analysis (SparsePCA).
 
     Finds the set of sparse components that can optimally reconstruct
@@ -24,7 +24,7 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     ----------
     n_components : int, default=None
         Number of sparse atoms to extract. If None, then ``n_components``
-        is set to ``n_features_in_``.
+        is set to ``n_features``.
 
     alpha : float, default=1
         Sparsity controlling parameter. Higher values lead to sparser
@@ -236,6 +236,11 @@ def transform(self, X):
 
         return U
 
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
+
 
 class MiniBatchSparsePCA(SparsePCA):
     """Mini-batch Sparse Principal Components Analysis.
@@ -249,7 +254,8 @@ class MiniBatchSparsePCA(SparsePCA):
     Parameters
     ----------
     n_components : int, default=None
-        Number of sparse atoms to extract.
+        Number of sparse atoms to extract. If None, then ``n_components``
+        is set to ``n_features``.
 
     alpha : int, default=1
         Sparsity controlling parameter. Higher values lead to sparser
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 21ed87eca5fd1..01d79f742302f 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -10,7 +10,7 @@
 import scipy.sparse as sp
 from scipy.sparse.linalg import svds
 
-from ..base import BaseEstimator, TransformerMixin
+from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin
 from ..utils import check_array, check_random_state
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
@@ -21,7 +21,7 @@
 __all__ = ["TruncatedSVD"]
 
 
-class TruncatedSVD(TransformerMixin, BaseEstimator):
+class TruncatedSVD(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """Dimensionality reduction using truncated SVD (aka LSA).
 
     This transformer performs linear dimensionality reduction by means of
@@ -273,3 +273,8 @@ def inverse_transform(self, X):
 
     def _more_tags(self):
         return {"preserves_dtype": [np.float64, np.float32]}
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index 1270287ec844a..9ce477fffcd9d 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -664,3 +664,21 @@ def test_warning_default_transform_alpha(Estimator):
     dl = Estimator(alpha=0.1)
     with pytest.warns(FutureWarning, match="default transform_alpha"):
         dl.fit_transform(X)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning()],
+    ids=lambda x: x.__class__.__name__,
+)
+def test_get_feature_names_out(estimator):
+    """Check feature names for dict learning estimators."""
+    estimator.fit(X)
+    n_components = X.shape[1]
+
+    feature_names_out = estimator.get_feature_names_out()
+    estimator_name = estimator.__class__.__name__.lower()
+    assert_array_equal(
+        feature_names_out,
+        [f"{estimator_name}{i}" for i in range(n_components)],
+    )
diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py
index 8caf56d00b703..0a61e4fe53bff 100644
--- a/sklearn/decomposition/tests/test_fastica.py
+++ b/sklearn/decomposition/tests/test_fastica.py
@@ -2,7 +2,6 @@
 Test the fastica algorithm.
 """
 import itertools
-import warnings
 import pytest
 
 import numpy as np
@@ -10,6 +9,7 @@
 
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_array_equal
 
 from sklearn.decomposition import FastICA, fastica, PCA
 from sklearn.decomposition._fastica import _gs_decorrelation
@@ -46,6 +46,10 @@ def test_gs():
     assert (tmp[:5] ** 2).sum() < 1.0e-10
 
 
+# FIXME remove filter in 1.3
+@pytest.mark.filterwarnings(
+    "ignore:From version 1.3 whiten='unit-variance' will be used by default."
+)
 @pytest.mark.parametrize("add_noise", [True, False])
 @pytest.mark.parametrize("seed", range(1))
 def test_fastica_simple(add_noise, seed):
@@ -76,12 +80,14 @@ def g_test(x):
 
     algos = ["parallel", "deflation"]
     nls = ["logcosh", "exp", "cube", g_test]
-    whitening = [True, False]
+    whitening = ["arbitrary-variance", "unit-variance", False]
     for algo, nl, whiten in itertools.product(algos, nls, whitening):
         if whiten:
-            k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng)
+            k_, mixing_, s_ = fastica(
+                m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng
+            )
             with pytest.raises(ValueError):
-                fastica(m.T, fun=np.tanh, algorithm=algo)
+                fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo)
         else:
             pca = PCA(n_components=2, whiten=True, random_state=rng)
             X = pca.fit_transform(m.T)
@@ -197,7 +203,9 @@ def test_non_square_fastica(add_noise):
 
     center_and_norm(m)
 
-    k_, mixing_, s_ = fastica(m.T, n_components=2, random_state=rng)
+    k_, mixing_, s_ = fastica(
+        m.T, n_components=2, whiten="unit-variance", random_state=rng
+    )
     s_ = s_.T
 
     # Check that the mixing model described in the docstring holds:
@@ -219,10 +227,16 @@ def test_non_square_fastica(add_noise):
 
 
 def test_fit_transform():
-    # Test FastICA.fit_transform
+    """Test unit variance of transformed data using FastICA algorithm.
+
+    Check that `fit_transform` gives the same result as applying
+    `fit` and then `transform`.
+
+    Bug #13056
+    """
     rng = np.random.RandomState(0)
     X = rng.random_sample((100, 10))
-    for whiten, n_components in [[True, 5], [False, None]]:
+    for whiten, n_components in [["unit-variance", 5], [False, None]]:
         n_components_ = n_components if n_components is not None else X.shape[1]
 
         ica = FastICA(n_components=n_components, whiten=whiten, random_state=0)
@@ -238,36 +252,39 @@ def test_fit_transform():
         assert_array_almost_equal(Xt, Xt2)
 
 
-def test_inverse_transform():
+@pytest.mark.filterwarnings("ignore:Ignoring n_components with whiten=False.")
+@pytest.mark.parametrize(
+    "whiten, n_components, expected_mixing_shape",
+    [
+        ("arbitrary-variance", 5, (10, 5)),
+        ("arbitrary-variance", 10, (10, 10)),
+        ("unit-variance", 5, (10, 5)),
+        ("unit-variance", 10, (10, 10)),
+        (False, 5, (10, 10)),
+        (False, 10, (10, 10)),
+    ],
+)
+def test_inverse_transform(whiten, n_components, expected_mixing_shape):
     # Test FastICA.inverse_transform
-    n_features = 10
     n_samples = 100
-    n1, n2 = 5, 10
     rng = np.random.RandomState(0)
-    X = rng.random_sample((n_samples, n_features))
-    expected = {
-        (True, n1): (n_features, n1),
-        (True, n2): (n_features, n2),
-        (False, n1): (n_features, n2),
-        (False, n2): (n_features, n2),
-    }
-    for whiten in [True, False]:
-        for n_components in [n1, n2]:
-            n_components_ = n_components if n_components is not None else X.shape[1]
-            ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten)
-            with warnings.catch_warnings(record=True):
-                # catch "n_components ignored" warning
-                Xt = ica.fit_transform(X)
-            expected_shape = expected[(whiten, n_components_)]
-            assert ica.mixing_.shape == expected_shape
-            X2 = ica.inverse_transform(Xt)
-            assert X.shape == X2.shape
-
-            # reversibility test in non-reduction case
-            if n_components == X.shape[1]:
-                assert_array_almost_equal(X, X2)
+    X = rng.random_sample((n_samples, 10))
+
+    ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten)
+    Xt = ica.fit_transform(X)
+    assert ica.mixing_.shape == expected_mixing_shape
+    X2 = ica.inverse_transform(Xt)
+    assert X.shape == X2.shape
 
+    # reversibility test in non-reduction case
+    if n_components == X.shape[1]:
+        assert_array_almost_equal(X, X2)
 
+
+# FIXME remove filter in 1.3
+@pytest.mark.filterwarnings(
+    "ignore:From version 1.3 whiten='unit-variance' will be used by default."
+)
 def test_fastica_errors():
     n_features = 3
     n_samples = 10
@@ -289,7 +306,70 @@ def test_fastica_errors():
         fastica(X, algorithm="pizza")
 
 
-@pytest.mark.parametrize("whiten", [True, False])
+def test_fastica_whiten_unit_variance():
+    """Test unit variance of transformed data using FastICA algorithm.
+
+    Bug #13056
+    """
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10))
+    n_components = X.shape[1]
+    ica = FastICA(n_components=n_components, whiten="unit-variance", random_state=0)
+    Xt = ica.fit_transform(X)
+
+    assert np.var(Xt) == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize("ica", [FastICA(), FastICA(whiten=True)])
+def test_fastica_whiten_default_value_deprecation(ica):
+    """Test FastICA whiten default value deprecation.
+
+    Regression test for #19490
+    """
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10))
+    with pytest.warns(FutureWarning, match=r"From version 1.3 whiten="):
+        ica.fit(X)
+        assert ica._whiten == "arbitrary-variance"
+
+
+def test_fastica_whiten_backwards_compatibility():
+    """Test previous behavior for FastICA whitening (whiten=True)
+
+    Regression test for #19490
+    """
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10))
+    n_components = X.shape[1]
+
+    default_ica = FastICA(n_components=n_components, random_state=0)
+    with pytest.warns(FutureWarning):
+        Xt_on_default = default_ica.fit_transform(X)
+
+    ica = FastICA(n_components=n_components, whiten=True, random_state=0)
+    with pytest.warns(FutureWarning):
+        Xt = ica.fit_transform(X)
+
+    # No warning must be raised in this case.
+    av_ica = FastICA(
+        n_components=n_components, whiten="arbitrary-variance", random_state=0
+    )
+    with pytest.warns(None) as warn_record:
+        Xt_av = av_ica.fit_transform(X)
+        assert len(warn_record) == 0
+
+    # The whitening strategy must be "arbitrary-variance" in all the cases.
+    assert default_ica._whiten == "arbitrary-variance"
+    assert ica._whiten == "arbitrary-variance"
+    assert av_ica._whiten == "arbitrary-variance"
+
+    assert_array_equal(Xt, Xt_on_default)
+    assert_array_equal(Xt, Xt_av)
+
+    assert np.var(Xt) == pytest.approx(1.0 / 100)
+
+
+@pytest.mark.parametrize("whiten", ["arbitrary-variance", "unit-variance", False])
 @pytest.mark.parametrize("return_X_mean", [True, False])
 @pytest.mark.parametrize("return_n_iter", [True, False])
 def test_fastica_output_shape(whiten, return_X_mean, return_n_iter):
diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py
index 756300d970072..2ae2187452eee 100644
--- a/sklearn/decomposition/tests/test_incremental_pca.py
+++ b/sklearn/decomposition/tests/test_incremental_pca.py
@@ -5,6 +5,7 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose_dense_sparse
+from numpy.testing import assert_array_equal
 
 from sklearn import datasets
 from sklearn.decomposition import PCA, IncrementalPCA
@@ -427,3 +428,11 @@ def test_incremental_pca_fit_overflow_error():
     pca.fit(A)
 
     np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_)
+
+
+def test_incremental_pca_feature_names_out():
+    """Check feature names out for IncrementalPCA."""
+    ipca = IncrementalPCA(n_components=2).fit(iris.data)
+
+    names = ipca.get_feature_names_out()
+    assert_array_equal([f"incrementalpca{i}" for i in range(2)], names)
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index e7ae53fd5188b..848b9bb1b9bbf 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -78,8 +78,12 @@ def test_kernel_pca_invalid_parameters():
     Tests fitting inverse transform with a precomputed kernel raises a
     ValueError.
     """
-    with pytest.raises(ValueError):
-        KernelPCA(10, fit_inverse_transform=True, kernel="precomputed")
+    estimator = KernelPCA(
+        n_components=10, fit_inverse_transform=True, kernel="precomputed"
+    )
+    err_ms = "Cannot fit_inverse_transform with a precomputed kernel"
+    with pytest.raises(ValueError, match=err_ms):
+        estimator.fit(np.random.randn(10, 10))
 
 
 def test_kernel_pca_consistent_transform():
@@ -559,3 +563,12 @@ def test_kernel_pca_alphas_deprecated():
     msg = r"Attribute `alphas_` was deprecated in version 1\.0"
     with pytest.warns(FutureWarning, match=msg):
         kp.alphas_
+
+
+def test_kernel_pca_feature_names_out():
+    """Check feature names out for KernelPCA."""
+    X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
+    kpca = KernelPCA(n_components=2).fit(X)
+
+    names = kpca.get_feature_names_out()
+    assert_array_equal([f"kernelpca{i}" for i in range(2)], names)
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index 3b056bf9ee0b1..c95b7ceb737db 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -42,11 +42,9 @@ def test_initialize_nn_output():
 def test_parameter_checking():
     A = np.ones((2, 2))
     name = "spam"
-    # FIXME : should be removed in 1.1
-    init = "nndsvda"
     msg = "Invalid solver parameter: got 'spam' instead of one of"
     with pytest.raises(ValueError, match=msg):
-        NMF(solver=name, init=init).fit(A)
+        NMF(solver=name).fit(A)
     msg = "Invalid init parameter: got 'spam' instead of one of"
     with pytest.raises(ValueError, match=msg):
         NMF(init=name).fit(A)
@@ -55,21 +53,21 @@ def test_parameter_checking():
         # TODO remove in 1.2
         msg = "Invalid regularization parameter: got 'spam' instead of one of"
         with pytest.raises(ValueError, match=msg):
-            NMF(regularization=name, init=init).fit(A)
+            NMF(regularization=name).fit(A)
 
     msg = "Invalid beta_loss parameter: got 'spam' instead of one"
     with pytest.raises(ValueError, match=msg):
-        NMF(solver="mu", init=init, beta_loss=name).fit(A)
+        NMF(solver="mu", beta_loss=name).fit(A)
     msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0"
     with pytest.raises(ValueError, match=msg):
-        NMF(solver="cd", init=init, beta_loss=1.0).fit(A)
+        NMF(solver="cd", beta_loss=1.0).fit(A)
 
     msg = "Negative values in data passed to"
     with pytest.raises(ValueError, match=msg):
-        NMF(init=init).fit(-A)
+        NMF().fit(-A)
     with pytest.raises(ValueError, match=msg):
         nmf._initialize_nmf(-A, 2, "nndsvd")
-    clf = NMF(2, tol=0.1, init=init).fit(A)
+    clf = NMF(2, tol=0.1).fit(A)
     with pytest.raises(ValueError, match=msg):
         clf.transform(-A)
 
@@ -198,9 +196,7 @@ def test_n_components_greater_n_features():
     # Smoke test for the case of more components than features.
     rng = np.random.mtrand.RandomState(42)
     A = np.abs(rng.randn(30, 10))
-    # FIXME : should be removed in 1.1
-    init = "random"
-    NMF(n_components=15, random_state=0, tol=1e-2, init=init).fit(A)
+    NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
 
 
 @pytest.mark.parametrize("solver", ["cd", "mu"])
@@ -533,8 +529,6 @@ def test_nmf_regularization(solver):
     rng = np.random.mtrand.RandomState(42)
     X = np.abs(rng.randn(n_samples, n_features))
 
-    # FIXME : should be removed in 1.1
-    init = "nndsvda"
     # L1 regularization should increase the number of zeros
     l1_ratio = 1.0
 
@@ -544,7 +538,6 @@ def test_nmf_regularization(solver):
         alpha_W=0.5,
         l1_ratio=l1_ratio,
         random_state=42,
-        init=init,
     )
     model = nmf.NMF(
         n_components=n_components,
@@ -552,7 +545,6 @@ def test_nmf_regularization(solver):
         alpha_W=0.0,
         l1_ratio=l1_ratio,
         random_state=42,
-        init=init,
     )
 
     W_regul = regul.fit_transform(X)
@@ -578,7 +570,6 @@ def test_nmf_regularization(solver):
         alpha_W=0.5,
         l1_ratio=l1_ratio,
         random_state=42,
-        init=init,
     )
     model = nmf.NMF(
         n_components=n_components,
@@ -586,7 +577,6 @@ def test_nmf_regularization(solver):
         alpha_W=0.0,
         l1_ratio=l1_ratio,
         random_state=42,
-        init=init,
     )
 
     W_regul = regul.fit_transform(X)
@@ -685,9 +675,7 @@ def test_nmf_dtype_match(dtype_in, dtype_out, solver, alpha_W, alpha_H):
     # Check that NMF preserves dtype (float32 and float64)
     X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False)
     np.abs(X, out=X)
-    # FIXME : should be removed in 1.1
-    init = "nndsvda"
-    nmf = NMF(solver=solver, alpha_W=alpha_W, alpha_H=alpha_H, init=init)
+    nmf = NMF(solver=solver, alpha_W=alpha_W, alpha_H=alpha_H)
 
     assert nmf.fit(X).transform(X).dtype == dtype_out
     assert nmf.fit_transform(X).dtype == dtype_out
@@ -699,11 +687,10 @@ def test_nmf_float32_float64_consistency(solver):
     # Check that the result of NMF is the same between float32 and float64
     X = np.random.RandomState(0).randn(50, 7)
     np.abs(X, out=X)
-    # FIXME : should be removed in 1.1
-    init = "nndsvda"
-    nmf32 = NMF(solver=solver, random_state=0, init=init)
+
+    nmf32 = NMF(solver=solver, random_state=0)
     W32 = nmf32.fit_transform(X.astype(np.float32))
-    nmf64 = NMF(solver=solver, random_state=0, init=init)
+    nmf64 = NMF(solver=solver, random_state=0)
     W64 = nmf64.fit_transform(X)
 
     assert_allclose(W32, W64, rtol=1e-6, atol=1e-5)
@@ -724,20 +711,11 @@ def test_nmf_custom_init_dtype_error():
         non_negative_factorization(X, H=H, update_H=False)
 
 
-# FIXME : should be removed in 1.1
-def test_init_default_deprecation():
-    # Test FutureWarning on init default
-    msg = (
-        r"The 'init' value, when 'init=None' and "
-        r"n_components is less than n_samples and "
-        r"n_features, will be changed from 'nndsvd' to "
-        r"'nndsvda' in 1.1 \(renaming of 0.26\)."
-    )
-    rng = np.random.mtrand.RandomState(42)
-    A = np.abs(rng.randn(6, 5))
-    with pytest.warns(FutureWarning, match=msg):
-        nmf._initialize_nmf(A, 3)
-    with pytest.warns(FutureWarning, match=msg):
-        NMF().fit(A)
-    with pytest.warns(FutureWarning, match=msg):
-        non_negative_factorization(A)
+def test_feature_names_out():
+    """Check feature names out for NMF."""
+    random_state = np.random.RandomState(0)
+    X = np.abs(random_state.randn(10, 4))
+    nmf = NMF(n_components=3).fit(X)
+
+    names = nmf.get_feature_names_out()
+    assert_array_equal([f"nmf{i}" for i in range(3)], names)
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
index 811f3186ce503..e3ce951f7b6da 100644
--- a/sklearn/decomposition/tests/test_online_lda.py
+++ b/sklearn/decomposition/tests/test_online_lda.py
@@ -4,6 +4,7 @@
 from scipy.linalg import block_diag
 from scipy.sparse import csr_matrix
 from scipy.special import psi
+from numpy.testing import assert_array_equal
 
 import pytest
 
@@ -427,3 +428,14 @@ def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexiti
 )
 def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities):
     check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities)
+
+
+def test_lda_feature_names_out():
+    """Check feature names out for LatentDirichletAllocation."""
+    n_components, X = _build_sparse_mtx()
+    lda = LatentDirichletAllocation(n_components=n_components).fit(X)
+
+    names = lda.get_feature_names_out()
+    assert_array_equal(
+        [f"latentdirichletallocation{i}" for i in range(n_components)], names
+    )
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index e7973fd8aa3af..145a76cb9551a 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -1,5 +1,6 @@
 import numpy as np
 import scipy as sp
+from numpy.testing import assert_array_equal
 
 import pytest
 
@@ -656,3 +657,60 @@ def test_assess_dimesion_rank_one():
     assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
     for rank in range(2, n_features):
         assert _assess_dimension(s, rank, n_samples) == -np.inf
+
+
+def test_pca_randomized_svd_n_oversamples():
+    """Check that exposing and setting `n_oversamples` will provide accurate results
+    even when `X` as a large number of features.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/20589
+    """
+    rng = np.random.RandomState(0)
+    n_features = 100
+    X = rng.randn(1_000, n_features)
+
+    # The default value of `n_oversamples` will lead to inaccurate results
+    # We force it to the number of features.
+    pca_randomized = PCA(
+        n_components=1,
+        svd_solver="randomized",
+        n_oversamples=n_features,
+        random_state=0,
+    ).fit(X)
+    pca_full = PCA(n_components=1, svd_solver="full").fit(X)
+    pca_arpack = PCA(n_components=1, svd_solver="arpack", random_state=0).fit(X)
+
+    assert_allclose(np.abs(pca_full.components_), np.abs(pca_arpack.components_))
+    assert_allclose(np.abs(pca_randomized.components_), np.abs(pca_arpack.components_))
+
+
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"n_oversamples": 0},
+            ValueError,
+            "n_oversamples == 0, must be >= 1.",
+        ),
+        (
+            {"n_oversamples": 1.5},
+            TypeError,
+            "n_oversamples must be an instance of <class 'numbers.Integral'>",
+        ),
+    ],
+)
+def test_pca_params_validation(params, err_type, err_msg):
+    """Check the parameters validation in `PCA`."""
+    rng = np.random.RandomState(0)
+    X = rng.randn(100, 20)
+    with pytest.raises(err_type, match=err_msg):
+        PCA(**params).fit(X)
+
+
+def test_feature_names_out():
+    """Check feature names out for PCA."""
+    pca = PCA(n_components=2).fit(iris.data)
+
+    names = pca.get_feature_names_out()
+    assert_array_equal([f"pca{i}" for i in range(2)], names)
diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
index 79ad3d0e6006f..c77aabf9c182c 100644
--- a/sklearn/decomposition/tests/test_sparse_pca.py
+++ b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -5,6 +5,7 @@
 import pytest
 
 import numpy as np
+from numpy.testing import assert_array_equal
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
@@ -203,3 +204,17 @@ def test_spca_n_components_(SPCA, n_components):
         assert model.n_components_ == n_components
     else:
         assert model.n_components_ == n_features
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+def test_spca_feature_names_out(SPCA):
+    """Check feature names out for *SparsePCA."""
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 12, 10
+    X = rng.randn(n_samples, n_features)
+
+    model = SPCA(n_components=4).fit(X)
+    names = model.get_feature_names_out()
+
+    estimator_name = SPCA.__name__.lower()
+    assert_array_equal([f"{estimator_name}{i}" for i in range(4)], names)
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 001b4a23d0686..79faa8694a535 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -542,7 +542,7 @@ def fit(self, X, y):
             Fitted estimator.
         """
         X, y = self._validate_data(
-            X, y, ensure_min_samples=2, estimator=self, dtype=[np.float64, np.float32]
+            X, y, ensure_min_samples=2, dtype=[np.float64, np.float32]
         )
         self.classes_ = unique_labels(y)
         n_samples, _ = X.shape
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 6b2133defce6f..c475e3ed9acb0 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -58,7 +58,7 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
-    constant : int or str or array-like of shape (n_outputs,)
+    constant : int or str or array-like of shape (n_outputs,), default=None
         The explicit constant as predicted by the "constant" strategy. This
         parameter is useful only for the "constant" strategy.
 
@@ -530,7 +530,7 @@ def fit(self, X, y, sample_weight=None):
                 % (self.strategy, allowed_strategies)
             )
 
-        y = check_array(y, ensure_2d=False)
+        y = check_array(y, ensure_2d=False, input_name="y")
         if len(y) == 0:
             raise ValueError("y must not be empty.")
 
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 6b6342301769d..d213ecf5c2090 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -52,6 +52,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 from ..base import is_classifier
 from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin
+from ..base import _ClassNamePrefixFeaturesOutMixin
 from ..metrics import accuracy_score, r2_score
 from ..preprocessing import OneHotEncoder
 from ..tree import (
@@ -2344,7 +2345,7 @@ def __init__(
         self.ccp_alpha = ccp_alpha
 
 
-class RandomTreesEmbedding(BaseForest):
+class RandomTreesEmbedding(BaseForest, _ClassNamePrefixFeaturesOutMixin):
     """
     An ensemble of totally random trees.
 
@@ -2633,7 +2634,12 @@ def fit_transform(self, X, y=None, sample_weight=None):
         super().fit(X, y, sample_weight=sample_weight)
 
         self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
-        return self.one_hot_encoder_.fit_transform(self.apply(X))
+        X_leaf = self.apply(X)
+        self.one_hot_encoder_.fit(X_leaf)
+        self._n_features_out = np.product(
+            np.asarray(self.one_hot_encoder_.categories_).shape
+        )
+        return self.one_hot_encoder_.transform(X_leaf)
 
     def transform(self, X):
         """
diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index 2e335fec62705..5942e30c701ce 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-#
 # Author: Peter Prettenhofer
 #
 # License: BSD 3 clause
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
index 5f5dd68935fd4..33cf0dadae011 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -1,9 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: nonecheck=False
-# cython: language_level=3
-
 # Author: Nicolas Hug
 
 cimport cython
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
index 0ea642df3ddcf..4aea8276c4398 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
@@ -1,4 +1,3 @@
-# cython: language_level=3
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport BITSET_DTYPE_C
 from .common cimport BITSET_INNER_DTYPE_C
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
index 2df03b047aad1..0d3b630f3314f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
 from .common cimport BITSET_INNER_DTYPE_C
 from .common cimport BITSET_DTYPE_C
 from .common cimport X_DTYPE_C
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
index f684ca57e560d..8170c8dc462e9 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -1,8 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
-
 # Author: Nicolas Hug
 
 cimport cython
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index da900e28c6457..23e7d2841443b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -1,8 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
-
 # Author: Nicolas Hug
 
 cimport cython
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index a6b2f8b90de8e..5aee8620e34d1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -1,8 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
-
 # Author: Nicolas Hug
 
 cimport cython
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index 6122b961fb91c..16ff7645aa740 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -1,4 +1,3 @@
-# cython: language_level=3
 import numpy as np
 cimport numpy as np
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
index 54cfdcc077dc7..e6cdafd2d46bf 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
 """This module contains routines for building histograms."""
 
 # Author: Nicolas Hug
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 232cf094876cb..3ba6d7a0ce9df 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -1,8 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
-
 """This module contains routines and data structures to:
 
 - Find the best possible split of a node. For a given node, a split is
@@ -791,7 +786,6 @@ cdef class Splitter:
                 split_info.sum_gradient_right, split_info.sum_hessian_right,
                 lower_bound, upper_bound, self.l2_regularization)
 
-    @cython.initializedcheck(False)
     cdef void _find_best_bin_to_split_category(
             self,
             unsigned int feature_idx,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index 53aaa450c90ce..b2de7614fe499 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
 """This module contains utility routines."""
 # Author: Nicolas Hug
 
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 507794bd4e092..cee62807c8b01 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -13,6 +13,8 @@
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
 from ..base import is_classifier, is_regressor
+from ..base import _ClassNamePrefixFeaturesOutMixin
+
 from ..exceptions import NotFittedError
 from ..utils._estimator_html_repr import _VisualBlock
 
@@ -35,7 +37,12 @@
 from ..utils.fixes import delayed
 
 
-class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble, metaclass=ABCMeta):
+class _BaseStacking(
+    TransformerMixin,
+    _BaseHeterogeneousEnsemble,
+    _ClassNamePrefixFeaturesOutMixin,
+    metaclass=ABCMeta,
+):
     """Base class for stacking method."""
 
     @abstractmethod
@@ -160,6 +167,7 @@ def fit(self, X, y, sample_weight=None):
             for est in all_estimators
             if est != "drop"
         )
+        self._n_features_out = len(self.estimators_)
 
         self.named_estimators_ = Bunch()
         est_fitted_idx = 0
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 99591cedead3d..bec7fe27c4b73 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -23,6 +23,8 @@
 from ..base import RegressorMixin
 from ..base import TransformerMixin
 from ..base import clone
+from ..base import _ClassNamePrefixFeaturesOutMixin
+
 from ._base import _fit_single_estimator
 from ._base import _BaseHeterogeneousEnsemble
 from ..preprocessing import LabelEncoder
@@ -36,7 +38,9 @@
 from ..utils.fixes import delayed
 
 
-class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
+class _BaseVoting(
+    TransformerMixin, _BaseHeterogeneousEnsemble, _ClassNamePrefixFeaturesOutMixin
+):
     """Base class for voting.
 
     Warning: This class should not be used directly. Use derived classes
@@ -83,6 +87,7 @@ def fit(self, X, y, sample_weight=None):
             for idx, clf in enumerate(clfs)
             if clf != "drop"
         )
+        self._n_features_out = len(self.estimators_)
 
         self.named_estimators_ = Bunch()
 
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 319cf367cb112..ed0202a9a97ee 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -25,6 +25,7 @@
 
 from abc import ABCMeta, abstractmethod
 
+import numbers
 import numpy as np
 
 import warnings
@@ -36,6 +37,7 @@
 
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, _safe_indexing
+from ..utils import check_scalar
 from ..utils.extmath import softmax
 from ..utils.extmath import stable_cumsum
 from ..metrics import accuracy_score, r2_score
@@ -478,9 +480,28 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
+        check_scalar(
+            self.n_estimators,
+            "n_estimators",
+            target_type=numbers.Integral,
+            min_val=1,
+            include_boundaries="left",
+        )
+
+        check_scalar(
+            self.learning_rate,
+            "learning_rate",
+            target_type=numbers.Real,
+            min_val=0,
+            include_boundaries="neither",
+        )
+
         # Check that algorithm is supported
         if self.algorithm not in ("SAMME", "SAMME.R"):
-            raise ValueError("algorithm %s is not supported" % self.algorithm)
+            raise ValueError(
+                "Algorithm must be 'SAMME' or 'SAMME.R'."
+                f" Got {self.algorithm!r} instead."
+            )
 
         # Fit
         return super().fit(X, y, sample_weight)
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index acd4913df9665..e050b34dd648f 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -28,6 +28,7 @@
 from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import FunctionTransformer, scale
+from itertools import cycle
 
 from scipy.sparse import csc_matrix, csr_matrix
 
@@ -57,24 +58,28 @@ def test_classification():
     grid = ParameterGrid(
         {
             "max_samples": [0.5, 1.0],
-            "max_features": [1, 2, 4],
+            "max_features": [1, 4],
             "bootstrap": [True, False],
             "bootstrap_features": [True, False],
         }
     )
-
-    for base_estimator in [
+    estimators = [
         None,
         DummyClassifier(),
-        Perceptron(),
-        DecisionTreeClassifier(),
+        Perceptron(max_iter=20),
+        DecisionTreeClassifier(max_depth=2),
         KNeighborsClassifier(),
         SVC(),
-    ]:
-        for params in grid:
-            BaggingClassifier(
-                base_estimator=base_estimator, random_state=rng, **params
-            ).fit(X_train, y_train).predict(X_test)
+    ]
+    # Try different parameter settings with different base classifiers without
+    # doing the full cartesian product to keep the test durations low.
+    for params, base_estimator in zip(grid, cycle(estimators)):
+        BaggingClassifier(
+            base_estimator=base_estimator,
+            random_state=rng,
+            n_estimators=2,
+            **params,
+        ).fit(X_train, y_train).predict(X_test)
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index e9b74f817311b..7c2386b6c4e1a 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -25,7 +25,7 @@
 import pytest
 
 import joblib
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_array_equal
 
 from sklearn.dummy import DummyRegressor
 from sklearn.metrics import mean_poisson_deviance
@@ -1795,3 +1795,15 @@ def test_mse_criterion_object_segfault_smoke_test(Forest):
     est = FOREST_REGRESSORS[Forest](n_estimators=2, n_jobs=2, criterion=mse_criterion)
 
     est.fit(X_reg, y)
+
+
+def test_random_trees_embedding_feature_names_out():
+    """Check feature names out for NMF."""
+    random_state = np.random.RandomState(0)
+    X = np.abs(random_state.randn(100, 4))
+
+    hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False).fit(X)
+    names = hasher.get_feature_names_out()
+    assert_array_equal(
+        [f"randomtreesembedding{i}" for i in range(hasher._n_features_out)], names
+    )
diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py
index da18158070b23..7561516a9948c 100644
--- a/sklearn/ensemble/tests/test_stacking.py
+++ b/sklearn/ensemble/tests/test_stacking.py
@@ -41,7 +41,8 @@
 from sklearn.utils._mocking import CheckingClassifier
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_allclose_dense_sparse
-from sklearn.utils._testing import ignore_warnings
+from sklearn.utils._testing import ignore_warnings, assert_array_equal
+
 
 X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
 X_iris, y_iris = load_iris(return_X_y=True)
@@ -561,3 +562,36 @@ def fit(self, X, y):
     msg = "'MyEstimator' object has no attribute 'n_features_in_'"
     with pytest.raises(AttributeError, match=msg):
         stacker.n_features_in_
+
+
+@pytest.mark.parametrize(
+    "estimator, X, y",
+    [
+        (
+            StackingClassifier(
+                estimators=[
+                    ("lr", LogisticRegression(random_state=0)),
+                    ("svm", LinearSVC(random_state=0)),
+                ]
+            ),
+            X_iris[:100],
+            y_iris[:100],
+        ),  # keep only classes 0 and 1
+        (
+            StackingRegressor(
+                estimators=[
+                    ("lr", LinearRegression()),
+                    ("svm", LinearSVR(random_state=0)),
+                ]
+            ),
+            X_diabetes,
+            y_diabetes,
+        ),
+    ],
+    ids=["StackingClassifier", "StackingRegressor"],
+)
+def test_stacking_get_feature_names_out(estimator, X, y):
+    eclf = estimator.fit(X, y)
+    names = eclf.get_feature_names_out()
+    cls_name = estimator.__class__.__name__.lower()
+    assert_array_equal([f"{cls_name}{i}" for i in range(eclf._n_features_out)], names)
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 4bebfaca53709..e55375b68f296 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -602,3 +602,29 @@ def test_voting_verbose(estimator, capsys):
 
     estimator.fit(X, y)
     assert re.match(pattern, capsys.readouterr()[0])
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        VotingRegressor(
+            estimators=[
+                ("lr", LinearRegression()),
+                ("rf", RandomForestRegressor(random_state=123)),
+            ],
+            verbose=True,
+        ),
+        VotingClassifier(
+            estimators=[
+                ("lr", LogisticRegression(random_state=123)),
+                ("rf", RandomForestClassifier(random_state=123)),
+            ],
+            verbose=True,
+        ),
+    ],
+)
+def test_voting_get_feature_names_out(estimator):
+    eclf = estimator.fit(X, y)
+    names = eclf.get_feature_names_out()
+    cls_name = estimator.__class__.__name__.lower()
+    assert_array_equal([f"{cls_name}{i}" for i in range(eclf._n_features_out)], names)
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
index 159f83abf24c4..290f4b64dcc71 100755
--- a/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -273,12 +273,6 @@ def test_importances():
 def test_error():
     # Test that it gives proper exception on deficient input.
 
-    with pytest.raises(ValueError):
-        AdaBoostClassifier(learning_rate=-1).fit(X, y_class)
-
-    with pytest.raises(ValueError):
-        AdaBoostClassifier(algorithm="foo").fit(X, y_class)
-
     with pytest.raises(ValueError):
         AdaBoostClassifier().fit(X, y_class, sample_weight=np.asarray([-1]))
 
@@ -549,6 +543,32 @@ def test_adaboostregressor_sample_weight():
     assert score_no_outlier == pytest.approx(score_with_weight)
 
 
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        ({"n_estimators": -1}, ValueError, "n_estimators == -1, must be >= 1"),
+        ({"n_estimators": 0}, ValueError, "n_estimators == 0, must be >= 1"),
+        (
+            {"n_estimators": 1.5},
+            TypeError,
+            "n_estimators must be an instance of <class 'numbers.Integral'>,"
+            " not <class 'float'>",
+        ),
+        ({"learning_rate": -1}, ValueError, "learning_rate == -1, must be > 0."),
+        ({"learning_rate": 0}, ValueError, "learning_rate == 0, must be > 0."),
+        (
+            {"algorithm": "unknown"},
+            ValueError,
+            "Algorithm must be 'SAMME' or 'SAMME.R'.",
+        ),
+    ],
+)
+def test_adaboost_classifier_params_validation(params, err_type, err_msg):
+    """Check the parameters validation in `AdaBoostClassifier`."""
+    with pytest.raises(err_type, match=err_msg):
+        AdaBoostClassifier(**params).fit(X, y_class)
+
+
 @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
 def test_adaboost_consistent_predict(algorithm):
     # check that predict_proba and predict give consistent results
diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx
index 3a3102444af98..722538fe166d3 100644
--- a/sklearn/feature_extraction/_hashing_fast.pyx
+++ b/sklearn/feature_extraction/_hashing_fast.pyx
@@ -1,7 +1,5 @@
 # Author: Lars Buitinck
 # License: BSD 3 clause
-#
-# cython: boundscheck=False, cdivision=True
 
 import sys
 import array
@@ -92,7 +90,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype,
     indices_a = np.frombuffer(indices, dtype=np.int32)
     indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype)
 
-    if indptr[-1] > np.iinfo(np.int32).max:  # = 2**31 - 1
+    if indptr[len(indptr) - 1] > np.iinfo(np.int32).max:  # = 2**31 - 1
         # both indices and indptr have the same dtype in CSR arrays
         indices_a = indices_a.astype(np.int64, copy=False)
     else:
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index da32e855fabb6..5e016d348703c 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1093,9 +1093,9 @@ def test_pickling_vectorizer():
         if IS_PYPY and isinstance(orig, HashingVectorizer):
             continue
         else:
-            assert_array_equal(
-                copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
-                orig.fit_transform(JUNK_FOOD_DOCS).toarray(),
+            assert_allclose_dense_sparse(
+                copy.fit_transform(JUNK_FOOD_DOCS),
+                orig.fit_transform(JUNK_FOOD_DOCS),
             )
 
 
diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py
index d4e6af5c667c2..486cc6d90f09c 100644
--- a/sklearn/feature_selection/tests/test_sequential.py
+++ b/sklearn/feature_selection/tests/test_sequential.py
@@ -123,7 +123,7 @@ def test_nan_support():
     sfs.fit(X, y)
     sfs.transform(X)
 
-    with pytest.raises(ValueError, match="Input contains NaN"):
+    with pytest.raises(ValueError, match="Input X contains NaN"):
         # LinearRegression does not support nans
         SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 715cd2d0b16bd..e0ae8a0b2ff5b 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -94,7 +94,7 @@ def optimizer(obj_func, initial_theta, bounds):
         run is performed.
 
     normalize_y : bool, default=False
-        Whether or not to normalized the target values `y` by removing the mean
+        Whether or not to normalize the target values `y` by removing the mean
         and scaling to unit-variance. This is recommended for cases where
         zero-mean, unit-variance priors are used. Note that, in this
         implementation, the normalisation is reversed before the GP predictions
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index c97a8d24d4578..919be70607fdf 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -17,10 +17,14 @@
 from ..utils.validation import FLOAT_DTYPES
 from ..utils.validation import _check_feature_names_in
 from ..utils._mask import _get_mask
+from ..utils import _is_pandas_na
 from ..utils import is_scalar_nan
 
 
 def _check_inputs_dtype(X, missing_values):
+    if _is_pandas_na(missing_values):
+        # Allow using `pd.NA` as missing values to impute numerical arrays.
+        return
     if X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real):
         raise ValueError(
             "'X' and 'missing_values' types are expected to be"
@@ -136,11 +140,11 @@ class SimpleImputer(_BaseImputer):
 
     Parameters
     ----------
-    missing_values : int, float, str, np.nan or None, default=np.nan
+    missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed. For pandas' dataframes with
         nullable integer dtypes with missing values, `missing_values`
-        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
+        can be set to either `np.nan` or `pd.NA`.
 
     strategy : str, default='mean'
         The imputation strategy.
@@ -167,8 +171,13 @@ class SimpleImputer(_BaseImputer):
     verbose : int, default=0
         Controls the verbosity of the imputer.
 
+        .. deprecated:: 1.1
+           The 'verbose' parameter was deprecated in version 1.1 and will be
+           removed in 1.3. A warning will always be raised upon the removal of
+           empty columns in the future version.
+
     copy : bool, default=True
-        If True, a copy of `X` will be created. If False, imputation will
+        If True, a copy of X will be created. If False, imputation will
         be done in-place whenever possible. Note that, in the following cases,
         a new copy will always be made, even if `copy=False`:
 
@@ -236,7 +245,7 @@ def __init__(
         missing_values=np.nan,
         strategy="mean",
         fill_value=None,
-        verbose=0,
+        verbose="deprecated",
         copy=True,
         add_indicator=False,
     ):
@@ -269,10 +278,10 @@ def _validate_input(self, X, in_fit):
         else:
             dtype = FLOAT_DTYPES
 
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
+        if _is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values):
             force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
 
         try:
             X = self._validate_data(
@@ -324,6 +333,15 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
+        if self.verbose != "deprecated":
+            warnings.warn(
+                "The 'verbose' parameter was deprecated in version "
+                "1.1 and will be removed in 1.3. A warning will "
+                "always be raised upon the removal of empty columns "
+                "in the future version.",
+                FutureWarning,
+            )
+
         X = self._validate_input(X, in_fit=True)
 
         # default fill_value is 0 for numerical input and "missing_value"
@@ -500,9 +518,9 @@ def transform(self, X):
 
             if invalid_mask.any():
                 missing = np.arange(X.shape[1])[invalid_mask]
-                if self.verbose:
+                if self.verbose != "deprecated" and self.verbose:
                     warnings.warn(
-                        "Deleting features without observed values: %s" % missing
+                        "Skipping features without observed values: %s" % missing
                     )
                 X = X[:, valid_statistics_indexes]
 
@@ -604,6 +622,13 @@ def inverse_transform(self, X):
         X_original[full_mask] = self.missing_values
         return X_original
 
+    def _more_tags(self):
+        return {
+            "allow_nan": (
+                _is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values)
+            )
+        }
+
     def get_feature_names_out(self, input_features=None):
         """Get output feature names for transformation.
 
diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
index 9a4da4a9230a0..8b79041fd4b6e 100644
--- a/sklearn/impute/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -28,6 +28,16 @@
 from sklearn.impute._base import _most_frequent
 
 
+def _assert_array_equal_and_same_dtype(x, y):
+    assert_array_equal(x, y)
+    assert x.dtype == y.dtype
+
+
+def _assert_allclose_and_same_dtype(x, y):
+    assert_allclose(x, y)
+    assert x.dtype == y.dtype
+
+
 def _check_statistics(X, X_true, strategy, statistics, missing_values):
     """Utility function for testing imputation for a given strategy.
 
@@ -96,10 +106,14 @@ def test_imputation_error_invalid_strategy(strategy):
 def test_imputation_deletion_warning(strategy):
     X = np.ones((3, 5))
     X[:, 0] = np.nan
+    imputer = SimpleImputer(strategy=strategy, verbose=1)
 
-    with pytest.warns(UserWarning, match="Deleting"):
-        imputer = SimpleImputer(strategy=strategy, verbose=True)
-        imputer.fit_transform(X)
+    # TODO: Remove in 1.3
+    with pytest.warns(FutureWarning, match="The 'verbose' parameter"):
+        imputer.fit(X)
+
+    with pytest.warns(UserWarning, match="Skipping"):
+        imputer.transform(X)
 
 
 @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
@@ -1277,7 +1291,7 @@ def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
 @pytest.mark.parametrize(
     "imputer_missing_values, missing_value, err_msg",
     [
-        ("NaN", np.nan, "Input contains NaN"),
+        ("NaN", np.nan, "Input X contains NaN"),
         ("-1", -1, "types are expected to be both numerical."),
     ],
 )
@@ -1495,6 +1509,66 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
     )
 
 
+def test_simple_impute_pd_na():
+    pd = pytest.importorskip("pandas", minversion="1.0")
+
+    # Impute pandas array of string types.
+    df = pd.DataFrame({"feature": pd.Series(["abc", None, "de"], dtype="string")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="na")
+    _assert_array_equal_and_same_dtype(
+        imputer.fit_transform(df), np.array([["abc"], ["na"], ["de"]], dtype=object)
+    )
+
+    # Impute pandas array of string types without any missing values.
+    df = pd.DataFrame({"feature": pd.Series(["abc", "de", "fgh"], dtype="string")})
+    imputer = SimpleImputer(fill_value="ok", strategy="constant")
+    _assert_array_equal_and_same_dtype(
+        imputer.fit_transform(df), np.array([["abc"], ["de"], ["fgh"]], dtype=object)
+    )
+
+    # Impute pandas array of integer types.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 3], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-1)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64")
+    )
+
+    # Use `np.nan` also works.
+    imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64")
+    )
+
+    # Impute pandas array of integer types with 'median' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 2, 3], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="median")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [2], [2], [3]], dtype="float64")
+    )
+
+    # Impute pandas array of integer types with 'mean' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1, None, 2], dtype="Int64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="mean")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1], [1.5], [2]], dtype="float64")
+    )
+
+    # Impute pandas array of float types.
+    df = pd.DataFrame({"feature": pd.Series([1.0, None, 3.0], dtype="float64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-2.0)
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df), np.array([[1.0], [-2.0], [3.0]], dtype="float64")
+    )
+
+    # Impute pandas array of float types with 'median' strategy.
+    df = pd.DataFrame({"feature": pd.Series([1.0, None, 2.0, 3.0], dtype="float64")})
+    imputer = SimpleImputer(missing_values=pd.NA, strategy="median")
+    _assert_allclose_and_same_dtype(
+        imputer.fit_transform(df),
+        np.array([[1.0], [2.0], [2.0], [3.0]], dtype="float64"),
+    )
+
+
 def test_missing_indicator_feature_names_out():
     """Check that missing indicator return the feature names with a prefix."""
     pd = pytest.importorskip("pandas")
diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py
index b153f3a458161..098899bc1a0f1 100644
--- a/sklearn/impute/tests/test_knn.py
+++ b/sklearn/impute/tests/test_knn.py
@@ -39,7 +39,7 @@ def test_knn_imputer_default_with_invalid_input(na):
             [6, 6, 2, 5, 7],
         ]
     )
-    with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
+    with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
         KNNImputer(missing_values=na).fit(X)
 
     # Test with inf present in matrix passed in transform()
@@ -65,7 +65,7 @@ def test_knn_imputer_default_with_invalid_input(na):
         ]
     )
     imputer = KNNImputer(missing_values=na).fit(X_fit)
-    with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
+    with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
         imputer.transform(X)
 
     # negative n_neighbors
@@ -82,9 +82,7 @@ def test_knn_imputer_default_with_invalid_input(na):
             [np.nan, 6, 0, 5, 13],
         ]
     )
-    msg = (
-        r"Input contains NaN, infinity or a value too large for " r"dtype\('float64'\)"
-    )
+    msg = "Input X contains NaN"
     with pytest.raises(ValueError, match=msg):
         imputer.fit(X)
 
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index 215cb4b21c179..cca120d12e0bb 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -13,7 +13,6 @@
 from scipy.stats.mstats import mquantiles
 
 from ..base import is_classifier, is_regressor
-from ..pipeline import Pipeline
 from ..utils.extmath import cartesian
 from ..utils import check_array
 from ..utils import check_matplotlib_support  # noqa
@@ -383,20 +382,11 @@ def partial_dependence(
     ...                    grid_resolution=2) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
+    check_is_fitted(estimator)
+
     if not (is_classifier(estimator) or is_regressor(estimator)):
         raise ValueError("'estimator' must be a fitted regressor or classifier.")
 
-    if isinstance(estimator, Pipeline):
-        # TODO: to be removed if/when pipeline get a `steps_` attributes
-        # assuming Pipeline is the only estimator that does not store a new
-        # attribute
-        for est in estimator:
-            # FIXME: remove the None option when it will be deprecated
-            if est not in (None, "drop"):
-                check_is_fitted(est)
-    else:
-        check_is_fitted(estimator)
-
     if is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray):
         raise ValueError("Multiclass-multioutput estimators are not supported")
 
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index b61efc29ebdda..4b1687dea9605 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -116,7 +116,7 @@ def isotonic_regression(
     by Michael J. Best and Nilotpal Chakravarti, section 3.
     """
     order = np.s_[:] if increasing else np.s_[::-1]
-    y = check_array(y, ensure_2d=False, dtype=[np.float64, np.float32])
+    y = check_array(y, ensure_2d=False, input_name="y", dtype=[np.float64, np.float32])
     y = np.array(y[order], dtype=y.dtype)
     sample_weight = _check_sample_weight(sample_weight, y, dtype=y.dtype, copy=True)
     sample_weight = np.ascontiguousarray(sample_weight[order])
@@ -337,8 +337,10 @@ def fit(self, X, y, sample_weight=None):
         new input data.
         """
         check_params = dict(accept_sparse=False, ensure_2d=False)
-        X = check_array(X, dtype=[np.float64, np.float32], **check_params)
-        y = check_array(y, dtype=X.dtype, **check_params)
+        X = check_array(
+            X, input_name="X", dtype=[np.float64, np.float32], **check_params
+        )
+        y = check_array(y, input_name="y", dtype=X.dtype, **check_params)
         check_consistent_length(X, y, sample_weight)
 
         # Transform y by running the isotonic regression algorithm and
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 03f63797afe93..652556cf1e702 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -392,15 +392,15 @@ def decision_function(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
-            Samples.
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the confidence scores.
 
         Returns
         -------
-        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
-            Confidence scores per (sample, class) combination. In the binary
-            case, confidence score for self.classes_[1] where >0 means this
-            class would be predicted.
+        scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Confidence scores per `(n_samples, n_classes)` combination. In the
+            binary case, confidence score for `self.classes_[1]` where >0 means
+            this class would be predicted.
         """
         check_is_fitted(self)
 
@@ -414,13 +414,13 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features)
-            Samples.
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the predictions.
 
         Returns
         -------
-        C : array, shape [n_samples]
-            Predicted class label per sample.
+        y_pred : ndarray of shape (n_samples,)
+            Vector containing the class labels for each sample.
         """
         scores = self.decision_function(X)
         if len(scores.shape) == 1:
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
index 4841809ac7aa7..338ca0f0cae7e 100644
--- a/sklearn/linear_model/_cd_fast.pyx
+++ b/sklearn/linear_model/_cd_fast.pyx
@@ -5,8 +5,6 @@
 #         Manoj Kumar <manojkumarsivaraj334@gmail.com>
 #
 # License: BSD 3 clause
-#
-# cython: boundscheck=False, wraparound=False, cdivision=True
 
 from libc.math cimport fabs
 cimport numpy as np
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 851e85a90f194..7e4906962bffc 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1144,7 +1144,6 @@ class Lasso(ElasticNet):
             ``normalize`` was deprecated in version 1.0 and will be removed in
             1.2.
 
-    precompute : 'auto', bool or array-like of shape (n_features, n_features),\
     precompute : bool or array-like of shape (n_features, n_features),\
                  default=False
         Whether to use a precomputed Gram matrix to speed up
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 8b94e4a32087f..7790171042d0f 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -1022,9 +1022,7 @@ def fit(self, X, y):
             self.normalize, default=True, estimator_name=self.__class__.__name__
         )
 
-        X, y = self._validate_data(
-            X, y, y_numeric=True, ensure_min_features=2, estimator=self
-        )
+        X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2)
         X = as_float_array(X, copy=False, force_all_finite=False)
         cv = check_cv(self.cv, classifier=False)
         max_iter = (
diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index f4d4ccae2ee0f..3d15d073a6f9b 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -49,7 +49,7 @@ class Perceptron(BaseSGDClassifier):
     verbose : int, default=0
         The verbosity level.
 
-    eta0 : double, default=1
+    eta0 : float, default=1
         Constant by which the updates are multiplied.
 
     n_jobs : int, default=None
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index ef7c27dd82ad6..1a501e8404f62 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -28,6 +28,7 @@
 from ..utils import check_consistent_length
 from ..utils import compute_sample_weight
 from ..utils import column_or_1d
+from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_sample_weight
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
@@ -1010,7 +1011,93 @@ def fit(self, X, y, sample_weight=None):
         return super().fit(X, y, sample_weight=sample_weight)
 
 
-class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
+class _RidgeClassifierMixin(LinearClassifierMixin):
+    def _prepare_data(self, X, y, sample_weight, solver):
+        """Validate `X` and `y` and binarize `y`.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : ndarray of shape (n_samples,)
+            Target values.
+
+        sample_weight : float or ndarray of shape (n_samples,), default=None
+            Individual weights for each sample. If given a float, every sample
+            will have the same weight.
+
+        solver : str
+            The solver used in `Ridge` to know which sparse format to support.
+
+        Returns
+        -------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            Validated training data.
+
+        y : ndarray of shape (n_samples,)
+            Validated target values.
+
+        sample_weight : ndarray of shape (n_samples,)
+            Validated sample weights.
+
+        Y : ndarray of shape (n_samples, n_classes)
+            The binarized version of `y`.
+        """
+        accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
+        X, y = self._validate_data(
+            X,
+            y,
+            accept_sparse=accept_sparse,
+            multi_output=True,
+            y_numeric=False,
+        )
+
+        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
+        Y = self._label_binarizer.fit_transform(y)
+        if not self._label_binarizer.y_type_.startswith("multilabel"):
+            y = column_or_1d(y, warn=True)
+
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        if self.class_weight:
+            sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
+        return X, y, sample_weight, Y
+
+    def predict(self, X):
+        """Predict class labels for samples in `X`.
+
+        Parameters
+        ----------
+        X : {array-like, spare matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to predict the targets.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            Vector or matrix containing the predictions. In binary and
+            multiclass problems, this is a vector containing `n_samples`. In
+            a multilabel problem, it returns a matrix of shape
+            `(n_samples, n_outputs)`.
+        """
+        check_is_fitted(self, attributes=["_label_binarizer"])
+        if self._label_binarizer.y_type_.startswith("multilabel"):
+            # Threshold such that the negative label is -1 and positive label
+            # is 1 to use the inverse transform of the label binarizer fitted
+            # during fit.
+            scores = 2 * (self.decision_function(X) > 0) - 1
+            return self._label_binarizer.inverse_transform(scores)
+        return super().predict(X)
+
+    @property
+    def classes_(self):
+        """Classes labels."""
+        return self._label_binarizer.classes_
+
+    def _more_tags(self):
+        return {"multilabel": True}
+
+
+class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge):
     """Classifier using Ridge regression.
 
     This classifier first converts the target values into ``{-1, 1}`` and
@@ -1096,7 +1183,7 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
           .. versionadded:: 0.17
              Stochastic Average Gradient descent solver.
           .. versionadded:: 0.19
-           SAGA solver.
+             SAGA solver.
 
         - 'lbfgs' uses L-BFGS-B algorithm implemented in
           `scipy.optimize.minimize`. It can be used only when `positive`
@@ -1203,42 +1290,18 @@ def fit(self, X, y, sample_weight=None):
             will have the same weight.
 
             .. versionadded:: 0.17
-               *sample_weight* support to Classifier.
+               *sample_weight* support to RidgeClassifier.
 
         Returns
         -------
         self : object
             Instance of the estimator.
         """
-        _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
-        X, y = self._validate_data(
-            X, y, accept_sparse=_accept_sparse, multi_output=True, y_numeric=False
-        )
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
-        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
-        Y = self._label_binarizer.fit_transform(y)
-        if not self._label_binarizer.y_type_.startswith("multilabel"):
-            y = column_or_1d(y, warn=True)
-        else:
-            # we don't (yet) support multi-label classification in Ridge
-            raise ValueError(
-                "%s doesn't support multi-label classification"
-                % (self.__class__.__name__)
-            )
-
-        if self.class_weight:
-            # modify the sample weights with the corresponding class weight
-            sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
+        X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)
 
         super().fit(X, Y, sample_weight=sample_weight)
         return self
 
-    @property
-    def classes_(self):
-        """Classes labels."""
-        return self._label_binarizer.classes_
-
 
 def _check_gcv_mode(X, gcv_mode):
     possible_gcv_modes = [None, "auto", "svd", "eigen"]
@@ -1420,7 +1483,7 @@ def __init__(
         is_clf=False,
         alpha_per_target=False,
     ):
-        self.alphas = np.asarray(alphas)
+        self.alphas = alphas
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.scoring = scoring
@@ -1779,6 +1842,8 @@ def fit(self, X, y, sample_weight=None):
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
+        self.alphas = np.asarray(self.alphas)
+
         if np.any(self.alphas <= 0):
             raise ValueError(
                 "alphas must be strictly positive. Got {} containing some "
@@ -1914,7 +1979,7 @@ def __init__(
         store_cv_values=False,
         alpha_per_target=False,
     ):
-        self.alphas = np.asarray(alphas)
+        self.alphas = alphas
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.scoring = scoring
@@ -2063,7 +2128,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
 
-    gcv_mode : {'auto', 'svd', eigen'}, default='auto'
+    gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'
         Flag indicating which strategy to use when performing
         Leave-One-Out Cross-Validation. Options are::
 
@@ -2145,7 +2210,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
     """
 
 
-class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
+class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV):
     """Ridge classifier with built-in cross-validation.
 
     See glossary entry for :term:`cross-validation estimator`.
@@ -2318,33 +2383,20 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        X, y = self._validate_data(
-            X,
-            y,
-            accept_sparse=["csr", "csc", "coo"],
-            multi_output=True,
-            y_numeric=False,
-        )
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-
-        self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
-        Y = self._label_binarizer.fit_transform(y)
-        if not self._label_binarizer.y_type_.startswith("multilabel"):
-            y = column_or_1d(y, warn=True)
-
-        if self.class_weight:
-            # modify the sample weights with the corresponding class weight
-            sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
-
+        # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
+        # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
+        # all sparse format.
+        X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen")
+
+        # If cv is None, gcv mode will be used and we used the binarized Y
+        # since y will not be binarized in _RidgeGCV estimator.
+        # If cv is not None, a GridSearchCV with some RidgeClassifier
+        # estimators are used where y will be binarized. Thus, we pass y
+        # instead of the binarized Y.
         target = Y if self.cv is None else y
-        _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight)
+        super().fit(X, target, sample_weight=sample_weight)
         return self
 
-    @property
-    def classes_(self):
-        """Classes labels."""
-        return self._label_binarizer.classes_
-
     def _more_tags(self):
         return {
             "multilabel": True,
@@ -2352,12 +2404,5 @@ def _more_tags(self):
                 "check_sample_weights_invariance": (
                     "zero sample_weight is not equivalent to removing samples"
                 ),
-                # FIXME: see
-                # https://github.com/scikit-learn/scikit-learn/issues/19858
-                # to track progress to resolve this issue
-                "check_classifiers_multilabel_output_format_predict": (
-                    "RidgeClassifierCV.predict outputs an array of shape (25,) "
-                    "instead of (25, 5)"
-                ),
             },
         }
diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py
index 5d87166df816b..48dcd7aef8ad3 100644
--- a/sklearn/linear_model/_sag.py
+++ b/sklearn/linear_model/_sag.py
@@ -156,7 +156,7 @@ def sag_solver(
         The max number of passes over the training data if the stopping
         criteria is not reached.
 
-    tol : double, default=0.001
+    tol : float, default=0.001
         The stopping criteria for the weights. The iterations will stop when
         max(change in weights) / max(weights) < tol.
 
diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp
index b6493f5f32f96..756a048eea999 100644
--- a/sklearn/linear_model/_sag_fast.pyx.tp
+++ b/sklearn/linear_model/_sag_fast.pyx.tp
@@ -27,10 +27,6 @@ dtypes = [('64', 'double', 'np.float64'),
 
 #------------------------------------------------------------------------------
 
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-#
 # Authors: Danny Sullivan <dbsullivan23@gmail.com>
 #          Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
 #          Arthur Mensch <arthur.mensch@m4x.org
diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx
index 3b54a34adc80b..b6e93d79e9f9e 100644
--- a/sklearn/linear_model/_sgd_fast.pyx
+++ b/sklearn/linear_model/_sgd_fast.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-#
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Mathieu Blondel (partial_fit support)
 #         Rob Zinkov (passive-aggressive)
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index ce03fc3697566..c07232b6c23fe 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -1022,12 +1022,12 @@ class SGDClassifier(BaseSGDClassifier):
             .. versionadded:: 0.20
                 Added 'adaptive' option
 
-    eta0 : double, default=0.0
+    eta0 : float, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
         the default schedule 'optimal'.
 
-    power_t : double, default=0.5
+    power_t : float, default=0.5
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
@@ -1776,11 +1776,11 @@ class SGDRegressor(BaseSGDRegressor):
             .. versionadded:: 0.20
                 Added 'adaptive' option
 
-    eta0 : double, default=0.01
+    eta0 : float, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
 
-    power_t : double, default=0.25
+    power_t : float, default=0.25
         The exponent for inverse scaling learning rate.
 
     early_stopping : bool, default=False
@@ -1954,64 +1954,60 @@ class SGDOneClassSVM(BaseSGD, OutlierMixin):
 
     Parameters
     ----------
-    nu : float, optional
+    nu : float, default=0.5
         The nu parameter of the One Class SVM: an upper bound on the
         fraction of training errors and a lower bound of the fraction of
         support vectors. Should be in the interval (0, 1]. By default 0.5
         will be taken.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. Defaults to True.
 
-    max_iter : int, optional
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         `partial_fit`. Defaults to 1000.
 
-    tol : float or None, optional
+    tol : float or None, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > previous_loss - tol). Defaults to 1e-3.
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
         Defaults to True.
 
-    verbose : int, optional
+    verbose : int, default=0
         The verbosity level.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance or None, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : str, optional
+    learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'
         The learning rate schedule to use with `fit`. (If using `partial_fit`,
         learning rate must be controlled directly).
 
-        'constant':
-            eta = eta0
-        'optimal': [default]
-            eta = 1.0 / (alpha * (t + t0))
-            where t0 is chosen by a heuristic proposed by Leon Bottou.
-        'invscaling':
-            eta = eta0 / pow(t, power_t)
-        'adaptive':
-            eta = eta0, as long as the training keeps decreasing.
-            Each time n_iter_no_change consecutive epochs fail to decrease the
-            training loss by tol or fail to increase validation score by tol if
-            early_stopping is True, the current learning rate is divided by 5.
-
-    eta0 : double
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
+
+    eta0 : float, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
         the default schedule 'optimal'.
 
-    power_t : double
+    power_t : float, default=0.5
         The exponent for inverse scaling learning rate [default 0.5].
 
-    warm_start : bool, optional
+    warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
         See :term:`the Glossary <warm_start>`.
@@ -2024,7 +2020,7 @@ class SGDOneClassSVM(BaseSGD, OutlierMixin):
         this counter, while ``partial_fit``  will result in increasing the
         existing counter.
 
-    average : bool or int, optional
+    average : bool or int, default=False
         When set to True, computes the averaged SGD weights and stores the
         result in the ``coef_`` attribute. If set to an int greater than 1,
         averaging will begin once the total number of samples seen reaches
@@ -2033,10 +2029,10 @@ class SGDOneClassSVM(BaseSGD, OutlierMixin):
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features)
+    coef_ : ndarray of shape (1, n_features)
         Weights assigned to the features.
 
-    offset_ : array, shape (1,)
+    offset_ : ndarray of shape (1,)
         Offset used to define the decision function from the raw scores.
         We have the relation: decision_function = score_samples - offset.
 
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 579f0afa3997c..bccfd22286d02 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -248,7 +248,7 @@ class TheilSenRegressor(RegressorMixin, LinearModel):
     max_iter : int, default=300
         Maximum number of iterations for the calculation of spatial median.
 
-    tol : float, default=1.e-3
+    tol : float, default=1e-3
         Tolerance when calculating spatial median.
 
     random_state : int, RandomState instance or None, default=None
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index dd67c49585bad..5d0b212c10e0e 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -351,7 +351,7 @@ def test_lasso_cv_positive_constraint():
 
 def _scale_alpha_inplace(estimator, n_samples):
     """Rescale the parameter alpha from when the estimator is evoked with
-    normalize set to True to when it is evoked in a Pipeline with normalize set
+    normalize set to True as if it were evoked in a Pipeline with normalize set
     to False and with a StandardScaler.
     """
     if ("alpha" not in estimator.get_params()) and (
@@ -360,7 +360,10 @@ def _scale_alpha_inplace(estimator, n_samples):
         return
 
     if isinstance(estimator, (RidgeCV, RidgeClassifierCV)):
-        alphas = estimator.alphas * n_samples
+        # alphas is not validated at this point and can be a list.
+        # We convert it to a np.ndarray to make sure broadcasting
+        # is used.
+        alphas = np.asarray(estimator.alphas) * n_samples
         return estimator.set_params(alphas=alphas)
     if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)):
         alpha = estimator.alpha * np.sqrt(n_samples)
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index bfc6722737bd8..58d2804d89aca 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -1110,6 +1110,27 @@ def test_ridge_classifier_cv_store_cv_values(scoring):
     assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
 
 
+@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV])
+def test_ridgecv_alphas_conversion(Estimator):
+    rng = np.random.RandomState(0)
+    alphas = (0.1, 1.0, 10.0)
+
+    n_samples, n_features = 5, 5
+    if Estimator is RidgeCV:
+        y = rng.randn(n_samples)
+    else:
+        y = rng.randint(0, 2, n_samples)
+    X = rng.randn(n_samples, n_features)
+
+    ridge_est = Estimator(alphas=alphas)
+    assert (
+        ridge_est.alphas is alphas
+    ), f"`alphas` was mutated in `{Estimator.__name__}.__init__`"
+
+    ridge_est.fit(X, y)
+    assert_array_equal(ridge_est.alphas, np.asarray(alphas))
+
+
 def test_ridgecv_sample_weight():
     rng = np.random.RandomState(0)
     alphas = (0.1, 1.0, 10.0)
@@ -1396,12 +1417,6 @@ def test_ridge_regression_check_arguments_validity(
         assert_allclose(out, true_coefs, rtol=0, atol=atol)
 
 
-def test_ridge_classifier_no_support_multilabel():
-    X, y = make_multilabel_classification(n_samples=10, random_state=0)
-    with pytest.raises(ValueError):
-        RidgeClassifier().fit(X, y)
-
-
 @pytest.mark.parametrize(
     "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga", "lbfgs"]
 )
@@ -1515,6 +1530,28 @@ def test_ridge_sag_with_X_fortran():
     Ridge(solver="sag").fit(X, y)
 
 
+@pytest.mark.parametrize(
+    "Classifier, params",
+    [
+        (RidgeClassifier, {}),
+        (RidgeClassifierCV, {"cv": None}),
+        (RidgeClassifierCV, {"cv": 3}),
+    ],
+)
+def test_ridgeclassifier_multilabel(Classifier, params):
+    """Check that multilabel classification is supported and give meaningful
+    results."""
+    X, y = make_multilabel_classification(n_classes=1, random_state=0)
+    y = y.reshape(-1, 1)
+    Y = np.concatenate([y, y], axis=1)
+    clf = Classifier(**params).fit(X, Y)
+    Y_pred = clf.predict(X)
+
+    assert Y_pred.shape == Y.shape
+    assert_array_equal(Y_pred[:, 0], Y_pred[:, 1])
+    Ridge(solver="sag").fit(X, y)
+
+
 @pytest.mark.parametrize("solver", ["auto", "lbfgs"])
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0])
diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx
index 936a74373e735..2d314c0ccf3a5 100644
--- a/sklearn/manifold/_barnes_hut_tsne.pyx
+++ b/sklearn/manifold/_barnes_hut_tsne.pyx
@@ -1,7 +1,3 @@
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: cdivision=True
-#
 # Author: Christopher Moody <chrisemoody@gmail.com>
 # Author: Nick Travers <nickt@squareup.com>
 # Implementation by Chris Moody & Nick Travers
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 18e647307754e..793f3958c64d2 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -315,8 +315,9 @@ def spectral_embedding(
         # problem.
         if not sparse.issparse(laplacian):
             warnings.warn("AMG works better for sparse matrices")
-        # lobpcg needs double precision floats
-        laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True)
+        laplacian = check_array(
+            laplacian, dtype=[np.float64, np.float32], accept_sparse=True
+        )
         laplacian = _set_diag(laplacian, 1, norm_laplacian)
 
         # The Laplacian matrix is always singular, having at least one zero
@@ -337,6 +338,7 @@ def spectral_embedding(
         # Create initial approximation X to eigenvectors
         X = random_state.rand(laplacian.shape[0], n_components + 1)
         X[:, 0] = dd.ravel()
+        X = X.astype(laplacian.dtype)
         _, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.0e-5, largest=False)
         embedding = diffusion_map.T
         if norm_laplacian:
@@ -346,8 +348,9 @@ def spectral_embedding(
             raise ValueError
 
     if eigen_solver == "lobpcg":
-        # lobpcg needs double precision floats
-        laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True)
+        laplacian = check_array(
+            laplacian, dtype=[np.float64, np.float32], accept_sparse=True
+        )
         if n_nodes < 5 * n_components + 1:
             # see note above under arpack why lobpcg has problems with small
             # number of nodes
@@ -366,6 +369,7 @@ def spectral_embedding(
             # approximation X to eigenvectors
             X = random_state.rand(laplacian.shape[0], n_components + 1)
             X[:, 0] = dd.ravel()
+            X = X.astype(laplacian.dtype)
             _, diffusion_map = lobpcg(
                 laplacian, X, tol=1e-5, largest=False, maxiter=2000
             )
@@ -619,9 +623,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
 
-        X = self._validate_data(
-            X, accept_sparse="csr", ensure_min_samples=2, estimator=self
-        )
+        X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
 
         random_state = check_random_state(self.random_state)
         if isinstance(self.affinity, str):
diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx
index cd6ade795ae91..985aa3388d34c 100644
--- a/sklearn/manifold/_utils.pyx
+++ b/sklearn/manifold/_utils.pyx
@@ -1,5 +1,3 @@
-# cython: boundscheck=False
-
 from libc cimport math
 cimport cython
 import numpy as np
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index 8454accb7c59b..6a5f51972504e 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -19,6 +19,15 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 
+try:
+    from pyamg import smoothed_aggregation_solver  # noqa
+
+    pyamg_available = True
+except ImportError:
+    pyamg_available = False
+skip_if_no_pyamg = pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
 
 # non centered, sparse centers to check the
 centers = np.array(
@@ -85,7 +94,16 @@ def test_sparse_graph_connected_component():
         assert_array_equal(component_1, component_2)
 
 
-def test_spectral_embedding_two_components(seed=36):
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_spectral_embedding_two_components(eigen_solver, dtype, seed=36):
     # Test spectral embedding with two components
     random_state = np.random.RandomState(seed)
     n_sample = 100
@@ -117,31 +135,46 @@ def test_spectral_embedding_two_components(seed=36):
     true_label[0:n_sample] = 1
 
     se_precomp = SpectralEmbedding(
-        n_components=1, affinity="precomputed", random_state=np.random.RandomState(seed)
+        n_components=1,
+        affinity="precomputed",
+        random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
     )
-    embedded_coordinate = se_precomp.fit_transform(affinity)
-    # Some numpy versions are touchy with types
-    embedded_coordinate = se_precomp.fit_transform(affinity.astype(np.float32))
+
+    embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
     # thresholding on the first components using 0.
-    label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
+    label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
     assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
 
 
 @pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
-def test_spectral_embedding_precomputed_affinity(X, seed=36):
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_spectral_embedding_precomputed_affinity(X, eigen_solver, dtype, seed=36):
     # Test spectral embedding with precomputed kernel
     gamma = 1.0
     se_precomp = SpectralEmbedding(
-        n_components=2, affinity="precomputed", random_state=np.random.RandomState(seed)
+        n_components=2,
+        affinity="precomputed",
+        random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
     )
     se_rbf = SpectralEmbedding(
         n_components=2,
         affinity="rbf",
         gamma=gamma,
         random_state=np.random.RandomState(seed),
+        eigen_solver=eigen_solver,
     )
-    embed_precomp = se_precomp.fit_transform(rbf_kernel(X, gamma=gamma))
-    embed_rbf = se_rbf.fit_transform(X)
+    embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
+    embed_rbf = se_rbf.fit_transform(X.astype(dtype))
     assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
     _assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
 
@@ -205,10 +238,11 @@ def test_spectral_embedding_callable_affinity(X, seed=36):
 @pytest.mark.filterwarnings(
     "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
 )
-def test_spectral_embedding_amg_solver(seed=36):
-    # Test spectral embedding with amg solver
-    pytest.importorskip("pyamg")
-
+@pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_spectral_embedding_amg_solver(dtype, seed=36):
     se_amg = SpectralEmbedding(
         n_components=2,
         affinity="nearest_neighbors",
@@ -223,8 +257,8 @@ def test_spectral_embedding_amg_solver(seed=36):
         n_neighbors=5,
         random_state=np.random.RandomState(seed),
     )
-    embed_amg = se_amg.fit_transform(S)
-    embed_arpack = se_arpack.fit_transform(S)
+    embed_amg = se_amg.fit_transform(S.astype(dtype))
+    embed_arpack = se_arpack.fit_transform(S.astype(dtype))
     _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
 
     # same with special case in which amg is not actually used
@@ -239,8 +273,8 @@ def test_spectral_embedding_amg_solver(seed=36):
     ).toarray()
     se_amg.affinity = "precomputed"
     se_arpack.affinity = "precomputed"
-    embed_amg = se_amg.fit_transform(affinity)
-    embed_arpack = se_arpack.fit_transform(affinity)
+    embed_amg = se_amg.fit_transform(affinity.astype(dtype))
+    embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
     _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
 
 
@@ -258,12 +292,15 @@ def test_spectral_embedding_amg_solver(seed=36):
 @pytest.mark.filterwarnings(
     "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
 )
-def test_spectral_embedding_amg_solver_failure():
+@pytest.mark.skipif(
+    not pyamg_available, reason="PyAMG is required for the tests in this function."
+)
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
     # Non-regression test for amg solver failure (issue #13393 on github)
-    pytest.importorskip("pyamg")
-    seed = 36
     num_nodes = 100
     X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
+    X = X.astype(dtype)
     upper = sparse.triu(X) - sparse.diags(X.diagonal())
     sym_matrix = upper + upper.T
     embedding = spectral_embedding(
@@ -314,7 +351,9 @@ def test_spectral_embedding_unknown_eigensolver(seed=36):
 def test_spectral_embedding_unknown_affinity(seed=36):
     # Test that SpectralClustering fails with an unknown affinity type
     se = SpectralEmbedding(
-        n_components=1, affinity="<unknown>", random_state=np.random.RandomState(seed)
+        n_components=1,
+        affinity="<unknown>",
+        random_state=np.random.RandomState(seed),
     )
     with pytest.raises(ValueError):
         se.fit(S)
@@ -399,6 +438,50 @@ def test_spectral_embedding_first_eigen_vector():
         assert np.std(embedding[:, 1]) > 1e-3
 
 
+@pytest.mark.parametrize(
+    "eigen_solver",
+    [
+        "arpack",
+        "lobpcg",
+        pytest.param("amg", marks=skip_if_no_pyamg),
+    ],
+)
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
+    """Check that `SpectralEmbedding is preserving the dtype of the fitted
+    attribute and transformed data.
+
+    Ideally, this test should be covered by the common test
+    `check_transformer_preserve_dtypes`. However, this test only run
+    with transformers implementing `transform` while `SpectralEmbedding`
+    implements only `fit_transform`.
+    """
+    X = S.astype(dtype)
+    se = SpectralEmbedding(
+        n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
+    )
+    X_trans = se.fit_transform(X)
+
+    assert X_trans.dtype == dtype
+    assert se.embedding_.dtype == dtype
+    assert se.affinity_matrix_.dtype == dtype
+
+
+@pytest.mark.skipif(
+    pyamg_available,
+    reason="PyAMG is installed and we should not test for an error.",
+)
+def test_error_pyamg_not_available():
+    se_precomp = SpectralEmbedding(
+        n_components=2,
+        affinity="rbf",
+        eigen_solver="amg",
+    )
+    err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
+    with pytest.raises(ValueError, match=err_msg):
+        se_precomp.fit_transform(S)
+
+
 # TODO: Remove in 1.1
 @pytest.mark.parametrize("affinity", ["precomputed", "precomputed_nearest_neighbors"])
 def test_spectral_embedding_pairwise_deprecated(affinity):
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 81b35f1cf6f9e..ac726dd816a13 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -82,8 +82,8 @@ def _check_targets(y_true, y_pred):
     y_pred : array or indicator matrix
     """
     check_consistent_length(y_true, y_pred)
-    type_true = type_of_target(y_true)
-    type_pred = type_of_target(y_pred)
+    type_true = type_of_target(y_true, input_name="y_true")
+    type_pred = type_of_target(y_pred, input_name="y_pred")
 
     y_type = {type_true, type_pred}
     if y_type == {"binary", "multiclass"}:
@@ -303,7 +303,6 @@ def confusion_matrix(
     >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
     >>> (tn, fp, fn, tp)
     (0, 2, 1, 1)
-
     """
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     if y_type not in ("binary", "multiclass"):
@@ -1714,7 +1713,16 @@ def precision_score(
 
     See Also
     --------
-    precision_recall_fscore_support, multilabel_confusion_matrix
+    precision_recall_fscore_support : Compute precision, recall, F-measure and
+        support for each class.
+    recall_score :  Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the
+        number of true positives and ``fn`` the number of false negatives.
+    PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given
+        an estimator and some data.
+    PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given
+        binary class predictions.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
 
     Notes
     -----
@@ -1847,8 +1855,18 @@ def recall_score(
 
     See Also
     --------
-    precision_recall_fscore_support, balanced_accuracy_score,
-    multilabel_confusion_matrix
+    precision_recall_fscore_support : Compute precision, recall, F-measure and
+        support for each class.
+    precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the
+        number of true positives and ``fp`` the number of false positives.
+    balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced
+        datasets.
+    multilabel_confusion_matrix : Compute a confusion matrix for each class or
+        sample.
+    PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given
+        an estimator and some data.
+    PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given
+        binary class predictions.
 
     Notes
     -----
@@ -1925,10 +1943,16 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=Fals
     Returns
     -------
     balanced_accuracy : float
+        Balanced accuracy score.
 
     See Also
     --------
-    recall_score, roc_auc_score
+    average_precision_score : Compute average precision (AP) from prediction
+        scores.
+    precision_score : Compute the precision score.
+    recall_score : Compute the recall score.
+    roc_auc_score : Compute Area Under the Receiver Operating Characteristic
+        Curve (ROC AUC) from prediction scores.
 
     Notes
     -----
@@ -1955,7 +1979,6 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=Fals
     >>> y_pred = [0, 1, 0, 0, 0, 1]
     >>> balanced_accuracy_score(y_true, y_pred)
     0.625
-
     """
     C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
     with np.errstate(divide="ignore", invalid="ignore"):
@@ -2641,7 +2664,7 @@ def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     assert_all_finite(y_prob)
     check_consistent_length(y_true, y_prob, sample_weight)
 
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type != "binary":
         raise ValueError(
             "Only binary classification is supported. The type of the target "
diff --git a/sklearn/metrics/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
index 61bb4fb2fe011..611f6759e2c8b 100644
--- a/sklearn/metrics/_dist_metrics.pxd
+++ b/sklearn/metrics/_dist_metrics.pxd
@@ -1,8 +1,3 @@
-# cython: boundscheck=False
-# cython: cdivision=True
-# cython: initializedcheck=False
-# cython: wraparound=False
-
 cimport numpy as np
 from libc.math cimport sqrt, exp
 
diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
index a8fb4c45ddd0c..6bb279012e518 100644
--- a/sklearn/metrics/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -1,8 +1,3 @@
-# cython: boundscheck=False
-# cython: cdivision=True
-# cython: initializedcheck=False
-# cython: wraparound=False
-
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
 # License: BSD
diff --git a/sklearn/metrics/_pairwise_fast.pyx b/sklearn/metrics/_pairwise_fast.pyx
index f122972a15f89..76973529de818 100644
--- a/sklearn/metrics/_pairwise_fast.pyx
+++ b/sklearn/metrics/_pairwise_fast.pyx
@@ -1,7 +1,3 @@
-#cython: boundscheck=False
-#cython: cdivision=True
-#cython: wraparound=False
-#
 # Author: Andreas Mueller <amueller@ais.uni-bonn.de>
 #         Lars Buitinck
 #         Paolo Toccaceli
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index badfff094f7fa..0707c8d2a951d 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -210,7 +210,7 @@ def _binary_uninterpolated_average_precision(
         # guaranteed to be 1, as returned by precision_recall_curve
         return -np.sum(np.diff(recall) * np.array(precision)[:-1])
 
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type == "multilabel-indicator" and pos_label != 1:
         raise ValueError(
             "Parameter pos_label is fixed to 1 for "
@@ -541,7 +541,7 @@ class scores must correspond to the order of ``labels``,
     array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])
     """
 
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     y_true = check_array(y_true, ensure_2d=False, dtype=None)
     y_score = check_array(y_score, ensure_2d=False)
 
@@ -726,7 +726,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
         Decreasing score values.
     """
     # Check to make sure y_true is valid
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)):
         raise ValueError("{0} format is not supported".format(y_type))
 
@@ -1059,7 +1059,7 @@ def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None
         raise ValueError("y_true and y_score have different shape")
 
     # Handle badly formatted array and the degenerate case with one label
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type != "multilabel-indicator" and not (
         y_type == "binary" and y_true.ndim == 2
     ):
@@ -1140,7 +1140,7 @@ def coverage_error(y_true, y_score, *, sample_weight=None):
     y_score = check_array(y_score, ensure_2d=False)
     check_consistent_length(y_true, y_score, sample_weight)
 
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type != "multilabel-indicator":
         raise ValueError("{0} format is not supported".format(y_type))
 
@@ -1198,7 +1198,7 @@ def label_ranking_loss(y_true, y_score, *, sample_weight=None):
     y_score = check_array(y_score, ensure_2d=False)
     check_consistent_length(y_true, y_score, sample_weight)
 
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type not in ("multilabel-indicator",):
         raise ValueError("{0} format is not supported".format(y_type))
 
@@ -1345,7 +1345,7 @@ def _tie_averaged_dcg(y_true, y_score, discount_cumsum):
 
 
 def _check_dcg_target_type(y_true):
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     supported_fmt = (
         "multilabel-indicator",
         "continuous-multioutput",
@@ -1697,7 +1697,7 @@ def top_k_accuracy_score(
     """
     y_true = check_array(y_true, ensure_2d=False, dtype=None)
     y_true = column_or_1d(y_true)
-    y_type = type_of_target(y_true)
+    y_type = type_of_target(y_true, input_name="y_true")
     if y_type == "binary" and labels is not None and len(labels) > 2:
         y_type = "multiclass"
     y_score = check_array(y_score, ensure_2d=False)
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 0c0c15f33280d..ffa2b0b8218aa 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -30,9 +30,14 @@
 
 from .._loss.glm_distribution import TweedieDistribution
 from ..exceptions import UndefinedMetricWarning
-from ..utils.validation import check_array, check_consistent_length, _num_samples
-from ..utils.validation import column_or_1d
-from ..utils.validation import _check_sample_weight
+from ..utils.validation import (
+    check_array,
+    check_consistent_length,
+    _num_samples,
+    column_or_1d,
+    _check_sample_weight,
+    _deprecate_positional_args,
+)
 from ..utils.stats import _weighted_percentile
 
 
@@ -216,7 +221,7 @@ def mean_pinball_loss(
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
-    alpha: double, slope of the pinball loss, default=0.5,
+    alpha: float, slope of the pinball loss, default=0.5,
         this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,
         `alpha=0.95` is minimized by estimators of the 95th percentile.
 
@@ -283,13 +288,17 @@ def mean_pinball_loss(
     return np.average(output_errors, weights=multioutput)
 
 
+@_deprecate_positional_args(version="1.1")
 def mean_absolute_percentage_error(
-    y_true, y_pred, sample_weight=None, multioutput="uniform_average"
+    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
 ):
-    """Mean absolute percentage error regression loss.
+    """Mean absolute percentage error (MAPE) regression loss.
 
-    Note here that we do not represent the output as a percentage in range
-    [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the
+    Note here that the output is not a percentage in the range [0, 100]
+    and a value of 100 does not mean 100% but 1e2. Furthermore, the output
+    can be arbitrarily high when `y_true` is small (which is specific to the
+    metric) or when `abs(y_true - y_pred)` is large (which is common for most
+    regression metrics). Read more in the
     :ref:`User Guide <mean_absolute_percentage_error>`.
 
     .. versionadded:: 0.24
@@ -318,16 +327,16 @@ def mean_absolute_percentage_error(
 
     Returns
     -------
-    loss : float or ndarray of floats in the range [0, 1/eps]
+    loss : float or ndarray of floats
         If multioutput is 'raw_values', then mean absolute percentage error
         is returned for each output separately.
         If multioutput is 'uniform_average' or an ndarray of weights, then the
         weighted average of all output errors is returned.
 
         MAPE output is non-negative floating point. The best value is 0.0.
-        But note the fact that bad predictions can lead to arbitrarily large
-        MAPE values, especially if some y_true values are very close to zero.
-        Note that we return a large value instead of `inf` when y_true is zero.
+        But note that bad predictions can lead to arbitrarily large
+        MAPE values, especially if some `y_true` values are very close to zero.
+        Note that we return a large value instead of `inf` when `y_true` is zero.
 
     Examples
     --------
@@ -342,6 +351,12 @@ def mean_absolute_percentage_error(
     0.5515...
     >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])
     0.6198...
+    >>> # the value when some element of the y_true is zero is arbitrarily high because
+    >>> # of the division by epsilon
+    >>> y_true = [1., 0., 2.4, 7.]
+    >>> y_pred = [1.2, 0.1, 2.4, 8.]
+    >>> mean_absolute_percentage_error(y_true, y_pred)
+    112589990684262.48
     """
     y_type, y_true, y_pred, multioutput = _check_reg_targets(
         y_true, y_pred, multioutput
diff --git a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
index d2f9cd8578b12..fbc910cb23b8c 100644
--- a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
+++ b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-#
 # Authors: Robert Layton <robertlayton@gmail.com>
 #           Corey Lynch <coreylynch9@gmail.com>
 # License: BSD 3 clause
diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py
index 49fd0f06c51f7..98c9a0155a6d3 100644
--- a/sklearn/metrics/cluster/tests/test_common.py
+++ b/sklearn/metrics/cluster/tests/test_common.py
@@ -214,6 +214,6 @@ def test_inf_nan_input(metric_name, metric_func):
     else:
         X = np.random.randint(10, size=(2, 10))
         invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])]
-    with pytest.raises(ValueError, match="contains NaN, infinity"):
+    with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
         for args in invalids:
             metric_func(*args)
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index 3f4fadadb1988..e8f4ab422ae52 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -456,8 +456,8 @@ def test_adjusted_rand_score_overflow():
     https://github.com/scikit-learn/scikit-learn/issues/20305
     """
     rng = np.random.RandomState(0)
-    y_true = rng.randint(0, 2, 10_000_000, dtype=np.int8)
-    y_pred = rng.randint(0, 2, 10_000_000, dtype=np.int8)
+    y_true = rng.randint(0, 2, 100_000, dtype=np.int8)
+    y_pred = rng.randint(0, 2, 100_000, dtype=np.int8)
     with pytest.warns(None) as record:
         adjusted_rand_score(y_true, y_pred)
     assert len(record) == 0
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 5d5b807e77faa..a48aebf0415b5 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1053,9 +1053,10 @@ def linear_kernel(X, Y=None, dense_output=True):
     Parameters
     ----------
     X : ndarray of shape (n_samples_X, n_features)
+        A feature array.
 
     Y : ndarray of shape (n_samples_Y, n_features), default=None
-        If `None`, uses `Y=X`.
+        An optional second feature array. If `None`, uses `Y=X`.
 
     dense_output : bool, default=True
         Whether to return dense output even when the input is sparse. If
@@ -1066,6 +1067,7 @@ def linear_kernel(X, Y=None, dense_output=True):
     Returns
     -------
     Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)
+        The Gram matrix of the linear kernel, i.e. `X @ Y.T`.
     """
     X, Y = check_pairwise_arrays(X, Y)
     return safe_sparse_dot(X, Y.T, dense_output=dense_output)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index d5a4fa7adfa17..dfd43ef34096f 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -902,7 +902,7 @@ def test_thresholded_invariance_string_vs_numbers_labels(name):
 )
 @pytest.mark.parametrize("y_true, y_score", invalids_nan_inf)
 def test_regression_thresholded_inf_nan_input(metric, y_true, y_score):
-    with pytest.raises(ValueError, match="contains NaN, infinity"):
+    with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
         metric(y_true, y_score)
 
 
@@ -913,12 +913,29 @@ def test_regression_thresholded_inf_nan_input(metric, y_true, y_score):
     # Add an additional case for classification only
     # non-regression test for:
     # https://github.com/scikit-learn/scikit-learn/issues/6809
-    [([np.nan, 1, 2], [1, 2, 3])],  # type: ignore
+    [
+        ([np.nan, 1, 2], [1, 2, 3]),
+        ([np.inf, 1, 2], [1, 2, 3]),
+    ],  # type: ignore
 )
 def test_classification_inf_nan_input(metric, y_true, y_score):
     """check that classification metrics raise a message mentioning the
     occurrence of non-finite values in the target vectors."""
-    err_msg = "Input contains NaN, infinity or a value too large"
+    if not np.isfinite(y_true).all():
+        input_name = "y_true"
+        if np.isnan(y_true).any():
+            unexpected_value = "NaN"
+        else:
+            unexpected_value = "infinity or a value too large"
+    else:
+        input_name = "y_pred"
+        if np.isnan(y_score).any():
+            unexpected_value = "NaN"
+        else:
+            unexpected_value = "infinity or a value too large"
+
+    err_msg = f"Input {input_name} contains {unexpected_value}"
+
     with pytest.raises(ValueError, match=err_msg):
         metric(y_true, y_score)
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index b66ce18ec8da4..1a64be4e5ab53 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -362,6 +362,19 @@ def test_regression_single_sample(metric):
         assert np.isnan(score)
 
 
+def test_deprecation_positional_arguments_mape():
+    y_true = [1, 1, 1]
+    y_pred = [1, 0, 1]
+    sample_weights = [0.5, 0.1, 0.2]
+    multioutput = "raw_values"
+
+    warning_msg = "passing these as positional arguments will result in an error"
+
+    # Trigger the warning
+    with pytest.warns(FutureWarning, match=warning_msg):
+        mean_absolute_percentage_error(y_true, y_pred, sample_weights, multioutput)
+
+
 def test_tweedie_deviance_continuity():
     n_samples = 100
 
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index f4bb194e1e33d..995366b247778 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -415,7 +415,10 @@ def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
     """
     n_samples, n_features = X.shape
     n_components, _ = means.shape
-    # det(precision_chol) is half of det(precision)
+    # The determinant of the precision matrix from the Cholesky decomposition
+    # corresponds to the negative half of the determinant of the full precision
+    # matrix.
+    # In short: det(precision_chol) = - det(precision) / 2
     log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features)
 
     if covariance_type == "full":
@@ -445,6 +448,8 @@ def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):
             - 2 * np.dot(X, means.T * precisions)
             + np.outer(row_norms(X, squared=True), precisions)
         )
+    # Since we are using the precision of the Cholesky decomposition,
+    # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol`
     return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
 
 
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index fc38f3355d04e..e2939f6d96096 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -508,7 +508,7 @@ def __init__(self, n_splits=5):
     def _iter_test_indices(self, X, y, groups):
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
-        groups = check_array(groups, ensure_2d=False, dtype=None)
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
 
         unique_groups, groups = np.unique(groups, return_inverse=True)
         n_groups = len(unique_groups)
@@ -744,7 +744,7 @@ def split(self, X, y, groups=None):
         split. You can make the results identical by setting `random_state`
         to an integer.
         """
-        y = check_array(y, ensure_2d=False, dtype=None)
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
         return super().split(X, y, groups)
 
 
@@ -1144,7 +1144,9 @@ def _iter_test_masks(self, X, y, groups):
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
         # We make a copy of groups to avoid side-effects during iteration
-        groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
         unique_groups = np.unique(groups)
         if len(unique_groups) <= 1:
             raise ValueError(
@@ -1178,7 +1180,7 @@ def get_n_splits(self, X=None, y=None, groups=None):
         """
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
-        groups = check_array(groups, ensure_2d=False, dtype=None)
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
         return len(np.unique(groups))
 
     def split(self, X, y=None, groups=None):
@@ -1270,7 +1272,9 @@ def __init__(self, n_groups):
     def _iter_test_masks(self, X, y, groups):
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
-        groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
+        groups = check_array(
+            groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
+        )
         unique_groups = np.unique(groups)
         if self.n_groups >= len(unique_groups):
             raise ValueError(
@@ -1310,7 +1314,7 @@ def get_n_splits(self, X=None, y=None, groups=None):
         """
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
-        groups = check_array(groups, ensure_2d=False, dtype=None)
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
         return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
 
     def split(self, X, y=None, groups=None):
@@ -1802,7 +1806,7 @@ def __init__(
     def _iter_indices(self, X, y, groups):
         if groups is None:
             raise ValueError("The 'groups' parameter should not be None.")
-        groups = check_array(groups, ensure_2d=False, dtype=None)
+        groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
         classes, group_indices = np.unique(groups, return_inverse=True)
         for group_train, group_test in super()._iter_indices(X=classes):
             # these are the indices of classes in the partition
@@ -1919,7 +1923,7 @@ def __init__(
 
     def _iter_indices(self, X, y, groups=None):
         n_samples = _num_samples(X)
-        y = check_array(y, ensure_2d=False, dtype=None)
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
         n_train, n_test = _validate_shuffle_split(
             n_samples,
             self.test_size,
@@ -2019,7 +2023,7 @@ def split(self, X, y, groups=None):
         split. You can make the results identical by setting `random_state`
         to an integer.
         """
-        y = check_array(y, ensure_2d=False, dtype=None)
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
         return super().split(X, y, groups)
 
 
@@ -2300,7 +2304,7 @@ def check_cv(cv=5, y=None, *, classifier=False):
         if (
             classifier
             and (y is not None)
-            and (type_of_target(y) in ("binary", "multiclass"))
+            and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
         ):
             return StratifiedKFold(cv)
         else:
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 0b221de60d395..90cda443466f1 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -393,7 +393,7 @@ def cross_val_score(
     pre_dispatch="2*n_jobs",
     error_score=np.nan,
 ):
-    """Evaluate a score by cross-validation
+    """Evaluate a score by cross-validation.
 
     Read more in the :ref:`User Guide <cross_validation>`.
 
@@ -433,7 +433,7 @@ def cross_val_score(
         - `None`, to use the default 5-fold cross validation,
         - int, to specify the number of folds in a `(Stratified)KFold`,
         - :term:`CV splitter`,
-        - An iterable yielding (train, test) splits as arrays of indices.
+        - An iterable that generates (train, test) splits as arrays of indices.
 
         For `int`/`None` inputs, if the estimator is a classifier and `y` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
@@ -509,7 +509,6 @@ def cross_val_score(
 
     sklearn.metrics.make_scorer : Make a scorer from a performance metric or
         loss function.
-
     """
     # To ensure multimetric format is not supported
     scorer = check_scoring(estimator, scoring=scoring)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 765cf4eefa7de..b0e8ee126a7c4 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -37,7 +37,7 @@
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.model_selection import StratifiedGroupKFold
 
-from sklearn.linear_model import Ridge
+from sklearn.dummy import DummyClassifier
 
 from sklearn.model_selection._split import _validate_shuffle_split
 from sklearn.model_selection._split import _build_repr
@@ -1771,16 +1771,16 @@ def test_nested_cv():
 
     cvs = [
         LeaveOneGroupOut(),
-        LeaveOneOut(),
+        StratifiedKFold(n_splits=2),
         GroupKFold(n_splits=3),
-        StratifiedKFold(),
-        StratifiedGroupKFold(n_splits=3),
-        StratifiedShuffleSplit(n_splits=3, random_state=0),
     ]
 
     for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
         gs = GridSearchCV(
-            Ridge(), param_grid={"alpha": [1, 0.1]}, cv=inner_cv, error_score="raise"
+            DummyClassifier(),
+            param_grid={"strategy": ["stratified", "most_frequent"]},
+            cv=inner_cv,
+            error_score="raise",
         )
         cross_val_score(
             gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={"groups": groups}
diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx
index f8f1bd9e95f96..b5ac18365631a 100644
--- a/sklearn/neighbors/_ball_tree.pyx
+++ b/sklearn/neighbors/_ball_tree.pyx
@@ -1,9 +1,3 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
-#cython: initializedcheck=False
-
 # Author: Jake Vanderplas <vanderplas@astro.washington.edu>
 # License: BSD 3 clause
 
diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx
index 5cdc071c38250..59199c41f2e85 100644
--- a/sklearn/neighbors/_kd_tree.pyx
+++ b/sklearn/neighbors/_kd_tree.pyx
@@ -1,9 +1,3 @@
-#!python
-#cython: boundscheck=False
-#cython: wraparound=False
-#cython: cdivision=True
-#cython: initializedcheck=False
-
 # By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
 # written for the scikit-learn project
 # License: BSD
diff --git a/sklearn/neighbors/_quad_tree.pxd b/sklearn/neighbors/_quad_tree.pxd
index 6f61b60cc0ab3..7287d5c420ca7 100644
--- a/sklearn/neighbors/_quad_tree.pxd
+++ b/sklearn/neighbors/_quad_tree.pxd
@@ -1,7 +1,3 @@
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: cdivision=True
-#
 # Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
 # Author: Olivier Grisel <olivier.grisel@ensta.fr>
 
diff --git a/sklearn/neighbors/_quad_tree.pyx b/sklearn/neighbors/_quad_tree.pyx
index 619467e69dd0c..6af7d1f547303 100644
--- a/sklearn/neighbors/_quad_tree.pyx
+++ b/sklearn/neighbors/_quad_tree.pyx
@@ -1,7 +1,3 @@
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: cdivision=True
-#
 # Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
 # Author: Olivier Grisel <olivier.grisel@ensta.fr>
 
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index ca515c5aa3f71..8c4dcdafbec83 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -851,11 +851,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
 
         Only used when ``solver='sgd'``.
 
-    learning_rate_init : double, default=0.001
+    learning_rate_init : float, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, default=0.5
+    power_t : float, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
@@ -1325,11 +1325,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
 
         Only used when solver='sgd'.
 
-    learning_rate_init : double, default=0.001
+    learning_rate_init : float, default=0.001
         The initial learning rate used. It controls the step-size
         in updating the weights. Only used when solver='sgd' or 'adam'.
 
-    power_t : double, default=0.5
+    power_t : float, default=0.5
         The exponent for inverse scaling learning rate.
         It is used in updating effective learning rate when the learning_rate
         is set to 'invscaling'. Only used when solver='sgd'.
diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
index 84fef3f042dc7..ef958b12266e1 100644
--- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx
+++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Author: Andrew nystrom <awnystrom@gmail.com>
 
 from scipy.sparse import csr_matrix
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index db865456db7e0..88fdc7ea85c48 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -453,7 +453,6 @@ def partial_fit(self, X, y=None):
         X = self._validate_data(
             X,
             reset=first_pass,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -841,7 +840,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
         X = self._validate_data(
             X,
             accept_sparse=("csr", "csc"),
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
             reset=first_call,
@@ -975,7 +973,6 @@ def transform(self, X, copy=None):
             reset=False,
             accept_sparse="csr",
             copy=copy,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1017,7 +1014,6 @@ def inverse_transform(self, X, copy=None):
             X,
             accept_sparse="csr",
             copy=copy,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1175,7 +1171,6 @@ def partial_fit(self, X, y=None):
             X,
             reset=first_pass,
             accept_sparse=("csr", "csc"),
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1215,7 +1210,6 @@ def transform(self, X):
             accept_sparse=("csr", "csc"),
             copy=self.copy,
             reset=False,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1244,7 +1238,6 @@ def inverse_transform(self, X):
             X,
             accept_sparse=("csr", "csc"),
             copy=self.copy,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1488,7 +1481,6 @@ def fit(self, X, y=None):
         X = self._validate_data(
             X,
             accept_sparse="csc",
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
@@ -1551,7 +1543,6 @@ def transform(self, X):
             X,
             accept_sparse=("csr", "csc"),
             copy=self.copy,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             reset=False,
             force_all_finite="allow-nan",
@@ -1585,7 +1576,6 @@ def inverse_transform(self, X):
             X,
             accept_sparse=("csr", "csc"),
             copy=self.copy,
-            estimator=self,
             dtype=FLOAT_DTYPES,
             force_all_finite="allow-nan",
         )
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 79a667911d354..9fd9ff9409092 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -15,7 +15,10 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import check_random_state
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import check_scalar
+from ..utils import _safe_indexing
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -63,6 +66,27 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
         .. versionadded:: 0.24
 
+    subsample : int or None (default='warn')
+        Maximum number of samples, used to fit the model, for computational
+        efficiency. Used when `strategy="quantile"`.
+        `subsample=None` means that all the training samples are used when
+        computing the quantiles that determine the binning thresholds.
+        Since quantile computation relies on sorting each column of `X` and
+        that sorting has an `n log(n)` time complexity,
+        it is recommended to use subsampling on datasets with a
+        very large number of samples.
+
+        .. deprecated:: 1.1
+           In version 1.3 and onwards, `subsample=2e5` will be the default.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling.
+        Pass an int for reproducible results across multiple function calls.
+        See the `subsample` parameter for more details.
+        See :term:`Glossary <random_state>`.
+
+        .. versionadded:: 1.1
+
     Attributes
     ----------
     bin_edges_ : ndarray of ndarray of shape (n_features,)
@@ -136,11 +160,22 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
            [ 0.5,  3.5, -1.5,  1.5]])
     """
 
-    def __init__(self, n_bins=5, *, encode="onehot", strategy="quantile", dtype=None):
+    def __init__(
+        self,
+        n_bins=5,
+        *,
+        encode="onehot",
+        strategy="quantile",
+        dtype=None,
+        subsample="warn",
+        random_state=None,
+    ):
         self.n_bins = n_bins
         self.encode = encode
         self.strategy = strategy
         self.dtype = dtype
+        self.subsample = subsample
+        self.random_state = random_state
 
     def fit(self, X, y=None):
         """
@@ -174,6 +209,36 @@ def fit(self, X, y=None):
                 " instead."
             )
 
+        n_samples, n_features = X.shape
+
+        if self.strategy == "quantile" and self.subsample is not None:
+            if self.subsample == "warn":
+                if n_samples > 2e5:
+                    warnings.warn(
+                        "In version 1.3 onwards, subsample=2e5 "
+                        "will be used by default. Set subsample explicitly to "
+                        "silence this warning in the mean time. Set "
+                        "subsample=None to disable subsampling explicitly.",
+                        FutureWarning,
+                    )
+            else:
+                self.subsample = check_scalar(
+                    self.subsample, "subsample", numbers.Integral, min_val=1
+                )
+                rng = check_random_state(self.random_state)
+                if n_samples > self.subsample:
+                    subsample_idx = rng.choice(
+                        n_samples, size=self.subsample, replace=False
+                    )
+                    X = _safe_indexing(X, subsample_idx)
+        elif self.strategy != "quantile" and isinstance(
+            self.subsample, numbers.Integral
+        ):
+            raise ValueError(
+                f"Invalid parameter for `strategy`: {self.strategy}. "
+                '`subsample` must be used with `strategy="quantile"`.'
+            )
+
         valid_encode = ("onehot", "onehot-dense", "ordinal")
         if self.encode not in valid_encode:
             raise ValueError(
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d89d4e1e3d19d..e596460389f67 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -544,7 +544,7 @@ def transform(self, X):
 
         indptr = np.empty(n_samples + 1, dtype=int)
         indptr[0] = 0
-        np.sum(X_mask, axis=1, out=indptr[1:])
+        np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
         np.cumsum(indptr[1:], out=indptr[1:])
         data = np.ones(indptr[-1])
 
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 4410988513f39..e7f4a5e337208 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -256,20 +256,6 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
-        if neg_label >= pos_label:
-            raise ValueError(
-                "neg_label={0} must be strictly less than pos_label={1}.".format(
-                    neg_label, pos_label
-                )
-            )
-
-        if sparse_output and (pos_label == 0 or neg_label != 0):
-            raise ValueError(
-                "Sparse binarization is only supported with non "
-                "zero pos_label and zero neg_label, got "
-                "pos_label={0} and neg_label={1}"
-                "".format(pos_label, neg_label)
-            )
 
         self.neg_label = neg_label
         self.pos_label = pos_label
@@ -289,7 +275,22 @@ def fit(self, y):
         self : object
             Returns the instance itself.
         """
-        self.y_type_ = type_of_target(y)
+
+        if self.neg_label >= self.pos_label:
+            raise ValueError(
+                f"neg_label={self.neg_label} must be strictly less than "
+                f"pos_label={self.pos_label}."
+            )
+
+        if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
+            raise ValueError(
+                "Sparse binarization is only supported with non "
+                "zero pos_label and zero neg_label, got "
+                f"pos_label={self.pos_label} and neg_label={self.neg_label}"
+            )
+
+        self.y_type_ = type_of_target(y, input_name="y")
+
         if "multioutput" in self.y_type_:
             raise ValueError(
                 "Multioutput target data is not supported with label binarization"
@@ -475,7 +476,9 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False)
     if not isinstance(y, list):
         # XXX Workaround that will be removed when list of list format is
         # dropped
-        y = check_array(y, accept_sparse="csr", ensure_2d=False, dtype=None)
+        y = check_array(
+            y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
+        )
     else:
         if _num_samples(y) == 0:
             raise ValueError("y has 0 samples: %r" % y)
diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
index a053332619e39..fa8240893f7c3 100644
--- a/sklearn/preprocessing/tests/test_discretization.py
+++ b/sklearn/preprocessing/tests/test_discretization.py
@@ -3,6 +3,7 @@
 import scipy.sparse as sp
 import warnings
 
+from sklearn import clone
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils._testing import (
@@ -37,16 +38,16 @@ def test_valid_n_bins():
 def test_invalid_n_bins():
     est = KBinsDiscretizer(n_bins=1)
     err_msg = (
-        "KBinsDiscretizer received an invalid "
-        "number of bins. Received 1, expected at least 2."
+        "KBinsDiscretizer received an invalid number of bins. Received 1, expected at"
+        " least 2."
     )
     with pytest.raises(ValueError, match=err_msg):
         est.fit_transform(X)
 
     est = KBinsDiscretizer(n_bins=1.1)
     err_msg = (
-        "KBinsDiscretizer received an invalid "
-        "n_bins type. Received float, expected int."
+        "KBinsDiscretizer received an invalid n_bins type. Received float, expected"
+        " int."
     )
     with pytest.raises(ValueError, match=err_msg):
         est.fit_transform(X)
@@ -357,3 +358,80 @@ def test_32_equal_64(input_dtype, encode):
     Xt_64 = kbd_64.transform(X_input)
 
     assert_allclose_dense_sparse(Xt_32, Xt_64)
+
+
+# FIXME: remove the `filterwarnings` in 1.3
+@pytest.mark.filterwarnings("ignore:In version 1.3 onwards, subsample=2e5")
+@pytest.mark.parametrize("subsample", [None, "warn"])
+def test_kbinsdiscretizer_subsample_default(subsample):
+    # Since the size of X is small (< 2e5), subsampling will not take place.
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
+    kbd_default.fit(X)
+
+    kbd_with_subsampling = clone(kbd_default)
+    kbd_with_subsampling.set_params(subsample=subsample)
+    kbd_with_subsampling.fit(X)
+
+    for bin_kbd_default, bin_kbd_with_subsampling in zip(
+        kbd_default.bin_edges_[0], kbd_with_subsampling.bin_edges_[0]
+    ):
+        np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling)
+    assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
+
+
+def test_kbinsdiscretizer_subsample_invalid_strategy():
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform", subsample=3)
+
+    err_msg = '`subsample` must be used with `strategy="quantile"`.'
+    with pytest.raises(ValueError, match=err_msg):
+        kbd.fit(X)
+
+
+def test_kbinsdiscretizer_subsample_invalid_type():
+    X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(
+        n_bins=10, encode="ordinal", strategy="quantile", subsample="full"
+    )
+
+    msg = (
+        "subsample must be an instance of <class 'numbers.Integral'>, not "
+        "<class 'str'>."
+    )
+    with pytest.raises(TypeError, match=msg):
+        kbd.fit(X)
+
+
+# TODO: Remove in 1.3
+def test_kbinsdiscretizer_subsample_warn():
+    X = np.random.rand(200001, 1).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=100, encode="ordinal", strategy="quantile")
+
+    msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
+    with pytest.warns(FutureWarning, match=msg):
+        kbd.fit(X)
+
+
+@pytest.mark.parametrize("subsample", [0, int(2e5)])
+def test_kbinsdiscretizer_subsample_values(subsample):
+    X = np.random.rand(220000, 1).reshape(-1, 1)
+    kbd_default = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
+
+    kbd_with_subsampling = clone(kbd_default)
+    kbd_with_subsampling.set_params(subsample=subsample)
+
+    if subsample == 0:
+        with pytest.raises(ValueError, match="subsample == 0, must be >= 1."):
+            kbd_with_subsampling.fit(X)
+    else:
+        # TODO: Remove in 1.3
+        msg = "In version 1.3 onwards, subsample=2e5 will be used by default."
+        with pytest.warns(FutureWarning, match=msg):
+            kbd_default.fit(X)
+
+        kbd_with_subsampling.fit(X)
+        assert not np.all(
+            kbd_default.bin_edges_[0] == kbd_with_subsampling.bin_edges_[0]
+        )
+        assert kbd_default.bin_edges_.shape == kbd_with_subsampling.bin_edges_.shape
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 5142144bcb881..a59cd9b152d27 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -124,25 +124,37 @@ def test_label_binarizer_errors():
     lb = LabelBinarizer().fit(one_class)
 
     multi_label = [(2, 3), (0,), (0, 2)]
-    with pytest.raises(ValueError):
+    err_msg = "You appear to be using a legacy multi-label data representation."
+    with pytest.raises(ValueError, match=err_msg):
         lb.transform(multi_label)
 
     lb = LabelBinarizer()
-    with pytest.raises(ValueError):
+    err_msg = "This LabelBinarizer instance is not fitted yet"
+    with pytest.raises(ValueError, match=err_msg):
         lb.transform([])
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=err_msg):
         lb.inverse_transform([])
 
-    with pytest.raises(ValueError):
-        LabelBinarizer(neg_label=2, pos_label=1)
-    with pytest.raises(ValueError):
-        LabelBinarizer(neg_label=2, pos_label=2)
-
-    with pytest.raises(ValueError):
-        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
+    input_labels = [0, 1, 0, 1]
+    err_msg = "neg_label=2 must be strictly less than pos_label=1."
+    lb = LabelBinarizer(neg_label=2, pos_label=1)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
+    err_msg = "neg_label=2 must be strictly less than pos_label=2."
+    lb = LabelBinarizer(neg_label=2, pos_label=2)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
+    err_msg = (
+        "Sparse binarization is only supported with non zero pos_label and zero "
+        "neg_label, got pos_label=2 and neg_label=1"
+    )
+    lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
+    with pytest.raises(ValueError, match=err_msg):
+        lb.fit(input_labels)
 
     # Fail on y_type
-    with pytest.raises(ValueError):
+    err_msg = "foo format is not supported"
+    with pytest.raises(ValueError, match=err_msg):
         _inverse_binarize_thresholding(
             y=csr_matrix([[1, 2], [2, 1]]),
             output_type="foo",
@@ -152,11 +164,13 @@ def test_label_binarizer_errors():
 
     # Sequence of seq type should raise ValueError
     y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
-    with pytest.raises(ValueError):
+    err_msg = "You appear to be using a legacy multi-label data representation"
+    with pytest.raises(ValueError, match=err_msg):
         LabelBinarizer().fit_transform(y_seq_of_seqs)
 
     # Fail on the number of classes
-    with pytest.raises(ValueError):
+    err_msg = "The number of class is not equal to the number of dimension of y."
+    with pytest.raises(ValueError, match=err_msg):
         _inverse_binarize_thresholding(
             y=csr_matrix([[1, 2], [2, 1]]),
             output_type="foo",
@@ -165,7 +179,8 @@ def test_label_binarizer_errors():
         )
 
     # Fail on the dimension of 'binary'
-    with pytest.raises(ValueError):
+    err_msg = "output_type='binary', but y.shape"
+    with pytest.raises(ValueError, match=err_msg):
         _inverse_binarize_thresholding(
             y=np.array([[1, 2, 3], [2, 1, 3]]),
             output_type="binary",
@@ -174,9 +189,10 @@ def test_label_binarizer_errors():
         )
 
     # Fail on multioutput data
-    with pytest.raises(ValueError):
+    err_msg = "Multioutput target data is not supported with label binarization"
+    with pytest.raises(ValueError, match=err_msg):
         LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=err_msg):
         label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
 
 
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 6b2c9217713e0..79ebe46042379 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -34,6 +34,7 @@
 import scipy.sparse as sp
 
 from .base import BaseEstimator, TransformerMixin
+from .base import _ClassNamePrefixFeaturesOutMixin
 
 from .utils import check_random_state
 from .utils.extmath import safe_sparse_dot
@@ -290,7 +291,9 @@ def _sparse_random_matrix(n_components, n_features, density="auto", random_state
         return np.sqrt(1 / density) / np.sqrt(n_components) * components
 
 
-class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
+class BaseRandomProjection(
+    TransformerMixin, BaseEstimator, _ClassNamePrefixFeaturesOutMixin, metaclass=ABCMeta
+):
     """Base class for random projections.
 
     Warning: This class should not be used directly.
@@ -420,6 +423,14 @@ def transform(self, X):
         X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output)
         return X_new
 
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features.
+
+        Used by _ClassNamePrefixFeaturesOutMixin.get_feature_names_out.
+        """
+        return self.n_components
+
 
 class GaussianRandomProjection(BaseRandomProjection):
     """Reduce dimensionality through Gaussian random projection.
@@ -487,11 +498,11 @@ class GaussianRandomProjection(BaseRandomProjection):
     >>> import numpy as np
     >>> from sklearn.random_projection import GaussianRandomProjection
     >>> rng = np.random.RandomState(42)
-    >>> X = rng.rand(100, 10000)
+    >>> X = rng.rand(25, 3000)
     >>> transformer = GaussianRandomProjection(random_state=rng)
     >>> X_new = transformer.fit_transform(X)
     >>> X_new.shape
-    (100, 3947)
+    (25, 2759)
     """
 
     def __init__(self, n_components="auto", *, eps=0.1, random_state=None):
@@ -637,14 +648,14 @@ class SparseRandomProjection(BaseRandomProjection):
     >>> import numpy as np
     >>> from sklearn.random_projection import SparseRandomProjection
     >>> rng = np.random.RandomState(42)
-    >>> X = rng.rand(100, 10000)
+    >>> X = rng.rand(25, 3000)
     >>> transformer = SparseRandomProjection(random_state=rng)
     >>> X_new = transformer.fit_transform(X)
     >>> X_new.shape
-    (100, 3947)
+    (25, 2759)
     >>> # very few components are non-zero
     >>> np.mean(transformer.components_ != 0)
-    0.0100...
+    0.0182...
     """
 
     def __init__(
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 7f0e628aab70c..259b1b28c29be 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -1,8 +1,10 @@
-import numpy as np
-import scipy.sparse as sp
 import warnings
+import numbers
 from abc import ABCMeta, abstractmethod
 
+import numpy as np
+import scipy.sparse as sp
+
 # mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm'
 # (and same for other imports)
 from . import _libsvm as libsvm  # type: ignore
@@ -98,13 +100,6 @@ def __init__(
                 "impl should be one of %s, %s was given" % (LIBSVM_IMPL, self._impl)
             )
 
-        if gamma == 0:
-            msg = (
-                "The gamma value of 0.0 is invalid. Use 'auto' to set"
-                " gamma to a value of 1 / n_features."
-            )
-            raise ValueError(msg)
-
         self.kernel = kernel
         self.degree = degree
         self.gamma = gamma
@@ -242,10 +237,23 @@ def fit(self, X, y, sample_weight=None):
             else:
                 raise ValueError(
                     "When 'gamma' is a string, it should be either 'scale' or "
-                    "'auto'. Got '{}' instead.".format(self.gamma)
+                    f"'auto'. Got '{self.gamma!r}' instead."
                 )
-        else:
+        elif isinstance(self.gamma, numbers.Real):
+            if self.gamma <= 0:
+                msg = (
+                    f"gamma value must be > 0; {self.gamma!r} is invalid. Use"
+                    " a positive number or use 'auto' to set gamma to a"
+                    " value of 1 / n_features."
+                )
+                raise ValueError(msg)
             self._gamma = self.gamma
+        else:
+            msg = (
+                "The gamma value should be set to 'scale', 'auto' or a"
+                f" positive float value. {self.gamma!r} is not a valid option"
+            )
+            raise ValueError(msg)
 
         fit = self._sparse_fit if self._sparse else self._dense_fit
         if self.verbose:
diff --git a/sklearn/svm/_libsvm.pyx b/sklearn/svm/_libsvm.pyx
index 9488bda4ccf58..9186f0fcf7e29 100644
--- a/sklearn/svm/_libsvm.pyx
+++ b/sklearn/svm/_libsvm.pyx
@@ -18,10 +18,7 @@ where no sort of memory checks are done.
 
 Notes
 -----
-Maybe we could speed it a bit further by decorating functions with
-@cython.boundscheck(False), but probably it is not worth since all
-work is done in lisvm_helper.c
-Also, the signature mode='c' is somewhat superficial, since we already
+The signature mode='c' is somewhat superficial, since we already
 check that arrays are C-contiguous in svm.py
 
 Authors
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index eb15aca0096b8..67eabf9fc8d2c 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -695,11 +695,47 @@ def test_bad_input():
         (svm.OneClassSVM, datasets.load_iris(return_X_y=True)),
     ],
 )
-def test_svm_gamma_error(Estimator, data):
+@pytest.mark.parametrize(
+    "gamma, err_msg",
+    [
+        (
+            "auto_deprecated",
+            "When 'gamma' is a string, it should be either 'scale' or 'auto'",
+        ),
+        (
+            -1,
+            "gamma value must be > 0; -1 is invalid. Use"
+            " a positive number or use 'auto' to set gamma to a"
+            " value of 1 / n_features.",
+        ),
+        (
+            0.0,
+            "gamma value must be > 0; 0.0 is invalid. Use"
+            " a positive number or use 'auto' to set gamma to a"
+            " value of 1 / n_features.",
+        ),
+        (
+            np.array([1.0, 4.0]),
+            "The gamma value should be set to 'scale',"
+            f" 'auto' or a positive float value. {np.array([1.0, 4.0])!r}"
+            " is not a valid option",
+        ),
+        (
+            [],
+            "The gamma value should be set to 'scale', 'auto' or a positive"
+            f" float value. {[]} is not a valid option",
+        ),
+        (
+            {},
+            "The gamma value should be set to 'scale', 'auto' or a positive"
+            " float value. {} is not a valid option",
+        ),
+    ],
+)
+def test_svm_gamma_error(Estimator, data, gamma, err_msg):
     X, y = data
-    est = Estimator(gamma="auto_deprecated")
-    err_msg = "When 'gamma' is a string, it should be either 'scale' or 'auto'"
-    with pytest.raises(ValueError, match=err_msg):
+    est = Estimator(gamma=gamma)
+    with pytest.raises(ValueError, match=(re.escape(err_msg))):
         est.fit(X, y)
 
 
@@ -1204,24 +1240,26 @@ def test_svc_ovr_tie_breaking(SVCClass):
     """Test if predict breaks ties in OVR mode.
     Related issue: https://github.com/scikit-learn/scikit-learn/issues/8277
     """
-    X, y = make_blobs(random_state=27)
+    X, y = make_blobs(random_state=0, n_samples=20, n_features=2)
 
-    xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 1000)
-    ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 1000)
+    xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
+    ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
     xx, yy = np.meshgrid(xs, ys)
 
+    common_params = dict(
+        kernel="rbf", gamma=1e6, random_state=42, decision_function_shape="ovr"
+    )
     svm = SVCClass(
-        kernel="linear",
-        decision_function_shape="ovr",
         break_ties=False,
-        random_state=42,
+        **common_params,
     ).fit(X, y)
     pred = svm.predict(np.c_[xx.ravel(), yy.ravel()])
     dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
     assert not np.all(pred == np.argmax(dv, axis=1))
 
     svm = SVCClass(
-        kernel="linear", decision_function_shape="ovr", break_ties=True, random_state=42
+        break_ties=True,
+        **common_params,
     ).fit(X, y)
     pred = svm.predict(np.c_[xx.ravel(), yy.ravel()])
     dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 4ad983d72e007..ee7214bf224e8 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -591,22 +591,6 @@ def test_calibration_inconsistent_prefit_n_features_in():
         calib_clf.fit(X[:, :3], y)
 
 
-# FIXME: remove in 1.1
-def test_calibrated_classifier_cv_deprecation(data):
-    # Check that we raise the proper deprecation warning if accessing
-    # `calibrators_` from the `_CalibratedClassifier`.
-    X, y = data
-    calib_clf = CalibratedClassifierCV(cv=2).fit(X, y)
-
-    with pytest.warns(FutureWarning):
-        calibrators = calib_clf.calibrated_classifiers_[0].calibrators_
-
-    for clf1, clf2 in zip(
-        calibrators, calib_clf.calibrated_classifiers_[0].calibrators
-    ):
-        assert clf1 is clf2
-
-
 def test_calibration_votingclassifier():
     # Check that `CalibratedClassifier` works with `VotingClassifier`.
     # The method `predict_proba` from `VotingClassifier` is dynamically
@@ -703,8 +687,8 @@ def test_calibration_display_compute(pyplot, iris_data_binary, n_bins, strategy)
     assert isinstance(viz.ax_, mpl.axes.Axes)
     assert isinstance(viz.figure_, mpl.figure.Figure)
 
-    assert viz.ax_.get_xlabel() == "Mean predicted probability"
-    assert viz.ax_.get_ylabel() == "Fraction of positives"
+    assert viz.ax_.get_xlabel() == "Mean predicted probability (Positive class: 1)"
+    assert viz.ax_.get_ylabel() == "Fraction of positives (Positive class: 1)"
     assert viz.line_.get_label() == "LogisticRegression"
 
 
@@ -823,6 +807,34 @@ def test_calibration_curve_pos_label(dtype_y_str):
     assert_allclose(prob_true, [0, 0, 0.5, 1])
 
 
+@pytest.mark.parametrize("pos_label, expected_pos_label", [(None, 1), (0, 0), (1, 1)])
+def test_calibration_display_pos_label(
+    pyplot, iris_data_binary, pos_label, expected_pos_label
+):
+    """Check the behaviour of `pos_label` in the `CalibrationDisplay`."""
+    X, y = iris_data_binary
+
+    lr = LogisticRegression().fit(X, y)
+    viz = CalibrationDisplay.from_estimator(lr, X, y, pos_label=pos_label)
+
+    y_prob = lr.predict_proba(X)[:, expected_pos_label]
+    prob_true, prob_pred = calibration_curve(y, y_prob, pos_label=pos_label)
+
+    assert_allclose(viz.prob_true, prob_true)
+    assert_allclose(viz.prob_pred, prob_pred)
+    assert_allclose(viz.y_prob, y_prob)
+
+    assert (
+        viz.ax_.get_xlabel()
+        == f"Mean predicted probability (Positive class: {expected_pos_label})"
+    )
+    assert (
+        viz.ax_.get_ylabel()
+        == f"Fraction of positives (Positive class: {expected_pos_label})"
+    )
+    assert viz.line_.get_label() == "LogisticRegression"
+
+
 @pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
 @pytest.mark.parametrize("ensemble", [True, False])
 def test_calibrated_classifier_cv_double_sample_weights_equivalence(method, ensemble):
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index e06a060546713..2ac46ea6ca003 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -11,7 +11,7 @@
 import sys
 import re
 import pkgutil
-from inspect import isgenerator, signature
+from inspect import isgenerator, signature, Parameter
 from itertools import product, chain
 from functools import partial
 
@@ -363,16 +363,13 @@ def test_pandas_column_name_consistency(estimator):
 GET_FEATURES_OUT_MODULES_TO_IGNORE = [
     "cluster",
     "cross_decomposition",
-    "decomposition",
     "discriminant_analysis",
-    "ensemble",
     "isotonic",
     "kernel_approximation",
     "preprocessing",
     "manifold",
     "neighbors",
     "neural_network",
-    "random_projection",
 ]
 
 
@@ -412,16 +409,8 @@ def test_transformers_get_feature_names_out(transformer):
     "FeatureUnion",
     "GridSearchCV",
     "HalvingGridSearchCV",
-    "KernelPCA",
-    "LabelBinarizer",
-    "NuSVC",
-    "NuSVR",
-    "OneClassSVM",
     "Pipeline",
-    "RadiusNeighborsClassifier",
     "SGDOneClassSVM",
-    "SVC",
-    "SVR",
     "TheilSenRegressor",
     "TweedieRegressor",
 ]
@@ -434,7 +423,14 @@ def test_transformers_get_feature_names_out(transformer):
 )
 def test_estimators_do_not_raise_errors_in_init_or_set_params(Estimator):
     """Check that init or set_param does not raise errors."""
-    params = signature(Estimator).parameters
+
+    # Remove parameters with **kwargs by filtering out Parameter.VAR_KEYWORD
+    # TODO: Remove in 1.2 when **kwargs is removed in RadiusNeighborsClassifier
+    params = [
+        name
+        for name, param in signature(Estimator).parameters.items()
+        if param.kind != Parameter.VAR_KEYWORD
+    ]
 
     smoke_test_values = [-1, 3.0, "helloworld", np.array([1.0, 4.0]), {}, []]
     for value in smoke_test_values:
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index de1ac9b1c04dd..69617d5fc4d68 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -246,10 +246,6 @@ def test_fit_docstring_attributes(name, Estimator):
     ):
         est.set_params(normalize=False)
 
-    # FIXME: TO BE REMOVED for 1.1 (avoid FutureWarning)
-    if Estimator.__name__ == "NMF":
-        est.set_params(init="nndsvda")
-
     # FIXME: TO BE REMOVED for 1.2 (avoid FutureWarning)
     if Estimator.__name__ == "TSNE":
         est.set_params(learning_rate=200.0, init="random")
@@ -257,6 +253,10 @@ def test_fit_docstring_attributes(name, Estimator):
     # For PLS, TODO remove in 1.1
     skipped_attributes = {"x_scores_", "y_scores_"}
 
+    # FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
+    if Estimator.__name__ == "FastICA":
+        est.set_params(whiten="unit-variance")
+
     if Estimator.__name__.endswith("Vectorizer"):
         # Vectorizer require some specific input data
         if Estimator.__name__ in (
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 54e21508fdb72..77201628957e0 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -216,7 +216,10 @@ def test_pipeline_init():
     repr(pipe)
 
     # Check that params are not set when naming them wrong
-    msg = "Invalid parameter C for estimator SelectKBest"
+    msg = re.escape(
+        "Invalid parameter 'C' for estimator SelectKBest(). Valid parameters are: ['k',"
+        " 'score_func']."
+    )
     with pytest.raises(ValueError, match=msg):
         pipe.set_params(anova__C=0.1)
 
@@ -316,18 +319,26 @@ def test_pipeline_raise_set_params_error():
 
     # expected error message
     error_msg = re.escape(
-        f"Invalid parameter fake for estimator {pipe}. "
-        "Check the list of available parameters "
-        "with `estimator.get_params().keys()`."
+        "Invalid parameter 'fake' for estimator Pipeline(steps=[('cls',"
+        " LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose']."
     )
-
     with pytest.raises(ValueError, match=error_msg):
         pipe.set_params(fake="nope")
 
-    # nested model check
+    # invalid outer parameter name for compound parameter: the expected error message
+    # is the same as above.
     with pytest.raises(ValueError, match=error_msg):
         pipe.set_params(fake__estimator="nope")
 
+    # expected error message for invalid inner parameter
+    error_msg = re.escape(
+        "Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid"
+        " parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'normalize',"
+        " 'positive']."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(cls__invalid_param="nope")
+
 
 def test_pipeline_methods_pca_svm():
     # Test the various methods of the pipeline (pca + svm).
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 5866fde29d73b..1e894d906a3ad 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -24,7 +24,7 @@
 
 all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
 all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
-all_RandomProjection = set(all_SparseRandomProjection + all_DenseRandomProjection)
+all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
 
 
 # Make some random data with uniformly located non zero entries with
@@ -359,3 +359,17 @@ def test_johnson_lindenstrauss_min_dim():
     Regression test for #17111: before #19374, 32-bit systems would fail.
     """
     assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
+
+
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+def test_random_projection_feature_names_out(random_projection_cls):
+    random_projection = random_projection_cls(n_components=2)
+    random_projection.fit(data)
+    names_out = random_projection.get_feature_names_out()
+    class_name_lower = random_projection_cls.__name__.lower()
+    expected_names_out = np.array(
+        [f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
+        dtype=object,
+    )
+
+    assert_array_equal(names_out, expected_names_out)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index c59fcda975a9d..3cd0e000bd4dd 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -147,9 +147,7 @@ def get_n_leaves(self):
         check_is_fitted(self)
         return self.tree_.n_leaves
 
-    def fit(
-        self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"
-    ):
+    def fit(self, X, y, sample_weight=None, check_input=True):
 
         random_state = check_random_state(self.random_state)
 
@@ -335,16 +333,6 @@ def fit(
         if self.min_impurity_decrease < 0.0:
             raise ValueError("min_impurity_decrease must be greater than or equal to 0")
 
-        # TODO: Remove in 1.1
-        if X_idx_sorted != "deprecated":
-            warnings.warn(
-                "The parameter 'X_idx_sorted' is deprecated and has no "
-                "effect. It will be removed in 1.1 (renaming of 0.26). You "
-                "can suppress this warning by not passing any value to the "
-                "'X_idx_sorted' parameter.",
-                FutureWarning,
-            )
-
         # Build tree
         criterion = self.criterion
         if not isinstance(criterion, Criterion):
@@ -896,9 +884,7 @@ def __init__(
             ccp_alpha=ccp_alpha,
         )
 
-    def fit(
-        self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"
-    ):
+    def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree classifier from the training set (X, y).
 
         Parameters
@@ -922,12 +908,6 @@ def fit(
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
-        X_idx_sorted : deprecated, default="deprecated"
-            This parameter is deprecated and has no effect.
-            It will be removed in 1.1 (renaming of 0.26).
-
-            .. deprecated:: 0.24
-
         Returns
         -------
         self : DecisionTreeClassifier
@@ -939,7 +919,6 @@ def fit(
             y,
             sample_weight=sample_weight,
             check_input=check_input,
-            X_idx_sorted=X_idx_sorted,
         )
         return self
 
@@ -1275,9 +1254,7 @@ def __init__(
             ccp_alpha=ccp_alpha,
         )
 
-    def fit(
-        self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated"
-    ):
+    def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree regressor from the training set (X, y).
 
         Parameters
@@ -1300,12 +1277,6 @@ def fit(
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you do.
 
-        X_idx_sorted : deprecated, default="deprecated"
-            This parameter is deprecated and has no effect.
-            It will be removed in 1.1 (renaming of 0.26).
-
-            .. deprecated:: 0.24
-
         Returns
         -------
         self : DecisionTreeRegressor
@@ -1317,7 +1288,6 @@ def fit(
             y,
             sample_weight=sample_weight,
             check_input=check_input,
-            X_idx_sorted=X_idx_sorted,
         )
         return self
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index db8a3cb821df3..2c115d0bd6ea1 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index d7b5e81c7b8f7..35ce58dce26ac 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index c8c58f12ffd3a..84fb808318a49 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index e6552debd3149..b80e7825ee6ab 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -1,7 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index cee55d2c40d8d..47dd4475e2feb 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -5,6 +5,8 @@
 import pickle
 from itertools import product
 import struct
+import io
+import copyreg
 
 import pytest
 import numpy as np
@@ -13,6 +15,9 @@
 from scipy.sparse import csr_matrix
 from scipy.sparse import coo_matrix
 
+import joblib
+from joblib.numpy_pickle import NumpyPickler
+
 from sklearn.random_projection import _sparse_random_matrix
 
 from sklearn.dummy import DummyRegressor
@@ -32,6 +37,7 @@
 
 from sklearn.utils.estimator_checks import check_sample_weights_invariance
 from sklearn.utils.validation import check_random_state
+from sklearn.utils import parse_version
 
 from sklearn.exceptions import NotFittedError
 
@@ -2132,21 +2138,6 @@ def test_decision_tree_regressor_sample_weight_consistentcy(criterion):
     assert_allclose(tree1.predict(X), tree2.predict(X))
 
 
-# TODO: Remove in v1.1
-@pytest.mark.parametrize(
-    "TreeEstimator", [DecisionTreeClassifier, DecisionTreeRegressor]
-)
-def test_X_idx_sorted_deprecated(TreeEstimator):
-    X_idx_sorted = np.argsort(X, axis=0)
-
-    tree = TreeEstimator()
-
-    with pytest.warns(
-        FutureWarning, match="The parameter 'X_idx_sorted' is deprecated"
-    ):
-        tree.fit(X, y, X_idx_sorted=X_idx_sorted)
-
-
 # TODO: Remove in v1.2
 @pytest.mark.parametrize("Tree", REG_TREES.values())
 @pytest.mark.parametrize(
@@ -2179,3 +2170,58 @@ def test_n_features_deprecated(Tree):
 
     with pytest.warns(FutureWarning, match=depr_msg):
         Tree().fit(X, y).n_features_
+
+
+def test_different_endianness_pickle():
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    def reduce_ndarray(arr):
+        return arr.byteswap().newbyteorder().__reduce__()
+
+    def get_pickle_non_native_endianness():
+        f = io.BytesIO()
+        p = pickle.Pickler(f)
+        p.dispatch_table = copyreg.dispatch_table.copy()
+        p.dispatch_table[np.ndarray] = reduce_ndarray
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = pickle.load(get_pickle_non_native_endianness())
+    new_score = new_clf.score(X, y)
+    assert np.isclose(score, new_score)
+
+
+@pytest.mark.skipif(
+    parse_version(joblib.__version__) < parse_version("1.1"),
+    reason="joblib >= 1.1 is needed to load numpy arrays in native endianness",
+)
+def test_different_endianness_joblib_pickle():
+    X, y = datasets.make_classification(random_state=0)
+
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf.fit(X, y)
+    score = clf.score(X, y)
+
+    class NonNativeEndiannessNumpyPickler(NumpyPickler):
+        def save(self, obj):
+            if isinstance(obj, np.ndarray):
+                obj = obj.byteswap().newbyteorder()
+            super().save(obj)
+
+    def get_joblib_pickle_non_native_endianness():
+        f = io.BytesIO()
+        p = NonNativeEndiannessNumpyPickler(f)
+
+        p.dump(clf)
+        f.seek(0)
+        return f
+
+    new_clf = joblib.load(get_joblib_pickle_non_native_endianness())
+    new_score = new_clf.score(X, y)
+    assert np.isclose(score, new_score)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 8290318d35deb..3d8a1ca87d210 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -15,6 +15,7 @@
 import struct
 import timeit
 from pathlib import Path
+from contextlib import suppress
 
 import warnings
 import numpy as np
@@ -986,6 +987,30 @@ def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
     return chunk_n_rows
 
 
+def _is_pandas_na(x):
+    """Test if x is pandas.NA.
+
+    We intentionally do not use this function to return `True` for `pd.NA` in
+    `is_scalar_nan`, because estimators that support `pd.NA` are the exception
+    rather than the rule at the moment. When `pd.NA` is more universally
+    supported, we may reconsider this decision.
+
+    Parameters
+    ----------
+    x : any type
+
+    Returns
+    -------
+    boolean
+    """
+    with suppress(ImportError):
+        from pandas import NA
+
+        return x is NA
+
+    return False
+
+
 def is_scalar_nan(x):
     """Tests if x is NaN.
 
@@ -1108,8 +1133,7 @@ def check_matplotlib_support(caller_name):
 
 
 def check_pandas_support(caller_name):
-    """Raise ImportError with detailed error message if pandas is not
-    installed.
+    """Raise ImportError with detailed error message if pandas is not installed.
 
     Plot utilities like :func:`fetch_openml` should lazily import
     pandas and call this helper before any computation.
@@ -1118,6 +1142,11 @@ def check_pandas_support(caller_name):
     ----------
     caller_name : str
         The name of the caller that requires pandas.
+
+    Returns
+    -------
+    pandas
+        The pandas package.
     """
     try:
         import pandas  # noqa
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
index 1f5c339c12771..c75545daf7da9 100644
--- a/sklearn/utils/_estimator_html_repr.py
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -69,6 +69,7 @@ def _write_label_html(
     name = html.escape(name)
 
     if name_details is not None:
+        name_details = html.escape(str(name_details))
         checked_str = "checked" if checked else ""
         est_id = uuid.uuid4()
         out.write(
@@ -354,7 +355,7 @@ def estimator_html_repr(estimator):
         )
         out.write(
             f"<style>{style_with_id}</style>"
-            f'<div id="{container_id}" class"sk-top-container">'
+            f'<div id="{container_id}" class="sk-top-container">'
             '<div class="sk-text-repr-fallback">'
             f"<pre>{html.escape(estimator_str)}</pre><b>{fallback_msg}</b>"
             "</div>"
diff --git a/sklearn/utils/_fast_dict.pyx b/sklearn/utils/_fast_dict.pyx
index 719cafc3cc8c1..2bbf2dcfa667c 100644
--- a/sklearn/utils/_fast_dict.pyx
+++ b/sklearn/utils/_fast_dict.pyx
@@ -38,8 +38,6 @@ np.import_array()
 
 cdef class IntFloatDict:
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
     def __init__(self, np.ndarray[ITYPE_t, ndim=1] keys,
                        np.ndarray[DTYPE_t, ndim=1] values):
         cdef int i
diff --git a/sklearn/utils/_logistic_sigmoid.pyx b/sklearn/utils/_logistic_sigmoid.pyx
index 3531d99bc4f44..c2ba685dbfcbd 100644
--- a/sklearn/utils/_logistic_sigmoid.pyx
+++ b/sklearn/utils/_logistic_sigmoid.pyx
@@ -1,7 +1,3 @@
-#cython: boundscheck=False
-#cython: cdivision=True
-#cython: wraparound=False
-
 from libc.math cimport log, exp
 
 import numpy as np
diff --git a/sklearn/utils/_mask.py b/sklearn/utils/_mask.py
index 699a2c1cc1725..d57cf839d962f 100644
--- a/sklearn/utils/_mask.py
+++ b/sklearn/utils/_mask.py
@@ -1,11 +1,20 @@
 import numpy as np
 from scipy import sparse as sp
+from contextlib import suppress
 
 from . import is_scalar_nan
 from .fixes import _object_dtype_isnan
 
 
 def _get_dense_mask(X, value_to_mask):
+    with suppress(ImportError, AttributeError):
+        # We also suppress `AttributeError` because older versions of pandas do
+        # not have `NA`.
+        import pandas
+
+        if value_to_mask is pandas.NA:
+            return pandas.isna(X)
+
     if is_scalar_nan(value_to_mask):
         if X.dtype.kind == "f":
             Xt = np.isnan(X)
diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
index 3caf062079211..be8ef0752ddd3 100644
--- a/sklearn/utils/_random.pyx
+++ b/sklearn/utils/_random.pyx
@@ -1,6 +1,3 @@
-# cython: boundscheck=False
-# cython: wraparound=False
-#
 # Author: Arnaud Joly
 #
 # License: BSD 3 clause
@@ -278,7 +275,7 @@ cpdef sample_without_replacement(np.int_t n_population,
 
     all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
 
-    ratio = n_samples / n_population if n_population != 0.0 else 1.0
+    ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0
 
     # Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
     if method == "auto" and ratio > 0.01 and ratio < 0.99:
diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp
index 8bc901194a24e..9115f80c5265d 100644
--- a/sklearn/utils/_seq_dataset.pyx.tp
+++ b/sklearn/utils/_seq_dataset.pyx.tp
@@ -1,6 +1,3 @@
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
 {{py:
 
 """
diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py
index 1f443ff765bd8..ca01fd1909f57 100644
--- a/sklearn/utils/_show_versions.py
+++ b/sklearn/utils/_show_versions.py
@@ -8,6 +8,8 @@
 import platform
 import sys
 import importlib
+from ..utils.fixes import threadpool_info
+
 
 from ._openmp_helpers import _openmp_parallelism_enabled
 
@@ -95,3 +97,15 @@ def show_versions():
             k="Built with OpenMP", stat=_openmp_parallelism_enabled()
         )
     )
+
+    # show threadpoolctl results
+    threadpool_results = threadpool_info()
+    if threadpool_results:
+        print()
+        print("threadpoolctl info:")
+
+        for i, result in enumerate(threadpool_results):
+            for key, val in result.items():
+                print(f"{key:>15}: {val}")
+            if i != len(threadpool_results) - 1:
+                print()
diff --git a/sklearn/utils/_weight_vector.pyx.tp b/sklearn/utils/_weight_vector.pyx.tp
index 186f391f8a955..ca552a1dff29e 100644
--- a/sklearn/utils/_weight_vector.pyx.tp
+++ b/sklearn/utils/_weight_vector.pyx.tp
@@ -18,11 +18,6 @@ dtypes = [('64', 'double', 1e-9),
 
 }}
 
-# cython: language_level=3
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: initializedcheck=False
 # cython: binding=False
 #
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ccc6ff23ed8fc..5c32b7b8fc36e 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -168,17 +168,33 @@ def check_supervised_y_no_nan(name, estimator_orig):
     estimator = clone(estimator_orig)
     rng = np.random.RandomState(888)
     X = rng.randn(10, 5)
-    y = np.full(10, np.inf)
-    y = _enforce_estimator_tags_y(estimator, y)
 
-    match = (
-        "Input contains NaN, infinity or a value too large for " r"dtype\('float64'\)."
-    )
-    err_msg = (
-        f"Estimator {name} should have raised error on fitting array y with NaN value."
-    )
-    with raises(ValueError, match=match, err_msg=err_msg):
-        estimator.fit(X, y)
+    for value in [np.nan, np.inf]:
+        y = np.full(10, value)
+        y = _enforce_estimator_tags_y(estimator, y)
+
+        module_name = estimator.__module__
+        if module_name.startswith("sklearn.") and not (
+            "test_" in module_name or module_name.endswith("_testing")
+        ):
+            # In scikit-learn we want the error message to mention the input
+            # name and be specific about the kind of unexpected value.
+            if np.isinf(value):
+                match = (
+                    r"Input (y|Y) contains infinity or a value too large for"
+                    r" dtype\('float64'\)."
+                )
+            else:
+                match = r"Input (y|Y) contains NaN."
+        else:
+            # Do not impose a particular error message to third-party libraries.
+            match = None
+        err_msg = (
+            f"Estimator {name} should have raised error on fitting array y with inf"
+            " value."
+        )
+        with raises(ValueError, match=match, err_msg=err_msg):
+            estimator.fit(X, y)
 
 
 def _yield_regressor_checks(regressor):
@@ -616,8 +632,7 @@ def _set_checking_parameters(estimator):
             estimator.set_params(max_iter=20)
         # NMF
         if estimator.__class__.__name__ == "NMF":
-            # FIXME : init should be removed in 1.1
-            estimator.set_params(max_iter=500, init="nndsvda")
+            estimator.set_params(max_iter=500)
         # MLP
         if estimator.__class__.__name__ in ["MLPClassifier", "MLPRegressor"]:
             estimator.set_params(max_iter=100)
@@ -632,6 +647,11 @@ def _set_checking_parameters(estimator):
     if "n_init" in params:
         # K-Means
         estimator.set_params(n_init=2)
+    if name == "MeanShift":
+        # In the case of check_fit2d_1sample, bandwidth is set to None and
+        # is thus estimated. De facto it is 0.0 as a single sample is provided
+        # and this makes the test fails. Hence we give it a placeholder value.
+        estimator.set_params(bandwidth=1.0)
 
     if name == "TruncatedSVD":
         # TruncatedSVD doesn't run with n_components = n_features
@@ -771,7 +791,7 @@ def _generate_sparse_matrix(X_csr):
 
 def check_estimator_sparse_data(name, estimator_orig):
     rng = np.random.RandomState(0)
-    X = rng.rand(40, 10)
+    X = rng.rand(40, 3)
     X[X < 0.8] = 0
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     X_csr = sparse.csr_matrix(X)
@@ -1725,9 +1745,11 @@ def check_estimators_nan_inf(name, estimator_orig):
     y = np.ones(10)
     y[:5] = 0
     y = _enforce_estimator_tags_y(estimator_orig, y)
-    error_string_fit = "Estimator doesn't check for NaN and inf in fit."
-    error_string_predict = "Estimator doesn't check for NaN and inf in predict."
-    error_string_transform = "Estimator doesn't check for NaN and inf in transform."
+    error_string_fit = f"Estimator {name} doesn't check for NaN and inf in fit."
+    error_string_predict = f"Estimator {name} doesn't check for NaN and inf in predict."
+    error_string_transform = (
+        f"Estimator {name} doesn't check for NaN and inf in transform."
+    )
     for X_train in [X_train_nan, X_train_inf]:
         # catch deprecation warnings
         with ignore_warnings(category=FutureWarning):
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index b0499f5bf0e24..7ff8d7a8bead9 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -659,14 +659,18 @@ def cartesian(arrays, out=None):
     ----------
     arrays : list of array-like
         1-D arrays to form the cartesian product of.
-    out : ndarray, default=None
+    out : ndarray of shape (M, len(arrays)), default=None
         Array to place the cartesian product in.
 
     Returns
     -------
-    out : ndarray
-        2-D array of shape (M, len(arrays)) containing cartesian products
-        formed of input arrays.
+    out : ndarray of shape (M, len(arrays))
+        Array containing the cartesian products formed of input arrays.
+
+    Notes
+    -----
+    This function may not be used on more than 32 arrays
+    because the underlying numpy functions do not support it.
 
     Examples
     --------
@@ -684,11 +688,6 @@ def cartesian(arrays, out=None):
            [3, 4, 7],
            [3, 5, 6],
            [3, 5, 7]])
-
-    Notes
-    -----
-    This function may not be used on more than 32 arrays
-    because the underlying numpy functions do not support it.
     """
     arrays = [np.asarray(x) for x in arrays]
     shape = (len(x) for x in arrays)
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index a10c57679ab62..b17e13caf71bd 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -36,6 +36,11 @@
     # mypy error: Name 'lobpcg' already defined (possibly by an import)
     from ..externals._lobpcg import lobpcg  # type: ignore  # noqa
 
+try:
+    from scipy.optimize._linesearch import line_search_wolfe2, line_search_wolfe1
+except ImportError:  # SciPy < 1.8
+    from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1  # type: ignore  # noqa
+
 
 def _object_dtype_isnan(X):
     return X != X
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index d734ab591ce2a..4e5981042f277 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -11,7 +11,6 @@
 import warnings
 
 from scipy.sparse import issparse
-from scipy.sparse.base import spmatrix
 from scipy.sparse import dok_matrix
 from scipy.sparse import lil_matrix
 
@@ -28,7 +27,9 @@ def _unique_multiclass(y):
 
 
 def _unique_indicator(y):
-    return np.arange(check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1])
+    return np.arange(
+        check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
+    )
 
 
 _FN_UNIQUE_LABELS = {
@@ -187,7 +188,7 @@ def check_classification_targets(y):
     ----------
     y : array-like
     """
-    y_type = type_of_target(y)
+    y_type = type_of_target(y, input_name="y")
     if y_type not in [
         "binary",
         "multiclass",
@@ -198,7 +199,7 @@ def check_classification_targets(y):
         raise ValueError("Unknown label type: %r" % y_type)
 
 
-def type_of_target(y):
+def type_of_target(y, input_name=""):
     """Determine the type of data indicated by the target.
 
     Note that this type is the most specific type that can be inferred.
@@ -214,6 +215,11 @@ def type_of_target(y):
     ----------
     y : array-like
 
+    input_name : str, default=""
+        The data name used to construct the error message.
+
+        .. versionadded:: 1.1.0
+
     Returns
     -------
     target_type : str
@@ -264,7 +270,7 @@ def type_of_target(y):
     'multilabel-indicator'
     """
     valid = (
-        isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__")
+        isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__")
     ) and not isinstance(y, str)
 
     if not valid:
@@ -322,7 +328,7 @@ def type_of_target(y):
     # check float and contains non-integer float values
     if y.dtype.kind == "f" and np.any(y != y.astype(int)):
         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
-        _assert_all_finite(y)
+        _assert_all_finite(y, input_name=input_name)
         return "continuous" + suffix
 
     if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
diff --git a/sklearn/utils/murmurhash.pyx b/sklearn/utils/murmurhash.pyx
index 0bce17737a090..dc9c3da08906f 100644
--- a/sklearn/utils/murmurhash.pyx
+++ b/sklearn/utils/murmurhash.pyx
@@ -55,7 +55,6 @@ cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
     return out
 
 
-@cython.boundscheck(False)
 cpdef np.ndarray[np.uint32_t, ndim=1] murmurhash3_bytes_array_u32(
     np.ndarray[np.int32_t] key, unsigned int seed):
     """Compute 32bit murmurhash3 hashes of a key int array at seed."""
@@ -67,7 +66,6 @@ cpdef np.ndarray[np.uint32_t, ndim=1] murmurhash3_bytes_array_u32(
     return out
 
 
-@cython.boundscheck(False)
 cpdef np.ndarray[np.int32_t, ndim=1] murmurhash3_bytes_array_s32(
     np.ndarray[np.int32_t] key, unsigned int seed):
     """Compute 32bit murmurhash3 hashes of a key int array at seed."""
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index bd2ac8bdfd27d..7e9b864afe043 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -15,8 +15,8 @@
 
 import numpy as np
 import warnings
-from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
 
+from .fixes import line_search_wolfe1, line_search_wolfe2
 from ..exceptions import ConvergenceWarning
 
 
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index 09677600cbbe4..ee12730d02b2d 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -7,7 +7,6 @@
 # License: BSD 3 clause
 
 #!python
-# cython: boundscheck=False, wraparound=False, cdivision=True
 
 from libc.math cimport fabs, sqrt, pow
 cimport numpy as np
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index c4f954790cd26..691f531a07e6d 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -23,11 +23,11 @@
 )
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.fixes import np_version, parse_version
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.linear_model import LinearRegression, SGDClassifier
 from sklearn.mixture import GaussianMixture
 from sklearn.cluster import MiniBatchKMeans
-from sklearn.decomposition import NMF
+from sklearn.decomposition import PCA
 from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
 from sklearn.svm import SVC, NuSVC
 from sklearn.neighbors import KNeighborsRegressor
@@ -496,7 +496,7 @@ def test_check_estimator():
     except ImportError:
         pass
     # check that predict does input validation (doesn't accept dicts in input)
-    msg = "Estimator doesn't check for NaN and inf in predict"
+    msg = "Estimator NoCheckinPredict doesn't check for NaN and inf in predict"
     with raises(AssertionError, match=msg):
         check_estimator(NoCheckinPredict())
     # check that estimator state does not change
@@ -603,9 +603,9 @@ def test_check_estimator_clones():
     for Estimator in [
         GaussianMixture,
         LinearRegression,
-        RandomForestClassifier,
-        NMF,
         SGDClassifier,
+        PCA,
+        ExtraTreesClassifier,
         MiniBatchKMeans,
     ]:
         with ignore_warnings(category=FutureWarning):
@@ -802,7 +802,7 @@ def predict_proba(self, X):
     # 1. unknown output type
     clf = MultiLabelClassifierPredictProba(response_output=sp.csr_matrix(y_test))
     err_msg = (
-        r"Unknown returned type <class 'scipy.sparse.csr.csr_matrix'> by "
+        "Unknown returned type .*csr_matrix.* by "
         r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy "
         r"array is expected."
     )
diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py
index 9d474ad10fe10..90300a9bef948 100644
--- a/sklearn/utils/tests/test_estimator_html_repr.py
+++ b/sklearn/utils/tests/test_estimator_html_repr.py
@@ -167,36 +167,38 @@ def test_estimator_html_repr_pipeline():
     html_output = estimator_html_repr(pipe)
 
     # top level estimators show estimator with changes
-    assert str(pipe) in html_output
+    assert html.escape(str(pipe)) in html_output
     for _, est in pipe.steps:
-        assert f'<div class="sk-toggleable__content"><pre>{str(est)}' in html_output
+        assert (
+            '<div class="sk-toggleable__content"><pre>' + html.escape(str(est))
+        ) in html_output
 
     # low level estimators do not show changes
     with config_context(print_changed_only=True):
-        assert str(num_trans["pass"]) in html_output
+        assert html.escape(str(num_trans["pass"])) in html_output
         assert "passthrough</label>" in html_output
-        assert str(num_trans["imputer"]) in html_output
+        assert html.escape(str(num_trans["imputer"])) in html_output
 
         for _, _, cols in preprocess.transformers:
-            assert f"<pre>{cols}</pre>" in html_output
+            assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
 
         # feature union
         for name, _ in feat_u.transformer_list:
-            assert f"<label>{name}</label>" in html_output
+            assert f"<label>{html.escape(name)}</label>" in html_output
 
         pca = feat_u.transformer_list[0][1]
-        assert f"<pre>{str(pca)}</pre>" in html_output
+        assert f"<pre>{html.escape(str(pca))}</pre>" in html_output
 
         tsvd = feat_u.transformer_list[1][1]
         first = tsvd["first"]
         select = tsvd["select"]
-        assert f"<pre>{str(first)}</pre>" in html_output
-        assert f"<pre>{str(select)}</pre>" in html_output
+        assert f"<pre>{html.escape(str(first))}</pre>" in html_output
+        assert f"<pre>{html.escape(str(select))}</pre>" in html_output
 
         # voting classifier
         for name, est in clf.estimators:
-            assert f"<label>{name}</label>" in html_output
-            assert f"<pre>{str(est)}</pre>" in html_output
+            assert f"<label>{html.escape(name)}</label>" in html_output
+            assert f"<pre>{html.escape(str(est))}</pre>" in html_output
 
 
 @pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
@@ -209,7 +211,7 @@ def test_stacking_classsifer(final_estimator):
 
     html_output = estimator_html_repr(clf)
 
-    assert str(clf) in html_output
+    assert html.escape(str(clf)) in html_output
     # If final_estimator's default changes from LogisticRegression
     # this should be updated
     if final_estimator is None:
@@ -225,12 +227,12 @@ def test_stacking_regressor(final_estimator):
     )
     html_output = estimator_html_repr(reg)
 
-    assert str(reg.estimators[0][0]) in html_output
+    assert html.escape(str(reg.estimators[0][0])) in html_output
     assert "LinearSVR</label>" in html_output
     if final_estimator is None:
         assert "RidgeCV</label>" in html_output
     else:
-        assert final_estimator.__class__.__name__ in html_output
+        assert html.escape(final_estimator.__class__.__name__) in html_output
 
 
 def test_birch_duck_typing_meta():
@@ -240,11 +242,11 @@ def test_birch_duck_typing_meta():
 
     # inner estimators do not show changes
     with config_context(print_changed_only=True):
-        assert f"<pre>{str(birch.n_clusters)}" in html_output
+        assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
         assert "AgglomerativeClustering</label>" in html_output
 
     # outer estimator contains all changes
-    assert f"<pre>{str(birch)}" in html_output
+    assert f"<pre>{html.escape(str(birch))}" in html_output
 
 
 def test_ovo_classifier_duck_typing_meta():
@@ -254,11 +256,11 @@ def test_ovo_classifier_duck_typing_meta():
 
     # inner estimators do not show changes
     with config_context(print_changed_only=True):
-        assert f"<pre>{str(ovo.estimator)}" in html_output
+        assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
         assert "LinearSVC</label>" in html_output
 
     # outer estimator
-    assert f"<pre>{str(ovo)}" in html_output
+    assert f"<pre>{html.escape(str(ovo))}" in html_output
 
 
 def test_duck_typing_nested_estimator():
@@ -267,8 +269,8 @@ def test_duck_typing_nested_estimator():
     gp = GaussianProcessRegressor(kernel=kernel)
     html_output = estimator_html_repr(gp)
 
-    assert f"<pre>{str(kernel)}" in html_output
-    assert f"<pre>{str(gp)}" in html_output
+    assert f"<pre>{html.escape(str(kernel))}" in html_output
+    assert f"<pre>{html.escape(str(gp))}" in html_output
 
 
 @pytest.mark.parametrize("print_changed_only", [True, False])
@@ -276,7 +278,7 @@ def test_one_estimator_print_change_only(print_changed_only):
     pca = PCA(n_components=10)
 
     with config_context(print_changed_only=print_changed_only):
-        pca_repr = str(pca)
+        pca_repr = html.escape(str(pca))
         html_output = estimator_html_repr(pca)
         assert pca_repr in html_output
 
diff --git a/sklearn/utils/tests/test_show_versions.py b/sklearn/utils/tests/test_show_versions.py
index a2c54379540ca..e6590bfde15f5 100644
--- a/sklearn/utils/tests/test_show_versions.py
+++ b/sklearn/utils/tests/test_show_versions.py
@@ -1,3 +1,4 @@
+from sklearn.utils.fixes import threadpool_info
 from sklearn.utils._show_versions import _get_sys_info
 from sklearn.utils._show_versions import _get_deps_info
 from sklearn.utils._show_versions import show_versions
@@ -34,3 +35,7 @@ def test_show_versions(capsys):
 
     assert "python" in out
     assert "numpy" in out
+
+    info = threadpool_info()
+    if info:
+        assert "threadpoolctl info:" in out
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 167118fb4ff8f..18f88373b02f3 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -175,23 +175,75 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
 
 
 @pytest.mark.parametrize(
-    "value, force_all_finite, match_msg",
+    "value, input_name, force_all_finite, match_msg",
     [
-        (np.inf, True, "Input contains NaN, infinity"),
-        (np.inf, "allow-nan", "Input contains infinity"),
-        (np.nan, True, "Input contains NaN, infinity"),
-        (np.nan, "allow-inf", 'force_all_finite should be a bool or "allow-nan"'),
-        (np.nan, 1, "Input contains NaN, infinity"),
+        (np.inf, "", True, "Input contains infinity"),
+        (np.inf, "X", True, "Input X contains infinity"),
+        (np.inf, "sample_weight", True, "Input sample_weight contains infinity"),
+        (np.inf, "X", "allow-nan", "Input X contains infinity"),
+        (np.nan, "", True, "Input contains NaN"),
+        (np.nan, "X", True, "Input X contains NaN"),
+        (np.nan, "y", True, "Input y contains NaN"),
+        (
+            np.nan,
+            "",
+            "allow-inf",
+            'force_all_finite should be a bool or "allow-nan"',
+        ),
+        (np.nan, "", 1, "Input contains NaN"),
     ],
 )
 @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix])
 def test_check_array_force_all_finiteinvalid(
-    value, force_all_finite, match_msg, retype
+    value, input_name, force_all_finite, match_msg, retype
 ):
-    X = retype(np.arange(4).reshape(2, 2).astype(float))
+    X = retype(np.arange(4).reshape(2, 2).astype(np.float64))
     X[0, 0] = value
     with pytest.raises(ValueError, match=match_msg):
-        check_array(X, force_all_finite=force_all_finite, accept_sparse=True)
+        check_array(
+            X,
+            input_name=input_name,
+            force_all_finite=force_all_finite,
+            accept_sparse=True,
+        )
+
+
+@pytest.mark.parametrize("input_name", ["X", "y", "sample_weight"])
+@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix])
+def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype):
+    data = retype(np.arange(4).reshape(2, 2).astype(np.float64))
+    data[0, 0] = np.nan
+    estimator = SVR()
+    extended_msg = (
+        f"\n{estimator.__class__.__name__} does not accept missing values"
+        " encoded as NaN natively. For supervised learning, you might want"
+        " to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor"
+        " which accept missing values encoded as NaNs natively."
+        " Alternatively, it is possible to preprocess the"
+        " data, for instance by using an imputer transformer in a pipeline"
+        " or drop samples with missing values. See"
+        " https://scikit-learn.org/stable/modules/impute.html"
+    )
+
+    with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx:
+        check_array(
+            data,
+            estimator=estimator,
+            input_name=input_name,
+            accept_sparse=True,
+        )
+
+    if input_name == "X":
+        assert extended_msg in ctx.value.args[0]
+    else:
+        assert extended_msg not in ctx.value.args[0]
+
+    if input_name == "X":
+        # Veriy that _validate_data is automatically called with the right argument
+        # to generate the same exception:
+        with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx:
+            SVR().fit(data, np.ones(data.shape[0]))
+        assert extended_msg in ctx.value.args[0]
 
 
 def test_check_array_force_all_finite_object():
@@ -212,15 +264,15 @@ def test_check_array_force_all_finite_object():
     [
         (
             np.array([[1, np.nan]]),
-            "Input contains NaN, infinity or a value too large for.*int",
+            "Input contains NaN.",
         ),
         (
             np.array([[1, np.nan]]),
-            "Input contains NaN, infinity or a value too large for.*int",
+            "Input contains NaN.",
         ),
         (
             np.array([[1, np.inf]]),
-            "Input contains NaN, infinity or a value too large for.*int",
+            "Input contains infinity or a value too large for.*int",
         ),
         (np.array([[1, np.nan]], dtype=object), "cannot convert float NaN to integer"),
     ],
@@ -390,7 +442,9 @@ def test_check_array_dtype_numeric_errors(X):
         check_array(X, dtype="numeric")
 
 
-@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"])
+@pytest.mark.parametrize(
+    "pd_dtype", ["Int8", "Int16", "UInt8", "UInt16", "Float32", "Float64"]
+)
 @pytest.mark.parametrize(
     "dtype, expected_dtype",
     [
@@ -400,14 +454,18 @@ def test_check_array_dtype_numeric_errors(X):
     ],
 )
 def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
-    # Test pandas IntegerArray with pd.NA
+    # Test pandas numerical extension arrays with pd.NA
     pd = pytest.importorskip("pandas", minversion="1.0")
 
+    if pd_dtype in {"Float32", "Float64"}:
+        # Extension dtypes with Floats was added in 1.2
+        pd = pytest.importorskip("pandas", minversion="1.2")
+
     X_np = np.array(
         [[1, 2, 3, np.nan, np.nan], [np.nan, np.nan, 8, 4, 6], [1, 2, 3, 4, 5]]
     ).T
 
-    # Creates dataframe with IntegerArrays with pd.NA
+    # Creates dataframe with numerical extension arrays with pd.NA
     X = pd.DataFrame(X_np, dtype=pd_dtype, columns=["a", "b", "c"])
     # column c has no nans
     X["c"] = X["c"].astype("float")
@@ -419,7 +477,7 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
     assert_allclose(X_checked, X_np)
     assert X_checked.dtype == expected_dtype
 
-    msg = "Input contains NaN, infinity"
+    msg = "Input contains NaN"
     with pytest.raises(ValueError, match=msg):
         check_array(X, force_all_finite=True)
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 0380af76f5140..9d6035d52ed9b 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -87,7 +87,9 @@ def inner_f(*args, **kwargs):
     return _inner_deprecate_positional_args
 
 
-def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
+def _assert_all_finite(
+    X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
+):
     """Like assert_all_finite, but only for ndarray."""
     # validation is also imported in extmath
     from .extmath import _safe_accumulator_op
@@ -103,26 +105,52 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
     if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
         pass
     elif is_float:
-        msg_err = "Input contains {} or a value too large for {!r}."
         if (
             allow_nan
             and np.isinf(X).any()
             or not allow_nan
             and not np.isfinite(X).all()
         ):
-            type_err = "infinity" if allow_nan else "NaN, infinity"
-            raise ValueError(
-                msg_err.format(
-                    type_err, msg_dtype if msg_dtype is not None else X.dtype
+            if not allow_nan and np.isnan(X).any():
+                type_err = "NaN"
+            else:
+                msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
+                type_err = f"infinity or a value too large for {msg_dtype!r}"
+            padded_input_name = input_name + " " if input_name else ""
+            msg_err = f"Input {padded_input_name}contains {type_err}."
+            if (
+                not allow_nan
+                and estimator_name
+                and input_name == "X"
+                and np.isnan(X).any()
+            ):
+                # Improve the error message on how to handle missing values in
+                # scikit-learn.
+                msg_err += (
+                    f"\n{estimator_name} does not accept missing values"
+                    " encoded as NaN natively. For supervised learning, you might want"
+                    " to consider sklearn.ensemble.HistGradientBoostingClassifier and"
+                    " Regressor which accept missing values encoded as NaNs natively."
+                    " Alternatively, it is possible to preprocess the data, for"
+                    " instance by using an imputer transformer in a pipeline or drop"
+                    " samples with missing values. See"
+                    " https://scikit-learn.org/stable/modules/impute.html"
                 )
-            )
+            raise ValueError(msg_err)
+
     # for object dtype data, we only check for NaNs (GH-13254)
     elif X.dtype == np.dtype("object") and not allow_nan:
         if _object_dtype_isnan(X).any():
             raise ValueError("Input contains NaN")
 
 
-def assert_all_finite(X, *, allow_nan=False):
+def assert_all_finite(
+    X,
+    *,
+    allow_nan=False,
+    estimator_name=None,
+    input_name="",
+):
     """Throw a ValueError if X contains NaN or infinity.
 
     Parameters
@@ -130,12 +158,26 @@ def assert_all_finite(X, *, allow_nan=False):
     X : {ndarray, sparse matrix}
 
     allow_nan : bool, default=False
+
+    estimator_name : str, default=None
+        The estimator name, used to construct the error message.
+
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
     """
-    _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
+    _assert_all_finite(
+        X.data if sp.issparse(X) else X,
+        allow_nan=allow_nan,
+        estimator_name=estimator_name,
+        input_name=input_name,
+    )
 
 
 def as_float_array(X, *, copy=True, force_all_finite=True):
-    """Converts an array-like to an array of floats.
+    """Convert an array-like to an array of floats.
 
     The new dtype will be np.float32 or np.float64, depending on the original
     type. The function can create a copy or modify the argument depending
@@ -144,6 +186,7 @@ def as_float_array(X, *, copy=True, force_all_finite=True):
     Parameters
     ----------
     X : {array-like, sparse matrix}
+        The input data.
 
     copy : bool, default=True
         If True, a copy of X will be created. If False, a copy may still be
@@ -379,7 +422,14 @@ def indexable(*iterables):
 
 
 def _ensure_sparse_format(
-    spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse
+    spmatrix,
+    accept_sparse,
+    dtype,
+    copy,
+    force_all_finite,
+    accept_large_sparse,
+    estimator_name=None,
+    input_name="",
 ):
     """Convert a sparse matrix to a given format.
 
@@ -419,6 +469,16 @@ def _ensure_sparse_format(
         .. versionchanged:: 0.23
            Accepts `pd.NA` and converts it into `np.nan`
 
+
+    estimator_name : str, default=None
+        The estimator name, used to construct the error message.
+
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
+
     Returns
     -------
     spmatrix_converted : sparse matrix.
@@ -475,7 +535,12 @@ def _ensure_sparse_format(
                 stacklevel=2,
             )
         else:
-            _assert_all_finite(spmatrix.data, allow_nan=force_all_finite == "allow-nan")
+            _assert_all_finite(
+                spmatrix.data,
+                allow_nan=force_all_finite == "allow-nan",
+                estimator_name=estimator_name,
+                input_name=input_name,
+            )
 
     return spmatrix
 
@@ -490,6 +555,42 @@ def _ensure_no_complex_data(array):
         raise ValueError("Complex data not supported\n{}\n".format(array))
 
 
+def _check_estimator_name(estimator):
+    if estimator is not None:
+        if isinstance(estimator, str):
+            return estimator
+        else:
+            return estimator.__class__.__name__
+    return None
+
+
+def _pandas_dtype_needs_early_conversion(pd_dtype):
+    """Return True if pandas extension pd_dtype need to be converted early."""
+    try:
+        from pandas.api.types import (
+            is_extension_array_dtype,
+            is_float_dtype,
+            is_integer_dtype,
+            is_sparse,
+        )
+    except ImportError:
+        return False
+
+    if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
+        # Sparse arrays will be converted later in `check_array`
+        # Only handle extension arrays for interger and floats
+        return False
+    elif is_float_dtype(pd_dtype):
+        # Float ndarrays can normally support nans. They need to be converted
+        # first to map pd.NA to np.nan
+        return True
+    elif is_integer_dtype(pd_dtype):
+        # XXX: Warn when converting from a high integer to a float
+        return True
+
+    return False
+
+
 def check_array(
     array,
     accept_sparse=False,
@@ -504,6 +605,7 @@ def check_array(
     ensure_min_samples=1,
     ensure_min_features=1,
     estimator=None,
+    input_name="",
 ):
 
     """Input validation on an array, list, sparse matrix or similar.
@@ -583,6 +685,14 @@ def check_array(
     estimator : str or estimator instance, default=None
         If passed, include the name of the estimator in warning messages.
 
+    input_name : str, default=""
+        The data name used to construct the error message. In particular
+        if `input_name` is "X" and the data has NaN values and
+        allow_nan is False, the error message will link to the imputer
+        documentation.
+
+        .. versionadded:: 1.1.0
+
     Returns
     -------
     array_converted : object
@@ -612,7 +722,7 @@ def check_array(
     # check if the object contains several dtypes (typically a pandas
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
-    has_pd_integer_array = False
+    pandas_requires_conversion = False
     if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
         # throw warning if columns are sparse. If all columns are sparse, then
         # array.sparse exists and sparsity will be preserved (later).
@@ -625,42 +735,17 @@ def check_array(
                     "It will be converted to a dense numpy array."
                 )
 
-        dtypes_orig = list(array.dtypes)
-        # pandas boolean dtype __array__ interface coerces bools to objects
-        for i, dtype_iter in enumerate(dtypes_orig):
+        dtypes_orig = []
+        for dtype_iter in array.dtypes:
             if dtype_iter.kind == "b":
-                dtypes_orig[i] = np.dtype(object)
-            elif dtype_iter.name.startswith(("Int", "UInt")):
-                # name looks like an Integer Extension Array, now check for
-                # the dtype
-                with suppress(ImportError):
-                    from pandas import (
-                        Int8Dtype,
-                        Int16Dtype,
-                        Int32Dtype,
-                        Int64Dtype,
-                        UInt8Dtype,
-                        UInt16Dtype,
-                        UInt32Dtype,
-                        UInt64Dtype,
-                    )
-
-                    if isinstance(
-                        dtype_iter,
-                        (
-                            Int8Dtype,
-                            Int16Dtype,
-                            Int32Dtype,
-                            Int64Dtype,
-                            UInt8Dtype,
-                            UInt16Dtype,
-                            UInt32Dtype,
-                            UInt64Dtype,
-                        ),
-                    ):
-                        has_pd_integer_array = True
-
-        if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
+                # pandas boolean dtype __array__ interface coerces bools to objects
+                dtype_iter = np.dtype(object)
+            elif _pandas_dtype_needs_early_conversion(dtype_iter):
+                pandas_requires_conversion = True
+
+            dtypes_orig.append(dtype_iter)
+
+        if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
 
     if dtype_numeric:
@@ -679,9 +764,12 @@ def check_array(
             # list of accepted types.
             dtype = dtype[0]
 
-    if has_pd_integer_array:
-        # If there are any pandas integer extension arrays,
+    if pandas_requires_conversion:
+        # pandas dataframe requires conversion earlier to handle extension dtypes with
+        # nans
         array = array.astype(dtype)
+        # Since we converted here, we do not need to convert again later
+        dtype = None
 
     if force_all_finite not in (True, False, "allow-nan"):
         raise ValueError(
@@ -690,13 +778,7 @@ def check_array(
             )
         )
 
-    if estimator is not None:
-        if isinstance(estimator, str):
-            estimator_name = estimator
-        else:
-            estimator_name = estimator.__class__.__name__
-    else:
-        estimator_name = "Estimator"
+    estimator_name = _check_estimator_name(estimator)
     context = " by %s" % estimator_name if estimator is not None else ""
 
     # When all dataframe columns are sparse, convert to a sparse array
@@ -723,6 +805,8 @@ def check_array(
             copy=copy,
             force_all_finite=force_all_finite,
             accept_large_sparse=accept_large_sparse,
+            estimator_name=estimator_name,
+            input_name=input_name,
         )
     else:
         # If np.array(..) gives ComplexWarning, then we convert the warning
@@ -739,7 +823,13 @@ def check_array(
                     # then conversion float -> int would be disallowed.
                     array = np.asarray(array, order=order)
                     if array.dtype.kind == "f":
-                        _assert_all_finite(array, allow_nan=False, msg_dtype=dtype)
+                        _assert_all_finite(
+                            array,
+                            allow_nan=False,
+                            msg_dtype=dtype,
+                            estimator_name=estimator_name,
+                            input_name=input_name,
+                        )
                     array = array.astype(dtype, casting="unsafe", copy=False)
                 else:
                     array = np.asarray(array, order=order, dtype=dtype)
@@ -796,7 +886,12 @@ def check_array(
             )
 
         if force_all_finite:
-            _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
+            _assert_all_finite(
+                array,
+                input_name=input_name,
+                estimator_name=estimator_name,
+                allow_nan=force_all_finite == "allow-nan",
+            )
 
     if ensure_min_samples > 0:
         n_samples = _num_samples(array)
@@ -973,24 +1068,32 @@ def check_X_y(
         ensure_min_samples=ensure_min_samples,
         ensure_min_features=ensure_min_features,
         estimator=estimator,
+        input_name="X",
     )
 
-    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
+    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
 
     check_consistent_length(X, y)
 
     return X, y
 
 
-def _check_y(y, multi_output=False, y_numeric=False):
+def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
     """Isolated part of check_X_y dedicated to y validation"""
     if multi_output:
         y = check_array(
-            y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
+            y,
+            accept_sparse="csr",
+            force_all_finite=True,
+            ensure_2d=False,
+            dtype=None,
+            input_name="y",
+            estimator=estimator,
         )
     else:
+        estimator_name = _check_estimator_name(estimator)
         y = column_or_1d(y, warn=True)
-        _assert_all_finite(y)
+        _assert_all_finite(y, input_name="y", estimator_name=estimator_name)
         _ensure_no_complex_data(y)
     if y_numeric and y.dtype.kind == "O":
         y = y.astype(np.float64)
@@ -1004,6 +1107,7 @@ def column_or_1d(y, *, warn=False):
     Parameters
     ----------
     y : array-like
+       Input data.
 
     warn : bool, default=False
        To control display of warnings.
@@ -1011,7 +1115,12 @@ def column_or_1d(y, *, warn=False):
     Returns
     -------
     y : ndarray
+       Output data.
 
+    Raises
+    -------
+    ValueError
+        If `y` is not a 1D array or a 2D array with a single row or column.
     """
     y = np.asarray(y)
     shape = np.shape(y)
@@ -1056,7 +1165,7 @@ def check_random_state(seed):
 
 
 def has_fit_parameter(estimator, parameter):
-    """Checks whether the estimator's fit method supports the given parameter.
+    """Check whether the estimator's fit method supports the given parameter.
 
     Parameters
     ----------
@@ -1068,7 +1177,7 @@ def has_fit_parameter(estimator, parameter):
 
     Returns
     -------
-    is_parameter: bool
+    is_parameter : bool
         Whether the parameter was found to be a named parameter of the
         estimator's fit method.
 
@@ -1078,7 +1187,6 @@ def has_fit_parameter(estimator, parameter):
     >>> from sklearn.utils.validation import has_fit_parameter
     >>> has_fit_parameter(SVC(), "sample_weight")
     True
-
     """
     return parameter in signature(estimator.fit).parameters
 
@@ -1558,6 +1666,7 @@ def _check_sample_weight(
             dtype=dtype,
             order="C",
             copy=copy,
+            input_name="sample_weight",
         )
         if sample_weight.ndim != 1:
             raise ValueError("Sample weights must be 1D array or scalar")
@@ -1702,7 +1811,7 @@ def _get_feature_names(X):
         return feature_names
 
 
-def _check_feature_names_in(estimator, input_features=None):
+def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
     """Get output feature names for transformation.
 
     Parameters
@@ -1716,9 +1825,13 @@ def _check_feature_names_in(estimator, input_features=None):
         - If `input_features` is an array-like, then `input_features` must
             match `feature_names_in_` if `feature_names_in_` is defined.
 
+    generate_names : bool, default=True
+        Wether to generate names when `input_features` is `None` and
+        `estimator.feature_names_in_` is not defined.
+
     Returns
     -------
-    feature_names_in : ndarray of str
+    feature_names_in : ndarray of str or `None`
         Feature names in.
     """
 
@@ -1742,8 +1855,40 @@ def _check_feature_names_in(estimator, input_features=None):
     if feature_names_in_ is not None:
         return feature_names_in_
 
+    if not generate_names:
+        return
+
     # Generates feature names if `n_features_in_` is defined
     if n_features_in_ is None:
         raise ValueError("Unable to generate feature names without n_features_in_")
 
     return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
+
+
+def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):
+    """Generate feature names out for estimator using the estimator name as the prefix.
+
+    The input_feature names are validated but not used. This function is useful
+    for estimators that generate their own names based on `n_features_out`, i.e. PCA.
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator producing output feature names.
+
+    n_feature_out : int
+        Number of feature names out.
+
+    input_features : array-like of str or None, default=None
+        Only used to validate feature names with `estimator.feature_names_in_`.
+
+    Returns
+    -------
+    feature_names_in : ndarray of str or `None`
+        Feature names in.
+    """
+    _check_feature_names_in(estimator, input_features, generate_names=False)
+    estimator_name = estimator.__class__.__name__.lower()
+    return np.asarray(
+        [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object
+    )