diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 11fcb5cd25d78..1f133d701ca53 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -84,6 +84,8 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 + with: + python-version: '3.9' # update once build dependencies are available - name: Build and test wheels env: diff --git a/.travis.yml b/.travis.yml index 456d94301d4c1..7b37bc7c91f58 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ env: # Custom environment variables for the ARM wheel builder - CIBW_BUILD_VERBOSITY=1 - CIBW_TEST_COMMAND="bash {project}/build_tools/travis/test_wheels.sh" - - CIBW_ENVIRONMENT="CPU_COUNT=6 + - CIBW_ENVIRONMENT="CPU_COUNT=2 OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 SKLEARN_BUILD_PARALLEL=10 @@ -39,7 +39,10 @@ jobs: # fast. - python: 3.7 os: linux - arch: arm64 + arch: arm64-graviton2 + dist: focal + virt: lxd + group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true @@ -47,7 +50,10 @@ jobs: - python: 3.8 os: linux - arch: arm64 + arch: arm64-graviton2 + dist: focal + virt: lxd + group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true @@ -55,7 +61,10 @@ jobs: - python: 3.9 os: linux - arch: arm64 + arch: arm64-graviton2 + dist: focal + virt: lxd + group: edge if: type = cron or commit_message =~ /\[cd build\]/ env: - BUILD_WHEEL=true diff --git a/README.rst b/README.rst index b41bb9c98daba..d8357c246543c 100644 --- a/README.rst +++ b/README.rst @@ -5,15 +5,15 @@ .. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=main .. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main +.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token +.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn + .. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=main -.. _Travis: https://travis-ci.com/scikit-learn/scikit-learn +.. _Travis: https://app.travis-ci.com/github/scikit-learn/scikit-learn .. |Codecov| image:: https://codecov.io/gh/scikit-learn/scikit-learn/branch/main/graph/badge.svg?token=Pk8G9gg3y9 .. _Codecov: https://codecov.io/gh/scikit-learn/scikit-learn -.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token -.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn - .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule @@ -186,6 +186,7 @@ Communication - Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn - Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions - Website: https://scikit-learn.org +- LinkedIn: https://www.linkedin.com/company/scikit-learn Citation ~~~~~~~~ diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000..251dbb054df52 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,17 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| --------- | ------------------ | +| 1.0.1 | :white_check_mark: | +| < 1.0.1 | :x: | + +## Reporting a Vulnerability + +Please report security vulnerabilities by email to `security@scikit-learn.org`. +This email is an alias to a subset of the scikit-learn maintainers' team. + +If the security vulnerability is accepted, a patch will be crafted privately +in order to prepare a dedicated bugfix release as timely as possible (depending +on the complexity of the fix). diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 19bc8b4efe4df..60b1f2811f88d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,7 +48,7 @@ jobs: pip install pytest flake8 mypy==0.782 black==21.6b0 displayName: Install linters - bash: | - black --check . + black --check --diff . displayName: Run black - bash: | ./build_tools/circle/linting.sh @@ -81,6 +81,7 @@ jobs: # Tests that require large downloads over the networks are skipped in CI. # Here we make sure, that they are still run on a regular basis. SKLEARN_SKIP_NETWORK_TESTS: '0' + CREATE_ISSUE_ON_TRACKER: 'true' # Check compilation with intel C++ compiler (ICC) - template: build_tools/azure/posix.yml @@ -105,6 +106,28 @@ jobs: COVERAGE: 'false' BUILD_WITH_ICC: 'true' +- template: build_tools/azure/posix-docker.yml + parameters: + name: Linux_Nightly_PyPy + vmImage: ubuntu-20.04 + dependsOn: [linting, git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + or( + eq(variables['Build.Reason'], 'Schedule'), + contains(dependencies['git_commit']['outputs']['commit.message'], '[pypy]') + ) + ) + matrix: + pypy3: + DISTRIB: 'conda-mamba-pypy3' + DOCKER_CONTAINER: 'condaforge/mambaforge-pypy3:4.10.3-5' + PILLOW_VERSION: 'none' + PANDAS_VERSION: 'none' + CREATE_ISSUE_ON_TRACKER: 'true' + # Will run all the time regardless of linting outcome. - template: build_tools/azure/posix.yml parameters: @@ -123,6 +146,7 @@ jobs: PYTHON_VERSION: '*' BLAS: 'mkl' COVERAGE: 'true' + SHOW_SHORT_SUMMARY: 'true' # Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge - template: build_tools/azure/posix.yml @@ -198,11 +222,6 @@ jobs: ne(variables['Build.Reason'], 'Schedule') ) matrix: - pypy3: - DISTRIB: 'conda-mamba-pypy3' - DOCKER_CONTAINER: 'condaforge/mambaforge-pypy3:4.10.3-5' - PILLOW_VERSION: 'none' - PANDAS_VERSION: 'none' debian_atlas_32bit: DISTRIB: 'debian-32' DOCKER_CONTAINER: 'i386/debian:10.9' @@ -237,7 +256,7 @@ jobs: - template: build_tools/azure/windows.yml parameters: name: Windows - vmImage: vs2017-win2016 + vmImage: windows-latest dependsOn: [linting, git_commit] condition: | and( diff --git a/build_tools/azure/posix-docker.yml b/build_tools/azure/posix-docker.yml index 18642f1f39b7b..2578453578f61 100644 --- a/build_tools/azure/posix-docker.yml +++ b/build_tools/azure/posix-docker.yml @@ -34,6 +34,7 @@ jobs: # Set in azure-pipelines.yml DISTRIB: '' DOCKER_CONTAINER: '' + SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index 79101c8eef227..fb4e7f85f9527 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -33,6 +33,8 @@ jobs: THREADPOOLCTL_VERSION: 'latest' COVERAGE: 'true' TEST_DOCSTRINGS: 'false' + CREATE_ISSUE_ON_TRACKER: 'false' + SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} @@ -72,6 +74,30 @@ jobs: testRunTitle: ${{ format('{0}-$(Agent.JobName)', parameters.name) }} displayName: 'Publish Test Results' condition: succeededOrFailed() + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.9' + displayName: Place Python into path to update issue tracker + condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), + eq(variables['Build.Reason'], 'Schedule')) + - bash: | + set -ex + if [[ $(BOT_GITHUB_TOKEN) == "" ]]; then + echo "GitHub Token is not set. Issue tracker will not be updated." + exit + fi + + LINK_TO_RUN="https://dev.azure.com/$BUILD_REPOSITORY_NAME/_build/results?buildId=$BUILD_BUILDID&view=logs&j=$SYSTEM_JOBID" + CI_NAME="$SYSTEM_JOBIDENTIFIER" + ISSUE_REPO="$BUILD_REPOSITORY_NAME" + + pip install defusedxml PyGithub + python maint_tools/create_issue_from_juint.py $(BOT_GITHUB_TOKEN) $CI_NAME $ISSUE_REPO $LINK_TO_RUN $JUNIT_FILE + displayName: 'Update issue tracker' + env: + JUNIT_FILE: $(TEST_DIR)/$(JUNITXML) + condition: and(succeededOrFailed(), eq(variables['CREATE_ISSUE_ON_TRACKER'], 'true'), + eq(variables['Build.Reason'], 'Schedule')) - script: | build_tools/azure/upload_codecov.sh condition: and(succeeded(), eq(variables['COVERAGE'], 'true')) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 8e8110bcaef04..721cfc665a14a 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -17,7 +17,6 @@ cp setup.cfg $TEST_DIR cd $TEST_DIR python -c "import sklearn; sklearn.show_versions()" -python -m threadpoolctl -i sklearn if ! command -v conda &> /dev/null then @@ -36,18 +35,28 @@ if [[ "$COVERAGE" == "true" ]]; then # report that otherwise hides the test failures and forces long scrolls in # the CI logs. export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" - TEST_CMD="$TEST_CMD --cov-config=$COVERAGE_PROCESS_START --cov sklearn --cov-report=" + TEST_CMD="$TEST_CMD --cov-config='$COVERAGE_PROCESS_START' --cov sklearn --cov-report=" fi if [[ -n "$CHECK_WARNINGS" ]]; then # numpy's 1.19.0's tostring() deprecation is ignored until scipy and joblib removes its usage TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning -Wignore:tostring:DeprecationWarning" + + # Python 3.10 deprecates disutils and is imported by numpy interally during import time + TEST_CMD="$TEST_CMD -Wignore:The\ distutils:DeprecationWarning" + + # Workaround for https://github.com/pypa/setuptools/issues/2885 + TEST_CMD="$TEST_CMD -Wignore:Creating\ a\ LegacyVersion:DeprecationWarning" fi if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then TEST_CMD="$TEST_CMD -n2" fi +if [[ "$SHOW_SHORT_SUMMARY" == "true" ]]; then + TEST_CMD="$TEST_CMD -ra" +fi + set -x -$TEST_CMD --pyargs sklearn +eval "$TEST_CMD --pyargs sklearn" set +x diff --git a/build_tools/azure/windows.yml b/build_tools/azure/windows.yml index f835fd02cf58e..6b4789e87627b 100644 --- a/build_tools/azure/windows.yml +++ b/build_tools/azure/windows.yml @@ -20,6 +20,7 @@ jobs: PYTEST_XDIST: 'true' PYTEST_XDIST_VERSION: 'latest' TEST_DIR: '$(Agent.WorkFolder)/tmp_folder' + SHOW_SHORT_SUMMARY: 'false' strategy: matrix: ${{ insert }}: ${{ parameters.matrix }} diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 8facdc59a6c56..ddb54c840093a 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -145,8 +145,8 @@ fi MINICONDA_PATH=$HOME/miniconda # Install dependencies with miniconda -wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - -O miniconda.sh +wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ + -O miniconda.sh chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH="/usr/lib/ccache:$MINICONDA_PATH/bin:$PATH" @@ -165,7 +165,7 @@ fi source build_tools/shared.sh # packaging won't be needed once setuptools starts shipping packaging>=17.0 -conda create -n $CONDA_ENV_NAME --yes --quiet \ +mamba create -n $CONDA_ENV_NAME --yes --quiet \ python="${PYTHON_VERSION:-*}" \ "$(get_dep numpy $NUMPY_VERSION)" \ "$(get_dep scipy $SCIPY_VERSION)" \ @@ -176,6 +176,11 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \ joblib memory_profiler packaging seaborn pillow pytest coverage source activate testenv +# Pin PyWavelet to 1.1.1 that is the latest version that support our minumum +# NumPy version required. If PyWavelets 1.2+ is installed, it would require +# NumPy 1.17+ that trigger a bug with Pandas 0.25: +# https://github.com/numpy/numpy/issues/18355#issuecomment-774610226 +pip install PyWavelets==1.1.1 pip install "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)" pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)" diff --git a/build_tools/github/test_wheels.sh b/build_tools/github/test_wheels.sh index 58a05b6182006..1a984bc91dba8 100755 --- a/build_tools/github/test_wheels.sh +++ b/build_tools/github/test_wheels.sh @@ -9,7 +9,7 @@ if [[ "$OSTYPE" != "linux-gnu" ]]; then cp $CONFTEST_PATH $CONFTEST_NAME fi +# Test that there are no links to system libraries in the +# threadpoolctl output section of the show_versions output: +python -c "import sklearn; sklearn.show_versions()" pytest --pyargs sklearn - -# Test that there are no links to system libraries -python -m threadpoolctl -i sklearn diff --git a/build_tools/github/test_windows_wheels.sh b/build_tools/github/test_windows_wheels.sh index cf33252d551ba..a04e390b5cdc4 100755 --- a/build_tools/github/test_windows_wheels.sh +++ b/build_tools/github/test_windows_wheels.sh @@ -10,15 +10,17 @@ if [[ "$BITNESS" == "32" ]]; then # 32-bit architectures use the regular # test command (outside of the minimal Docker container) cp $CONFTEST_PATH $CONFTEST_NAME + python -c "import sklearn; sklearn.show_versions()" pytest --pyargs sklearn - python -m threadpoolctl -i sklearn else - docker container run -e SKLEARN_SKIP_NETWORK_TESTS=1 \ - -e OMP_NUM_THREADS=2 \ - -e OPENBLAS_NUM_THREADS=2 \ - --rm scikit-learn/minimal-windows \ - powershell -Command "pytest --pyargs sklearn" + docker container run \ + --rm scikit-learn/minimal-windows \ + powershell -Command "python -c 'import sklearn; sklearn.show_versions()'" - docker container run --rm scikit-learn/minimal-windows \ - powershell -Command "python -m threadpoolctl -i sklearn" + docker container run \ + -e SKLEARN_SKIP_NETWORK_TESTS=1 \ + -e OMP_NUM_THREADS=2 \ + -e OPENBLAS_NUM_THREADS=2 \ + --rm scikit-learn/minimal-windows \ + powershell -Command "pytest --pyargs sklearn" fi diff --git a/build_tools/travis/test_wheels.sh b/build_tools/travis/test_wheels.sh index aa3d0d8c0ef1b..11d4bd73cedd7 100755 --- a/build_tools/travis/test_wheels.sh +++ b/build_tools/travis/test_wheels.sh @@ -3,7 +3,7 @@ pip install --upgrade pip || travis_terminate $? pip install pytest pytest-xdist || travis_terminate $? +# Test that there are no links to system libraries in the threadpoolctl +# section of the show_versions output. +python -c "import sklearn; sklearn.show_versions()" || travis_terminate $? python -m pytest -n $CPU_COUNT --pyargs sklearn || travis_terminate $? - -# Test that there are no links to system libraries -python -m threadpoolctl -i sklearn || travis_terminate $? diff --git a/doc/authors.rst b/doc/authors.rst index 73631f2be0121..9a2b1f89d5d06 100644 --- a/doc/authors.rst +++ b/doc/authors.rst @@ -42,6 +42,10 @@
Adrin Jalali
+ diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 985a246eb73b4..54e7f66ffc3c9 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -254,33 +254,35 @@ how to set up your git repository: .. prompt:: bash $ - git clone git@github.com:YourLogin/scikit-learn.git # add --depth 1 if your connection is slow - cd scikit-learn + git clone git@github.com:YourLogin/scikit-learn.git # add --depth 1 if your connection is slow + cd scikit-learn -4. Install the development dependencies: - - .. prompt:: bash $ - - pip install cython pytest pytest-cov flake8 mypy black==21.6b0 - -5. Install scikit-learn in editable mode: +3. Follow steps 2-7 in :ref:`install_bleeding_edge` to build scikit-learn in + development mode and return to this document. - .. prompt:: bash $ +4. Install the development dependencies: - pip install --no-build-isolation --editable . + .. prompt:: bash $ - If you receive errors in building scikit-learn, see the - :ref:`install_bleeding_edge` section. + pip install pytest pytest-cov flake8 mypy black==21.6b0 .. _upstream: -6. Add the ``upstream`` remote. This saves a reference to the main +5. Add the ``upstream`` remote. This saves a reference to the main scikit-learn repository, which you can use to keep your repository synchronized with the latest changes: .. prompt:: bash $ - git remote add upstream https://github.com/scikit-learn/scikit-learn.git + git remote add upstream git@github.com:scikit-learn/scikit-learn.git + +6. Check that the `upstream` and `origin` remote aliases are configured correctly + by running `git remote -v` which should display:: + + origin git@github.com:YourLogin/scikit-learn.git (fetch) + origin git@github.com:YourLogin/scikit-learn.git (push) + upstream git@github.com:scikit-learn/scikit-learn.git (fetch) + upstream git@github.com:scikit-learn/scikit-learn.git (push) You should now have a working installation of scikit-learn, and your git repository properly configured. The next steps now describe the process of @@ -536,7 +538,7 @@ profiling and Cython optimizations. For two very well documented and more detailed guides on development workflow, please pay a visit to the `Scipy Development Workflow -{html.escape(estimator_str)}{fallback_msg}" "
{str(est)}' in html_output + assert ( + '' + html.escape(str(est)) + ) in html_output # low level estimators do not show changes with config_context(print_changed_only=True): - assert str(num_trans["pass"]) in html_output + assert html.escape(str(num_trans["pass"])) in html_output assert "passthrough" in html_output - assert str(num_trans["imputer"]) in html_output + assert html.escape(str(num_trans["imputer"])) in html_output for _, _, cols in preprocess.transformers: - assert f"{cols}" in html_output + assert f"{html.escape(str(cols))}" in html_output # feature union for name, _ in feat_u.transformer_list: - assert f"" in html_output + assert f"" in html_output pca = feat_u.transformer_list[0][1] - assert f"{str(pca)}" in html_output + assert f"{html.escape(str(pca))}" in html_output tsvd = feat_u.transformer_list[1][1] first = tsvd["first"] select = tsvd["select"] - assert f"{str(first)}" in html_output - assert f"{str(select)}" in html_output + assert f"{html.escape(str(first))}" in html_output + assert f"{html.escape(str(select))}" in html_output # voting classifier for name, est in clf.estimators: - assert f"" in html_output - assert f"{str(est)}" in html_output + assert f"" in html_output + assert f"{html.escape(str(est))}" in html_output @pytest.mark.parametrize("final_estimator", [None, LinearSVC()]) @@ -209,7 +211,7 @@ def test_stacking_classsifer(final_estimator): html_output = estimator_html_repr(clf) - assert str(clf) in html_output + assert html.escape(str(clf)) in html_output # If final_estimator's default changes from LogisticRegression # this should be updated if final_estimator is None: @@ -225,12 +227,12 @@ def test_stacking_regressor(final_estimator): ) html_output = estimator_html_repr(reg) - assert str(reg.estimators[0][0]) in html_output + assert html.escape(str(reg.estimators[0][0])) in html_output assert "LinearSVR" in html_output if final_estimator is None: assert "RidgeCV" in html_output else: - assert final_estimator.__class__.__name__ in html_output + assert html.escape(final_estimator.__class__.__name__) in html_output def test_birch_duck_typing_meta(): @@ -240,11 +242,11 @@ def test_birch_duck_typing_meta(): # inner estimators do not show changes with config_context(print_changed_only=True): - assert f"{str(birch.n_clusters)}" in html_output + assert f"{html.escape(str(birch.n_clusters))}" in html_output assert "AgglomerativeClustering" in html_output # outer estimator contains all changes - assert f"{str(birch)}" in html_output + assert f"{html.escape(str(birch))}" in html_output def test_ovo_classifier_duck_typing_meta(): @@ -254,11 +256,11 @@ def test_ovo_classifier_duck_typing_meta(): # inner estimators do not show changes with config_context(print_changed_only=True): - assert f"{str(ovo.estimator)}" in html_output + assert f"{html.escape(str(ovo.estimator))}" in html_output assert "LinearSVC" in html_output # outer estimator - assert f"{str(ovo)}" in html_output + assert f"{html.escape(str(ovo))}" in html_output def test_duck_typing_nested_estimator(): @@ -267,8 +269,8 @@ def test_duck_typing_nested_estimator(): gp = GaussianProcessRegressor(kernel=kernel) html_output = estimator_html_repr(gp) - assert f"{str(kernel)}" in html_output - assert f"{str(gp)}" in html_output + assert f"{html.escape(str(kernel))}" in html_output + assert f"{html.escape(str(gp))}" in html_output @pytest.mark.parametrize("print_changed_only", [True, False]) @@ -276,7 +278,7 @@ def test_one_estimator_print_change_only(print_changed_only): pca = PCA(n_components=10) with config_context(print_changed_only=print_changed_only): - pca_repr = str(pca) + pca_repr = html.escape(str(pca)) html_output = estimator_html_repr(pca) assert pca_repr in html_output diff --git a/sklearn/utils/tests/test_show_versions.py b/sklearn/utils/tests/test_show_versions.py index a2c54379540ca..e6590bfde15f5 100644 --- a/sklearn/utils/tests/test_show_versions.py +++ b/sklearn/utils/tests/test_show_versions.py @@ -1,3 +1,4 @@ +from sklearn.utils.fixes import threadpool_info from sklearn.utils._show_versions import _get_sys_info from sklearn.utils._show_versions import _get_deps_info from sklearn.utils._show_versions import show_versions @@ -34,3 +35,7 @@ def test_show_versions(capsys): assert "python" in out assert "numpy" in out + + info = threadpool_info() + if info: + assert "threadpoolctl info:" in out diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 167118fb4ff8f..18f88373b02f3 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -175,23 +175,75 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype): @pytest.mark.parametrize( - "value, force_all_finite, match_msg", + "value, input_name, force_all_finite, match_msg", [ - (np.inf, True, "Input contains NaN, infinity"), - (np.inf, "allow-nan", "Input contains infinity"), - (np.nan, True, "Input contains NaN, infinity"), - (np.nan, "allow-inf", 'force_all_finite should be a bool or "allow-nan"'), - (np.nan, 1, "Input contains NaN, infinity"), + (np.inf, "", True, "Input contains infinity"), + (np.inf, "X", True, "Input X contains infinity"), + (np.inf, "sample_weight", True, "Input sample_weight contains infinity"), + (np.inf, "X", "allow-nan", "Input X contains infinity"), + (np.nan, "", True, "Input contains NaN"), + (np.nan, "X", True, "Input X contains NaN"), + (np.nan, "y", True, "Input y contains NaN"), + ( + np.nan, + "", + "allow-inf", + 'force_all_finite should be a bool or "allow-nan"', + ), + (np.nan, "", 1, "Input contains NaN"), ], ) @pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) def test_check_array_force_all_finiteinvalid( - value, force_all_finite, match_msg, retype + value, input_name, force_all_finite, match_msg, retype ): - X = retype(np.arange(4).reshape(2, 2).astype(float)) + X = retype(np.arange(4).reshape(2, 2).astype(np.float64)) X[0, 0] = value with pytest.raises(ValueError, match=match_msg): - check_array(X, force_all_finite=force_all_finite, accept_sparse=True) + check_array( + X, + input_name=input_name, + force_all_finite=force_all_finite, + accept_sparse=True, + ) + + +@pytest.mark.parametrize("input_name", ["X", "y", "sample_weight"]) +@pytest.mark.parametrize("retype", [np.asarray, sp.csr_matrix]) +def test_check_array_links_to_imputer_doc_only_for_X(input_name, retype): + data = retype(np.arange(4).reshape(2, 2).astype(np.float64)) + data[0, 0] = np.nan + estimator = SVR() + extended_msg = ( + f"\n{estimator.__class__.__name__} does not accept missing values" + " encoded as NaN natively. For supervised learning, you might want" + " to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor" + " which accept missing values encoded as NaNs natively." + " Alternatively, it is possible to preprocess the" + " data, for instance by using an imputer transformer in a pipeline" + " or drop samples with missing values. See" + " https://scikit-learn.org/stable/modules/impute.html" + ) + + with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx: + check_array( + data, + estimator=estimator, + input_name=input_name, + accept_sparse=True, + ) + + if input_name == "X": + assert extended_msg in ctx.value.args[0] + else: + assert extended_msg not in ctx.value.args[0] + + if input_name == "X": + # Veriy that _validate_data is automatically called with the right argument + # to generate the same exception: + with pytest.raises(ValueError, match=f"Input {input_name} contains NaN") as ctx: + SVR().fit(data, np.ones(data.shape[0])) + assert extended_msg in ctx.value.args[0] def test_check_array_force_all_finite_object(): @@ -212,15 +264,15 @@ def test_check_array_force_all_finite_object(): [ ( np.array([[1, np.nan]]), - "Input contains NaN, infinity or a value too large for.*int", + "Input contains NaN.", ), ( np.array([[1, np.nan]]), - "Input contains NaN, infinity or a value too large for.*int", + "Input contains NaN.", ), ( np.array([[1, np.inf]]), - "Input contains NaN, infinity or a value too large for.*int", + "Input contains infinity or a value too large for.*int", ), (np.array([[1, np.nan]], dtype=object), "cannot convert float NaN to integer"), ], @@ -390,7 +442,9 @@ def test_check_array_dtype_numeric_errors(X): check_array(X, dtype="numeric") -@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"]) +@pytest.mark.parametrize( + "pd_dtype", ["Int8", "Int16", "UInt8", "UInt16", "Float32", "Float64"] +) @pytest.mark.parametrize( "dtype, expected_dtype", [ @@ -400,14 +454,18 @@ def test_check_array_dtype_numeric_errors(X): ], ) def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): - # Test pandas IntegerArray with pd.NA + # Test pandas numerical extension arrays with pd.NA pd = pytest.importorskip("pandas", minversion="1.0") + if pd_dtype in {"Float32", "Float64"}: + # Extension dtypes with Floats was added in 1.2 + pd = pytest.importorskip("pandas", minversion="1.2") + X_np = np.array( [[1, 2, 3, np.nan, np.nan], [np.nan, np.nan, 8, 4, 6], [1, 2, 3, 4, 5]] ).T - # Creates dataframe with IntegerArrays with pd.NA + # Creates dataframe with numerical extension arrays with pd.NA X = pd.DataFrame(X_np, dtype=pd_dtype, columns=["a", "b", "c"]) # column c has no nans X["c"] = X["c"].astype("float") @@ -419,7 +477,7 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): assert_allclose(X_checked, X_np) assert X_checked.dtype == expected_dtype - msg = "Input contains NaN, infinity" + msg = "Input contains NaN" with pytest.raises(ValueError, match=msg): check_array(X, force_all_finite=True) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 0380af76f5140..9d6035d52ed9b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -87,7 +87,9 @@ def inner_f(*args, **kwargs): return _inner_deprecate_positional_args -def _assert_all_finite(X, allow_nan=False, msg_dtype=None): +def _assert_all_finite( + X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name="" +): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from .extmath import _safe_accumulator_op @@ -103,26 +105,52 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: - msg_err = "Input contains {} or a value too large for {!r}." if ( allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all() ): - type_err = "infinity" if allow_nan else "NaN, infinity" - raise ValueError( - msg_err.format( - type_err, msg_dtype if msg_dtype is not None else X.dtype + if not allow_nan and np.isnan(X).any(): + type_err = "NaN" + else: + msg_dtype = msg_dtype if msg_dtype is not None else X.dtype + type_err = f"infinity or a value too large for {msg_dtype!r}" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + if ( + not allow_nan + and estimator_name + and input_name == "X" + and np.isnan(X).any() + ): + # Improve the error message on how to handle missing values in + # scikit-learn. + msg_err += ( + f"\n{estimator_name} does not accept missing values" + " encoded as NaN natively. For supervised learning, you might want" + " to consider sklearn.ensemble.HistGradientBoostingClassifier and" + " Regressor which accept missing values encoded as NaNs natively." + " Alternatively, it is possible to preprocess the data, for" + " instance by using an imputer transformer in a pipeline or drop" + " samples with missing values. See" + " https://scikit-learn.org/stable/modules/impute.html" ) - ) + raise ValueError(msg_err) + # for object dtype data, we only check for NaNs (GH-13254) elif X.dtype == np.dtype("object") and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN") -def assert_all_finite(X, *, allow_nan=False): +def assert_all_finite( + X, + *, + allow_nan=False, + estimator_name=None, + input_name="", +): """Throw a ValueError if X contains NaN or infinity. Parameters @@ -130,12 +158,26 @@ def assert_all_finite(X, *, allow_nan=False): X : {ndarray, sparse matrix} allow_nan : bool, default=False + + estimator_name : str, default=None + The estimator name, used to construct the error message. + + input_name : str, default="" + The data name used to construct the error message. In particular + if `input_name` is "X" and the data has NaN values and + allow_nan is False, the error message will link to the imputer + documentation. """ - _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + estimator_name=estimator_name, + input_name=input_name, + ) def as_float_array(X, *, copy=True, force_all_finite=True): - """Converts an array-like to an array of floats. + """Convert an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original type. The function can create a copy or modify the argument depending @@ -144,6 +186,7 @@ def as_float_array(X, *, copy=True, force_all_finite=True): Parameters ---------- X : {array-like, sparse matrix} + The input data. copy : bool, default=True If True, a copy of X will be created. If False, a copy may still be @@ -379,7 +422,14 @@ def indexable(*iterables): def _ensure_sparse_format( - spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse + spmatrix, + accept_sparse, + dtype, + copy, + force_all_finite, + accept_large_sparse, + estimator_name=None, + input_name="", ): """Convert a sparse matrix to a given format. @@ -419,6 +469,16 @@ def _ensure_sparse_format( .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` + + estimator_name : str, default=None + The estimator name, used to construct the error message. + + input_name : str, default="" + The data name used to construct the error message. In particular + if `input_name` is "X" and the data has NaN values and + allow_nan is False, the error message will link to the imputer + documentation. + Returns ------- spmatrix_converted : sparse matrix. @@ -475,7 +535,12 @@ def _ensure_sparse_format( stacklevel=2, ) else: - _assert_all_finite(spmatrix.data, allow_nan=force_all_finite == "allow-nan") + _assert_all_finite( + spmatrix.data, + allow_nan=force_all_finite == "allow-nan", + estimator_name=estimator_name, + input_name=input_name, + ) return spmatrix @@ -490,6 +555,42 @@ def _ensure_no_complex_data(array): raise ValueError("Complex data not supported\n{}\n".format(array)) +def _check_estimator_name(estimator): + if estimator is not None: + if isinstance(estimator, str): + return estimator + else: + return estimator.__class__.__name__ + return None + + +def _pandas_dtype_needs_early_conversion(pd_dtype): + """Return True if pandas extension pd_dtype need to be converted early.""" + try: + from pandas.api.types import ( + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_sparse, + ) + except ImportError: + return False + + if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype): + # Sparse arrays will be converted later in `check_array` + # Only handle extension arrays for interger and floats + return False + elif is_float_dtype(pd_dtype): + # Float ndarrays can normally support nans. They need to be converted + # first to map pd.NA to np.nan + return True + elif is_integer_dtype(pd_dtype): + # XXX: Warn when converting from a high integer to a float + return True + + return False + + def check_array( array, accept_sparse=False, @@ -504,6 +605,7 @@ def check_array( ensure_min_samples=1, ensure_min_features=1, estimator=None, + input_name="", ): """Input validation on an array, list, sparse matrix or similar. @@ -583,6 +685,14 @@ def check_array( estimator : str or estimator instance, default=None If passed, include the name of the estimator in warning messages. + input_name : str, default="" + The data name used to construct the error message. In particular + if `input_name` is "X" and the data has NaN values and + allow_nan is False, the error message will link to the imputer + documentation. + + .. versionadded:: 1.1.0 + Returns ------- array_converted : object @@ -612,7 +722,7 @@ def check_array( # check if the object contains several dtypes (typically a pandas # DataFrame), and store them. If not, store None. dtypes_orig = None - has_pd_integer_array = False + pandas_requires_conversion = False if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be preserved (later). @@ -625,42 +735,17 @@ def check_array( "It will be converted to a dense numpy array." ) - dtypes_orig = list(array.dtypes) - # pandas boolean dtype __array__ interface coerces bools to objects - for i, dtype_iter in enumerate(dtypes_orig): + dtypes_orig = [] + for dtype_iter in array.dtypes: if dtype_iter.kind == "b": - dtypes_orig[i] = np.dtype(object) - elif dtype_iter.name.startswith(("Int", "UInt")): - # name looks like an Integer Extension Array, now check for - # the dtype - with suppress(ImportError): - from pandas import ( - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - ) - - if isinstance( - dtype_iter, - ( - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - ), - ): - has_pd_integer_array = True - - if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): + # pandas boolean dtype __array__ interface coerces bools to objects + dtype_iter = np.dtype(object) + elif _pandas_dtype_needs_early_conversion(dtype_iter): + pandas_requires_conversion = True + + dtypes_orig.append(dtype_iter) + + if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: @@ -679,9 +764,12 @@ def check_array( # list of accepted types. dtype = dtype[0] - if has_pd_integer_array: - # If there are any pandas integer extension arrays, + if pandas_requires_conversion: + # pandas dataframe requires conversion earlier to handle extension dtypes with + # nans array = array.astype(dtype) + # Since we converted here, we do not need to convert again later + dtype = None if force_all_finite not in (True, False, "allow-nan"): raise ValueError( @@ -690,13 +778,7 @@ def check_array( ) ) - if estimator is not None: - if isinstance(estimator, str): - estimator_name = estimator - else: - estimator_name = estimator.__class__.__name__ - else: - estimator_name = "Estimator" + estimator_name = _check_estimator_name(estimator) context = " by %s" % estimator_name if estimator is not None else "" # When all dataframe columns are sparse, convert to a sparse array @@ -723,6 +805,8 @@ def check_array( copy=copy, force_all_finite=force_all_finite, accept_large_sparse=accept_large_sparse, + estimator_name=estimator_name, + input_name=input_name, ) else: # If np.array(..) gives ComplexWarning, then we convert the warning @@ -739,7 +823,13 @@ def check_array( # then conversion float -> int would be disallowed. array = np.asarray(array, order=order) if array.dtype.kind == "f": - _assert_all_finite(array, allow_nan=False, msg_dtype=dtype) + _assert_all_finite( + array, + allow_nan=False, + msg_dtype=dtype, + estimator_name=estimator_name, + input_name=input_name, + ) array = array.astype(dtype, casting="unsafe", copy=False) else: array = np.asarray(array, order=order, dtype=dtype) @@ -796,7 +886,12 @@ def check_array( ) if force_all_finite: - _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") + _assert_all_finite( + array, + input_name=input_name, + estimator_name=estimator_name, + allow_nan=force_all_finite == "allow-nan", + ) if ensure_min_samples > 0: n_samples = _num_samples(array) @@ -973,24 +1068,32 @@ def check_X_y( ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, estimator=estimator, + input_name="X", ) - y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric) + y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) check_consistent_length(X, y) return X, y -def _check_y(y, multi_output=False, y_numeric=False): +def _check_y(y, multi_output=False, y_numeric=False, estimator=None): """Isolated part of check_X_y dedicated to y validation""" if multi_output: y = check_array( - y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None + y, + accept_sparse="csr", + force_all_finite=True, + ensure_2d=False, + dtype=None, + input_name="y", + estimator=estimator, ) else: + estimator_name = _check_estimator_name(estimator) y = column_or_1d(y, warn=True) - _assert_all_finite(y) + _assert_all_finite(y, input_name="y", estimator_name=estimator_name) _ensure_no_complex_data(y) if y_numeric and y.dtype.kind == "O": y = y.astype(np.float64) @@ -1004,6 +1107,7 @@ def column_or_1d(y, *, warn=False): Parameters ---------- y : array-like + Input data. warn : bool, default=False To control display of warnings. @@ -1011,7 +1115,12 @@ def column_or_1d(y, *, warn=False): Returns ------- y : ndarray + Output data. + Raises + ------- + ValueError + If `y` is not a 1D array or a 2D array with a single row or column. """ y = np.asarray(y) shape = np.shape(y) @@ -1056,7 +1165,7 @@ def check_random_state(seed): def has_fit_parameter(estimator, parameter): - """Checks whether the estimator's fit method supports the given parameter. + """Check whether the estimator's fit method supports the given parameter. Parameters ---------- @@ -1068,7 +1177,7 @@ def has_fit_parameter(estimator, parameter): Returns ------- - is_parameter: bool + is_parameter : bool Whether the parameter was found to be a named parameter of the estimator's fit method. @@ -1078,7 +1187,6 @@ def has_fit_parameter(estimator, parameter): >>> from sklearn.utils.validation import has_fit_parameter >>> has_fit_parameter(SVC(), "sample_weight") True - """ return parameter in signature(estimator.fit).parameters @@ -1558,6 +1666,7 @@ def _check_sample_weight( dtype=dtype, order="C", copy=copy, + input_name="sample_weight", ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") @@ -1702,7 +1811,7 @@ def _get_feature_names(X): return feature_names -def _check_feature_names_in(estimator, input_features=None): +def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): """Get output feature names for transformation. Parameters @@ -1716,9 +1825,13 @@ def _check_feature_names_in(estimator, input_features=None): - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. + generate_names : bool, default=True + Wether to generate names when `input_features` is `None` and + `estimator.feature_names_in_` is not defined. + Returns ------- - feature_names_in : ndarray of str + feature_names_in : ndarray of str or `None` Feature names in. """ @@ -1742,8 +1855,40 @@ def _check_feature_names_in(estimator, input_features=None): if feature_names_in_ is not None: return feature_names_in_ + if not generate_names: + return + # Generates feature names if `n_features_in_` is defined if n_features_in_ is None: raise ValueError("Unable to generate feature names without n_features_in_") return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) + + +def _generate_get_feature_names_out(estimator, n_features_out, input_features=None): + """Generate feature names out for estimator using the estimator name as the prefix. + + The input_feature names are validated but not used. This function is useful + for estimators that generate their own names based on `n_features_out`, i.e. PCA. + + Parameters + ---------- + estimator : estimator instance + Estimator producing output feature names. + + n_feature_out : int + Number of feature names out. + + input_features : array-like of str or None, default=None + Only used to validate feature names with `estimator.feature_names_in_`. + + Returns + ------- + feature_names_in : ndarray of str or `None` + Feature names in. + """ + _check_feature_names_in(estimator, input_features, generate_names=False) + estimator_name = estimator.__class__.__name__.lower() + return np.asarray( + [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object + )