From 57b0f1872ca45f0b8e542be0f1c01c0bb0669a48 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 22 Dec 2020 15:24:50 +0100 Subject: [PATCH 01/38] DOC 0.24.0 release highlights formatting (#19059) From 319bc3911039f899a1d52acfd89a431593fc9597 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 22 Dec 2020 16:02:21 +0100 Subject: [PATCH 02/38] MNT update the number of wheels generated to upload to PyPI --- build_tools/github/check_wheels.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py index c1c183f279b54..05c70085b4081 100644 --- a/build_tools/github/check_wheels.py +++ b/build_tools/github/check_wheels.py @@ -11,11 +11,11 @@ build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix'] n_python_versions = len(build_matrix['python']) -# For each python version we have: 5 wheels +# For each python version we have: 6 wheels # 1 osx wheel (x86_64) -# 2 linux wheel (i686 + x86_64) +# 3 linux wheel (i686 + x86_64 + arm64) # 2 windows wheel (win32 + wind_amd64) -n_wheels = 5 * n_python_versions +n_wheels = 6 * n_python_versions # plus one more for the sdist n_wheels += 1 From 203fb7c9cf70c6213c9f1f96e167b5b8f4e894f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 22 Dec 2020 16:22:02 +0100 Subject: [PATCH 03/38] MNT fix publish to pypi conditions --- .github/workflows/publish_pypi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index 0e2b9ffd0f258..677188a3567b3 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -40,10 +40,10 @@ jobs: user: __token__ password: ${{ secrets.TEST_PYPI_TOKEN }} repository_url: https://test.pypi.org/legacy/ - if: ${{ github.event.inputs.pypi_repo }} == 'testpypi' + if: ${{ github.event.inputs.pypi_repo == 'testpypi' }} - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} - if: ${{ github.event.inputs.pypi_repo }} == 'pypi' + if: ${{ github.event.inputs.pypi_repo == 'pypi' }} From e467ba8c879f384535875d68673001759862fea6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 22 Dec 2020 11:48:46 -0500 Subject: [PATCH 04/38] CI Publish to Pypi workflow for aarch64 wheels (#19060) --- build_tools/github/check_wheels.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py index 05c70085b4081..64cebe3b6b0c4 100644 --- a/build_tools/github/check_wheels.py +++ b/build_tools/github/check_wheels.py @@ -11,15 +11,25 @@ build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix'] n_python_versions = len(build_matrix['python']) -# For each python version we have: 6 wheels +# For each python version we have: 5 wheels # 1 osx wheel (x86_64) -# 3 linux wheel (i686 + x86_64 + arm64) +# 2 linux wheel (i686 + x86_64) # 2 windows wheel (win32 + wind_amd64) -n_wheels = 6 * n_python_versions +n_wheels = 5 * n_python_versions # plus one more for the sdist n_wheels += 1 +# aarch64 builds from travis +travis_config_path = Path.cwd() / ".travis.yml" +with travis_config_path.open('r') as f: + travis_config = yaml.safe_load(f) + +jobs = travis_config['jobs']['include'] +travis_builds = [j for j in jobs + if any("CIBW_BUILD" in env for env in j["env"])] +n_wheels += len(travis_builds) + dist_files = list(Path("dist").glob('**/*')) n_dist_files = len(dist_files) From d9bdc06b0e7c5a15f475d54f8d6b9eb3d50801a6 Mon Sep 17 00:00:00 2001 From: Harry Wei Date: Mon, 4 Jan 2021 18:00:06 +0800 Subject: [PATCH 05/38] DOC typo correction in neighbors.rst (#19099) --- doc/modules/neighbors.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index 36a9c86d98e24..bb84b79e8570a 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -433,7 +433,7 @@ based on the following assumptions: training points * ``leaf_size`` is close to its default value of ``30`` * when :math:`D > 15`, the intrinsic dimensionality of the data is generally - to high for tree-based methods + too high for tree-based methods Effect of ``leaf_size`` ----------------------- From 665a389c3d877181b0ef474b57d721e3eba7afac Mon Sep 17 00:00:00 2001 From: yzhenman <65328572+yzhenman@users.noreply.github.com> Date: Mon, 4 Jan 2021 02:04:55 -0800 Subject: [PATCH 06/38] DOC fix dataset used for visualization in digits classification example (#19095) --- examples/classification/plot_digits_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py index 35843883df1b2..0d1a79f609f7d 100644 --- a/examples/classification/plot_digits_classification.py +++ b/examples/classification/plot_digits_classification.py @@ -78,8 +78,9 @@ # digit value in the title. _, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3)) -for ax, image, prediction in zip(axes, digits.images, predicted): +for ax, image, prediction in zip(axes, X_test, predicted): ax.set_axis_off() + image = image.reshape(8, 8) ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') ax.set_title(f'Prediction: {prediction}') From 5be5808b27defdbe28d697aeed5d795eace8e812 Mon Sep 17 00:00:00 2001 From: "Paulo S. Costa" Date: Tue, 5 Jan 2021 19:19:15 -0800 Subject: [PATCH 07/38] DOC Fix cross-validation wording in RidgeCV (#19121) --- sklearn/linear_model/_ridge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 199a1cd760660..7e8d5d273d1df 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1670,8 +1670,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): See glossary entry for :term:`cross-validation estimator`. - By default, it performs Leave-One-Out Cross-Validation, which is a form of - efficient Leave-One-Out cross-validation. + By default, it performs efficient Leave-One-Out Cross-Validation. Read more in the :ref:`User Guide `. From 98bcc24a77140b9ed9bb464e6863943f5e619398 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 6 Jan 2021 04:11:53 -0500 Subject: [PATCH 08/38] TST Skips test_compare_to_ELKI for arm (#19115) * TST Skips test for arm [cd build] * CI Skip for 32bit linux [cd build] --- sklearn/cluster/tests/test_optics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 03ca4995c0446..4428b6c00d7eb 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -1,6 +1,8 @@ # Authors: Shane Grigsby # Adrin Jalali # License: BSD 3 clause +import platform +import sys import numpy as np import pytest @@ -15,8 +17,10 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_allclose +from sklearn.utils.fixes import sp_version, parse_version from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.utils import _IS_32BIT rng = np.random.RandomState(0) @@ -314,6 +318,11 @@ def test_processing_order(): assert_array_equal(clust.ordering_, [0, 1, 2, 3]) +@pytest.mark.skipif(sp_version >= parse_version("1.6.0") + and (platform.machine() == "aarch64" or + (sys.platform == "linux" and _IS_32BIT)), + reason=("Test fails for SciPy 1.6.0 on ARM and on 32-bit " + "linux. See #19111")) def test_compare_to_ELKI(): # Expected values, computed with (future) ELKI 0.7.5 using: # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter From 110333fdf98e41ca3f2f669ea0f9e8f07d17566b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 6 Jan 2021 15:48:57 +0100 Subject: [PATCH 09/38] CI Reduce travis nightly load (#19113) Co-authored-by: Thomas J. Fan --- .travis.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3f631d9f8bc90..3c995f35253ae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,15 +36,21 @@ jobs: - BUILD_WITH_ICC=true if: type = cron OR commit_message =~ /\[icc-build\]/ - - python: 3.7 + # Manual trigger of linux/arm64 tests in PR without triggering the full + # wheel building process for all the Python versions. + - python: 3.9 os: linux arch: arm64 - if: type = cron OR commit_message =~ /\[arm64\]/ + if: commit_message =~ /\[arm64\]/ env: - CPU_COUNT=8 - # Linux environments to build the scikit-learn wheels - # for the ARM64 arquitecture and Python 3.6 and newer + # Linux environments to build the scikit-learn wheels for the ARM64 + # architecture and Python 3.6 and newer. This is used both at release time + # with the manual trigger in the commit message in the release branch and as + # a scheduled task to build the weekly dev build on the master branch. The + # weekly frequency is meant to avoid depleting the Travis CI credits too + # fast. - python: 3.6 os: linux arch: arm64 From 867cf5f6c7f47156dda79de114f80717e75eee5f Mon Sep 17 00:00:00 2001 From: shinnar Date: Fri, 8 Jan 2021 03:12:02 -0500 Subject: [PATCH 10/38] DOC Fix docstring of HalvingSearch estimators (#19133) --- sklearn/model_selection/_search_successive_halving.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 512595b1943ce..b522ce7fbda41 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -448,7 +448,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this - ``GridSearchCV`` instance. + ``HalvingGridSearchCV`` instance. error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. @@ -735,7 +735,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this - ``GridSearchCV`` instance. + ``HalvingRandomSearchCV`` instance. error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. From 59f6ec959d4574503a450567b7f58fa0a386c6c6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 8 Jan 2021 12:39:00 +0100 Subject: [PATCH 11/38] FIX accept meta-estimator in SelfTrainingClassifier (#19126) Co-authored-by: Thomas J. Fan Co-authored-by: Olivier Grisel --- doc/whats_new/v0.24.rst | 17 ++++++++++++ sklearn/semi_supervised/_self_training.py | 8 +++--- .../tests/test_self_training.py | 27 ++++++++++++++++++- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index b2c6db64969f0..ca96e875d342a 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -2,6 +2,23 @@ .. currentmodule:: sklearn +.. _changes_0_24_1: + +Version 0.24.1 +============== + +Changelog +--------- + +:mod:`sklearn.semi_supervised` +.............................. + +- |Fix| :class:`semi_supervised.SelfTrainingClassifier` is now accepting + meta-estimator (e.g. :class:`ensemble.StackingClassifier`). The validation + of this estimator is done on the fitted estimator, once we know the existence + of the method `predict_proba`. + :pr:`19126` by :user:`Guillaume Lemaitre `. + .. _changes_0_24: Version 0.24.0 diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index c3ed0baeaae68..8c79065c830d1 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -205,10 +205,10 @@ def fit(self, X, y): X[safe_mask(X, has_label)], self.transduction_[has_label]) - if self.n_iter_ == 1: - # Only validate in the first iteration so that n_iter=0 is - # equivalent to the base_estimator itself. - _validate_estimator(self.base_estimator) + # Validate the fitted estimator since `predict_proba` can be + # delegated to an underlying "final" fitted estimator as + # generally done in meta-estimator or pipeline. + _validate_estimator(self.base_estimator_) # Predict on the unlabeled samples prob = self.base_estimator_.predict_proba( diff --git a/sklearn/semi_supervised/tests/test_self_training.py b/sklearn/semi_supervised/tests/test_self_training.py index b5c44996d5e52..7c5287be9974c 100644 --- a/sklearn/semi_supervised/tests/test_self_training.py +++ b/sklearn/semi_supervised/tests/test_self_training.py @@ -4,14 +4,16 @@ from numpy.testing import assert_array_equal import pytest +from sklearn.ensemble import StackingClassifier from sklearn.exceptions import NotFittedError -from sklearn.semi_supervised import SelfTrainingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris, make_blobs from sklearn.metrics import accuracy_score +from sklearn.semi_supervised import SelfTrainingClassifier + # Author: Oliver Rausch # License: BSD 3 clause @@ -318,3 +320,26 @@ def test_k_best_selects_best(): for row in most_confident_svc.tolist(): assert row in added_by_st + + +def test_base_estimator_meta_estimator(): + # Check that a meta-estimator relying on an estimator implementing + # `predict_proba` will work even if it does expose this method before being + # fitted. + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/19119 + + base_estimator = StackingClassifier( + estimators=[ + ("svc_1", SVC(probability=True)), ("svc_2", SVC(probability=True)), + ], + final_estimator=SVC(probability=True), cv=2 + ) + + # make sure that the `base_estimator` does not expose `predict_proba` + # without being fitted + assert not hasattr(base_estimator, "predict_proba") + + clf = SelfTrainingClassifier(base_estimator=base_estimator) + clf.fit(X_train, y_train_missing_labels) + clf.predict_proba(X_test) From f317eadb6bdefd6c0e456ab49290afdefd2cb09f Mon Sep 17 00:00:00 2001 From: Connor Tann Date: Fri, 8 Jan 2021 13:34:35 +0000 Subject: [PATCH 12/38] DOC Fix typo in datasets.rst (#19136) Fix typo in dataset loading docs --- doc/datasets.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/datasets.rst b/doc/datasets.rst index 30efdae06b1e3..b9484a02ce84c 100644 --- a/doc/datasets.rst +++ b/doc/datasets.rst @@ -39,8 +39,8 @@ an array of shape ``n_samples`` * ``n_features`` with key ``data`` (except for 20newsgroups) and a numpy array of length ``n_samples``, containing the target values, with key ``target``. -The Bunch object is a dictionary that exposes its keys are attributes. -For more information about Bunch object, see :class:`~sklearn.utils.Bunch`: +The Bunch object is a dictionary that exposes its keys as attributes. +For more information about Bunch object, see :class:`~sklearn.utils.Bunch`. It's also possible for almost all of these function to constrain the output to be a tuple containing only the data and the target, by setting the From e4a63dba4c2f77954b1228cd7a1d3a472210c776 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 8 Jan 2021 18:55:53 +0100 Subject: [PATCH 13/38] DOC Update docs guideline regarding docstring formatting (#18243) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Olivier Grisel Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- doc/developers/contributing.rst | 14 ++++++- doc/glossary.rst | 7 ++++ .../model_selection/plot_learning_curve.py | 2 +- sklearn/dummy.py | 8 ++-- sklearn/linear_model/_least_angle.py | 38 ++++++++++--------- sklearn/preprocessing/_discretization.py | 6 +-- 6 files changed, 48 insertions(+), 27 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 78c1175620c4f..8a3c460c615a8 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -779,6 +779,8 @@ Finally, follow the formatting rules below to make it consistently good: sample_weight : array-like of shape (n_samples,), default=None + multioutput_array : ndarray of shape (n_samples, n_classes) or list of such arrays + In general have the following in mind: 1. Use Python basic types. (``bool`` instead of ``boolean``) @@ -792,10 +794,18 @@ Finally, follow the formatting rules below to make it consistently good: 5. Specify ``dataframe`` when "frame-like" features are being used, such as the column names. 6. When specifying the data type of a list, use ``of`` as a delimiter: - ``list of int``. + ``list of int``. When the parameter supports arrays giving details + about the shape and/or data type and a list of such arrays, you can + use one of ``array-like of shape (n_samples,) or list of such arrays``. 7. When specifying the dtype of an ndarray, use e.g. ``dtype=np.int32`` after defining the shape: - ``ndarray of shape (n_samples,), dtype=np.int32``. + ``ndarray of shape (n_samples,), dtype=np.int32``. You can specify + multiple dtype as a set: + ``array-like of shape (n_samples,), dtype={np.float64, np.float32}``. + If one wants to mention arbitrary precision, use `integral` and + `floating` rather than the Python dtype `int` and `float`. When both + `int` and `floating` are supported, there is no need to specify the + dtype. 8. When the default is ``None``, ``None`` only needs to be specified at the end with ``default=None``. Be sure to include in the docstring, what it means for the parameter or attribute to be ``None``. diff --git a/doc/glossary.rst b/doc/glossary.rst index 30e647be1c0f4..a43eda4a79b67 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -255,6 +255,13 @@ General Concepts or vectorizing. Our estimators do not work with struct arrays, for instance. + Our documentation can sometimes give information about the dtype + precision, e.g. `np.int32`, `np.int64`, etc. When the precision is + provided, it refers to the NumPy dtype. If an arbitrary precision is + used, the documentation will refer to dtype `integer` or `floating`. + Note that in this case, the precision can be platform dependent. + The `numeric` dtype refers to accepting both `integer` and `floating`. + TODO: Mention efficiency and precision issues; casting policy. duck typing diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py index ee9809f27e44f..71cc565c3528c 100644 --- a/examples/model_selection/plot_learning_curve.py +++ b/examples/model_selection/plot_learning_curve.py @@ -77,7 +77,7 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, ``-1`` means using all processors. See :term:`Glossary ` for more details. - train_sizes : array-like of shape (n_ticks,), dtype={int, float} + train_sizes : array-like of shape (n_ticks,) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the ``dtype`` is float, it is regarded as a fraction of the maximum size of the training set (that is diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 66992d83f83f4..ad5ab3f24731d 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -64,13 +64,13 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): Attributes ---------- - classes_ : ndarray of shape (n_classes,) or list thereof + classes_ : ndarray of shape (n_classes,) or list of such arrays Class labels for each output. n_classes_ : int or list of int Number of label for each output. - class_prior_ : ndarray of shape (n_classes,) or list thereof + class_prior_ : ndarray of shape (n_classes,) or list of such arrays Probability of each class for each output. n_outputs_ : int @@ -272,7 +272,7 @@ def predict_proba(self, X): Returns ------- - P : ndarray of shape (n_samples, n_classes) or list thereof + P : ndarray of shape (n_samples, n_classes) or list of such arrays Returns the probability of the sample for each class in the model, where classes are ordered arithmetically, for each output. @@ -335,7 +335,7 @@ def predict_log_proba(self, X): Returns ------- - P : ndarray of shape (n_samples, n_classes) or list thereof + P : ndarray of shape (n_samples, n_classes) or list of such arrays Returns the log probability of the sample for each class in the model, where classes are ordered arithmetically for each output. diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index e1d9146b5f2ea..55e37ff51fc6a 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -864,21 +864,22 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): Attributes ---------- - alphas_ : array-like of shape (n_alphas + 1,) or list of thereof of \ - shape (n_targets,) + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the number of nodes in the path with ``alpha >= alpha_min``, whichever - is smaller. + is smaller. If this is a list of array-like, the length of the outer + list is `n_targets`. - active_ : list of shape (n_alphas,) or list of thereof of shape \ - (n_targets,) + active_ : list of shape (n_alphas,) or list of such lists Indices of active variables at the end of the path. + If this is a list of list, the length of the outer list is `n_targets`. - coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of \ - thereof of shape (n_targets,) + coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ + of such arrays The varying values of the coefficients along the path. It is not - present if the ``fit_path`` parameter is ``False``. + present if the ``fit_path`` parameter is ``False``. If this is a list + of array-like, the length of the outer list is `n_targets`. coef_ : array-like of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the formulation formula). @@ -1121,21 +1122,23 @@ class LassoLars(Lars): Attributes ---------- - alphas_ : array-like of shape (n_alphas + 1,) or list of thereof of shape \ - (n_targets,) + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the number of nodes in the path with ``alpha >= alpha_min``, whichever - is smaller. + is smaller. If this is a list of array-like, the length of the outer + list is `n_targets`. - active_ : list of length n_alphas or list of thereof of shape (n_targets,) + active_ : list of length n_alphas or list of such lists Indices of active variables at the end of the path. + If this is a list of list, the length of the outer list is `n_targets`. - coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of \ - thereof of shape (n_targets,) + coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ + of such arrays If a list is passed it's expected to be one of n_targets such arrays. The varying values of the coefficients along the path. It is not - present if the ``fit_path`` parameter is ``False``. + present if the ``fit_path`` parameter is ``False``. If this is a list + of array-like, the length of the outer list is `n_targets`. coef_ : array-like of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the formulation formula). @@ -1382,8 +1385,9 @@ class LarsCV(Lars): Attributes ---------- - active_ : list of length n_alphas or list of thereof of shape (n_targets,) + active_ : list of length n_alphas or list of such lists Indices of active variables at the end of the path. + If this is a list of lists, the outer list length is `n_targets`. coef_ : array-like of shape (n_features,) parameter vector (w in the formulation formula) @@ -1775,7 +1779,7 @@ class LassoLarsIC(LassoLars): alpha_ : float the alpha parameter chosen by the information criterion - alphas_ : array-like of shape (n_alphas + 1,) or list thereof + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the number of nodes in the path with ``alpha >= alpha_min``, whichever diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index a628533ac13d0..22fa236f3314e 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -139,7 +139,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like of shape (n_samples, n_features), dtype={int, float} + X : array-like of shape (n_samples, n_features) Data to be discretized. y : None @@ -276,7 +276,7 @@ def transform(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features), dtype={int, float} + X : array-like of shape (n_samples, n_features) Data to be discretized. Returns @@ -326,7 +326,7 @@ def inverse_transform(self, Xt): Parameters ---------- - Xt : array-like of shape (n_samples, n_features), dtype={int, float} + Xt : array-like of shape (n_samples, n_features) Transformed data in the binned space. Returns From 3ebe1a5f9f974d06560e3a2e02a7980e27b24b92 Mon Sep 17 00:00:00 2001 From: Kunj Date: Sat, 9 Jan 2021 09:47:13 -0800 Subject: [PATCH 14/38] DOC Update docs for StandardScaler.scale_ to include 0 variance (#19124) Co-authored-by: Thomas J. Fan --- sklearn/preprocessing/_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 478d41ecc768a..3921b898c072d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -617,8 +617,11 @@ class StandardScaler(TransformerMixin, BaseEstimator): Attributes ---------- scale_ : ndarray of shape (n_features,) or None - Per feature relative scaling of the data. This is calculated using - `np.sqrt(var_)`. Equal to ``None`` when ``with_std=False``. + Per feature relative scaling of the data to achieve zero mean and unit + variance. Generally this is calculated using `np.sqrt(var_)`. If a + variance is zero, we can't achieve unit variance, and the data is left + as-is, giving a scaling factor of 1. `scale_` is equal to `None` + when `with_std=False`. .. versionadded:: 0.17 *scale_* From 890caa4d6345d1db2893f784f259050cdca2ada3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 11 Jan 2021 14:12:16 +0100 Subject: [PATCH 15/38] CI Use macos-10.13 compatible libomp when building the wheels (#19064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- .github/workflows/wheels.yml | 1 + build_tools/github/build_wheels.sh | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ac1d495642049..17726ec9a112b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -86,6 +86,7 @@ jobs: OPENBLAS_NUM_THREADS=2 SKLEARN_SKIP_NETWORK_TESTS=1 SKLEARN_BUILD_PARALLEL=3 + MACOSX_DEPLOYMENT_TARGET=10.13 CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }} CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }} diff --git a/build_tools/github/build_wheels.sh b/build_tools/github/build_wheels.sh index 917fc14fdb651..9b45481cbb978 100644 --- a/build_tools/github/build_wheels.sh +++ b/build_tools/github/build_wheels.sh @@ -5,7 +5,15 @@ set -x # OpenMP is not present on macOS by default if [[ "$RUNNER_OS" == "macOS" ]]; then - brew install libomp + # Make sure to use a libomp version binary compatible with the oldest + # supported version of the macos SDK as libomp will be vendored into the + # scikit-learn wheels for macos. The list of bottles can be found at: + # https://formulae.brew.sh/api/formula/libomp.json. Currently, the oldest + # supported macos version is: High Sierra / 10.13. When upgrading this, be + # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in + # wheels.yml accordingly. + wget https://homebrew.bintray.com/bottles/libomp-11.0.0.high_sierra.bottle.tar.gz + brew install libomp-11.0.0.high_sierra.bottle.tar.gz export CC=/usr/bin/clang export CXX=/usr/bin/clang++ export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" From 7168e14cbb3e40627fa71111e70292b951dd16f2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 11 Jan 2021 17:37:31 +0000 Subject: [PATCH 16/38] DOC minor broken links fix in parallelism docs (#19151) Co-authored-by: Thomas J. Fan --- doc/computing/parallelism.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst index c30d0790c1f01..3dce5ef66bb1d 100644 --- a/doc/computing/parallelism.rst +++ b/doc/computing/parallelism.rst @@ -114,9 +114,11 @@ threads than the number of CPUs on a machine. Over-subscription happens when a program is running too many threads at the same time. Suppose you have a machine with 8 CPUs. Consider a case where you're running -a :class:`~GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over -a :class:`~HistGradientBoostingClassifier` (parallelized with OpenMP). Each -instance of :class:`~HistGradientBoostingClassifier` will spawn 8 threads +a :class:`~sklearn.model_selection.GridSearchCV` (parallelized with joblib) +with ``n_jobs=8`` over a +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` (parallelized with +OpenMP). Each instance of +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` will spawn 8 threads (since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which leads to oversubscription of physical CPU resources and to scheduling overhead. @@ -129,9 +131,10 @@ is the default), joblib will tell its child **processes** to limit the number of threads they can use, so as to avoid oversubscription. In practice the heuristic that joblib uses is to tell the processes to use ``max_threads = n_cpus // n_jobs``, via their corresponding environment variable. Back to -our example from above, since the joblib backend of :class:`~GridSearchCV` -is ``loky``, each process will only be able to use 1 thread instead of 8, -thus mitigating the oversubscription issue. +our example from above, since the joblib backend of +:class:`~sklearn.model_selection.GridSearchCV` is ``loky``, each process will +only be able to use 1 thread instead of 8, thus mitigating the +oversubscription issue. Note that: From bbc46f3cd12a5302c95a41533127b30c6d265c2d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 11 Jan 2021 12:44:04 -0500 Subject: [PATCH 17/38] DOC Adds default to SpectralClustering (#19149) --- sklearn/cluster/_spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 79a0b77954028..b86d5870025c3 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -356,7 +356,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): increase with similarity) should be used. This property is not checked by the clustering algorithm. - n_neighbors : int + n_neighbors : int, default=10 Number of neighbors to use when constructing the affinity matrix using the nearest neighbors method. Ignored for ``affinity='rbf'``. From 463894431c1b14cbf7350276c766e7ed57529f8f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 11 Jan 2021 18:46:18 +0100 Subject: [PATCH 18/38] DOC Update installation instructions for macos/arm64 (#19146) --- doc/developers/advanced_installation.rst | 7 +++- doc/install.rst | 45 +++++++++++++++++------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index fdda0076428af..7b305c13200a7 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -238,6 +238,11 @@ to enable OpenMP support: - or install `libomp` with Homebrew to extend the default Apple clang compiler. +For Apple Silicon M1 hardware, only the conda-forge method below is known to +work at the time of writing (January 2021). You can install the `macos/arm64` +distribution of conda using the `miniforge installer +`_ + macOS compilers from conda-forge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -257,7 +262,7 @@ scikit-learn from source: .. prompt:: bash $ conda create -n sklearn-dev -c conda-forge python numpy scipy cython \ - joblib threadpoolctl pytest "compilers>=1.0.4,!=1.1.0" llvm-openmp + joblib threadpoolctl pytest compilers llvm-openmp conda activate sklearn-dev make clean pip install --verbose --no-build-isolation --editable . diff --git a/doc/install.rst b/doc/install.rst index 57cb489a11262..7912cc4dc4df6 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -59,7 +59,10 @@ Installing the latest release Install the 64bit version of Python 3, for instance from https://www.python.org.Install Python 3 using homebrew (brew install python) or by manually installing the package from https://www.python.org.Install python3 and python3-pip using the package manager of the Linux Distribution.Install conda (no administrator permission required). + >Install conda using the Anaconda or miniconda + installers or the miniforge installers + (no administrator permission required for any of those). Then run: @@ -106,17 +109,15 @@ In order to check your installation you can use Note that in order to avoid potential conflicts with other packages it is -strongly recommended to use a virtual environment, e.g. python3 ``virtualenv`` -(see `python3 virtualenv documentation -`_) or `conda environments +strongly recommended to use a `virtual environment (venv) +`_ or a `conda environment `_. -Using an isolated environment makes possible to install a specific version of -scikit-learn and its dependencies independently of any previously installed -Python packages. -In particular under Linux is it discouraged to install pip packages alongside -the packages managed by the package manager of the distribution -(apt, dnf, pacman...). +Using such an isolated environment makes it possible to install a specific +version of scikit-learn with pip or conda and its dependencies independently of +any previously installed Python packages. In particular under Linux is it +discouraged to install pip packages alongside the packages managed by the +package manager of the distribution (apt, dnf, pacman...). Note that you should always remember to activate the environment of your choice prior to running any Python command whenever you start a new terminal session. @@ -127,8 +128,6 @@ and NumPy and SciPy are not recompiled from source, which can happen when using particular configurations of operating system and hardware (such as Linux on a Raspberry Pi). -If you must install scikit-learn and its dependencies with pip, you can install -it as ``scikit-learn[alldeps]``. Scikit-learn plotting capabilities (i.e., functions start with "plot\_" and classes end with "Display") require Matplotlib. The examples require @@ -151,6 +150,28 @@ purpose. For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+ are required. +.. _install_on_apple_silicon_m1: + +Installing on Apple Silicon M1 hardware +======================================= + +The recently introduced `macos/arm64` platform (sometimes also known as +`macos/aarch64`) requires the open source community to upgrade the build +configuation and automation to properly support it. + +At the time of writing (January 2021), the only way to get a working +installation of scikit-learn on this hardware is to install scikit-learn and its +dependencies from the conda-forge distribution, for instance using the miniforge +installers: + +https://github.com/conda-forge/miniforge + +The following issue tracks progress on making it possible to install +scikit-learn from PyPI with pip: + +https://github.com/scikit-learn/scikit-learn/issues/19137 + + .. _install_by_distribution: Third party distributions of scikit-learn From e3caae66014d8f74aa52bc61120ca39ccca08df0 Mon Sep 17 00:00:00 2001 From: Miao Cai Date: Tue, 12 Jan 2021 01:50:13 +0800 Subject: [PATCH 19/38] DOC Mention to use a command prompt in Windows install (#19125) Co-authored-by: Guillaume Lemaitre --- doc/developers/advanced_installation.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 7b305c13200a7..7fbceeeab4c47 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -206,7 +206,8 @@ console: python -c "import struct; print(struct.calcsize('P') * 8)" -For 64-bit Python, configure the build environment with: +For 64-bit Python, configure the build environment by running the following +commands in ``cmd`` or an Anaconda Prompt (if you use Anaconda): :: From 3ae054f5d62456dc342413de4b7aee80b30412b3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 12 Jan 2021 06:20:12 -0500 Subject: [PATCH 20/38] DOC Uses float instead of real in cross_decomposition (#19156) --- sklearn/cross_decomposition/_pls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 66adacb64b1f3..817d4edbd9e88 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -499,7 +499,7 @@ class PLSRegression(_PLS): The maximum number of iterations of the power method when `algorithm='nipals'`. Ignored otherwise. - tol : real, default 1e-06 + tol : float, default=1e-06 The tolerance used as convergence criteria in the power method: the algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less than `tol`, where `u` corresponds to the left singular vector. @@ -597,7 +597,7 @@ class PLSCanonical(_PLS): the maximum number of iterations of the power method when `algorithm='nipals'`. Ignored otherwise. - tol : real, default 1e-06 + tol : float, default=1e-06 The tolerance used as convergence criteria in the power method: the algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less than `tol`, where `u` corresponds to the left singular vector. @@ -703,7 +703,7 @@ class CCA(_PLS): max_iter : int, default=500 the maximum number of iterations of the power method. - tol : real, default 1e-06 + tol : float, default=1e-06 The tolerance used as convergence criteria in the power method: the algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less than `tol`, where `u` corresponds to the left singular vector. From fcc49e0129f5a1ef451967c09dad2e75551d3450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Tue, 12 Jan 2021 15:30:36 +0100 Subject: [PATCH 21/38] MNT Replace PDF build by ZIP of the HTML (#17564) --- build_tools/circle/build_doc.sh | 11 ++++------ build_tools/circle/list_versions.py | 31 +++++++++++++++++++++-------- doc/Makefile | 19 +++++++++++++++--- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 96ae64df1c44d..691006bd2dab0 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -116,8 +116,8 @@ fi if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]] then - # PDF linked into HTML - make_args="dist LATEXMKOPTS=-halt-on-error" + # ZIP linked into HTML + make_args=dist elif [[ "$build_type" =~ ^QUICK ]] then make_args=html-noplot @@ -133,13 +133,10 @@ fi make_args="SPHINXOPTS=-T $make_args" # show full traceback on exception # Installing required system packages to support the rendering of math -# notation in the HTML documentation +# notation in the HTML documentation and to optimize the image files sudo -E apt-get -yq update -sudo -E apt-get -yq remove texlive-binaries --purge sudo -E apt-get -yq --no-install-suggests --no-install-recommends \ - install dvipng texlive-latex-base texlive-latex-extra \ - texlive-latex-recommended texlive-fonts-recommended \ - latexmk gsfonts ccache + install dvipng gsfonts ccache zip optipng # deactivate circleci virtualenv and setup a miniconda env instead if [[ `type -t deactivate` ]]; then diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py index 19fa8aa2dc991..9d64497012445 100755 --- a/build_tools/circle/list_versions.py +++ b/build_tools/circle/list_versions.py @@ -8,6 +8,7 @@ from distutils.version import LooseVersion from urllib.request import urlopen + def json_urlread(url): try: return json.loads(urlopen(url).read().decode('utf8')) @@ -32,10 +33,23 @@ def human_readable_data_quantity(quantity, multiple=1024): quantity /= multiple -def get_pdf_size(version): +def get_file_extension(version): + if version == 'dev': + # The 'dev' branch should be explictly handled + return 'zip' + + current_version = LooseVersion(version) + min_zip_version = LooseVersion('1.0.0') + + return 'zip' if current_version >= min_zip_version else 'pdf' + + +def get_file_size(version): api_url = ROOT_URL + '%s/_downloads' % version for path_details in json_urlread(api_url): - if path_details['name'] == 'scikit-learn-docs.pdf': + file_extension = get_file_extension(version) + file_path = f'scikit-learn-docs.{file_extension}' + if path_details['name'] == file_path: return human_readable_data_quantity(path_details['size'], 1000) @@ -64,8 +78,8 @@ def get_pdf_size(version): if path_details['type'] == 'dir': html = urlopen(RAW_FMT % name).read().decode('utf8') version_num = VERSION_RE.search(html).group(1) - pdf_size = get_pdf_size(name) - dirs[name] = (version_num, pdf_size) + file_size = get_file_size(name) + dirs[name] = (version_num, file_size) if path_details['type'] == 'symlink': symlinks[name] = json_urlread(path_details['_links']['self'])['target'] @@ -81,7 +95,7 @@ def get_pdf_size(version): for name in (NAMED_DIRS + sorted((k for k in dirs if k[:1].isdigit()), key=LooseVersion, reverse=True)): - version_num, pdf_size = dirs[name] + version_num, file_size = dirs[name] if version_num in seen: # symlink came first continue @@ -91,7 +105,8 @@ def get_pdf_size(version): path = 'https://scikit-learn.org/%s/' % name out = ('* `Scikit-learn %s%s documentation <%s>`_' % (version_num, name_display, path)) - if pdf_size is not None: - out += (' (`PDF %s <%s/_downloads/scikit-learn-docs.pdf>`_)' - % (pdf_size, path)) + if file_size is not None: + file_extension = get_file_extension(version_num) + out += (f' (`{file_extension.upper()} {file_size} <{path}/' + f'_downloads/scikit-learn-docs.{file_extension}>`_)') print(out) diff --git a/doc/Makefile b/doc/Makefile index 1cbce7dba9662..6146d11123017 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -17,7 +17,7 @@ ALLSPHINXOPTS = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\ $(EXAMPLES_PATTERN_OPTS) . -.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest optipng +.PHONY: help clean html dirhtml ziphtml pickle json latex latexpdf changes linkcheck doctest optipng all: html-noplot @@ -25,6 +25,7 @@ help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" + @echo " ziphtml to make a ZIP of the HTML" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @@ -58,6 +59,19 @@ dirhtml: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." +ziphtml: + @if [ ! -d "$(BUILDDIR)/html/stable/" ]; then \ + make html; \ + fi + # Optimize the images to reduce the size of the ZIP + optipng $(BUILDDIR)/html/stable/_images/*.png + # Exclude the output directory to avoid infinity recursion + cd $(BUILDDIR)/html/stable; \ + zip -q -x _downloads \ + -r _downloads/scikit-learn-docs.zip . + @echo + @echo "Build finished. The ZIP of the HTML is in $(BUILDDIR)/html/stable/_downloads." + pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @@ -106,5 +120,4 @@ optipng: find _build auto_examples */generated -name '*.png' -print0 \ | xargs -0 -n 1 -P 4 optipng -o10 -dist: html latexpdf - cp _build/latex/user_guide.pdf _build/html/stable/_downloads/scikit-learn-docs.pdf +dist: html ziphtml From 7baf7581646d9891cbb933bcce688c3d17c58829 Mon Sep 17 00:00:00 2001 From: Sina Tootoonian Date: Wed, 13 Jan 2021 07:18:06 +0000 Subject: [PATCH 22/38] DOC Normalization of linear_model decision_function (#19142) --- sklearn/linear_model/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 2399e1216238f..0cd9263cb5618 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -264,8 +264,8 @@ def decision_function(self, X): """ Predict confidence scores for samples. - The confidence score for a sample is the signed distance of that - sample to the hyperplane. + The confidence score for a sample is proportional to the signed + distance of that sample to the hyperplane. Parameters ---------- From 1e4d7567e5cf455849a1b72512937d183e93886f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 13 Jan 2021 03:23:27 -0500 Subject: [PATCH 23/38] CI Adds skipping to azure pipelines with commit message (#19134) --- azure-pipelines.yml | 117 +++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 46 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1a42c533fb2ee..870c5f0e1d313 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,17 +8,11 @@ schedules: always: true jobs: -- job: linting - displayName: Linting +- job: git_commit + displayName: Get Git Commit pool: vmImage: ubuntu-18.04 steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.9' - - bash: | - pip install flake8 mypy==0.782 - displayName: Install linters - bash: | set -ex if [[ $BUILD_REASON == "PullRequest" ]]; then @@ -26,48 +20,53 @@ jobs: # which has a "Merge ID into ID" as a commit message. The latest commit # message is the second to last commit COMMIT_ID=$(echo $BUILD_SOURCEVERSIONMESSAGE | awk '{print $2}') - COMMIT_MESSAGE=$(git log $COMMIT_ID -1 --pretty=%B) + message=$(git log $COMMIT_ID -1 --pretty=%B) else - COMMIT_MESSAGE=$BUILD_SOURCEVERSIONMESSAGE + message=$BUILD_SOURCEVERSIONMESSAGE fi - echo "##vso[task.setvariable variable=COMMIT_MESSAGE]$COMMIT_MESSAGE" + echo "##vso[task.setvariable variable=message;isOutput=true]$message" + name: commit displayName: Get source version message + +- job: linting + dependsOn: [git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[lint skip]')), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) + ) + displayName: Linting + pool: + vmImage: ubuntu-18.04 + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.9' - bash: | - set -ex - if [[ "$COMMIT_MESSAGE" =~ "[lint skip]" ]]; then - # skip linting - echo "Skipping flake8 linting" - exit 0 - else - ./build_tools/circle/linting.sh - fi + pip install flake8 mypy==0.782 + displayName: Install linters + - bash: | + ./build_tools/circle/linting.sh displayName: Run linting - bash: | - set -ex - if [[ "$COMMIT_MESSAGE" =~ "[lint skip]" ]]; then - # skip linting - echo "Skipping mypy linting" - exit 0 - else - mypy sklearn/ - fi + mypy sklearn/ displayName: Run mypy - - bash: | - if [[ "$COMMIT_MESSAGE" =~ "[scipy-dev]" ]] || [[ $BUILD_REASON == "Schedule" ]]; then - echo "Running scipy-dev" - echo "##vso[task.setvariable variable=runScipyDev;isOutput=true]true" - else - echo "##vso[task.setvariable variable=runScipyDev;isOutput=true]false" - fi - name: gitCommitMessage - displayName: Determine to run scipy-dev - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly vmImage: ubuntu-18.04 - dependsOn: [linting] - condition: eq(dependencies['linting']['outputs']['gitCommitMessage.runScipyDev'], 'true') + dependsOn: [git_commit, linting] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + or(eq(variables['Build.Reason'], 'Schedule'), + contains(dependencies['git_commit']['outputs']['commit.message'], '[scipy-dev]' + ) + ) + ) matrix: pylatest_pip_scipy_dev: DISTRIB: 'conda-pip-scipy-dev' @@ -84,6 +83,12 @@ jobs: parameters: name: Linux_Runs vmImage: ubuntu-18.04 + dependsOn: [git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')) + ) matrix: pylatest_conda_mkl: DISTRIB: 'conda' @@ -95,8 +100,13 @@ jobs: parameters: name: Linux vmImage: ubuntu-18.04 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) + dependsOn: [linting, git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + ne(variables['Build.Reason'], 'Schedule') + ) matrix: # Linux environment to test that scikit-learn can be built against # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 @@ -139,8 +149,13 @@ jobs: parameters: name: Linux32 vmImage: ubuntu-18.04 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) + dependsOn: [linting, git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + ne(variables['Build.Reason'], 'Schedule') + ) matrix: py36_ubuntu_atlas_32bit: DISTRIB: 'ubuntu-32' @@ -157,8 +172,13 @@ jobs: parameters: name: macOS vmImage: macOS-10.14 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) + dependsOn: [linting, git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + ne(variables['Build.Reason'], 'Schedule') + ) matrix: pylatest_conda_forge_mkl: DISTRIB: 'conda' @@ -174,8 +194,13 @@ jobs: parameters: name: Windows vmImage: vs2017-win2016 - dependsOn: [linting] - condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting')) + dependsOn: [linting, git_commit] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + ne(variables['Build.Reason'], 'Schedule') + ) matrix: py37_conda_mkl: PYTHON_VERSION: '3.7' From cd62f8beb1fe4aa1e09ec6929b27bad2ce1b4d03 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 13 Jan 2021 04:02:12 -0500 Subject: [PATCH 24/38] DOC Clarifies docstrings in decomposition (#19161) --- sklearn/decomposition/_dict_learning.py | 14 ++++++------- sklearn/decomposition/_lda.py | 2 +- sklearn/decomposition/_nmf.py | 26 ++++++++++++------------- sklearn/decomposition/_pca.py | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 781f288b70351..046738aa9700d 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1156,10 +1156,10 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): fit_algorithm : {'lars', 'cd'}, default='lars' * `'lars'`: uses the least angle regression method to solve the lasso - problem (`linear_model.lars_path`); + problem (:func:`~sklearn.linear_model.lars_path`); * `'cd'`: uses the coordinate descent method to compute the - Lasso solution (`linear_model.Lasso`). Lars will be faster if - the estimated components are sparse. + Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be + faster if the estimated components are sparse. .. versionadded:: 0.17 *cd* coordinate descent method to improve speed. @@ -1169,11 +1169,11 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): Algorithm used to transform the data: - `'lars'`: uses the least angle regression method - (`linear_model.lars_path`); + (:func:`~sklearn.linear_model.lars_path`); - `'lasso_lars'`: uses Lars to compute the Lasso solution. - `'lasso_cd'`: uses the coordinate descent method to compute the - Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster - if the estimated components are sparse. + Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'` + will be faster if the estimated components are sparse. - `'omp'`: uses orthogonal matching pursuit to estimate the sparse solution. - `'threshold'`: squashes to zero all coefficients less than alpha from @@ -1404,7 +1404,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): shuffle : bool, default=True Whether to shuffle the samples before forming batches. - dict_init : nbarray of shape (n_components, n_features), default=None + dict_init : ndarray of shape (n_components, n_features), default=None initial value of the dictionary for warm restart scenarios transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \ diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 6e6a5627ff7c5..e554d299fe478 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -194,7 +194,7 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): Number of documents to use in each EM iteration. Only used in online learning. - evaluate_every : int, default=0 + evaluate_every : int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 5d01060951ae1..7bedc60998388 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1138,23 +1138,23 @@ class NMF(TransformerMixin, BaseEstimator): Default: None. Valid options: - - None: 'nndsvd' if n_components <= min(n_samples, n_features), - otherwise random. + - `None`: 'nndsvd' if n_components <= min(n_samples, n_features), + otherwise random. - - 'random': non-negative random matrices, scaled with: - sqrt(X.mean() / n_components) + - `'random'`: non-negative random matrices, scaled with: + sqrt(X.mean() / n_components) - - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) - initialization (better for sparseness) + - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) - - 'nndsvda': NNDSVD with zeros filled with the average of X - (better when sparsity is not desired) + - `'nndsvda'`: NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) - - 'nndsvdar': NNDSVD with zeros filled with small random values - (generally faster, less accurate alternative to NNDSVDa - for when sparsity is not desired) + - `'nndsvdar'` NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) - - 'custom': use custom matrices W and H + - `'custom'`: use custom matrices W and H solver : {'cd', 'mu'}, default='cd' Numerical solver to use: @@ -1207,7 +1207,7 @@ class NMF(TransformerMixin, BaseEstimator): Regularization parameter *l1_ratio* used in the Coordinate Descent solver. - verbose : bool, default=False + verbose : int, default=0 Whether to be verbose. shuffle : bool, default=False diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index c69dc959b851a..80ac7e856dfd0 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -130,7 +130,7 @@ class PCA(_BasePCA): Parameters ---------- - n_components : int, float or str, default=None + n_components : int, float or 'mle', default=None Number of components to keep. if n_components is not set all components are kept:: From 957781f04c6dc7265ab31414907a28df0a897b94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Wed, 13 Jan 2021 10:21:02 +0100 Subject: [PATCH 25/38] MNT fix strict comparison in version listing (#19163) --- build_tools/circle/list_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py index 9d64497012445..399c77b723d3c 100755 --- a/build_tools/circle/list_versions.py +++ b/build_tools/circle/list_versions.py @@ -34,7 +34,7 @@ def human_readable_data_quantity(quantity, multiple=1024): def get_file_extension(version): - if version == 'dev': + if 'dev' in version: # The 'dev' branch should be explictly handled return 'zip' From 300782fdbb7722bcfc595c031f3f2b703f5df11d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 13 Jan 2021 11:02:53 -0500 Subject: [PATCH 26/38] TST Download datasets before running pytest-xdist (#19118) Co-authored-by: Olivier Grisel --- conftest.py | 16 ---- doc/computing/parallelism.rst | 3 +- sklearn/conftest.py | 85 +++++++++++++++++++ sklearn/datasets/tests/conftest.py | 60 ------------- .../ensemble/tests/test_gradient_boosting.py | 7 +- 5 files changed, 90 insertions(+), 81 deletions(-) diff --git a/conftest.py b/conftest.py index 5c48de4ac36a3..aec49c03ae13d 100644 --- a/conftest.py +++ b/conftest.py @@ -5,7 +5,6 @@ # doc/modules/clustering.rst and use sklearn from the local folder rather than # the one from site-packages. -import os import platform import sys @@ -17,18 +16,12 @@ from sklearn._min_dependencies import PYTEST_MIN_VERSION from sklearn.utils.fixes import np_version, parse_version - if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION): raise ImportError('Your version of pytest is too old, you should have ' 'at least pytest >= {} installed.' .format(PYTEST_MIN_VERSION)) -def pytest_addoption(parser): - parser.addoption("--skip-network", action="store_true", default=False, - help="skip network tests") - - def pytest_collection_modifyitems(config, items): for item in items: # FeatureHasher is not compatible with PyPy @@ -50,15 +43,6 @@ def pytest_collection_modifyitems(config, items): ) item.add_marker(marker) - # Skip tests which require internet if the flag is provided - if (config.getoption("--skip-network") - or int(os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "0"))): - skip_network = pytest.mark.skip( - reason="test requires internet connectivity") - for item in items: - if "network" in item.keywords: - item.add_marker(skip_network) - # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to # run doctests only for numpy >= 1.14. skip_doctests = False diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst index 3dce5ef66bb1d..8605650e8eec5 100644 --- a/doc/computing/parallelism.rst +++ b/doc/computing/parallelism.rst @@ -212,4 +212,5 @@ These environment variables should be set before importing scikit-learn. :SKLEARN_SKIP_NETWORK_TESTS: When this environment variable is set to a non zero value, the tests - that need network access are skipped. + that need network access are skipped. When this environment variable is + not set then network tests are skipped. diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 8a98921342efa..2978115e3091c 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -1,9 +1,94 @@ import os +from os import environ +from functools import wraps import pytest from threadpoolctl import threadpool_limits from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.datasets import fetch_20newsgroups +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import fetch_covtype +from sklearn.datasets import fetch_kddcup99 +from sklearn.datasets import fetch_olivetti_faces +from sklearn.datasets import fetch_rcv1 + + +dataset_fetchers = { + 'fetch_20newsgroups_fxt': fetch_20newsgroups, + 'fetch_20newsgroups_vectorized_fxt': fetch_20newsgroups_vectorized, + 'fetch_california_housing_fxt': fetch_california_housing, + 'fetch_covtype_fxt': fetch_covtype, + 'fetch_kddcup99_fxt': fetch_kddcup99, + 'fetch_olivetti_faces_fxt': fetch_olivetti_faces, + 'fetch_rcv1_fxt': fetch_rcv1, +} + + +def _fetch_fixture(f): + """Fetch dataset (download if missing and requested by environment).""" + download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' + + @wraps(f) + def wrapped(*args, **kwargs): + kwargs['download_if_missing'] = download_if_missing + try: + return f(*args, **kwargs) + except IOError: + pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") + return pytest.fixture(lambda: wrapped) + + +# Adds fixtures for fetching data +fetch_20newsgroups_fxt = _fetch_fixture(fetch_20newsgroups) +fetch_20newsgroups_vectorized_fxt = \ + _fetch_fixture(fetch_20newsgroups_vectorized) +fetch_california_housing_fxt = _fetch_fixture(fetch_california_housing) +fetch_covtype_fxt = _fetch_fixture(fetch_covtype) +fetch_kddcup99_fxt = _fetch_fixture(fetch_kddcup99) +fetch_olivetti_faces_fxt = _fetch_fixture(fetch_olivetti_faces) +fetch_rcv1_fxt = _fetch_fixture(fetch_rcv1) + + +def pytest_collection_modifyitems(config, items): + """Called after collect is completed. + + Parameters + ---------- + config : pytest config + items : list of collected items + """ + run_network_tests = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' + skip_network = pytest.mark.skip( + reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") + + # download datasets during collection to avoid thread unsafe behavior + # when running pytest in parallel with pytest-xdist + dataset_features_set = set(dataset_fetchers) + datasets_to_download = set() + + for item in items: + if not hasattr(item, "fixturenames"): + continue + item_fixtures = set(item.fixturenames) + dataset_to_fetch = item_fixtures & dataset_features_set + if not dataset_to_fetch: + continue + + if run_network_tests: + datasets_to_download |= dataset_to_fetch + else: + # network tests are skipped + item.add_marker(skip_network) + + # Only download datasets on the first worker spawned by pytest-xdist + # to avoid thread unsafe behavior. If pytest-xdist is not used, we still + # download before tests run. + worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0") + if worker_id == "gw0" and run_network_tests: + for name in datasets_to_download: + dataset_fetchers[name]() @pytest.fixture(scope='function') diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 4612cd5deb4bc..cf356d6ca3b10 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,67 +1,7 @@ """ Network tests are only run, if data is already locally available, or if download is specifically requested by environment variable.""" import builtins -from functools import wraps -from os import environ import pytest -from sklearn.datasets import fetch_20newsgroups -from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.datasets import fetch_california_housing -from sklearn.datasets import fetch_covtype -from sklearn.datasets import fetch_kddcup99 -from sklearn.datasets import fetch_olivetti_faces -from sklearn.datasets import fetch_rcv1 - - -def _wrapped_fetch(f, dataset_name): - """ Fetch dataset (download if missing and requested by environment) """ - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - - @wraps(f) - def wrapped(*args, **kwargs): - kwargs['download_if_missing'] = download_if_missing - try: - return f(*args, **kwargs) - except IOError: - pytest.skip("Download {} to run this test".format(dataset_name)) - return wrapped - - -@pytest.fixture -def fetch_20newsgroups_fxt(): - return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups') - - -@pytest.fixture -def fetch_20newsgroups_vectorized_fxt(): - return _wrapped_fetch(fetch_20newsgroups_vectorized, - dataset_name='20newsgroups_vectorized') - - -@pytest.fixture -def fetch_california_housing_fxt(): - return _wrapped_fetch(fetch_california_housing, - dataset_name='california_housing') - - -@pytest.fixture -def fetch_covtype_fxt(): - return _wrapped_fetch(fetch_covtype, dataset_name='covtype') - - -@pytest.fixture -def fetch_kddcup99_fxt(): - return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99') - - -@pytest.fixture -def fetch_olivetti_faces_fxt(): - return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces') - - -@pytest.fixture -def fetch_rcv1_fxt(): - return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') @pytest.fixture diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 256b79db4865c..498e5bf38a675 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -13,7 +13,7 @@ from sklearn import datasets from sklearn.base import clone -from sklearn.datasets import (make_classification, fetch_california_housing, +from sklearn.datasets import (make_classification, make_regression) from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor @@ -345,8 +345,7 @@ def test_max_feature_regression(): assert deviance < 0.5, "GB failed with deviance %.4f" % deviance -@pytest.mark.network -def test_feature_importance_regression(): +def test_feature_importance_regression(fetch_california_housing_fxt): """Test that Gini importance is calculated correctly. This test follows the example from [1]_ (pg. 373). @@ -354,7 +353,7 @@ def test_feature_importance_regression(): .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of statistical learning. New York: Springer series in statistics. """ - california = fetch_california_housing() + california = fetch_california_housing_fxt() X, y = california.data, california.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) From ff778443331e30025dd25a02c10a69c29a17a6c3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 14 Jan 2021 17:28:12 +0100 Subject: [PATCH 27/38] CI Use stable numpy scipy release for [icc-build] and [arm64] on travis (#19176) The scipy-dev builds have moved to Azure Pipelines and there is no reason to not use stable versions of numpy and scipy to run the ICC and ARM64 tests on travis. This should fix the invalid wheel metadata failure observed on travis which was already resolved on Azure Pipelines by using the legacy pip dependency resolver for the scipy-dev build. --- build_tools/travis/install_master.sh | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/build_tools/travis/install_master.sh b/build_tools/travis/install_master.sh index 042ce53b41d2c..e2e0534216c7c 100755 --- a/build_tools/travis/install_master.sh +++ b/build_tools/travis/install_master.sh @@ -50,23 +50,8 @@ conda update --yes conda conda create -n testenv --yes python=3.7 source activate testenv - -if [[ $TRAVIS_CPU_ARCH == amd64 ]]; then - echo "Upgrading pip and setuptools." - pip install --upgrade pip setuptools - echo "Installing numpy, scipy and pandas master wheels." - dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple - pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy scipy pandas - echo "Installing cython pre-release wheels." - pip install --pre cython - echo "Installing joblib master." - pip install https://github.com/joblib/joblib/archive/master.zip - echo "Installing pillow master." - pip install https://github.com/python-pillow/Pillow/archive/master.zip -else - conda install -y scipy numpy pandas cython - pip install joblib threadpoolctl -fi +conda install -y scipy numpy pandas cython +pip install joblib threadpoolctl pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist From 0f93f76be746f91f859075f51e3cbcf11301c525 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 16 Jan 2021 13:19:07 -0500 Subject: [PATCH 28/38] FIX Fixes issue with exatly_zero_info_score (#19179) * ENH Fixes issue with exatly_zero_info_score [scipy-dev] * ENH Remove unneeded line [scipy-dev] * WIP Keep types [scipy-dev] * REV Smaller diff [scipy-dev] * WIP Expand mutual_info_score [scipy-dev] * WIP Removes float casting [scipy-dev] * WIP Adds casting back in * CI [scipy-dev] * WIP Casting is not needed [scipy-dev] * WIP Only clip [scipy-dev] * REV Smaller diff [scipy-dev] * WIP Place expected_mutual_information diff back [scipy-dev] * ENH Uses around * WIP Use where again [scipy-dev] * ENH Adjust comments to match code --- .../metrics/cluster/_expected_mutual_info_fast.pyx | 14 +++++++------- sklearn/metrics/cluster/_supervised.py | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx index b9b94508da046..d2f9cd8578b12 100644 --- a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx +++ b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx @@ -22,7 +22,7 @@ def expected_mutual_information(contingency, int n_samples): cdef DOUBLE N, gln_N, emi, term2, term3, gln cdef np.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij cdef np.ndarray[DOUBLE] nijs, term1 - cdef np.ndarray[DOUBLE, ndim=2] log_ab_outer + cdef np.ndarray[DOUBLE] log_a, log_b cdef np.ndarray[np.int32_t] a, b #cdef np.ndarray[int, ndim=2] start, end R, C = contingency.shape @@ -37,10 +37,10 @@ def expected_mutual_information(contingency, int n_samples): # term1 is nij / N term1 = nijs / N # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b) - # term2 uses the outer product - log_ab_outer = np.log(a)[:, np.newaxis] + np.log(b) - # term2 uses N * nij - log_Nnij = np.log(N * nijs) + log_a = np.log(a) + log_b = np.log(b) + # term2 uses log(N * nij) = log(N) + log(nij) + log_Nnij = np.log(N) + np.log(nijs) # term3 is large, and involved many factorials. Calculate these in log # space to stop overflows. gln_a = gammaln(a + 1) @@ -54,12 +54,12 @@ def expected_mutual_information(contingency, int n_samples): start = np.maximum(start, 1) end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1 # emi itself is a summation over the various values. - emi = 0 + emi = 0.0 cdef Py_ssize_t i, j, nij for i in range(R): for j in range(C): for nij in range(start[i,j], end[i,j]): - term2 = log_Nnij[nij] - log_ab_outer[i,j] + term2 = log_Nnij[nij] - log_a[i] - log_b[j] # Numerators are positive, denominators are negative. gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j] - gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 6e4e13f26017a..19d1552518db4 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -795,6 +795,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) + contingency_nm * log_outer) + mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi) return np.clip(mi.sum(), 0.0, None) From a837abe3a36e4f3528d4b15a409c4d833407f477 Mon Sep 17 00:00:00 2001 From: Zito Date: Sat, 16 Jan 2021 10:28:47 -0800 Subject: [PATCH 29/38] DOC description for Calinski-Harabasz Index (#19167) --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 010d721fdd073..61c8393a734c8 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1737,7 +1737,7 @@ Ratio Criterion - can be used to evaluate the model, where a higher Calinski-Harabasz score relates to a model with better defined clusters. The index is the ratio of the sum of between-clusters dispersion and of -inter-cluster dispersion for all clusters (where dispersion is defined as the +within-cluster dispersion for all clusters (where dispersion is defined as the sum of distances squared): >>> from sklearn import metrics From dc59bc108716ec3e5cf8f534ec8d44eba33d0843 Mon Sep 17 00:00:00 2001 From: ranjanikrishnan Date: Sun, 17 Jan 2021 21:28:33 +0100 Subject: [PATCH 30/38] DOC Add link to video for contributing: Andreas video Volume 2 (#19180) Co-authored-by: Nicolas Hug --- doc/developers/contributing.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 8a3c460c615a8..fb2c0aa997fe5 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -213,6 +213,11 @@ latest up-to-date workflow. `Transcript `__ +- Sprint-specific instructions and practical tips: + `Video `__, + `Transcript + `__ + How to contribute ----------------- From a4df528b12ecf29faaac7d6aa82f3fcfa1708581 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta <62496969+abhinavtps@users.noreply.github.com> Date: Mon, 18 Jan 2021 20:43:19 +0530 Subject: [PATCH 31/38] DOC Replacing swarmplot with stripplot to avoid seaborn warning (#19195) --- .../plot_linear_model_coefficient_interpretation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py index 93a5b430a3542..459b180f00e36 100644 --- a/examples/inspection/plot_linear_model_coefficient_interpretation.py +++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py @@ -325,7 +325,7 @@ columns=feature_names ) plt.figure(figsize=(9, 7)) -sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.xlabel('Coefficient importance') @@ -376,7 +376,7 @@ columns=feature_names[:-1] ) plt.figure(figsize=(9, 7)) -sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.title('Coefficient importance and its variability') @@ -469,7 +469,7 @@ columns=feature_names ) plt.figure(figsize=(9, 7)) -sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5) +sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5) sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5) plt.axvline(x=0, color='.5') plt.title('Coefficient variability') From e79b51689df0b217b9e44b695e10560b82f1cf9b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Jan 2021 17:25:41 +0100 Subject: [PATCH 32/38] DOC add entry in whats new for 0.24.1 (#19196) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Olivier Grisel Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> --- doc/whats_new/v0.24.rst | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index ca96e875d342a..cbe26892d772e 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -7,6 +7,17 @@ Version 0.24.1 ============== +**January 2020** + +Packaging +--------- + +The 0.24.0 scikit-learn wheels were not working with MacOS <1.15 due to +`libomp`. The version of `libomp` used to build the wheels was too recent for +older macOS versions. This issue has been fixed for 0.24.1 scikit-learn wheels. +Scikit-learn wheels published on PyPI.org now officially support macOS 10.13 +and later. + Changelog --------- @@ -56,21 +67,9 @@ Details are listed in the changelog below. (While we are trying to better inform users by providing this information, we cannot assure that this list is complete.) - Changelog --------- -.. - Entries should be grouped by module (in alphabetic order) and prefixed with - one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|, - |Fix| or |API| (see whats_new.rst for descriptions). - Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|). - Changes not specific to a module should be listed under *Multiple Modules* - or *Miscellaneous*. - Entries should end with: - :pr:`123456` by :user:`Joe Bloggs `. - where 123456 is the *pull request* number, not the issue number. - :mod:`sklearn.base` ................... From a9fd32c25b8c643a04449db9cbd4806d84ed765c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Jan 2021 19:50:19 +0100 Subject: [PATCH 33/38] DOC update the version and year release --- doc/templates/index.html | 2 ++ doc/whats_new/v0.24.rst | 2 +- sklearn/__init__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/templates/index.html b/doc/templates/index.html index d333530ef8376..88affa25830b4 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -155,6 +155,8 @@

News

  • On-going development: What's new (Changelog) +
  • January 2021. scikit-learn 0.24.1 is available for download (Changelog). +
  • December 2020. scikit-learn 0.24.0 is available for download (Changelog).
  • August 2020. scikit-learn 0.23.2 is available for download (Changelog). diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index cbe26892d772e..2bef032a6cb0c 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -7,7 +7,7 @@ Version 0.24.1 ============== -**January 2020** +**January 2021** Packaging --------- diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 6aace65cbdfb1..04c20fc5c08fd 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.24.0' +__version__ = '0.24.1' # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From a7b9f317f1a9e3db2e893a296e6255141b4ecd3e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Jan 2021 20:06:25 +0100 Subject: [PATCH 34/38] DOC add entry in whats new for numerical instability in mutual information (#19200) --- doc/whats_new/v0.24.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 2bef032a6cb0c..749d67e83d4af 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -21,6 +21,14 @@ and later. Changelog --------- +:mod:`sklearn.metrics` +...................... + +- |Fix| Fix numerical stability bug that could happen in + :func:`metrics.adjusted_mutual_info_score` and + :func:`metrics.mutual_info_score` with NumPy 1.20+. + :pr:`19179` by `Thomas Fan`_. + :mod:`sklearn.semi_supervised` .............................. From eee3918288b8d9b8d6818d2a0e3a8c697146080f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 5 Jan 2021 04:27:02 -0500 Subject: [PATCH 35/38] TST Adapts wminkowski for scipy 1.6.0 (#19096) --- sklearn/neighbors/tests/test_dist_metrics.py | 28 ++++++++++++++++++-- sklearn/neighbors/tests/test_neighbors.py | 16 +++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py index 60a4e97880e15..441bcc134fe6b 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/neighbors/tests/test_dist_metrics.py @@ -55,7 +55,19 @@ def test_cdist(metric): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) - D_true = cdist(X1, X2, metric, **kwargs) + if metric == "wminkowski": + if sp_version >= parse_version("1.8.0"): + pytest.skip("wminkowski will be removed in SciPy 1.8.0") + + # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 + ExceptionToAssert = None + if sp_version >= parse_version("1.6.0"): + ExceptionToAssert = DeprecationWarning + with pytest.warns(ExceptionToAssert): + D_true = cdist(X1, X2, metric, **kwargs) + else: + D_true = cdist(X1, X2, metric, **kwargs) + check_cdist(metric, kwargs, D_true) @@ -83,7 +95,19 @@ def test_pdist(metric): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) - D_true = cdist(X1, X1, metric, **kwargs) + if metric == "wminkowski": + if sp_version >= parse_version("1.8.0"): + pytest.skip("wminkowski will be removed in SciPy 1.8.0") + + # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 + ExceptionToAssert = None + if sp_version >= parse_version("1.6.0"): + ExceptionToAssert = DeprecationWarning + with pytest.warns(ExceptionToAssert): + D_true = cdist(X1, X1, metric, **kwargs) + else: + D_true = cdist(X1, X1, metric, **kwargs) + check_pdist(metric, kwargs, D_true) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 513df1edb1bec..2b6c9a48d545d 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -26,6 +26,7 @@ from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state +from sklearn.utils.fixes import sp_version, parse_version import joblib @@ -1244,6 +1245,9 @@ def test_neighbors_metrics(n_samples=20, n_features=3, test = rng.rand(n_query_pts, n_features) for metric, metric_params in metrics: + if metric == "wminkowski" and sp_version >= parse_version("1.8.0"): + # wminkowski will be removed in SciPy 1.8.0 + continue results = {} p = metric_params.pop('p', 2) for algorithm in algorithms: @@ -1265,8 +1269,16 @@ def test_neighbors_metrics(n_samples=20, n_features=3, if metric == 'haversine' else slice(None)) neigh.fit(X[:, feature_sl]) - results[algorithm] = neigh.kneighbors(test[:, feature_sl], - return_distance=True) + + # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 + ExceptionToAssert = None + if (metric == "wminkowski" and algorithm == 'brute' + and sp_version >= parse_version("1.6.0")): + ExceptionToAssert = DeprecationWarning + + with pytest.warns(ExceptionToAssert): + results[algorithm] = neigh.kneighbors(test[:, feature_sl], + return_distance=True) assert_array_almost_equal(results['brute'][0], results['ball_tree'][0]) assert_array_almost_equal(results['brute'][1], results['ball_tree'][1]) From 4c3f0dc3e16f7ccc7758c6b3e9b368bed086fc10 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Sat, 2 Jan 2021 15:19:32 +0100 Subject: [PATCH 36/38] TST Fix scipy DeprecationWarning from wminkowski in nightly (#18930) --- sklearn/metrics/tests/test_pairwise.py | 45 ++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 88c285421fca6..7640a6a2b8e70 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -4,8 +4,16 @@ from numpy import linalg from scipy.sparse import dok_matrix, csr_matrix, issparse -from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski +from scipy.spatial.distance import cosine, cityblock, minkowski from scipy.spatial.distance import cdist, pdist, squareform +try: + from scipy.spatial.distance import wminkowski +except ImportError: + # In scipy 1.6.0, wminkowski is deprecated and minkowski + # should be used instead. + from scipy.spatial.distance import minkowski as wminkowski + +from sklearn.utils.fixes import sp_version, parse_version import pytest @@ -233,6 +241,7 @@ def test_pairwise_precomputed_non_negative(): pairwise_distances(np.full((5, 5), -1), metric='precomputed') +_minkowski_kwds = {'w': np.arange(1, 5).astype('double', copy=False), 'p': 1} _wminkowski_kwds = {'w': np.arange(1, 5).astype('double', copy=False), 'p': 1} @@ -245,8 +254,38 @@ def callable_rbf_kernel(x, y, **kwds): @pytest.mark.parametrize( 'func, metric, kwds', [(pairwise_distances, 'euclidean', {}), - (pairwise_distances, wminkowski, _wminkowski_kwds), - (pairwise_distances, 'wminkowski', _wminkowski_kwds), + pytest.param( + pairwise_distances, minkowski, _minkowski_kwds, + marks=pytest.mark.skipif( + sp_version < parse_version("1.0"), + reason="minkowski does not accept the w " + "parameter prior to scipy 1.0." + ) + ), + pytest.param( + pairwise_distances, 'minkowski', _minkowski_kwds, + marks=pytest.mark.skipif( + sp_version < parse_version("1.0"), + reason="minkowski does not accept the w " + "parameter prior to scipy 1.0." + ) + ), + pytest.param( + pairwise_distances, wminkowski, _wminkowski_kwds, + marks=pytest.mark.skipif( + sp_version >= parse_version("1.6.0"), + reason="wminkowski is now minkowski " + "and it has been already tested." + ) + ), + pytest.param( + pairwise_distances, 'wminkowski', _wminkowski_kwds, + marks=pytest.mark.skipif( + sp_version >= parse_version("1.6.0"), + reason="wminkowski is now minkowski " + "and it has been already tested." + ) + ), (pairwise_kernels, 'polynomial', {'degree': 1}), (pairwise_kernels, callable_rbf_kernel, {'gamma': .1})]) @pytest.mark.parametrize('array_constr', [np.array, csr_matrix]) From cbb9320771590655211d0e0faea7e6e643c5a895 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Jan 2021 10:00:38 +0100 Subject: [PATCH 37/38] Trigger wheel builder workflow: [cd build] From cf21428a5770ca77b741fe8c7215d9dfd7caa417 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Jan 2021 11:09:01 +0100 Subject: [PATCH 38/38] DOC fix year release 0.24.1