diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1a42c533fb2ee..0443abdd818a4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -77,7 +77,7 @@ jobs: TEST_DOCSTRINGS: 'true' # Tests that require large downloads over the networks are skipped in CI. # Here we make sure, that they are still run on a regular basis. - SKLEARN_SKIP_NETWORK_TESTS: '0' + SKLEARN_RUN_NETWORK_TESTS: 'true' # Will run all the time regardless of linting outcome. - template: build_tools/azure/posix.yml diff --git a/build_tools/azure/posix-32.yml b/build_tools/azure/posix-32.yml index 5e4689a2505e5..d0ea1f241f00a 100644 --- a/build_tools/azure/posix-32.yml +++ b/build_tools/azure/posix-32.yml @@ -16,7 +16,7 @@ jobs: JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' - SKLEARN_SKIP_NETWORK_TESTS: '1' + SKLEARN_RUN_NETWORK_TESTS: 'false' NUMPY_VERSION: 'latest' SCIPY_VERSION: 'latest' CYTHON_VERSION: 'latest' @@ -62,7 +62,7 @@ jobs: -e THREADPOOLCTL_VERSION=$THREADPOOLCTL_VERSION -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS - -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS + -e SKLEARN_RUN_NETWORK_TESTS=$SKLEARN_RUN_NETWORK_TESTS i386/ubuntu:18.04 sleep 1000000 displayName: 'Start container' diff --git a/build_tools/azure/posix.yml b/build_tools/azure/posix.yml index ae5726aab0b65..ebe91a15041ec 100644 --- a/build_tools/azure/posix.yml +++ b/build_tools/azure/posix.yml @@ -17,7 +17,7 @@ jobs: JUNITXML: 'test-data.xml' OMP_NUM_THREADS: '2' OPENBLAS_NUM_THREADS: '2' - SKLEARN_SKIP_NETWORK_TESTS: '1' + SKLEARN_RUN_NETWORK_TESTS: 'false' CCACHE_DIR: $(Pipeline.Workspace)/ccache CCACHE_COMPRESS: '1' NUMPY_VERSION: 'latest' diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index b08cda50cfd60..adcc0d18710e2 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -23,6 +23,7 @@ pip list TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" + if [[ "$COVERAGE" == "true" ]]; then # Note: --cov-report= is used to disable to long text output report in the # CI logs. The coverage data is consolidated by codecov to get an online @@ -47,5 +48,11 @@ cp setup.cfg $TEST_DIR cd $TEST_DIR set -x -$TEST_CMD --pyargs sklearn +if [[ "$SKLEARN_RUN_NETWORK_TESTS" == "true" ]]; then + # Tests that require large downloads over the networks are skipped in CI. + # Here we make sure, that they are still run on a regular basis. + $TEST_CMD --pyargs sklearn -m 'not skipnetwork' +else + $TEST_CMD --pyargs sklearn +fi set +x diff --git a/build_tools/azure/windows.yml b/build_tools/azure/windows.yml index 8a5edd4b93019..dcecb2ab2c596 100644 --- a/build_tools/azure/windows.yml +++ b/build_tools/azure/windows.yml @@ -15,7 +15,7 @@ jobs: variables: VIRTUALENV: 'testvenv' JUNITXML: 'test-data.xml' - SKLEARN_SKIP_NETWORK_TESTS: '1' + SKLEARN_RUN_NETWORK_TESTS: 'false' PYTEST_VERSION: '5.2.1' PYTEST_XDIST: 'true' TMP_FOLDER: '$(Agent.WorkFolder)\tmp_folder' diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index cb5a3dbfeed33..bccf5fde320e5 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -17,6 +17,12 @@ python -c "import platform; print(f'{platform.machine()}')" TEST_CMD="pytest --showlocals --durations=20 --pyargs" +# Tests that require large downloads over the networks are skipped in CI. +# Here we make sure, that they are still run on a regular basis. +if [[ "$SKLEARN_RUN_NETWORK_TESTS" == "true" ]]; then + TEST_CMD="$TEST_CMD -m 'not skipnetwork'" +fi + # Run the tests on the installed version mkdir -p $TEST_DIR diff --git a/conftest.py b/conftest.py index 5c48de4ac36a3..0f2c50d5e86e4 100644 --- a/conftest.py +++ b/conftest.py @@ -14,6 +14,14 @@ from sklearn.utils import _IS_32BIT from sklearn.externals import _pilutil +from sklearn.datasets import fetch_20newsgroups +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_california_housing +from sklearn.datasets import fetch_covtype +from sklearn.datasets import fetch_kddcup99 +from sklearn.datasets import fetch_olivetti_faces +from sklearn.datasets import fetch_rcv1 + from sklearn._min_dependencies import PYTEST_MIN_VERSION from sklearn.utils.fixes import np_version, parse_version @@ -24,9 +32,36 @@ .format(PYTEST_MIN_VERSION)) -def pytest_addoption(parser): - parser.addoption("--skip-network", action="store_true", default=False, - help="skip network tests") +dataset_fetchers = { + 'fetch_20newsgroups_fxt': fetch_20newsgroups, + 'fetch_20newsgroups_vectorized_fxt': fetch_20newsgroups_vectorized, + 'fetch_california_housing_fxt': fetch_california_housing, + 'fetch_covtype_fxt': fetch_covtype, + 'fetch_kddcup99_fxt': fetch_kddcup99, + 'fetch_olivetti_faces_fxt': fetch_olivetti_faces, + 'fetch_rcv1_fxt': fetch_rcv1, +} + + +# fetching a dataset with this fixture will never download if missing +def _fetch_fixture(f): + def wrapped(*args, **kwargs): + kwargs['download_if_missing'] = False + try: + return f(*args, **kwargs) + except IOError: + pytest.skip("test requires -m 'not skipnetwork' to run") + return pytest.fixture(lambda: wrapped) + + +fetch_20newsgroups_fxt = _fetch_fixture(fetch_20newsgroups) +fetch_20newsgroups_vectorized_fxt = \ + _fetch_fixture(fetch_20newsgroups_vectorized) +fetch_california_housing_fxt = _fetch_fixture(fetch_california_housing) +fetch_covtype_fxt = _fetch_fixture(fetch_covtype) +fetch_kddcup99_fxt = _fetch_fixture(fetch_kddcup99) +fetch_olivetti_faces_fxt = _fetch_fixture(fetch_olivetti_faces) +fetch_rcv1_fxt = _fetch_fixture(fetch_rcv1) def pytest_collection_modifyitems(config, items): @@ -50,14 +85,32 @@ def pytest_collection_modifyitems(config, items): ) item.add_marker(marker) - # Skip tests which require internet if the flag is provided - if (config.getoption("--skip-network") - or int(os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "0"))): - skip_network = pytest.mark.skip( - reason="test requires internet connectivity") - for item in items: - if "network" in item.keywords: - item.add_marker(skip_network) + run_network_tests = 'not skipnetwork' in config.getoption("markexpr") + skip_network = pytest.mark.skip( + reason="test requires -m 'not skipnetwork' to run") + + # download datasets during collection to avoid thread unsafe behavior + # when running pytest in parallel with pytest-xdist + dataset_features_set = set(dataset_fetchers) + datasets_to_download = set() + + for item in items: + item_keywords = set(item.keywords) + dataset_to_fetch = item_keywords & dataset_features_set + if not dataset_to_fetch: + continue + + if run_network_tests: + datasets_to_download |= dataset_to_fetch + else: + # network tests are skipped + item.add_marker(skip_network) + + # download datasets that are needed to avoid thread unsafe behavior + # by pytest-xdist + if run_network_tests: + for name in datasets_to_download: + dataset_fetchers[name]() # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to # run doctests only for numpy >= 1.14. @@ -92,6 +145,12 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_marker) +def pytest_runtest_setup(item): + run_network_tests = 'not skipnetwork' in item.config.getoption("markexpr") + if "network" in item.keywords and not run_network_tests: + pytest.skip("test requires -m 'not skipnetwork' to run") + + def pytest_configure(config): import sys sys._is_pytest_session = True diff --git a/doc/computing/computational_performance.rst b/doc/computing/computational_performance.rst index d47ac6f614183..e943ffef80688 100644 --- a/doc/computing/computational_performance.rst +++ b/doc/computing/computational_performance.rst @@ -368,3 +368,203 @@ Links - :ref:`scikit-learn developer performance documentation ` - `Scipy sparse matrix formats documentation `_ + +.. _parallelism: + +Parallelism +----------- + +Some scikit-learn estimators and utilities can parallelize costly operations +using multiple CPU cores, thanks to the following components: + +- via the `joblib `_ library. In + this case the number of threads or processes can be controlled with the + ``n_jobs`` parameter. +- via OpenMP, used in C or Cython code. + +In addition, some of the numpy routines that are used internally by +scikit-learn may also be parallelized if numpy is installed with specific +numerical libraries such as MKL, OpenBLAS, or BLIS. + +We describe these 3 scenarios in the following subsections. + +Joblib-based parallelism +........................ + +When the underlying implementation uses joblib, the number of workers +(threads or processes) that are spawned in parallel can be controlled via the +``n_jobs`` parameter. + +.. note:: + + Where (and how) parallelization happens in the estimators is currently + poorly documented. Please help us by improving our docs and tackle `issue + 14228 `_! + +Joblib is able to support both multi-processing and multi-threading. Whether +joblib chooses to spawn a thread or a process depends on the **backend** +that it's using. + +Scikit-learn generally relies on the ``loky`` backend, which is joblib's +default backend. Loky is a multi-processing backend. When doing +multi-processing, in order to avoid duplicating the memory in each process +(which isn't reasonable with big datasets), joblib will create a `memmap +`_ +that all processes can share, when the data is bigger than 1MB. + +In some specific cases (when the code that is run in parallel releases the +GIL), scikit-learn will indicate to ``joblib`` that a multi-threading +backend is preferable. + +As a user, you may control the backend that joblib will use (regardless of +what scikit-learn recommends) by using a context manager:: + + from joblib import parallel_backend + + with parallel_backend('threading', n_jobs=2): + # Your scikit-learn code here + +Please refer to the `joblib's docs +`_ +for more details. + +In practice, whether parallelism is helpful at improving runtime depends on +many factors. It is usually a good idea to experiment rather than assuming +that increasing the number of workers is always a good thing. In some cases +it can be highly detrimental to performance to run multiple copies of some +estimators or functions in parallel (see oversubscription below). + +OpenMP-based parallelism +........................ + +OpenMP is used to parallelize code written in Cython or C, relying on +multi-threading exclusively. By default (and unless joblib is trying to +avoid oversubscription), the implementation will use as many threads as +possible. + +You can control the exact number of threads that are used via the +``OMP_NUM_THREADS`` environment variable:: + + OMP_NUM_THREADS=4 python my_script.py + +Parallel Numpy routines from numerical libraries +................................................ + +Scikit-learn relies heavily on NumPy and SciPy, which internally call +multi-threaded linear algebra routines implemented in libraries such as MKL, +OpenBLAS or BLIS. + +The number of threads used by the OpenBLAS, MKL or BLIS libraries can be set +via the ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, and +``BLIS_NUM_THREADS`` environment variables. + +Please note that scikit-learn has no direct control over these +implementations. Scikit-learn solely relies on Numpy and Scipy. + +.. note:: + At the time of writing (2019), NumPy and SciPy packages distributed on + pypi.org (used by ``pip``) and on the conda-forge channel are linked + with OpenBLAS, while conda packages shipped on the "defaults" channel + from anaconda.org are linked by default with MKL. + + +Oversubscription: spawning too many threads +........................................... + +It is generally recommended to avoid using significantly more processes or +threads than the number of CPUs on a machine. Over-subscription happens when +a program is running too many threads at the same time. + +Suppose you have a machine with 8 CPUs. Consider a case where you're running +a :class:`~GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over +a :class:`~HistGradientBoostingClassifier` (parallelized with OpenMP). Each +instance of :class:`~HistGradientBoostingClassifier` will spawn 8 threads +(since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which +leads to oversubscription of physical CPU resources and to scheduling +overhead. + +Oversubscription can arise in the exact same fashion with parallelized +routines from MKL, OpenBLAS or BLIS that are nested in joblib calls. + +Starting from ``joblib >= 0.14``, when the ``loky`` backend is used (which +is the default), joblib will tell its child **processes** to limit the +number of threads they can use, so as to avoid oversubscription. In practice +the heuristic that joblib uses is to tell the processes to use ``max_threads += n_cpus // n_jobs``, via their corresponding environment variable. Back to +our example from above, since the joblib backend of :class:`~GridSearchCV` +is ``loky``, each process will only be able to use 1 thread instead of 8, +thus mitigating the oversubscription issue. + +Note that: + +- Manually setting one of the environment variables (``OMP_NUM_THREADS``, + ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, or ``BLIS_NUM_THREADS``) + will take precedence over what joblib tries to do. The total number of + threads will be ``n_jobs * _NUM_THREADS``. Note that setting this + limit will also impact your computations in the main process, which will + only use ``_NUM_THREADS``. Joblib exposes a context manager for + finer control over the number of threads in its workers (see joblib docs + linked below). +- Joblib is currently unable to avoid oversubscription in a + multi-threading context. It can only do so with the ``loky`` backend + (which spawns processes). + +You will find additional details about joblib mitigation of oversubscription +in `joblib documentation +`_. + + +Configuration switches +----------------------- + +Python runtime +.............. + +:func:`sklearn.set_config` controls the following behaviors: + +:assume_finite: + + used to skip validation, which enables faster computations but may + lead to segmentation faults if the data contains NaNs. + +:working_memory: + + the optimal size of temporary arrays used by some algorithms. + +.. _environment_variable: + +Environment variables +...................... + +These environment variables should be set before importing scikit-learn. + +:SKLEARN_SITE_JOBLIB: + + When this environment variable is set to a non zero value, + scikit-learn uses the site joblib rather than its vendored version. + Consequently, joblib must be installed for scikit-learn to run. + Note that using the site joblib is at your own risks: the versions of + scikit-learn and joblib need to be compatible. Currently, joblib 0.11+ + is supported. In addition, dumps from joblib.Memory might be incompatible, + and you might loose some caches and have to redownload some datasets. + + .. deprecated:: 0.21 + + As of version 0.21 this parameter has no effect, vendored joblib was + removed and site joblib is always used. + +:SKLEARN_ASSUME_FINITE: + + Sets the default value for the `assume_finite` argument of + :func:`sklearn.set_config`. + +:SKLEARN_WORKING_MEMORY: + + Sets the default value for the `working_memory` argument of + :func:`sklearn.set_config`. + +:SKLEARN_SEED: + + Sets the seed of the global random generator when running the tests, + for reproducibility. +======= diff --git a/doc/conftest.py b/doc/conftest.py index 4496bb74152ac..3f563187c8465 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -5,7 +5,6 @@ from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest -from sklearn.utils._testing import check_skip_network from sklearn.datasets import get_data_home from sklearn.datasets._base import _pkl_filepath from sklearn.datasets._twenty_newsgroups import CACHE_NAME @@ -18,7 +17,6 @@ def setup_labeled_faces(): def setup_rcv1(): - check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = join(get_data_home(), "RCV1") if not exists(rcv1_dir): @@ -35,7 +33,6 @@ def setup_twenty_newsgroups(): def setup_working_with_text_data(): if IS_PYPY and os.environ.get('CI', None): raise SkipTest('Skipping too slow test with PyPy on CI') - check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 78c1175620c4f..674780d46d780 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -493,6 +493,11 @@ a benchmark script and profiling output (see :ref:`monitoring_performances`). Also check out the :ref:`performance-howto` guide for more details on profiling and Cython optimizations. +* Code that uses the network such as the dataset functions prefixed with + `fetch_*` are off by default. These tests can be run with:: + + $ pytest sklearn -m 'not skipnetwork' + .. note:: The current state of the scikit-learn code base is not compliant with diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py index 4612cd5deb4bc..8f34f7c46cb21 100644 --- a/sklearn/datasets/tests/conftest.py +++ b/sklearn/datasets/tests/conftest.py @@ -1,67 +1,5 @@ -""" Network tests are only run, if data is already locally available, -or if download is specifically requested by environment variable.""" import builtins -from functools import wraps -from os import environ import pytest -from sklearn.datasets import fetch_20newsgroups -from sklearn.datasets import fetch_20newsgroups_vectorized -from sklearn.datasets import fetch_california_housing -from sklearn.datasets import fetch_covtype -from sklearn.datasets import fetch_kddcup99 -from sklearn.datasets import fetch_olivetti_faces -from sklearn.datasets import fetch_rcv1 - - -def _wrapped_fetch(f, dataset_name): - """ Fetch dataset (download if missing and requested by environment) """ - download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' - - @wraps(f) - def wrapped(*args, **kwargs): - kwargs['download_if_missing'] = download_if_missing - try: - return f(*args, **kwargs) - except IOError: - pytest.skip("Download {} to run this test".format(dataset_name)) - return wrapped - - -@pytest.fixture -def fetch_20newsgroups_fxt(): - return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups') - - -@pytest.fixture -def fetch_20newsgroups_vectorized_fxt(): - return _wrapped_fetch(fetch_20newsgroups_vectorized, - dataset_name='20newsgroups_vectorized') - - -@pytest.fixture -def fetch_california_housing_fxt(): - return _wrapped_fetch(fetch_california_housing, - dataset_name='california_housing') - - -@pytest.fixture -def fetch_covtype_fxt(): - return _wrapped_fetch(fetch_covtype, dataset_name='covtype') - - -@pytest.fixture -def fetch_kddcup99_fxt(): - return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99') - - -@pytest.fixture -def fetch_olivetti_faces_fxt(): - return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces') - - -@pytest.fixture -def fetch_rcv1_fxt(): - return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1') @pytest.fixture diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 256b79db4865c..289603c7d609b 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -13,8 +13,7 @@ from sklearn import datasets from sklearn.base import clone -from sklearn.datasets import (make_classification, fetch_california_housing, - make_regression) +from sklearn.datasets import make_classification, make_regression from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble._gradient_boosting import predict_stages @@ -345,8 +344,7 @@ def test_max_feature_regression(): assert deviance < 0.5, "GB failed with deviance %.4f" % deviance -@pytest.mark.network -def test_feature_importance_regression(): +def test_feature_importance_regression(fetch_california_housing_fxt): """Test that Gini importance is calculated correctly. This test follows the example from [1]_ (pg. 373). @@ -354,7 +352,7 @@ def test_feature_importance_regression(): .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of statistical learning. New York: Springer series in statistics. """ - california = fetch_california_housing() + california = fetch_california_housing_fxt() X, y = california.data, california.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 779e7b6574e3e..1bef09b86e1df 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -472,11 +472,6 @@ def set_random_state(estimator, random_state=0): pass -def check_skip_network(): - if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)): - raise SkipTest("Text tutorial requires large dataset download") - - def _delete_folder(folder_path, warn=False): """Utility function to cleanup a temporary folder if still existing.