diff --git a/.circleci/config.yml b/.circleci/config.yml index 3a1bc848942d3..0e77f30d18ed7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -41,13 +41,12 @@ jobs: # Test examples run with minimal dependencies - MINICONDA_PATH: ~/miniconda - CONDA_ENV_NAME: testenv - - PYTHON_VERSION: 2 - - NUMPY_VERSION: 1.8.2 - # XXX: plot_gpc_xor.py fails with scipy 0.13.3 - - SCIPY_VERSION: 0.14 - - MATPLOTLIB_VERSION: 1.3 - - SCIKIT_IMAGE_VERSION: 0.9.3 - - PANDAS_VERSION: 0.13.1 + - PYTHON_VERSION: "2" + - NUMPY_VERSION: "1.10" + - SCIPY_VERSION: "0.16" + - MATPLOTLIB_VERSION: "1.4" + - SCIKIT_IMAGE_VERSION: "0.11" + - PANDAS_VERSION: "0.17.1" steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh @@ -65,6 +64,21 @@ jobs: path: ~/log.txt destination: log.txt + pypy3: + docker: + - image: pypy:3-6.0.0 + steps: + - restore_cache: + keys: + - pypy3-ccache-{{ .Branch }} + - pypy3-ccache + - checkout + - run: ./build_tools/circle/build_test_pypy.sh + - save_cache: + key: pypy3-ccache-{{ .Branch }}-{{ .BuildNum }} + paths: + - ~/.ccache + - ~/.cache/pip deploy: docker: @@ -89,6 +103,21 @@ workflows: jobs: - python3 - python2 + - pypy3: + filters: + branches: + only: + - 0.20.X - deploy: requires: - python3 + pypy: + triggers: + - schedule: + cron: "0 0 * * *" + filters: + branches: + only: + - master + jobs: + - pypy3 diff --git a/.travis.yml b/.travis.yml index 7196296a386d3..4b0a7d0f4281b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,7 +46,7 @@ matrix: CYTHON_VERSION="*" PYAMG_VERSION="*" PILLOW_VERSION="*" JOBLIB_VERSION="*" COVERAGE=true CHECK_PYTEST_SOFT_DEPENDENCY="true" TEST_DOCSTRINGS="true" - SKLEARN_SITE_JOBLIB=1 + SKLEARN_SITE_JOBLIB=1 CHECK_WARNINGS="true" if: type != cron # flake8 linting on diff wrt common ancestor with upstream/master - env: RUN_FLAKE8="true" SKIP_TESTS="true" @@ -58,7 +58,7 @@ matrix: # installed from their CI wheels in a virtualenv with the Python # interpreter provided by travis. - python: 3.6 - env: DISTRIB="scipy-dev" + env: DISTRIB="scipy-dev" CHECK_WARNINGS="true" if: type = cron OR commit_message =~ /\[scipy-dev\]/ install: source build_tools/travis/install.sh diff --git a/AUTHORS.rst b/AUTHORS.rst deleted file mode 100644 index 48427fc0a2b3a..0000000000000 --- a/AUTHORS.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. -*- mode: rst -*- - - -This is a community effort, and as such many people have contributed -to it over the years. - -History -------- - -This project was started in 2007 as a Google Summer of Code project by -David Cournapeau. Later that year, Matthieu Brucher started work on -this project as part of his thesis. - -In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent -Michel of INRIA took leadership of the project and made the first public -release, February the 1st 2010. Since then, several releases have appeared -following a ~3 month cycle, and a thriving international community has -been leading the development. - -People ------- - -The following people have been core contributors to scikit-learn's development and maintenance: - -.. hlist:: - - * `Mathieu Blondel `_ - * `Matthieu Brucher `_ - * Lars Buitinck - * David Cournapeau - * `Noel Dawe `_ - * Vincent Dubourg - * Edouard Duchesnay - * `Tom Dupré la Tour `_ - * Alexander Fabisch - * `Virgile Fritsch `_ - * `Satra Ghosh `_ - * `Angel Soler Gollonet `_ - * Chris Filo Gorgolewski - * `Alexandre Gramfort `_ - * `Olivier Grisel `_ - * `Jaques Grobler `_ - * `Yaroslav Halchenko `_ - * `Brian Holt `_ - * `Arnaud Joly `_ - * Thouis (Ray) Jones - * `Kyle Kastner `_ - * `Manoj Kumar `_ - * Robert Layton - * `Guillaume Lemaitre `_ - * `Wei Li `_ - * Paolo Losi - * `Gilles Louppe `_ - * `Jan Hendrik Metzen `_ - * Vincent Michel - * Jarrod Millman - * `Andreas Müller `_ (release manager) - * `Vlad Niculae `_ - * `Joel Nothman `_ - * `Alexandre Passos `_ - * `Fabian Pedregosa `_ - * `Peter Prettenhofer `_ - * `Hanmin Qin `_ - * Bertrand Thirion - * `Joris Van den Bossche `_ - * `Jake VanderPlas `_ - * Nelle Varoquaux - * `Gael Varoquaux `_ - * Ron Weiss - * `Roman Yurchak `_ - -Please do not email the authors directly to ask for assistance or report issues. -Instead, please see `What's the best way to ask questions about scikit-learn -`_ -in the FAQ. diff --git a/README.rst b/README.rst index eb1957686acaf..b4d67af56eec8 100644 --- a/README.rst +++ b/README.rst @@ -56,8 +56,8 @@ scikit-learn requires: **Scikit-learn 0.20 is the last version to support Python2.7.** Scikit-learn 0.21 and later will require Python 3.5 or newer. -For running the examples Matplotlib >= 1.3.1 is required. A few examples -require scikit-image >= 0.9.3 and a few examples require pandas >= 0.13.1. +For running the examples Matplotlib >= 1.4 is required. A few examples +require scikit-image >= 0.11.3 and a few examples require pandas >= 0.17.1. scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra Subprograms library. scikit-learn comes with a reference implementation, but @@ -120,7 +120,7 @@ Testing ~~~~~~~ After installation, you can launch the test suite from outside the -source directory (you will need to have the ``pytest`` package installed):: +source directory (you will need to have ``pytest`` >= 3.3.0 installed):: pytest sklearn diff --git a/appveyor.yml b/appveyor.yml index 5eb4d08a8737d..c8a464723ff6c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,6 +20,7 @@ environment: - PYTHON: "C:\\Python37-x64" PYTHON_VERSION: "3.7.0" PYTHON_ARCH: "64" + CHECK_WARNINGS: "true" - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.8" @@ -72,7 +73,13 @@ test_script: # installed library. - mkdir "../empty_folder" - cd "../empty_folder" - - pytest --showlocals --durations=20 --pyargs sklearn + - ps: >- + if (Test-Path variable:global:CHECK_WARNINGS) { + $env:PYTEST_ARGS = "-Werror::DeprecationWarning -Werror::FutureWarning" + } else { + $env:PYTEST_ARGS = "" + } + - "pytest --showlocals --durations=20 %PYTEST_ARGS% --pyargs sklearn" # Move back to the project folder - cd "../scikit-learn" diff --git a/build_tools/Makefile b/build_tools/Makefile new file mode 100644 index 0000000000000..68162733b4b11 --- /dev/null +++ b/build_tools/Makefile @@ -0,0 +1,4 @@ +# Makefile for maintenance tools + +authors: + python generate_authors_table.py > ../doc/authors.rst diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh index 18fa361821d14..922bbac8e23a6 100755 --- a/build_tools/circle/build_test_pypy.sh +++ b/build_tools/circle/build_test_pypy.sh @@ -18,13 +18,16 @@ source pypy-env/bin/activate python --version which python -pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy==1.14.4 Cython pytest +pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy Cython pytest pip install "scipy>=1.1.0" sphinx numpydoc docutils ccache -M 512M export CCACHE_COMPRESS=1 export PATH=/usr/lib/ccache:$PATH +export LOKY_MAX_CPU_COUNT="2" -pip install -e . +pip install -vv -e . -make test +python -m pytest sklearn/ +python -m pytest doc/sphinxext/ +python -m pytest $(find doc -name '*.rst' | sort) diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py new file mode 100644 index 0000000000000..ea3796473396d --- /dev/null +++ b/build_tools/generate_authors_table.py @@ -0,0 +1,117 @@ +""" +This script generates an html table of contributors, with names and avatars. +The list is generated from scikit-learn's teams on GitHub, plus a small number +of hard-coded contributors. + +The table should be updated for each new inclusion in the teams. +Generating the table requires admin rights. +""" +from __future__ import print_function + +import sys +import requests +import getpass + +try: + # With authentication: up to 5000 requests per hour. + print("user:", file=sys.stderr) + user = input() + passwd = getpass.getpass() + auth = (user, passwd) +except IndexError: + # Without authentication: up to 60 requests per hour. + auth = None + +ROW_SIZE = 7 +LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4' + + +def group_iterable(iterable, size): + """Group iterable into lines""" + group = [] + for element in iterable: + group.append(element) + if len(group) == size: + yield group + group = [] + if len(group) != 0: + yield group + + +def get_contributors(): + """Get the list of contributor profiles. Require admin rights.""" + # get members of scikit-learn teams on GitHub + members = [] + for team in [11523, 33471]: + for page in [1, 2]: # 30 per page + members.extend(requests.get( + "https://api.github.com/teams/%d/members?page=%d" + % (team, page), auth=auth).json()) + + # keep only the logins + logins = [c['login'] for c in members] + # add missing contributors with GitHub accounts + logins.extend(['dubourg', 'jarrodmillman', 'mbrucher', 'thouis']) + # add missing contributors without GitHub accounts + logins.extend(['Angel Soler Gollonet']) + # remove duplicate + logins = set(logins) + # remove CI + logins.remove('sklearn-ci') + + # get profiles from GitHub + profiles = [get_profile(login) for login in logins] + # sort by last name + profiles = sorted(profiles, key=key) + + return profiles + + +def get_profile(login): + """Get the GitHub profile from login""" + profile = requests.get("https://api.github.com/users/%s" % login, + auth=auth).json() + if 'name' not in profile: + # default profile if the login does not exist + return dict(name=login, avatar_url=LOGO_URL, html_url="") + else: + if profile["name"] is None: + profile["name"] = profile["login"] + + # fix missing names + missing_names = {'bthirion': 'Bertrand Thirion', + 'dubourg': 'Vincent Dubourg', + 'Duchesnay': 'Edouard Duchesnay', + 'Lars': 'Lars Buitinck', + 'MechCoder': 'Manoj Kumar'} + if profile["name"] in missing_names: + profile["name"] = missing_names[profile["name"]] + return profile + + +def key(profile): + """Get the last name in lower case""" + return profile["name"].split(' ')[-1].lower() + + +contributors = get_contributors() + +print(".. raw :: html\n") +print(" ") +print(" ") +print(" " + % (int(100 / ROW_SIZE), ROW_SIZE)) +print(" ") +for row in group_iterable(contributors, size=ROW_SIZE): + print(" ") + for contributor in row: + print(" ") + print(" ") +print("
") + print("
" + % (contributor["html_url"], contributor["avatar_url"])) + print("

%s

" % contributor["name"]) + print("
") diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index 1cf24d10837c7..5036e19b3a6f0 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -38,6 +38,13 @@ run_tests() { if [[ "$COVERAGE" == "true" ]]; then TEST_CMD="$TEST_CMD --cov sklearn" fi + + if [[ -n "$CHECK_WARNINGS" ]]; then + TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning" + fi + + set -x # print executed commands to the terminal + $TEST_CMD sklearn # Going back to git checkout folder needed to test documentation diff --git a/conftest.py b/conftest.py index 621097bfc47ab..82c4b17faeef0 100644 --- a/conftest.py +++ b/conftest.py @@ -11,6 +11,19 @@ import pytest from _pytest.doctest import DoctestItem +from sklearn.utils.fixes import PY3_OR_LATER + +PYTEST_MIN_VERSION = '3.3.0' + +if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION: + raise('Your version of pytest is too old, you should have at least ' + 'pytest >= {} installed.'.format(PYTEST_MIN_VERSION)) + + +def pytest_addoption(parser): + parser.addoption("--skip-network", action="store_true", default=False, + help="skip network tests") + def pytest_collection_modifyitems(config, items): @@ -19,22 +32,35 @@ def pytest_collection_modifyitems(config, items): skip_marker = pytest.mark.skip( reason='FeatureHasher is not compatible with PyPy') for item in items: - if item.name == 'sklearn.feature_extraction.hashing.FeatureHasher': + if item.name in ( + 'sklearn.feature_extraction.hashing.FeatureHasher', + 'sklearn.feature_extraction.text.HashingVectorizer'): item.add_marker(skip_marker) + # Skip tests which require internet if the flag is provided + if config.getoption("--skip-network"): + skip_network = pytest.mark.skip( + reason="test requires internet connectivity") + for item in items: + if "network" in item.keywords: + item.add_marker(skip_network) + # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to - # run doctests only for numpy >= 1.14. - skip_doctests = True + # run doctests only for numpy >= 1.14. We want to skip the doctest for + # python 2 due to unicode. + skip_doctests = False + if not PY3_OR_LATER: + skip_doctests = True try: import numpy as np - if LooseVersion(np.__version__) >= LooseVersion('1.14'): - skip_doctests = False + if LooseVersion(np.__version__) < LooseVersion('1.14'): + skip_doctests = True except ImportError: pass if skip_doctests: skip_marker = pytest.mark.skip( - reason='doctests are only run for numpy >= 1.14') + reason='doctests are only run for numpy >= 1.14 and python >= 3') for item in items: if isinstance(item, DoctestItem): diff --git a/doc/about.rst b/doc/about.rst index 90295b96fb6ff..218b0ad897fe4 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -1,7 +1,31 @@ About us ======== -.. include:: ../AUTHORS.rst +History +------- + +This project was started in 2007 as a Google Summer of Code project by +David Cournapeau. Later that year, Matthieu Brucher started work on +this project as part of his thesis. + +In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent +Michel of INRIA took leadership of the project and made the first public +release, February the 1st 2010. Since then, several releases have appeared +following a ~3 month cycle, and a thriving international community has +been leading the development. + +Authors +------- + +The following people have been core contributors to scikit-learn's development +and maintenance: + +.. include:: authors.rst + +Please do not email the authors directly to ask for assistance or report issues. +Instead, please see `What's the best way to ask questions about scikit-learn +`_ +in the FAQ. .. seealso:: diff --git a/doc/authors.rst b/doc/authors.rst new file mode 100644 index 0000000000000..0210dff4bef6e --- /dev/null +++ b/doc/authors.rst @@ -0,0 +1,220 @@ +.. raw :: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

Mathieu Blondel

+
+
+

Joris Van den Bossche

+
+
+

Matthieu Brucher

+
+
+

Lars Buitinck

+
+
+

David Cournapeau

+
+
+

Noel Dawe

+
+
+

Shiqiao Du

+
+
+

Vincent Dubourg

+
+
+

Edouard Duchesnay

+
+
+

Loïc Estève

+
+
+

Alexander Fabisch

+
+
+

Virgile Fritsch

+
+
+

Satrajit Ghosh

+
+
+

Angel Soler Gollonet

+
+
+

Chris Filo Gorgolewski

+
+
+

Alexandre Gramfort

+
+
+

Olivier Grisel

+
+
+

Jaques Grobler

+
+
+

Yaroslav Halchenko

+
+
+

Brian Holt

+
+
+

Arnaud Joly

+
+
+

Thouis (Ray) Jones

+
+
+

Kyle Kastner

+
+
+

Manoj Kumar

+
+
+

Robert Layton

+
+
+

Guillaume Lemaitre

+
+
+

Wei Li

+
+
+

Paolo Losi

+
+
+

Gilles Louppe

+
+
+

Jan Hendrik Metzen

+
+
+

Vincent Michel

+
+
+

Jarrod Millman

+
+
+

Andreas Mueller

+
+
+

Vlad Niculae

+
+
+

Joel Nothman

+
+
+

Alexandre Passos

+
+
+

Fabian Pedregosa

+
+
+

Peter Prettenhofer

+
+
+

Hanmin Qin

+
+
+

(Venkat) Raghav, Rajagopalan

+
+
+

Jacob Schreiber

+
+
+

Bertrand Thirion

+
+
+

Tom Dupré la Tour

+
+
+

Jake Vanderplas

+
+
+

Nelle Varoquaux

+
+
+

Gael Varoquaux

+
+
+

David Warde-Farley

+
+
+

Ron Weiss

+
+
+

Roman Yurchak

+
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 947e55f0c4c37..e0640916fbb64 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -351,6 +351,154 @@ features:: _`Faster API-compatible implementation`: https://github.com/mblondel/svmlight-loader +.. + For doctests: + + >>> import numpy as np + >>> import os + +.. _openml: + +Downloading datasets from the openml.org repository +--------------------------------------------------- + +`openml.org `_ is a public repository for machine learning +data and experiments, that allows everybody to upload open datasets. + +The ``sklearn.datasets`` package is able to download datasets +from the repository using the function +:func:`sklearn.datasets.fetch_openml`. + +For example, to download a dataset of gene expressions in mice brains:: + + >>> from sklearn.datasets import fetch_openml + >>> mice = fetch_openml(name='miceprotein', version=4) + +To fully specify a dataset, you need to provide a name and a version, though +the version is optional, see :ref:`openml_versions` below. +The dataset contains a total of 1080 examples belonging to 8 different +classes:: + + >>> mice.data.shape + (1080, 77) + >>> mice.target.shape + (1080,) + >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE + array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object) + +You can get more information on the dataset by looking at the ``DESCR`` +and ``details`` attributes:: + + >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios + **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 + **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing + Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down + Syndrome. PLoS ONE 10(6): e0129126... + + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF', + 'upload_date': '2017-11-08T16:00:15', 'licence': 'Public', + 'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff', + 'file_id': '17928620', 'default_target_attribute': 'class', + 'row_id_attribute': 'MouseID', + 'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'], + 'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'], + 'visibility': 'public', 'status': 'active', + 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} + + +The ``DESCR`` contains a free-text description of the data, while ``details`` +contains a dictionary of meta-data stored by openml, like the dataset id. +For more details, see the `OpenML documentation +`_ The ``data_id`` of the mice protein dataset +is 40966, and you can use this (or the name) to get more information on the +dataset on the openml website:: + + >>> mice.url + 'https://www.openml.org/d/40966' + +The ``data_id`` also uniquely identifies a dataset from OpenML:: + + >>> mice = fetch_openml(data_id=40966) + >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP + {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', + 'creator': ..., + 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': + 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': + '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, + Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins + Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): + e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', + 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': + '3c479a6885bfa0438971388283a1ce32'} + +.. _openml_versions: + +Dataset Versions +~~~~~~~~~~~~~~~~ + +A dataset is uniquely specified by its ``data_id``, but not necessarily by its +name. Several different "versions" of a dataset with the same name can exist +which can contain entirely different datasets. +If a particular version of a dataset has been found to contain significant +issues, it might be deactivated. Using a name to specify a dataset will yield +the earliest version of a dataset that is still active. That means that +``fetch_openml(name="miceprotein")`` can yield different results at different +times if earlier versions become inactive. +You can see that the dataset with ``data_id`` 40966 that we fetched above is +the version 1 of the "miceprotein" dataset:: + + >>> mice.details['version'] #doctest: +SKIP + '1' + +In fact, this dataset only has one version. The iris dataset on the other hand +has multiple versions:: + + >>> iris = fetch_openml(name="iris") + >>> iris.details['version'] #doctest: +SKIP + '1' + >>> iris.details['id'] #doctest: +SKIP + '61' + + >>> iris_61 = fetch_openml(data_id=61) + >>> iris_61.details['version'] + '1' + >>> iris_61.details['id'] + '61' + + >>> iris_969 = fetch_openml(data_id=969) + >>> iris_969.details['version'] + '3' + >>> iris_969.details['id'] + '969' + +Specifying the dataset by the name "iris" yields the lowest version, version 1, +with the ``data_id`` 61. To make sure you always get this exact dataset, it is +safest to specify it by the dataset ``data_id``. The other dataset, with +``data_id`` 969, is version 3 (version 2 has become inactive), and contains a +binarized version of the data:: + + >>> np.unique(iris_969.target) + array(['N', 'P'], dtype=object) + +You can also specify both the name and the version, which also uniquely +identifies the dataset:: + + >>> iris_version_3 = fetch_openml(name="iris", version=3) + >>> iris_version_3.details['version'] + '3' + >>> iris_version_3.details['id'] + '969' + + +.. topic:: References: + + * Vanschoren, van Rijn, Bischl and Torgo + `"OpenML: networked science in machine learning" + `_, + ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014. + .. _external_datasets: Loading from external datasets diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst deleted file mode 100644 index 52dd453919522..0000000000000 --- a/doc/datasets/openml.rst +++ /dev/null @@ -1,148 +0,0 @@ -.. - For doctests: - - >>> import numpy as np - >>> import os - - -.. _openml: - -Downloading datasets from the openml.org repository -=================================================== - -`openml.org `_ is a public repository for machine learning -data and experiments, that allows everybody to upload open datasets. - -The ``sklearn.datasets`` package is able to download datasets -from the repository using the function -:func:`sklearn.datasets.fetch_openml`. - -For example, to download a dataset of gene expressions in mice brains:: - - >>> from sklearn.datasets import fetch_openml - >>> mice = fetch_openml(name='miceprotein', version=4) - -To fully specify a dataset, you need to provide a name and a version, though -the version is optional, see :ref:`openml_versions` below. -The dataset contains a total of 1080 examples belonging to 8 different -classes:: - - >>> mice.data.shape - (1080, 77) - >>> mice.target.shape - (1080,) - >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE - array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object) - -You can get more information on the dataset by looking at the ``DESCR`` -and ``details`` attributes:: - - >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios - **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015 - **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing - Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down - Syndrome. PLoS ONE 10(6): e0129126... - - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF', - 'upload_date': '2017-11-08T16:00:15', 'licence': 'Public', - 'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff', - 'file_id': '17928620', 'default_target_attribute': 'class', - 'row_id_attribute': 'MouseID', - 'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'], - 'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'], - 'visibility': 'public', 'status': 'active', - 'md5_checksum': '3c479a6885bfa0438971388283a1ce32'} - - -The ``DESCR`` contains a free-text description of the data, while ``details`` -contains a dictionary of meta-data stored by openml, like the dataset id. -For more details, see the `OpenML documentation -`_ The ``data_id`` of the mice protein dataset -is 40966, and you can use this (or the name) to get more information on the -dataset on the openml website:: - - >>> mice.url - 'https://www.openml.org/d/40966' - -The ``data_id`` also uniquely identifies a dataset from OpenML:: - - >>> mice = fetch_openml(data_id=40966) - >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP - {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF', - 'creator': ..., - 'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url': - 'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id': - '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C, - Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins - Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6): - e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14', - 'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum': - '3c479a6885bfa0438971388283a1ce32'} - -.. _openml_versions: - -Dataset Versions ----------------- - -A dataset is uniquely specified by its ``data_id``, but not necessarily by its -name. Several different "versions" of a dataset with the same name can exist -which can contain entirely different datasets. -If a particular version of a dataset has been found to contain significant -issues, it might be deactivated. Using a name to specify a dataset will yield -the earliest version of a dataset that is still active. That means that -``fetch_openml(name="miceprotein")`` can yield different results at different -times if earlier versions become inactive. -You can see that the dataset with ``data_id`` 40966 that we fetched above is -the version 1 of the "miceprotein" dataset:: - - >>> mice.details['version'] #doctest: +SKIP - '1' - -In fact, this dataset only has one version. The iris dataset on the other hand -has multiple versions:: - - >>> iris = fetch_openml(name="iris") - >>> iris.details['version'] #doctest: +SKIP - '1' - >>> iris.details['id'] #doctest: +SKIP - '61' - - >>> iris_61 = fetch_openml(data_id=61) - >>> iris_61.details['version'] - '1' - >>> iris_61.details['id'] - '61' - - >>> iris_969 = fetch_openml(data_id=969) - >>> iris_969.details['version'] - '3' - >>> iris_969.details['id'] - '969' - -Specifying the dataset by the name "iris" yields the lowest version, version 1, -with the ``data_id`` 61. To make sure you always get this exact dataset, it is -safest to specify it by the dataset ``data_id``. The other dataset, with -``data_id`` 969, is version 3 (version 2 has become inactive), and contains a -binarized version of the data:: - - >>> np.unique(iris_969.target) - array(['N', 'P'], dtype=object) - -You can also specify both the name and the version, which also uniquely -identifies the dataset:: - - >>> iris_version_3 = fetch_openml(name="iris", version=3) - >>> iris_version_3.details['version'] - '3' - >>> iris_version_3.details['id'] - '969' - - -.. topic:: References: - - * Vanschoren, van Rijn, Bischl and Torgo - `"OpenML: networked science in machine learning" - `_, - ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014. diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 720c11ed98f4c..e146363d0ac4e 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -50,7 +50,9 @@ Building Scikit-learn also requires Running tests requires -- pytest +.. |PytestMinVersion| replace:: 3.3.0 + +- pytest >=\ |PytestMinVersion| Some tests also require `pandas `_. @@ -276,9 +278,8 @@ Testing Testing scikit-learn once installed ----------------------------------- -Testing requires having the `pytest -`_ library. Some tests also require having -`pandas ` installed. +Testing requires having `pytest `_ >=\ |PytestMinVersion|\ . +Some tests also require having `pandas ` installed. After installation, the package can be tested by executing *from outside* the source directory:: diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index d0d0db8a041bb..a3309abcfbf10 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -1,8 +1,17 @@ Maintainer / core-developer information ======================================== +Before a release +---------------- + +1. Update authors table:: + + $ cd build_tools; make authors; cd .. + + and commit. + Making a release ------------------- +---------------- For more information see https://github.com/scikit-learn/scikit-learn/wiki/How-to-make-a-release diff --git a/doc/install.rst b/doc/install.rst index 7dbb2287c4063..bb6b67af3e3cb 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -78,7 +78,7 @@ Canopy and Anaconda for all supported platforms `Canopy `_ and `Anaconda -`_ both ship a recent +`_ both ship a recent version of scikit-learn, in addition to a large set of scientific python library for Windows, Mac OSX and Linux. diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 5a291bfaebf17..663ca40b8c7fa 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -353,13 +353,13 @@ Like pipelines, feature unions have a shorthand constructor called Like ``Pipeline``, individual steps may be replaced using ``set_params``, -and ignored by setting to ``None``:: +and ignored by setting to ``'drop'``:: - >>> combined.set_params(kernel_pca=None) + >>> combined.set_params(kernel_pca='drop') ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS FeatureUnion(n_jobs=None, transformer_list=[('linear_pca', PCA(copy=True,...)), - ('kernel_pca', None)], + ('kernel_pca', 'drop')], transformer_weights=None) .. topic:: Examples: diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index a41c8201a3fa1..5399f13dbc9f4 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -218,7 +218,7 @@ setting ``oob_score=True``. The size of the model with the default parameters is :math:`O( M * N * log (N) )`, where :math:`M` is the number of trees and :math:`N` is the number of samples. In order to reduce the size of the model, you can change these parameters: - ``min_samples_split``, ``max_leaf_nodes`` and ``max_depth``. + ``min_samples_split``, ``max_leaf_nodes``, ``max_depth`` and ``min_samples_leaf``. Parallelization --------------- @@ -393,7 +393,8 @@ The number of weak learners is controlled by the parameter ``n_estimators``. The the final combination. By default, weak learners are decision stumps. Different weak learners can be specified through the ``base_estimator`` parameter. The main parameters to tune to obtain good results are ``n_estimators`` and -the complexity of the base estimators (e.g., its depth ``max_depth``). +the complexity of the base estimators (e.g., its depth ``max_depth`` or +minimum required number of samples to consider a split ``min_samples_split``). .. topic:: Examples: diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index b3867373cbf11..827cc13592f56 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -735,9 +735,9 @@ decide better:: array([[1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 0, 1]]) -In the above example, ``'char_wb`` analyzer is used, which creates n-grams +In the above example, ``char_wb`` analyzer is used, which creates n-grams only from characters inside word boundaries (padded with space on each -side). The ``'char'`` analyzer, alternatively, creates n-grams that +side). The ``char`` analyzer, alternatively, creates n-grams that span across words:: >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5)) diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst index 3d45dd78f3179..e1dfb0c03ea4b 100644 --- a/doc/modules/lda_qda.rst +++ b/doc/modules/lda_qda.rst @@ -15,7 +15,7 @@ surface, respectively. These classifiers are attractive because they have closed-form solutions that can be easily computed, are inherently multiclass, have proven to work well in -practice and have no hyperparameters to tune. +practice, and have no hyperparameters to tune. .. |ldaqda| image:: ../auto_examples/classification/images/sphx_glr_plot_lda_qda_001.png :target: ../auto_examples/classification/plot_lda_qda.html @@ -43,7 +43,7 @@ linear subspace consisting of the directions which maximize the separation between classes (in a precise sense discussed in the mathematics section below). The dimension of the output is necessarily less than the number of classes, so this is, in general, a rather strong dimensionality reduction, and -only makes senses in a multiclass setting. +only makes sense in a multiclass setting. This is implemented in :func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired @@ -70,10 +70,10 @@ the class conditional distribution of the data :math:`P(X|y=k)` for each class and we select the class :math:`k` which maximizes this conditional probability. More specifically, for linear and quadratic discriminant analysis, -:math:`P(X|y)` is modelled as a multivariate Gaussian distribution with +:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with density: -.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) +.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) where :math:`d` is the number of features. @@ -85,7 +85,7 @@ matrices, or by a regularized estimator: see the section on shrinkage below). In the case of LDA, the Gaussians for each class are assumed to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to -linear decision surfaces between, as can be seen by comparing the +linear decision surfaces, which can be seen by comparing the log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`: .. math:: @@ -127,7 +127,7 @@ classifier, there is a dimensionality reduction by linear projection onto a :math:`K-1` dimensional space. We can reduce the dimension even more, to a chosen :math:`L`, by projecting -onto the linear subspace :math:`H_L` which maximize the variance of the +onto the linear subspace :math:`H_L` which maximizes the variance of the :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the ``n_components`` parameter used in the diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst index f5173e5d9f3fe..ccf5755c1c7e9 100644 --- a/doc/modules/model_persistence.rst +++ b/doc/modules/model_persistence.rst @@ -35,7 +35,7 @@ persistence model, namely `pickle >> y[0] 0 -In the specific case of scikit-learn, it may be more interesting to use +In the specific case of scikit-learn, it may be better to use joblib's replacement of pickle (``joblib.dump`` & ``joblib.load``), which is more efficient on objects that carry large numpy arrays internally as is often the case for fitted scikit-learn estimators, but can only pickle to the diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 9dbe013bef5d7..3482d4246cda7 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -8,9 +8,9 @@ Novelty and Outlier Detection Many applications require being able to decide whether a new observation belongs to the same distribution as existing observations (it is an -`inlier`), or should be considered as different (it is an outlier). +*inlier*), or should be considered as different (it is an *outlier*). Often, this ability is used to clean real data sets. Two important -distinction must be made: +distinctions must be made: :outlier detection: The training data contains outliers which are defined as observations that @@ -35,7 +35,7 @@ a low density region of the training data, considered as normal in this context. The scikit-learn project provides a set of machine learning tools that -can be used both for novelty or outliers detection. This strategy is +can be used both for novelty or outlier detection. This strategy is implemented with objects learning in an unsupervised way from the data:: estimator.fit(X_train) @@ -77,6 +77,18 @@ not available. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. +The behavior of :class:`neighbors.LocalOutlierFactor` is summarized in the +following table. + +===================== ================================ ===================== +Method Outlier detection Novelty detection +===================== ================================ ===================== +``fit_predict`` OK Not available +``predict`` Not available Use only on new data +``decision_function`` Not available Use only on new data +``score_samples`` Use ``negative_outlier_factor_`` Use only on new data +===================== ================================ ===================== + Overview of outlier detection methods ===================================== @@ -162,7 +174,7 @@ Outlier Detection Outlier detection is similar to novelty detection in the sense that the goal is to separate a core of regular observations from some -polluting ones, called "outliers". Yet, in the case of outlier +polluting ones, called *outliers*. Yet, in the case of outlier detection, we don't have a clean data set representing the population of regular observations that can be used to train any tool. @@ -341,19 +353,7 @@ Note that ``fit_predict`` is not available in this case. The scores of abnormality of the training samples are always accessible through the ``negative_outlier_factor_`` attribute. -The behavior of LOF is summarized in the following table. - -==================== ================================ ===================== -Method Outlier detection Novelty detection -==================== ================================ ===================== -`fit_predict` OK Not available -`predict` Not available Use only on test data -`decision_function` Not available Use only on test data -`score_samples` Use `negative_outlier_factor_` Use only on test data -==================== ================================ ===================== - - -This strategy is illustrated below. +Novelty detection with Local Outlier Factor is illustrated below. .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index bd065c14f7444..4429dd8b13cf6 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -164,11 +164,12 @@ Each row of the coefficients corresponds to one of the ``n_class`` many order of the "one" class. In the case of "one-vs-one" :class:`SVC`, the layout of the attributes -is a little more involved. In the case of having a linear kernel, -The layout of ``coef_`` and ``intercept_`` is similar to the one -described for :class:`LinearSVC` described above, except that the shape of -``coef_`` is ``[n_class * (n_class - 1) / 2, n_features]``, corresponding to as -many binary classifiers. The order for classes +is a little more involved. In the case of having a linear kernel, the +attributes ``coef_`` and ``intercept_`` have the shape +``[n_class * (n_class - 1) / 2, n_features]`` and +``[n_class * (n_class - 1) / 2]`` respectively. This is similar to the +layout for :class:`LinearSVC` described above, with each row now corresponding +to a binary classifier. The order for classes 0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . . . "n-1 vs n". diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 5d448f86a3f11..97797191e5e15 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -166,17 +166,6 @@ render these plots inline automatically:: .. figure:: ../images/iris.pdf :align: center -After being fitted, the model can then be used to predict the class of samples:: - - >>> clf.predict(iris.data[:1, :]) - array([0]) - -Alternatively, the probability of each class can be predicted, which is the -fraction of training samples of the same class in a leaf:: - - >>> clf.predict_proba(iris.data[:1, :]) - array([[1., 0., 0.]]) - .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_001.png :target: ../auto_examples/tree/plot_iris.html :align: center @@ -330,18 +319,31 @@ Tips on practical use for each additional level the tree grows to. Use ``max_depth`` to control the size of the tree to prevent overfitting. - * Use ``min_samples_split`` to control the number of samples at a leaf node. - A very small number will usually mean the tree will overfit, whereas a - large number will prevent the tree from learning the data. If the sample - size varies greatly, a float number can be used as percentage in this - parameter. Note that ``min_samples_split`` can create arbitrarily - small leaves. + * Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple + samples inform every decision in the tree, by controlling which splits will + be considered. A very small number will usually mean the tree will overfit, + whereas a large number will prevent the tree from learning the data. Try + ``min_samples_leaf=5`` as an initial value. If the sample size varies + greatly, a float number can be used as percentage in these two parameters. + While ``min_samples_split`` can create arbitrarily small leaves, + ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding + low-variance, over-fit leaf nodes in regression problems. For + classification with few classes, ``min_samples_leaf=1`` is often the best + choice. * Balance your dataset before training to prevent the tree from being biased toward the classes that are dominant. Class balancing can be done by sampling an equal number of samples from each class, or preferably by normalizing the sum of the sample weights (``sample_weight``) for each - class to the same value. + class to the same value. Also note that weight-based pre-pruning criteria, + such as ``min_weight_fraction_leaf``, will then be less biased toward + dominant classes than criteria that are not aware of the sample weights, + like ``min_samples_leaf``. + + * If the samples are weighted, it will be easier to optimize the tree + structure using weight-based pre-pruning criterion such as + ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least + a fraction of the overall sum of the sample weights. * All decision trees use ``np.float32`` arrays internally. If training data is not in this format, a copy of the dataset will be made. diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 9e5d5a32c0575..ce5f5c24dbf3a 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -183,7 +183,10 @@ and tasks. - `multiisotonic `_ Isotonic regression on multidimensional features. - + +- `scikit-multilearn `_ Multi-label classification with + focus on label space manipulation. + - `seglearn `_ Time series and sequence learning using sliding window segmentation. diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html index 79ddd08093012..21136856aa6d2 100644 --- a/doc/themes/scikit-learn/layout.html +++ b/doc/themes/scikit-learn/layout.html @@ -340,17 +340,13 @@

Machine Learning in Python

{% if theme_google_analytics|tobool %} - + {% endif %}