diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3a1bc848942d3..0e77f30d18ed7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -41,13 +41,12 @@ jobs:
       # Test examples run with minimal dependencies
       - MINICONDA_PATH: ~/miniconda
       - CONDA_ENV_NAME: testenv
-      - PYTHON_VERSION: 2
-      - NUMPY_VERSION: 1.8.2
-      # XXX: plot_gpc_xor.py fails with scipy 0.13.3
-      - SCIPY_VERSION: 0.14
-      - MATPLOTLIB_VERSION: 1.3
-      - SCIKIT_IMAGE_VERSION: 0.9.3
-      - PANDAS_VERSION: 0.13.1
+      - PYTHON_VERSION: "2"
+      - NUMPY_VERSION: "1.10"
+      - SCIPY_VERSION: "0.16"
+      - MATPLOTLIB_VERSION: "1.4"
+      - SCIKIT_IMAGE_VERSION: "0.11"
+      - PANDAS_VERSION: "0.17.1"
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
@@ -65,6 +64,21 @@ jobs:
           path: ~/log.txt
           destination: log.txt
 
+  pypy3:
+    docker:
+      - image: pypy:3-6.0.0
+    steps:
+      - restore_cache:
+          keys:
+            - pypy3-ccache-{{ .Branch }}
+            - pypy3-ccache
+      - checkout
+      - run: ./build_tools/circle/build_test_pypy.sh
+      - save_cache:
+          key: pypy3-ccache-{{ .Branch }}-{{ .BuildNum }}
+          paths:
+            - ~/.ccache
+            - ~/.cache/pip
 
   deploy:
     docker:
@@ -89,6 +103,21 @@ workflows:
     jobs:
       - python3
       - python2
+      - pypy3:
+          filters:
+            branches:
+              only:
+                - 0.20.X
       - deploy:
           requires:
             - python3
+  pypy:
+    triggers:
+      - schedule:
+          cron: "0 0 * * *"
+          filters:
+            branches:
+              only:
+                - master
+    jobs:
+      - pypy3
diff --git a/.travis.yml b/.travis.yml
index 7196296a386d3..4b0a7d0f4281b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,7 +46,7 @@ matrix:
            CYTHON_VERSION="*" PYAMG_VERSION="*" PILLOW_VERSION="*"
            JOBLIB_VERSION="*" COVERAGE=true
            CHECK_PYTEST_SOFT_DEPENDENCY="true" TEST_DOCSTRINGS="true"
-           SKLEARN_SITE_JOBLIB=1
+           SKLEARN_SITE_JOBLIB=1 CHECK_WARNINGS="true"
       if: type != cron
     # flake8 linting on diff wrt common ancestor with upstream/master
     - env: RUN_FLAKE8="true" SKIP_TESTS="true"
@@ -58,7 +58,7 @@ matrix:
     # installed from their CI wheels in a virtualenv with the Python
     # interpreter provided by travis.
     -  python: 3.6
-       env: DISTRIB="scipy-dev"
+       env: DISTRIB="scipy-dev" CHECK_WARNINGS="true"
        if: type = cron OR commit_message =~ /\[scipy-dev\]/
 
 install: source build_tools/travis/install.sh
diff --git a/AUTHORS.rst b/AUTHORS.rst
deleted file mode 100644
index 48427fc0a2b3a..0000000000000
--- a/AUTHORS.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-.. -*- mode: rst -*-
-
-
-This is a community effort, and as such many people have contributed
-to it over the years.
-
-History
--------
-
-This project was started in 2007 as a Google Summer of Code project by
-David Cournapeau. Later that year, Matthieu Brucher started work on
-this project as part of his thesis.
-
-In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent
-Michel of INRIA took leadership of the project and made the first public
-release, February the 1st 2010. Since then, several releases have appeared
-following a ~3 month cycle, and a thriving international community has
-been leading the development.
-
-People
-------
-
-The following people have been core contributors to scikit-learn's development and maintenance:
-
-.. hlist::
-
-  * `Mathieu Blondel <http://mblondel.org>`_
-  * `Matthieu Brucher <http://matt.eifelle.com/>`_
-  * Lars Buitinck
-  * David Cournapeau
-  * `Noel Dawe <http://noel.dawe.me>`_
-  * Vincent Dubourg
-  * Edouard Duchesnay
-  * `Tom Dupré la Tour <https://github.com/TomDLT>`_
-  * Alexander Fabisch
-  * `Virgile Fritsch <https://team.inria.fr/parietal/vfritsch/>`_
-  * `Satra Ghosh <http://www.mit.edu/~satra>`_
-  * `Angel Soler Gollonet <http://webylimonada.com>`_
-  * Chris Filo Gorgolewski
-  * `Alexandre Gramfort <http://alexandre.gramfort.net>`_
-  * `Olivier Grisel <https://twitter.com/ogrisel>`_
-  * `Jaques Grobler <https://github.com/jaquesgrobler>`_
-  * `Yaroslav Halchenko <http://www.onerussian.com/>`_
-  * `Brian Holt <http://personal.ee.surrey.ac.uk/Personal/B.Holt/>`_
-  * `Arnaud Joly <http://www.ajoly.org>`_
-  * Thouis (Ray) Jones
-  * `Kyle Kastner <http://kastnerkyle.github.io>`_
-  * `Manoj Kumar <https://manojbits.wordpress.com>`_
-  * Robert Layton
-  * `Guillaume Lemaitre <https://github.com/glemaitre>`_
-  * `Wei Li <http://kuantkid.github.io/>`_
-  * Paolo Losi
-  * `Gilles Louppe <http://glouppe.github.io/>`_
-  * `Jan Hendrik Metzen <https://github.com/jmetzen>`_
-  * Vincent Michel
-  * Jarrod Millman
-  * `Andreas Müller <http://peekaboo-vision.blogspot.com>`_ (release manager)
-  * `Vlad Niculae <http://vene.ro>`_
-  * `Joel Nothman <http://joelnothman.com>`_
-  * `Alexandre Passos <http://atpassos.posterous.com>`_
-  * `Fabian Pedregosa <http://fa.bianp.net/blog/>`_
-  * `Peter Prettenhofer <https://sites.google.com/site/peterprettenhofer/>`_
-  * `Hanmin Qin <https://github.com/qinhanmin2014>`_
-  * Bertrand Thirion
-  * `Joris Van den Bossche <https://github.com/jorisvandenbossche>`_
-  * `Jake VanderPlas <http://staff.washington.edu/jakevdp/>`_
-  * Nelle Varoquaux
-  * `Gael Varoquaux <http://gael-varoquaux.info/>`_
-  * Ron Weiss
-  * `Roman Yurchak <https://github.com/rth>`_
-
-Please do not email the authors directly to ask for assistance or report issues.
-Instead, please see `What's the best way to ask questions about scikit-learn
-<http://scikit-learn.org/stable/faq.html#what-s-the-best-way-to-get-help-on-scikit-learn-usage>`_
-in the FAQ.
diff --git a/README.rst b/README.rst
index eb1957686acaf..b4d67af56eec8 100644
--- a/README.rst
+++ b/README.rst
@@ -56,8 +56,8 @@ scikit-learn requires:
 **Scikit-learn 0.20 is the last version to support Python2.7.**
 Scikit-learn 0.21 and later will require Python 3.5 or newer.
 
-For running the examples Matplotlib >= 1.3.1 is required. A few examples
-require scikit-image >= 0.9.3 and a few examples require pandas >= 0.13.1.
+For running the examples Matplotlib >= 1.4 is required. A few examples
+require scikit-image >= 0.11.3 and a few examples require pandas >= 0.17.1.
 
 scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra
 Subprograms library. scikit-learn comes with a reference implementation, but
@@ -120,7 +120,7 @@ Testing
 ~~~~~~~
 
 After installation, you can launch the test suite from outside the
-source directory (you will need to have the ``pytest`` package installed)::
+source directory (you will need to have ``pytest`` >= 3.3.0 installed)::
 
     pytest sklearn
 
diff --git a/appveyor.yml b/appveyor.yml
index 5eb4d08a8737d..c8a464723ff6c 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -20,6 +20,7 @@ environment:
     - PYTHON: "C:\\Python37-x64"
       PYTHON_VERSION: "3.7.0"
       PYTHON_ARCH: "64"
+      CHECK_WARNINGS: "true"
 
     - PYTHON: "C:\\Python27"
       PYTHON_VERSION: "2.7.8"
@@ -72,7 +73,13 @@ test_script:
   # installed library.
   - mkdir "../empty_folder"
   - cd "../empty_folder"
-  - pytest --showlocals --durations=20 --pyargs sklearn
+  - ps: >-
+        if (Test-Path variable:global:CHECK_WARNINGS) {
+            $env:PYTEST_ARGS = "-Werror::DeprecationWarning -Werror::FutureWarning"
+        } else {
+            $env:PYTEST_ARGS = ""
+        }
+  - "pytest --showlocals --durations=20 %PYTEST_ARGS% --pyargs sklearn"
   # Move back to the project folder
   - cd "../scikit-learn"
 
diff --git a/build_tools/Makefile b/build_tools/Makefile
new file mode 100644
index 0000000000000..68162733b4b11
--- /dev/null
+++ b/build_tools/Makefile
@@ -0,0 +1,4 @@
+# Makefile for maintenance tools
+
+authors:
+	python generate_authors_table.py > ../doc/authors.rst
diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh
index 18fa361821d14..922bbac8e23a6 100755
--- a/build_tools/circle/build_test_pypy.sh
+++ b/build_tools/circle/build_test_pypy.sh
@@ -18,13 +18,16 @@ source pypy-env/bin/activate
 python --version
 which python
 
-pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy==1.14.4 Cython pytest
+pip install --extra-index https://antocuni.github.io/pypy-wheels/ubuntu numpy Cython pytest
 pip install "scipy>=1.1.0" sphinx numpydoc docutils
 
 ccache -M 512M
 export CCACHE_COMPRESS=1
 export PATH=/usr/lib/ccache:$PATH
+export LOKY_MAX_CPU_COUNT="2"
 
-pip install -e .
+pip install -vv -e . 
 
-make test
+python -m pytest sklearn/
+python -m pytest doc/sphinxext/
+python -m pytest $(find doc -name '*.rst' | sort)
diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py
new file mode 100644
index 0000000000000..ea3796473396d
--- /dev/null
+++ b/build_tools/generate_authors_table.py
@@ -0,0 +1,117 @@
+"""
+This script generates an html table of contributors, with names and avatars.
+The list is generated from scikit-learn's teams on GitHub, plus a small number
+of hard-coded contributors.
+
+The table should be updated for each new inclusion in the teams.
+Generating the table requires admin rights.
+"""
+from __future__ import print_function
+
+import sys
+import requests
+import getpass
+
+try:
+    # With authentication: up to 5000 requests per hour.
+    print("user:", file=sys.stderr)
+    user = input()
+    passwd = getpass.getpass()
+    auth = (user, passwd)
+except IndexError:
+    # Without authentication: up to 60 requests per hour.
+    auth = None
+
+ROW_SIZE = 7
+LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4'
+
+
+def group_iterable(iterable, size):
+    """Group iterable into lines"""
+    group = []
+    for element in iterable:
+        group.append(element)
+        if len(group) == size:
+            yield group
+            group = []
+    if len(group) != 0:
+        yield group
+
+
+def get_contributors():
+    """Get the list of contributor profiles. Require admin rights."""
+    # get members of scikit-learn teams on GitHub
+    members = []
+    for team in [11523, 33471]:
+        for page in [1, 2]:  # 30 per page
+            members.extend(requests.get(
+                "https://api.github.com/teams/%d/members?page=%d"
+                % (team, page), auth=auth).json())
+
+    # keep only the logins
+    logins = [c['login'] for c in members]
+    # add missing contributors with GitHub accounts
+    logins.extend(['dubourg', 'jarrodmillman', 'mbrucher', 'thouis'])
+    # add missing contributors without GitHub accounts
+    logins.extend(['Angel Soler Gollonet'])
+    # remove duplicate
+    logins = set(logins)
+    # remove CI
+    logins.remove('sklearn-ci')
+
+    # get profiles from GitHub
+    profiles = [get_profile(login) for login in logins]
+    # sort by last name
+    profiles = sorted(profiles, key=key)
+
+    return profiles
+
+
+def get_profile(login):
+    """Get the GitHub profile from login"""
+    profile = requests.get("https://api.github.com/users/%s" % login,
+                           auth=auth).json()
+    if 'name' not in profile:
+        # default profile if the login does not exist
+        return dict(name=login, avatar_url=LOGO_URL, html_url="")
+    else:
+        if profile["name"] is None:
+            profile["name"] = profile["login"]
+
+        # fix missing names
+        missing_names = {'bthirion': 'Bertrand Thirion',
+                         'dubourg': 'Vincent Dubourg',
+                         'Duchesnay': 'Edouard Duchesnay',
+                         'Lars': 'Lars Buitinck',
+                         'MechCoder': 'Manoj Kumar'}
+        if profile["name"] in missing_names:
+            profile["name"] = missing_names[profile["name"]]
+        return profile
+
+
+def key(profile):
+    """Get the last name in lower case"""
+    return profile["name"].split(' ')[-1].lower()
+
+
+contributors = get_contributors()
+
+print(".. raw :: html\n")
+print("    <!-- Generated by gen_authors.py -->")
+print("    <table>")
+print("    <col style='width:%d%%' span='%d'>"
+      % (int(100 / ROW_SIZE), ROW_SIZE))
+print("    <style>")
+print("      img.avatar {border-radius: 10px;}")
+print("      td {vertical-align: top;}")
+print("    </style>")
+for row in group_iterable(contributors, size=ROW_SIZE):
+    print("    <tr>")
+    for contributor in row:
+        print("    <td>")
+        print("    <a href='%s'><img src='%s' class='avatar' /></a> <br />"
+              % (contributor["html_url"], contributor["avatar_url"]))
+        print("    <p>%s</p>" % contributor["name"])
+        print("    </td>")
+    print("    </tr>")
+print("    </table>")
diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh
index 1cf24d10837c7..5036e19b3a6f0 100755
--- a/build_tools/travis/test_script.sh
+++ b/build_tools/travis/test_script.sh
@@ -38,6 +38,13 @@ run_tests() {
     if [[ "$COVERAGE" == "true" ]]; then
         TEST_CMD="$TEST_CMD --cov sklearn"
     fi
+
+    if [[ -n "$CHECK_WARNINGS" ]]; then
+        TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning"
+    fi
+
+    set -x  # print executed commands to the terminal
+
     $TEST_CMD sklearn
 
     # Going back to git checkout folder needed to test documentation
diff --git a/conftest.py b/conftest.py
index 621097bfc47ab..82c4b17faeef0 100644
--- a/conftest.py
+++ b/conftest.py
@@ -11,6 +11,19 @@
 import pytest
 from _pytest.doctest import DoctestItem
 
+from sklearn.utils.fixes import PY3_OR_LATER
+
+PYTEST_MIN_VERSION = '3.3.0'
+
+if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION:
+    raise('Your version of pytest is too old, you should have at least '
+          'pytest >= {} installed.'.format(PYTEST_MIN_VERSION))
+
+
+def pytest_addoption(parser):
+    parser.addoption("--skip-network", action="store_true", default=False,
+                     help="skip network tests")
+
 
 def pytest_collection_modifyitems(config, items):
 
@@ -19,22 +32,35 @@ def pytest_collection_modifyitems(config, items):
         skip_marker = pytest.mark.skip(
             reason='FeatureHasher is not compatible with PyPy')
         for item in items:
-            if item.name == 'sklearn.feature_extraction.hashing.FeatureHasher':
+            if item.name in (
+                    'sklearn.feature_extraction.hashing.FeatureHasher',
+                    'sklearn.feature_extraction.text.HashingVectorizer'):
                 item.add_marker(skip_marker)
 
+    # Skip tests which require internet if the flag is provided
+    if config.getoption("--skip-network"):
+        skip_network = pytest.mark.skip(
+            reason="test requires internet connectivity")
+        for item in items:
+            if "network" in item.keywords:
+                item.add_marker(skip_network)
+
     # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
-    # run doctests only for numpy >= 1.14.
-    skip_doctests = True
+    # run doctests only for numpy >= 1.14. We want to skip the doctest for
+    # python 2 due to unicode.
+    skip_doctests = False
+    if not PY3_OR_LATER:
+        skip_doctests = True
     try:
         import numpy as np
-        if LooseVersion(np.__version__) >= LooseVersion('1.14'):
-            skip_doctests = False
+        if LooseVersion(np.__version__) < LooseVersion('1.14'):
+            skip_doctests = True
     except ImportError:
         pass
 
     if skip_doctests:
         skip_marker = pytest.mark.skip(
-            reason='doctests are only run for numpy >= 1.14')
+            reason='doctests are only run for numpy >= 1.14 and python >= 3')
 
         for item in items:
             if isinstance(item, DoctestItem):
diff --git a/doc/about.rst b/doc/about.rst
index 90295b96fb6ff..218b0ad897fe4 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -1,7 +1,31 @@
 About us
 ========
 
-.. include:: ../AUTHORS.rst
+History
+-------
+
+This project was started in 2007 as a Google Summer of Code project by
+David Cournapeau. Later that year, Matthieu Brucher started work on
+this project as part of his thesis.
+
+In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent
+Michel of INRIA took leadership of the project and made the first public
+release, February the 1st 2010. Since then, several releases have appeared
+following a ~3 month cycle, and a thriving international community has
+been leading the development.
+
+Authors
+-------
+
+The following people have been core contributors to scikit-learn's development
+and maintenance:
+
+.. include:: authors.rst
+
+Please do not email the authors directly to ask for assistance or report issues.
+Instead, please see `What's the best way to ask questions about scikit-learn
+<http://scikit-learn.org/stable/faq.html#what-s-the-best-way-to-get-help-on-scikit-learn-usage>`_
+in the FAQ.
 
 .. seealso::
 
diff --git a/doc/authors.rst b/doc/authors.rst
new file mode 100644
index 0000000000000..0210dff4bef6e
--- /dev/null
+++ b/doc/authors.rst
@@ -0,0 +1,220 @@
+.. raw :: html
+
+    <!-- Generated by gen_authors.py -->
+    <table>
+    <col style='width:14%' span='7'>
+    <style>
+      img.avatar {border-radius: 10px;}
+      td {vertical-align: top;}
+    </style>
+    <tr>
+    <td>
+    <a href='https://github.com/mblondel'><img src='https://avatars2.githubusercontent.com/u/233706?v=4' class='avatar' /></a> <br />
+    <p>Mathieu Blondel</p>
+    </td>
+    <td>
+    <a href='https://github.com/jorisvandenbossche'><img src='https://avatars2.githubusercontent.com/u/1020496?v=4' class='avatar' /></a> <br />
+    <p>Joris Van den Bossche</p>
+    </td>
+    <td>
+    <a href='https://github.com/mbrucher'><img src='https://avatars1.githubusercontent.com/u/321752?v=4' class='avatar' /></a> <br />
+    <p>Matthieu Brucher</p>
+    </td>
+    <td>
+    <a href='https://github.com/larsmans'><img src='https://avatars1.githubusercontent.com/u/335383?v=4' class='avatar' /></a> <br />
+    <p>Lars Buitinck</p>
+    </td>
+    <td>
+    <a href='https://github.com/cournape'><img src='https://avatars1.githubusercontent.com/u/25111?v=4' class='avatar' /></a> <br />
+    <p>David Cournapeau</p>
+    </td>
+    <td>
+    <a href='https://github.com/ndawe'><img src='https://avatars1.githubusercontent.com/u/202816?v=4' class='avatar' /></a> <br />
+    <p>Noel Dawe</p>
+    </td>
+    <td>
+    <a href='https://github.com/lucidfrontier45'><img src='https://avatars2.githubusercontent.com/u/655305?v=4' class='avatar' /></a> <br />
+    <p>Shiqiao Du</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/dubourg'><img src='https://avatars0.githubusercontent.com/u/401766?v=4' class='avatar' /></a> <br />
+    <p>Vincent Dubourg</p>
+    </td>
+    <td>
+    <a href='https://github.com/duchesnay'><img src='https://avatars1.githubusercontent.com/u/344402?v=4' class='avatar' /></a> <br />
+    <p>Edouard Duchesnay</p>
+    </td>
+    <td>
+    <a href='https://github.com/lesteve'><img src='https://avatars1.githubusercontent.com/u/1680079?v=4' class='avatar' /></a> <br />
+    <p>Loïc Estève</p>
+    </td>
+    <td>
+    <a href='https://github.com/AlexanderFabisch'><img src='https://avatars1.githubusercontent.com/u/869592?v=4' class='avatar' /></a> <br />
+    <p>Alexander Fabisch</p>
+    </td>
+    <td>
+    <a href='https://github.com/VirgileFritsch'><img src='https://avatars3.githubusercontent.com/u/263280?v=4' class='avatar' /></a> <br />
+    <p>Virgile Fritsch</p>
+    </td>
+    <td>
+    <a href='https://github.com/satra'><img src='https://avatars2.githubusercontent.com/u/184063?v=4' class='avatar' /></a> <br />
+    <p>Satrajit Ghosh</p>
+    </td>
+    <td>
+    <a href=''><img src='https://avatars2.githubusercontent.com/u/365630?v=4' class='avatar' /></a> <br />
+    <p>Angel Soler Gollonet</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/chrisfilo'><img src='https://avatars2.githubusercontent.com/u/238759?v=4' class='avatar' /></a> <br />
+    <p>Chris Filo Gorgolewski</p>
+    </td>
+    <td>
+    <a href='https://github.com/agramfort'><img src='https://avatars2.githubusercontent.com/u/161052?v=4' class='avatar' /></a> <br />
+    <p>Alexandre Gramfort</p>
+    </td>
+    <td>
+    <a href='https://github.com/ogrisel'><img src='https://avatars0.githubusercontent.com/u/89061?v=4' class='avatar' /></a> <br />
+    <p>Olivier Grisel</p>
+    </td>
+    <td>
+    <a href='https://github.com/jaquesgrobler'><img src='https://avatars3.githubusercontent.com/u/1378870?v=4' class='avatar' /></a> <br />
+    <p>Jaques Grobler</p>
+    </td>
+    <td>
+    <a href='https://github.com/yarikoptic'><img src='https://avatars3.githubusercontent.com/u/39889?v=4' class='avatar' /></a> <br />
+    <p>Yaroslav Halchenko</p>
+    </td>
+    <td>
+    <a href='https://github.com/bdholt1'><img src='https://avatars0.githubusercontent.com/u/937444?v=4' class='avatar' /></a> <br />
+    <p>Brian Holt</p>
+    </td>
+    <td>
+    <a href='https://github.com/arjoly'><img src='https://avatars0.githubusercontent.com/u/1274722?v=4' class='avatar' /></a> <br />
+    <p>Arnaud Joly</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/thouis'><img src='https://avatars1.githubusercontent.com/u/473043?v=4' class='avatar' /></a> <br />
+    <p>Thouis (Ray) Jones</p>
+    </td>
+    <td>
+    <a href='https://github.com/kastnerkyle'><img src='https://avatars2.githubusercontent.com/u/1563421?v=4' class='avatar' /></a> <br />
+    <p>Kyle Kastner</p>
+    </td>
+    <td>
+    <a href='https://github.com/MechCoder'><img src='https://avatars3.githubusercontent.com/u/1867024?v=4' class='avatar' /></a> <br />
+    <p>Manoj Kumar</p>
+    </td>
+    <td>
+    <a href='https://github.com/robertlayton'><img src='https://avatars2.githubusercontent.com/u/800543?v=4' class='avatar' /></a> <br />
+    <p>Robert Layton</p>
+    </td>
+    <td>
+    <a href='https://github.com/glemaitre'><img src='https://avatars2.githubusercontent.com/u/7454015?v=4' class='avatar' /></a> <br />
+    <p>Guillaume Lemaitre</p>
+    </td>
+    <td>
+    <a href='https://github.com/weilinear'><img src='https://avatars0.githubusercontent.com/u/2232328?v=4' class='avatar' /></a> <br />
+    <p>Wei Li</p>
+    </td>
+    <td>
+    <a href='https://github.com/paolo-losi'><img src='https://avatars1.githubusercontent.com/u/264906?v=4' class='avatar' /></a> <br />
+    <p>Paolo Losi</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/glouppe'><img src='https://avatars3.githubusercontent.com/u/477771?v=4' class='avatar' /></a> <br />
+    <p>Gilles Louppe</p>
+    </td>
+    <td>
+    <a href='https://github.com/jmetzen'><img src='https://avatars1.githubusercontent.com/u/1116263?v=4' class='avatar' /></a> <br />
+    <p>Jan Hendrik Metzen</p>
+    </td>
+    <td>
+    <a href='https://github.com/vmichel'><img src='https://avatars1.githubusercontent.com/u/295195?v=4' class='avatar' /></a> <br />
+    <p>Vincent Michel</p>
+    </td>
+    <td>
+    <a href='https://github.com/jarrodmillman'><img src='https://avatars1.githubusercontent.com/u/123428?v=4' class='avatar' /></a> <br />
+    <p>Jarrod Millman</p>
+    </td>
+    <td>
+    <a href='https://github.com/amueller'><img src='https://avatars3.githubusercontent.com/u/449558?v=4' class='avatar' /></a> <br />
+    <p>Andreas Mueller</p>
+    </td>
+    <td>
+    <a href='https://github.com/vene'><img src='https://avatars0.githubusercontent.com/u/241745?v=4' class='avatar' /></a> <br />
+    <p>Vlad Niculae</p>
+    </td>
+    <td>
+    <a href='https://github.com/jnothman'><img src='https://avatars2.githubusercontent.com/u/78827?v=4' class='avatar' /></a> <br />
+    <p>Joel Nothman</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/alextp'><img src='https://avatars0.githubusercontent.com/u/5061?v=4' class='avatar' /></a> <br />
+    <p>Alexandre Passos</p>
+    </td>
+    <td>
+    <a href='https://github.com/fabianp'><img src='https://avatars3.githubusercontent.com/u/277639?v=4' class='avatar' /></a> <br />
+    <p>Fabian Pedregosa</p>
+    </td>
+    <td>
+    <a href='https://github.com/pprett'><img src='https://avatars0.githubusercontent.com/u/111730?v=4' class='avatar' /></a> <br />
+    <p>Peter Prettenhofer</p>
+    </td>
+    <td>
+    <a href='https://github.com/qinhanmin2014'><img src='https://avatars2.githubusercontent.com/u/12003569?v=4' class='avatar' /></a> <br />
+    <p>Hanmin Qin</p>
+    </td>
+    <td>
+    <a href='https://github.com/raghavrv'><img src='https://avatars3.githubusercontent.com/u/9487348?v=4' class='avatar' /></a> <br />
+    <p>(Venkat) Raghav, Rajagopalan</p>
+    </td>
+    <td>
+    <a href='https://github.com/jmschrei'><img src='https://avatars2.githubusercontent.com/u/3916816?v=4' class='avatar' /></a> <br />
+    <p>Jacob Schreiber</p>
+    </td>
+    <td>
+    <a href='https://github.com/bthirion'><img src='https://avatars1.githubusercontent.com/u/234454?v=4' class='avatar' /></a> <br />
+    <p>Bertrand Thirion</p>
+    </td>
+    </tr>
+    <tr>
+    <td>
+    <a href='https://github.com/TomDLT'><img src='https://avatars2.githubusercontent.com/u/11065596?v=4' class='avatar' /></a> <br />
+    <p>Tom Dupré la Tour</p>
+    </td>
+    <td>
+    <a href='https://github.com/jakevdp'><img src='https://avatars0.githubusercontent.com/u/781659?v=4' class='avatar' /></a> <br />
+    <p>Jake Vanderplas</p>
+    </td>
+    <td>
+    <a href='https://github.com/NelleV'><img src='https://avatars0.githubusercontent.com/u/184798?v=4' class='avatar' /></a> <br />
+    <p>Nelle Varoquaux</p>
+    </td>
+    <td>
+    <a href='https://github.com/GaelVaroquaux'><img src='https://avatars3.githubusercontent.com/u/208217?v=4' class='avatar' /></a> <br />
+    <p>Gael Varoquaux</p>
+    </td>
+    <td>
+    <a href='https://github.com/dwf'><img src='https://avatars1.githubusercontent.com/u/60206?v=4' class='avatar' /></a> <br />
+    <p>David Warde-Farley</p>
+    </td>
+    <td>
+    <a href='https://github.com/ronw'><img src='https://avatars2.githubusercontent.com/u/113819?v=4' class='avatar' /></a> <br />
+    <p>Ron Weiss</p>
+    </td>
+    <td>
+    <a href='https://github.com/rth'><img src='https://avatars0.githubusercontent.com/u/630936?v=4' class='avatar' /></a> <br />
+    <p>Roman Yurchak</p>
+    </td>
+    </tr>
+    </table>
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index 947e55f0c4c37..e0640916fbb64 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -351,6 +351,154 @@ features::
 
  _`Faster API-compatible implementation`: https://github.com/mblondel/svmlight-loader
 
+..
+    For doctests:
+
+    >>> import numpy as np
+    >>> import os
+
+.. _openml:
+
+Downloading datasets from the openml.org repository
+---------------------------------------------------
+
+`openml.org <https://openml.org>`_ is a public repository for machine learning
+data and experiments, that allows everybody to upload open datasets.
+
+The ``sklearn.datasets`` package is able to download datasets
+from the repository using the function
+:func:`sklearn.datasets.fetch_openml`.
+
+For example, to download a dataset of gene expressions in mice brains::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> mice = fetch_openml(name='miceprotein', version=4)
+
+To fully specify a dataset, you need to provide a name and a version, though
+the version is optional, see :ref:`openml_versions` below.
+The dataset contains a total of 1080 examples belonging to 8 different
+classes::
+
+  >>> mice.data.shape
+  (1080, 77)
+  >>> mice.target.shape
+  (1080,)
+  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE
+  array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object)
+
+You can get more information on the dataset by looking at the ``DESCR``
+and ``details`` attributes::
+
+  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios
+  **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015
+  **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
+  Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
+  Syndrome. PLoS ONE 10(6): e0129126...
+
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF',
+  'upload_date': '2017-11-08T16:00:15', 'licence': 'Public',
+  'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff',
+  'file_id': '17928620', 'default_target_attribute': 'class',
+  'row_id_attribute': 'MouseID',
+  'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'],
+  'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'],
+  'visibility': 'public', 'status': 'active',
+  'md5_checksum': '3c479a6885bfa0438971388283a1ce32'}
+
+
+The ``DESCR`` contains a free-text description of the data, while ``details``
+contains a dictionary of meta-data stored by openml, like the dataset id.
+For more details, see the `OpenML documentation
+<https://docs.openml.org/#data>`_ The ``data_id`` of the mice protein dataset
+is 40966, and you can use this (or the name) to get more information on the
+dataset on the openml website::
+
+  >>> mice.url
+  'https://www.openml.org/d/40966'
+
+The ``data_id`` also uniquely identifies a dataset from OpenML::
+
+  >>> mice = fetch_openml(data_id=40966)
+  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
+  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
+  'creator': ...,
+  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
+  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
+  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
+  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
+  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
+  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
+  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
+  '3c479a6885bfa0438971388283a1ce32'}
+
+.. _openml_versions:
+
+Dataset Versions
+~~~~~~~~~~~~~~~~
+
+A dataset is uniquely specified by its ``data_id``, but not necessarily by its
+name. Several different "versions" of a dataset with the same name can exist
+which can contain entirely different datasets.
+If a particular version of a dataset has been found to contain significant
+issues, it might be deactivated. Using a name to specify a dataset will yield
+the earliest version of a dataset that is still active. That means that
+``fetch_openml(name="miceprotein")`` can yield different results at different
+times if earlier versions become inactive.
+You can see that the dataset with ``data_id`` 40966 that we fetched above is
+the version 1 of the "miceprotein" dataset::
+
+  >>> mice.details['version']  #doctest: +SKIP
+  '1'
+
+In fact, this dataset only has one version. The iris dataset on the other hand
+has multiple versions::
+
+  >>> iris = fetch_openml(name="iris")
+  >>> iris.details['version']  #doctest: +SKIP
+  '1'
+  >>> iris.details['id']  #doctest: +SKIP
+  '61'
+
+  >>> iris_61 = fetch_openml(data_id=61)
+  >>> iris_61.details['version']
+  '1'
+  >>> iris_61.details['id']
+  '61'
+
+  >>> iris_969 = fetch_openml(data_id=969)
+  >>> iris_969.details['version']
+  '3'
+  >>> iris_969.details['id']
+  '969'
+
+Specifying the dataset by the name "iris" yields the lowest version, version 1,
+with the ``data_id`` 61. To make sure you always get this exact dataset, it is
+safest to specify it by the dataset ``data_id``. The other dataset, with
+``data_id`` 969, is version 3 (version 2 has become inactive), and contains a
+binarized version of the data::
+
+  >>> np.unique(iris_969.target)
+  array(['N', 'P'], dtype=object)
+
+You can also specify both the name and the version, which also uniquely
+identifies the dataset::
+
+  >>> iris_version_3 = fetch_openml(name="iris", version=3)
+  >>> iris_version_3.details['version']
+  '3'
+  >>> iris_version_3.details['id']
+  '969'
+
+
+.. topic:: References:
+
+ * Vanschoren, van Rijn, Bischl and Torgo
+   `"OpenML: networked science in machine learning"
+   <https://arxiv.org/pdf/1407.7722.pdf>`_,
+   ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014.
+
 .. _external_datasets:
 
 Loading from external datasets
diff --git a/doc/datasets/openml.rst b/doc/datasets/openml.rst
deleted file mode 100644
index 52dd453919522..0000000000000
--- a/doc/datasets/openml.rst
+++ /dev/null
@@ -1,148 +0,0 @@
-..
-    For doctests:
-
-    >>> import numpy as np
-    >>> import os
-
-
-.. _openml:
-
-Downloading datasets from the openml.org repository
-===================================================
-
-`openml.org <https://openml.org>`_ is a public repository for machine learning
-data and experiments, that allows everybody to upload open datasets.
-
-The ``sklearn.datasets`` package is able to download datasets
-from the repository using the function
-:func:`sklearn.datasets.fetch_openml`.
-
-For example, to download a dataset of gene expressions in mice brains::
-
-  >>> from sklearn.datasets import fetch_openml
-  >>> mice = fetch_openml(name='miceprotein', version=4)
-
-To fully specify a dataset, you need to provide a name and a version, though
-the version is optional, see :ref:`openml_versions` below.
-The dataset contains a total of 1080 examples belonging to 8 different
-classes::
-
-  >>> mice.data.shape
-  (1080, 77)
-  >>> mice.target.shape
-  (1080,)
-  >>> np.unique(mice.target) # doctest: +NORMALIZE_WHITESPACE
-  array(['c-CS-m', 'c-CS-s', 'c-SC-m', 'c-SC-s', 't-CS-m', 't-CS-s', 't-SC-m', 't-SC-s'], dtype=object)
-
-You can get more information on the dataset by looking at the ``DESCR``
-and ``details`` attributes::
-
-  >>> print(mice.DESCR) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
-  **Author**: Clara Higuera, Katheleen J. Gardiner, Krzysztof J. Cios
-  **Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression) - 2015
-  **Please cite**: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing
-  Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down
-  Syndrome. PLoS ONE 10(6): e0129126...
-
-  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
-  {'id': '40966', 'name': 'MiceProtein', 'version': '4', 'format': 'ARFF',
-  'upload_date': '2017-11-08T16:00:15', 'licence': 'Public',
-  'url': 'https://www.openml.org/data/v1/download/17928620/MiceProtein.arff',
-  'file_id': '17928620', 'default_target_attribute': 'class',
-  'row_id_attribute': 'MouseID',
-  'ignore_attribute': ['Genotype', 'Treatment', 'Behavior'],
-  'tag': ['OpenML-CC18', 'study_135', 'study_98', 'study_99'],
-  'visibility': 'public', 'status': 'active',
-  'md5_checksum': '3c479a6885bfa0438971388283a1ce32'}
-
-
-The ``DESCR`` contains a free-text description of the data, while ``details``
-contains a dictionary of meta-data stored by openml, like the dataset id.
-For more details, see the `OpenML documentation
-<https://docs.openml.org/#data>`_ The ``data_id`` of the mice protein dataset
-is 40966, and you can use this (or the name) to get more information on the
-dataset on the openml website::
-
-  >>> mice.url
-  'https://www.openml.org/d/40966'
-
-The ``data_id`` also uniquely identifies a dataset from OpenML::
-
-  >>> mice = fetch_openml(data_id=40966)
-  >>> mice.details # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +SKIP
-  {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
-  'creator': ...,
-  'upload_date': '2016-02-17T14:32:49', 'licence': 'Public', 'url':
-  'https://www.openml.org/data/v1/download/1804243/MiceProtein.ARFF', 'file_id':
-  '1804243', 'default_target_attribute': 'class', 'citation': 'Higuera C,
-  Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins
-  Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6):
-  e0129126. [Web Link] journal.pone.0129126', 'tag': ['OpenML100', 'study_14',
-  'study_34'], 'visibility': 'public', 'status': 'active', 'md5_checksum':
-  '3c479a6885bfa0438971388283a1ce32'}
-
-.. _openml_versions:
-
-Dataset Versions
-----------------
-
-A dataset is uniquely specified by its ``data_id``, but not necessarily by its
-name. Several different "versions" of a dataset with the same name can exist
-which can contain entirely different datasets.
-If a particular version of a dataset has been found to contain significant
-issues, it might be deactivated. Using a name to specify a dataset will yield
-the earliest version of a dataset that is still active. That means that
-``fetch_openml(name="miceprotein")`` can yield different results at different
-times if earlier versions become inactive.
-You can see that the dataset with ``data_id`` 40966 that we fetched above is
-the version 1 of the "miceprotein" dataset::
-
-  >>> mice.details['version']  #doctest: +SKIP
-  '1'
-
-In fact, this dataset only has one version. The iris dataset on the other hand
-has multiple versions::
-
-  >>> iris = fetch_openml(name="iris")
-  >>> iris.details['version']  #doctest: +SKIP
-  '1'
-  >>> iris.details['id']  #doctest: +SKIP
-  '61'
-
-  >>> iris_61 = fetch_openml(data_id=61)
-  >>> iris_61.details['version']
-  '1'
-  >>> iris_61.details['id']
-  '61'
-
-  >>> iris_969 = fetch_openml(data_id=969)
-  >>> iris_969.details['version']
-  '3'
-  >>> iris_969.details['id']
-  '969'
-
-Specifying the dataset by the name "iris" yields the lowest version, version 1,
-with the ``data_id`` 61. To make sure you always get this exact dataset, it is
-safest to specify it by the dataset ``data_id``. The other dataset, with
-``data_id`` 969, is version 3 (version 2 has become inactive), and contains a
-binarized version of the data::
-
-  >>> np.unique(iris_969.target)
-  array(['N', 'P'], dtype=object)
-
-You can also specify both the name and the version, which also uniquely
-identifies the dataset::
-
-  >>> iris_version_3 = fetch_openml(name="iris", version=3)
-  >>> iris_version_3.details['version']
-  '3'
-  >>> iris_version_3.details['id']
-  '969'
-
-
-.. topic:: References:
-
- * Vanschoren, van Rijn, Bischl and Torgo
-   `"OpenML: networked science in machine learning"
-   <https://arxiv.org/pdf/1407.7722.pdf>`_,
-   ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014.
diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index 720c11ed98f4c..e146363d0ac4e 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -50,7 +50,9 @@ Building Scikit-learn also requires
 
 Running tests requires
 
-- pytest
+.. |PytestMinVersion| replace:: 3.3.0
+
+- pytest >=\ |PytestMinVersion|
 
 Some tests also require `pandas <https://pandas.pydata.org>`_.
 
@@ -276,9 +278,8 @@ Testing
 Testing scikit-learn once installed
 -----------------------------------
 
-Testing requires having the `pytest
-<https://docs.pytest.org>`_ library. Some tests also require having
-`pandas <https://pandas.pydata.org/>` installed.
+Testing requires having `pytest <https://docs.pytest.org>`_ >=\ |PytestMinVersion|\ .
+Some tests also require having `pandas <https://pandas.pydata.org/>` installed.
 After installation, the package can be tested by executing *from outside* the
 source directory::
 
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index d0d0db8a041bb..a3309abcfbf10 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -1,8 +1,17 @@
 Maintainer / core-developer information
 ========================================
 
+Before a release
+----------------
+
+1. Update authors table::
+
+    $ cd build_tools; make authors; cd ..
+
+   and commit.
+
 Making a release
-------------------
+----------------
 For more information see https://github.com/scikit-learn/scikit-learn/wiki/How-to-make-a-release
 
 
diff --git a/doc/install.rst b/doc/install.rst
index 7dbb2287c4063..bb6b67af3e3cb 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -78,7 +78,7 @@ Canopy and Anaconda for all supported platforms
 
 `Canopy
 <https://www.enthought.com/products/canopy>`_ and `Anaconda
-<https://www.continuum.io/downloads>`_ both ship a recent
+<https://www.anaconda.com/download>`_ both ship a recent
 version of scikit-learn, in addition to a large set of scientific python
 library for Windows, Mac OSX and Linux.
 
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 5a291bfaebf17..663ca40b8c7fa 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -353,13 +353,13 @@ Like pipelines, feature unions have a shorthand constructor called
 
 
 Like ``Pipeline``, individual steps may be replaced using ``set_params``,
-and ignored by setting to ``None``::
+and ignored by setting to ``'drop'``::
 
-    >>> combined.set_params(kernel_pca=None)
+    >>> combined.set_params(kernel_pca='drop')
     ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
     FeatureUnion(n_jobs=None,
                  transformer_list=[('linear_pca', PCA(copy=True,...)),
-                                   ('kernel_pca', None)],
+                                   ('kernel_pca', 'drop')],
                  transformer_weights=None)
 
 .. topic:: Examples:
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index a41c8201a3fa1..5399f13dbc9f4 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -218,7 +218,7 @@ setting ``oob_score=True``.
     The size of the model with the default parameters is :math:`O( M * N * log (N) )`,
     where :math:`M` is the number of trees and :math:`N` is the number of samples.
     In order to reduce the size of the model, you can change these parameters:
-    ``min_samples_split``, ``max_leaf_nodes`` and ``max_depth``.
+    ``min_samples_split``, ``max_leaf_nodes``, ``max_depth`` and ``min_samples_leaf``.
 
 Parallelization
 ---------------
@@ -393,7 +393,8 @@ The number of weak learners is controlled by the parameter ``n_estimators``. The
 the final combination. By default, weak learners are decision stumps. Different
 weak learners can be specified through the ``base_estimator`` parameter.
 The main parameters to tune to obtain good results are ``n_estimators`` and
-the complexity of the base estimators (e.g., its depth ``max_depth``).
+the complexity of the base estimators (e.g., its depth ``max_depth`` or
+minimum required number of samples to consider a split ``min_samples_split``).
 
 .. topic:: Examples:
 
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index b3867373cbf11..827cc13592f56 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -735,9 +735,9 @@ decide better::
   array([[1, 1, 1, 0, 1, 1, 1, 0],
          [1, 1, 0, 1, 1, 1, 0, 1]])
 
-In the above example, ``'char_wb`` analyzer is used, which creates n-grams
+In the above example, ``char_wb`` analyzer is used, which creates n-grams
 only from characters inside word boundaries (padded with space on each
-side). The ``'char'`` analyzer, alternatively, creates n-grams that
+side). The ``char`` analyzer, alternatively, creates n-grams that
 span across words::
 
   >>> ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index 3d45dd78f3179..e1dfb0c03ea4b 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -15,7 +15,7 @@ surface, respectively.
 
 These classifiers are attractive because they have closed-form solutions that
 can be easily computed, are inherently multiclass, have proven to work well in
-practice and have no hyperparameters to tune.
+practice, and have no hyperparameters to tune.
 
 .. |ldaqda| image:: ../auto_examples/classification/images/sphx_glr_plot_lda_qda_001.png
         :target: ../auto_examples/classification/plot_lda_qda.html
@@ -43,7 +43,7 @@ linear subspace consisting of the directions which maximize the separation
 between classes (in a precise sense discussed in the mathematics section
 below). The dimension of the output is necessarily less than the number of
 classes, so this is, in general, a rather strong dimensionality reduction, and
-only makes senses in a multiclass setting.
+only makes sense in a multiclass setting.
 
 This is implemented in
 :func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired
@@ -70,10 +70,10 @@ the class conditional distribution of the data :math:`P(X|y=k)` for each class
 and we select the class :math:`k` which maximizes this conditional probability.
 
 More specifically, for linear and quadratic discriminant analysis,
-:math:`P(X|y)` is modelled as a multivariate Gaussian distribution with
+:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with
 density:
 
-.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) 
+.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right)
 
 where :math:`d` is the number of features.
 
@@ -85,7 +85,7 @@ matrices, or by a regularized estimator: see the section on shrinkage below).
 
 In the case of LDA, the Gaussians for each class are assumed to share the same
 covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to
-linear decision surfaces between, as can be seen by comparing the
+linear decision surfaces, which can be seen by comparing the
 log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`:
 
 .. math::
@@ -127,7 +127,7 @@ classifier, there is a dimensionality reduction by linear projection onto a
 :math:`K-1` dimensional space.
 
 We can reduce the dimension even more, to a chosen :math:`L`, by projecting
-onto the linear subspace :math:`H_L` which maximize the variance of the
+onto the linear subspace :math:`H_L` which maximizes the variance of the
 :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the
 transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the
 ``n_components`` parameter used in the
diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst
index f5173e5d9f3fe..ccf5755c1c7e9 100644
--- a/doc/modules/model_persistence.rst
+++ b/doc/modules/model_persistence.rst
@@ -35,7 +35,7 @@ persistence model, namely `pickle <https://docs.python.org/2/library/pickle.html
   >>> y[0]
   0
 
-In the specific case of scikit-learn, it may be more interesting to use
+In the specific case of scikit-learn, it may be better to use
 joblib's replacement of pickle (``joblib.dump`` & ``joblib.load``),
 which is more efficient on objects that carry large numpy arrays internally as
 is often the case for fitted scikit-learn estimators, but can only pickle to the
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index 9dbe013bef5d7..3482d4246cda7 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -8,9 +8,9 @@ Novelty and Outlier Detection
 
 Many applications require being able to decide whether a new observation
 belongs to the same distribution as existing observations (it is an
-`inlier`), or should be considered as different (it is an outlier).
+*inlier*), or should be considered as different (it is an *outlier*).
 Often, this ability is used to clean real data sets. Two important
-distinction must be made:
+distinctions must be made:
 
 :outlier detection:
   The training data contains outliers which are defined as observations that
@@ -35,7 +35,7 @@ a low density region of the training data, considered as normal in this
 context.
 
 The scikit-learn project provides a set of machine learning tools that
-can be used both for novelty or outliers detection. This strategy is
+can be used both for novelty or outlier detection. This strategy is
 implemented with objects learning in an unsupervised way from the data::
 
     estimator.fit(X_train)
@@ -77,6 +77,18 @@ not available.
   The scores of abnormality of the training samples are always accessible
   through the ``negative_outlier_factor_`` attribute.
 
+The behavior of :class:`neighbors.LocalOutlierFactor` is summarized in the
+following table.
+
+===================== ================================ =====================
+Method                Outlier detection                Novelty detection
+===================== ================================ =====================
+``fit_predict``       OK                               Not available
+``predict``           Not available                    Use only on new data
+``decision_function`` Not available                    Use only on new data
+``score_samples``     Use ``negative_outlier_factor_`` Use only on new data
+===================== ================================ =====================
+
 
 Overview of outlier detection methods
 =====================================
@@ -162,7 +174,7 @@ Outlier Detection
 
 Outlier detection is similar to novelty detection in the sense that
 the goal is to separate a core of regular observations from some
-polluting ones, called "outliers". Yet, in the case of outlier
+polluting ones, called *outliers*. Yet, in the case of outlier
 detection, we don't have a clean data set representing the population
 of regular observations that can be used to train any tool.
 
@@ -341,19 +353,7 @@ Note that ``fit_predict`` is not available in this case.
   The scores of abnormality of the training samples are always accessible
   through the ``negative_outlier_factor_`` attribute.
 
-The behavior of LOF is summarized in the following table.
-
-====================  ================================  =====================
-Method                Outlier detection                 Novelty detection
-====================  ================================  =====================
-`fit_predict`         OK                                Not available
-`predict`             Not available                     Use only on test data
-`decision_function`   Not available                     Use only on test data
-`score_samples`       Use `negative_outlier_factor_`    Use only on test data
-====================  ================================  =====================
-
-
-This strategy is illustrated below.
+Novelty detection with Local Outlier Factor is illustrated below.
 
   .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png
      :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index bd065c14f7444..4429dd8b13cf6 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -164,11 +164,12 @@ Each row of the coefficients corresponds to one of the ``n_class`` many
 order of the "one" class.
 
 In the case of "one-vs-one" :class:`SVC`, the layout of the attributes
-is a little more involved. In the case of having a linear kernel,
-The layout of ``coef_`` and ``intercept_`` is similar to the one
-described for :class:`LinearSVC` described above, except that the shape of
-``coef_`` is ``[n_class * (n_class - 1) / 2, n_features]``, corresponding to as
-many binary classifiers. The order for classes
+is a little more involved. In the case of having a linear kernel, the
+attributes ``coef_`` and ``intercept_`` have the shape
+``[n_class * (n_class - 1) / 2, n_features]`` and
+``[n_class * (n_class - 1) / 2]`` respectively. This is similar to the
+layout for :class:`LinearSVC` described above, with each row now corresponding
+to a binary classifier. The order for classes
 0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . .
 . "n-1 vs n".
 
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index 5d448f86a3f11..97797191e5e15 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -166,17 +166,6 @@ render these plots inline automatically::
     .. figure:: ../images/iris.pdf
        :align: center
 
-After being fitted, the model can then be used to predict the class of samples::
-
-    >>> clf.predict(iris.data[:1, :])
-    array([0])
-
-Alternatively, the probability of each class can be predicted, which is the
-fraction of training samples of the same class in a leaf::
-
-    >>> clf.predict_proba(iris.data[:1, :])
-    array([[1., 0., 0.]])
-
 .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_001.png
    :target: ../auto_examples/tree/plot_iris.html
    :align: center
@@ -330,18 +319,31 @@ Tips on practical use
     for each additional level the tree grows to.  Use ``max_depth`` to control
     the size of the tree to prevent overfitting.
 
-  * Use ``min_samples_split`` to control the number of samples at a leaf node.
-    A very small number will usually mean the tree will overfit, whereas a
-    large number will prevent the tree from learning the data. If the sample
-    size varies greatly, a float number can be used as percentage in this
-    parameter. Note that ``min_samples_split`` can create arbitrarily
-    small leaves.
+  * Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple
+    samples inform every decision in the tree, by controlling which splits will
+    be considered. A very small number will usually mean the tree will overfit,
+    whereas a large number will prevent the tree from learning the data. Try
+    ``min_samples_leaf=5`` as an initial value. If the sample size varies
+    greatly, a float number can be used as percentage in these two parameters.
+    While ``min_samples_split`` can create arbitrarily small leaves,
+    ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding
+    low-variance, over-fit leaf nodes in regression problems.  For
+    classification with few classes, ``min_samples_leaf=1`` is often the best
+    choice.
 
   * Balance your dataset before training to prevent the tree from being biased
     toward the classes that are dominant. Class balancing can be done by
     sampling an equal number of samples from each class, or preferably by
     normalizing the sum of the sample weights (``sample_weight``) for each
-    class to the same value.
+    class to the same value. Also note that weight-based pre-pruning criteria,
+    such as ``min_weight_fraction_leaf``, will then be less biased toward
+    dominant classes than criteria that are not aware of the sample weights,
+    like ``min_samples_leaf``.
+
+  * If the samples are weighted, it will be easier to optimize the tree
+    structure using weight-based pre-pruning criterion such as
+    ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least
+    a fraction of the overall sum of the sample weights.
 
   * All decision trees use ``np.float32`` arrays internally.
     If training data is not in this format, a copy of the dataset will be made.
diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 9e5d5a32c0575..ce5f5c24dbf3a 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -183,7 +183,10 @@ and tasks.
 
 - `multiisotonic <https://github.com/alexfields/multiisotonic>`_ Isotonic
   regression on multidimensional features.
-  
+
+- `scikit-multilearn <https://scikit.ml>`_ Multi-label classification with 
+  focus on label space manipulation.
+
 - `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence 
   learning using sliding window segmentation.
 
diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html
index 79ddd08093012..21136856aa6d2 100644
--- a/doc/themes/scikit-learn/layout.html
+++ b/doc/themes/scikit-learn/layout.html
@@ -340,17 +340,13 @@ <h2>Machine Learning in Python</h2>
      </div>
 
     {% if theme_google_analytics|tobool %}
-    <script type="text/javascript">
-      var _gaq = _gaq || [];
-      _gaq.push(['_setAccount', 'UA-22606712-2']);
-      _gaq.push(['_trackPageview']);
-
-      (function() {
-        var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
-        ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
-        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
-      })();
+    <script>
+        window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
+        ga('create', 'UA-22606712-2', 'auto');
+        ga('set', 'anonymizeIp', true);
+        ga('send', 'pageview');
     </script>
+    <script async src='https://www.google-analytics.com/analytics.js'></script>
     {% endif %}
     <script>
       (function() {
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index de96f345efe00..d35b48c00da42 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -63,6 +63,7 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
+- :class:`cluster.MeanShift` (bug fix)
 - :class:`decomposition.IncrementalPCA` in Python 2 (bug fix)
 - :class:`decomposition.SparsePCA` (bug fix)
 - :class:`ensemble.GradientBoostingClassifier` (bug fix affecting feature importances)
@@ -94,7 +95,7 @@ Known Major Bugs
 
 * :issue:`11924`: :class:`LogisticRegressionCV` with `solver='lbfgs'` and
   `multi_class='multinomial'` may be non-deterministic or otherwise broken on
-  MacOS. This appears to be the case on Travis CI servers, but has not been
+  macOS. This appears to be the case on Travis CI servers, but has not been
   confirmed on personal MacBooks! This issue has been present in previous
   releases.
 
@@ -124,7 +125,7 @@ Support for Python 3.3 has been officially dropped.
   regardless of ``algorithm``.
   :issue:`8003` by :user:`Joël Billaud <recamshak>`.
 
-- |Enhancement| :class:`cluster.KMeans` now gives a warning, if the number of
+- |Enhancement| :class:`cluster.KMeans` now gives a warning if the number of
   distinct clusters found is smaller than ``n_clusters``. This may occur when
   the number of distinct points in the data set is actually smaller than the
   number of cluster one is looking for.
@@ -143,10 +144,15 @@ Support for Python 3.3 has been officially dropped.
   and :user:`Devansh D. <devanshdalal>`.
 
 - |Fix| Fixed a bug in :func:`cluster.k_means_elkan` where the returned
-  `iteration` was 1 less than the correct value. Also added the missing
-  `n_iter_` attribute in the docstring of :class:`cluster.KMeans`.
+  ``iteration`` was 1 less than the correct value. Also added the missing
+  ``n_iter_`` attribute in the docstring of :class:`cluster.KMeans`.
   :issue:`11353` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+- |Fix| Fixed a bug in :func:`cluster.mean_shift` where the assigned labels
+  were not deterministic if there were multiple clusters with the same
+  intensities.
+  :issue:`11901` by :user:`Adrin Jalali <adrinjalali>`.
+
 - |API| Deprecate ``pooling_func`` unused parameter in
   :class:`cluster.AgglomerativeClustering`.
   :issue:`9875` by :user:`Kumar Ashutosh <thechargedneutron>`.
@@ -187,12 +193,12 @@ Support for Python 3.3 has been officially dropped.
 .......................
 
 - |MajorFeature| Added :func:`datasets.fetch_openml` to fetch datasets from
-  `OpenML <http://openml.org>`. OpenML is a free, open data sharing platform
+  `OpenML <http://openml.org>`_. OpenML is a free, open data sharing platform
   and will be used instead of mldata as it provides better service availability.
   :issue:`9908` by `Andreas Müller`_ and :user:`Jan N. van Rijn <janvanrijn>`.
 
 - |Feature| In :func:`datasets.make_blobs`, one can now pass a list to the
-  `n_samples` parameter to indicate the number of samples to generate per
+  ``n_samples`` parameter to indicate the number of samples to generate per
   cluster. :issue:`8617` by :user:`Maskani Filali Mohamed <maskani-moh>` and
   :user:`Konstantinos Katrioplas <kkatrio>`.
 
@@ -251,8 +257,8 @@ Support for Python 3.3 has been officially dropped.
 
 - |Fix| In :class:`decomposition.PCA` selecting a n_components parameter greater
   than the number of samples now raises an error. Similarly, the
-  ``n_components=None`` case now selects the minimum of n_samples and
-  n_features.
+  ``n_components=None`` case now selects the minimum of ``n_samples`` and
+  ``n_features``.
   :issue:`8484` by :user:`Wally Gauze <wallygauze>`.
 
 - |Fix| Fixed a bug in :class:`decomposition.PCA` where users will get
@@ -264,7 +270,7 @@ Support for Python 3.3 has been officially dropped.
   :class:`decomposition.NMF` :issue:`10142` by `Tom Dupre la Tour`_.
 
 - |Fix| Fixed a bug in :class:`decomposition.SparseCoder` when running OMP
-  sparse coding in parallel using readonly memory mapped datastructures.
+  sparse coding in parallel using read-only memory mapped datastructures.
   :issue:`5956` by :user:`Vighnesh Birodkar <vighneshbirodkar>` and
   :user:`Olivier Grisel <ogrisel>`.
 
@@ -274,7 +280,7 @@ Support for Python 3.3 has been officially dropped.
 
 - |Efficiency| Memory usage improvement for :func:`_class_means` and
   :func:`_class_cov` in :mod:`discriminant_analysis`. :issue:`10898` by
-  :user:`Nanxin Chen <bobchennan>`.`
+  :user:`Nanxin Chen <bobchennan>`.
 
 
 :mod:`sklearn.dummy`
@@ -287,6 +293,10 @@ Support for Python 3.3 has been officially dropped.
   only require X to be an object with finite length or shape. :issue:`9832` by
   :user:`Vrishank Bhardwaj <vrishank97>`.
 
+- |Feature| :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`
+  can now be scored without supplying test samples.
+  :issue:`11951` by :user:`Rüdiger Busche <JarnoRFB>`.
+
 
 :mod:`sklearn.ensemble`
 .......................
@@ -301,7 +311,7 @@ Support for Python 3.3 has been officially dropped.
   via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071`
   by `Raghav RV`_
 
-- |Feature| Add `named_estimators_` parameter in
+- |Feature| Added ``named_estimators_`` parameter in
   :class:`ensemble.VotingClassifier` to access fitted estimators.
   :issue:`9157` by :user:`Herilalaina Rakotoarison <herilalaina>`.
 
@@ -333,12 +343,6 @@ Support for Python 3.3 has been officially dropped.
   while mask does not allow this functionality.
   :issue:`9524` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- |API| The parameters ``min_samples_leaf`` and ``min_weight_fraction_leaf`` in
-  tree-based ensembles are deprecated and will be removed (fixed to 1 and 0
-  respectively) in version 0.22.  These parameters were not effective for
-  regularization and at worst would produce bad splits.  :issue:`10773` by
-  :user:`Bob Chen <lasagnaman>` and `Joel Nothman`_.
-
 - |Fix| :class:`ensemble.BaseBagging` where one could not deterministically
   reproduce ``fit`` result using the object attributes when ``random_state``
   is set. :issue:`9723` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -466,7 +470,7 @@ Support for Python 3.3 has been officially dropped.
 
 - |Fix| Fixed a bug in :class:`linear_model.LogisticRegressionCV` where the
   'ovr' strategy was always used to compute cross-validation scores in the
-  multiclass setting, even if 'multinomial' was set.
+  multiclass setting, even if ``'multinomial'`` was set.
   :issue:`8720` by :user:`William de Vazelhes <wdevazelhes>`.
 
 - |Fix| Fixed a bug in :class:`linear_model.OrthogonalMatchingPursuit` that was
@@ -497,7 +501,7 @@ Support for Python 3.3 has been officially dropped.
   :issue:`10687` by :user:`Martin Hahn <martin-hahn>`.
 
 - |Fix| Fixed a bug in :func:`sklearn.linear_model.LogisticRegression` where the
-  multi_class='multinomial' with binary output with warm_start = True
+  ``multi_class='multinomial'`` with binary output ``with warm_start=True``
   :issue:`10836` by :user:`Aishwarya Srinivasan <aishgrt1>`.
 
 - |Fix| Fixed a bug in :class:`linear_model.RidgeCV` where using integer
@@ -513,7 +517,7 @@ Support for Python 3.3 has been officially dropped.
   :class:`linear_model.PassiveAggressiveClassifier`,
   :class:`linear_model.PassiveAggressiveRegressor` and
   :class:`linear_model.Perceptron`, where the stopping criterion was stopping
-  the algorithm before convergence. A parameter `n_iter_no_change` was added
+  the algorithm before convergence. A parameter ``n_iter_no_change`` was added
   and set by default to 5. Previous behavior is equivalent to setting the
   parameter to 1. :issue:`9043` by `Tom Dupre la Tour`_.
 
@@ -798,7 +802,7 @@ Support for Python 3.3 has been officially dropped.
   memory efficient when ``algorithm='brute'``.
   :issue:`11136` by `Joel Nothman`_ and :user:`Aman Dalmia <dalmia>`.
 
-- |Feature| Add `sample_weight` parameter to the fit method of
+- |Feature| Add ``sample_weight`` parameter to the fit method of
   :class:`neighbors.KernelDensity` to enable weighting in kernel density
   estimation.
   :issue:`4394` by :user:`Samuel O. Ronsin <samronsin>`.
@@ -827,8 +831,8 @@ Support for Python 3.3 has been officially dropped.
   faster construction and querying times.
   :issue:`11556` by :user:`Jake VanderPlas <jakevdp>`
 
-- |Fix| Fixed a bug in `neighbors.KDTree` and `neighbors.BallTree` where
-  pickled tree objects would change their type to the super class `BinaryTree`.
+- |Fix| Fixed a bug in :class:`neighbors.KDTree` and :class:`neighbors.BallTree` where
+  pickled tree objects would change their type to the super class :class:`BinaryTree`.
   :issue:`11774` by :user:`Nicolas Hug <NicolasHug>`.
 
 
@@ -860,6 +864,9 @@ Support for Python 3.3 has been officially dropped.
   keyword arguments on to the pipeline's last estimator, enabling the use of
   parameters such as ``return_std`` in a pipeline with caution.
   :issue:`9304` by :user:`Breno Freitas <brenolf>`.
+  
+- |API| :class:`pipeline.FeatureUnion` now supports ``'drop'`` as a transformer
+  to drop features. :issue:`11144` by :user:`thomasjpfan`.
 
 
 :mod:`sklearn.preprocessing`
@@ -945,7 +952,7 @@ Support for Python 3.3 has been officially dropped.
   :issue:`11042` by :user:`Daniel Morales <DanielMorales9>`.
 
 - |Fix| Fix ``fit`` and ``partial_fit`` in
-  :class:`preprocessing.StandardScaler` in the rare case when `with_mean=False`
+  :class:`preprocessing.StandardScaler` in the rare case when ``with_mean=False``
   and `with_std=False` which was crashing by calling ``fit`` more than once and
   giving inconsistent results for ``mean_`` whether the input was a sparse or a
   dense matrix. ``mean_`` will be set to ``None`` with both sparse and dense
@@ -1025,13 +1032,6 @@ Support for Python 3.3 has been officially dropped.
   considered all samples to be of equal weight importance.
   :issue:`11464` by :user:`John Stott <JohnStott>`.
 
-- |API| The parameters ``min_samples_leaf`` and ``min_weight_fraction_leaf`` in
-  :class:`tree.DecisionTreeClassifier` and :class:`tree.DecisionTreeRegressor`
-  are deprecated and will be removed (fixed to 1 and 0 respectively) in version
-  0.22.  These parameters were not effective for regularization and at worst
-  would produce bad splits.  :issue:`10773` by :user:`Bob Chen <lasagnaman>`
-  and `Joel Nothman`_.
-
 
 :mod:`sklearn.utils`
 ....................
diff --git a/examples/calibration/plot_compare_calibration.py b/examples/calibration/plot_compare_calibration.py
index 2d9d0af0dcbc5..15dd0e57a3021 100644
--- a/examples/calibration/plot_compare_calibration.py
+++ b/examples/calibration/plot_compare_calibration.py
@@ -75,7 +75,7 @@
 y_test = y[train_samples:]
 
 # Create classifiers
-lr = LogisticRegression()
+lr = LogisticRegression(solver='lbfgs')
 gnb = GaussianNB()
 svc = LinearSVC(C=1.0)
 rfc = RandomForestClassifier(n_estimators=100)
diff --git a/examples/classification/plot_classification_probability.py b/examples/classification/plot_classification_probability.py
index 4542362817d71..ea4df9e6fb583 100644
--- a/examples/classification/plot_classification_probability.py
+++ b/examples/classification/plot_classification_probability.py
@@ -3,13 +3,17 @@
 Plot classification probability
 ===============================
 
-Plot the classification probability for different classifiers. We use a 3
-class dataset, and we classify it with a Support Vector classifier, L1
-and L2 penalized logistic regression with either a One-Vs-Rest or multinomial
-setting, and Gaussian process classification.
+Plot the classification probability for different classifiers. We use a 3 class
+dataset, and we classify it with a Support Vector classifier, L1 and L2
+penalized logistic regression with either a One-Vs-Rest or multinomial setting,
+and Gaussian process classification.
 
-The logistic regression is not a multiclass classifier out of the box. As
-a result it can identify only the first class.
+Linear SVC is not a probabilistic classifier by default but it has a built-in
+calibration option enabled in this example (`probability=True`).
+
+The logistic regression with One-Vs-Rest is not a multiclass classifier out of
+the box. As a result it has more trouble in separating class 2 and 3 than the
+other estimators.
 """
 print(__doc__)
 
@@ -19,6 +23,7 @@ class dataset, and we classify it with a Support Vector classifier, L1
 import matplotlib.pyplot as plt
 import numpy as np
 
+from sklearn.metrics import accuracy_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 from sklearn.gaussian_process import GaussianProcessClassifier
@@ -31,19 +36,27 @@ class dataset, and we classify it with a Support Vector classifier, L1
 
 n_features = X.shape[1]
 
-C = 1.0
+C = 10
 kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
 
-# Create different classifiers. The logistic regression cannot do
-# multiclass out of the box.
-classifiers = {'L1 logistic': LogisticRegression(C=C, penalty='l1'),
-               'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2'),
-               'Linear SVC': SVC(kernel='linear', C=C, probability=True,
-                                 random_state=0),
-               'L2 logistic (Multinomial)': LogisticRegression(
-                C=C, solver='lbfgs', multi_class='multinomial'),
-               'GPC': GaussianProcessClassifier(kernel)
-               }
+# Create different classifiers.
+classifiers = {
+    'L1 logistic': LogisticRegression(C=C, penalty='l1',
+                                      solver='saga',
+                                      multi_class='multinomial',
+                                      max_iter=10000),
+    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
+                                                    solver='saga',
+                                                    multi_class='multinomial',
+                                                    max_iter=10000),
+    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
+                                            solver='saga',
+                                            multi_class='ovr',
+                                            max_iter=10000),
+    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
+                      random_state=0),
+    'GPC': GaussianProcessClassifier(kernel)
+}
 
 n_classifiers = len(classifiers)
 
@@ -59,10 +72,10 @@ class dataset, and we classify it with a Support Vector classifier, L1
     classifier.fit(X, y)
 
     y_pred = classifier.predict(X)
-    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
-    print("classif_rate for %s : %f " % (name, classif_rate))
+    accuracy = accuracy_score(y, y_pred)
+    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
 
-    # View probabilities=
+    # View probabilities:
     probas = classifier.predict_proba(Xfull)
     n_classes = np.unique(y_pred).size
     for k in range(n_classes):
diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
index 010717583ec00..d9c627e6d3f38 100644
--- a/examples/compose/plot_column_transformer.py
+++ b/examples/compose/plot_column_transformer.py
@@ -89,7 +89,7 @@ def transform(self, posts):
     # Extract the subject & body
     ('subjectbody', SubjectBodyExtractor()),
 
-    # Use C toolumnTransformer to combine the features from subject and body
+    # Use ColumnTransformer to combine the features from subject and body
     ('union', ColumnTransformer(
         [
             # Pulling features from the post's subject line (first column)
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 64f1a3c88d3d1..1da0c7e0d60e8 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -61,24 +61,22 @@
 categorical_features = ['embarked', 'sex', 'pclass']
 categorical_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
+    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
 
 preprocessor = ColumnTransformer(
     transformers=[
         ('num', numeric_transformer, numeric_features),
-        ('cat', categorical_transformer, categorical_features)],
-    remainder='drop')
+        ('cat', categorical_transformer, categorical_features)])
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
 clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+                      ('classifier', LogisticRegression(solver='lbfgs'))])
 
 X = data.drop('survived', axis=1)
 y = data['survived']
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
-                                                    shuffle=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
diff --git a/examples/compose/plot_digits_pipe.py b/examples/compose/plot_digits_pipe.py
index 2352abba4584e..6e722c9861529 100644
--- a/examples/compose/plot_digits_pipe.py
+++ b/examples/compose/plot_digits_pipe.py
@@ -22,42 +22,58 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+import pandas as pd
 
-from sklearn import linear_model, decomposition, datasets
+from sklearn import datasets
+from sklearn.decomposition import PCA
+from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 
-logistic = linear_model.LogisticRegression()
 
-pca = decomposition.PCA()
+# Define a pipeline to search for the best combination of PCA truncation
+# and classifier regularization.
+logistic = SGDClassifier(loss='log', penalty='l2', early_stopping=True,
+                         max_iter=10000, tol=1e-5, random_state=0)
+pca = PCA()
 pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
 
 digits = datasets.load_digits()
 X_digits = digits.data
 y_digits = digits.target
 
+# Parameters of pipelines can be set using ‘__’ separated parameter names:
+param_grid = {
+    'pca__n_components': [5, 20, 30, 40, 50, 64],
+    'logistic__alpha': np.logspace(-4, 4, 5),
+}
+search = GridSearchCV(pipe, param_grid, iid=False, cv=5,
+                      return_train_score=False)
+search.fit(X_digits, y_digits)
+print("Best parameter (CV score=%0.3f):" % search.best_score_)
+print(search.best_params_)
+
 # Plot the PCA spectrum
 pca.fit(X_digits)
 
-plt.figure(1, figsize=(4, 3))
-plt.clf()
-plt.axes([.2, .2, .7, .7])
-plt.plot(pca.explained_variance_, linewidth=2)
-plt.axis('tight')
-plt.xlabel('n_components')
-plt.ylabel('explained_variance_')
+fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
+ax0.plot(pca.explained_variance_ratio_, linewidth=2)
+ax0.set_ylabel('PCA explained variance')
+
+ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
+            linestyle=':', label='n_components chosen')
+ax0.legend(prop=dict(size=12))
 
-# Prediction
-n_components = [20, 40, 64]
-Cs = np.logspace(-4, 4, 3)
+# For each number of components, find the best classifier results
+results = pd.DataFrame(search.cv_results_)
+components_col = 'param_pca__n_components'
+best_clfs = results.groupby(components_col).apply(
+    lambda g: g.nlargest(1, 'mean_test_score'))
 
-# Parameters of pipelines can be set using ‘__’ separated parameter names:
-estimator = GridSearchCV(pipe,
-                         dict(pca__n_components=n_components,
-                              logistic__C=Cs), cv=5)
-estimator.fit(X_digits, y_digits)
+best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
+               legend=False, ax=ax1)
+ax1.set_ylabel('Classification accuracy (val)')
+ax1.set_xlabel('n_components')
 
-plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
-            linestyle=':', label='n_components chosen')
-plt.legend(prop=dict(size=12))
+plt.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_adaboost_hastie_10_2.py b/examples/ensemble/plot_adaboost_hastie_10_2.py
index 7fc00a77e3eab..4d48d13dd24f2 100644
--- a/examples/ensemble/plot_adaboost_hastie_10_2.py
+++ b/examples/ensemble/plot_adaboost_hastie_10_2.py
@@ -43,11 +43,11 @@
 X_test, y_test = X[2000:], y[2000:]
 X_train, y_train = X[:2000], y[:2000]
 
-dt_stump = DecisionTreeClassifier(max_depth=1)
+dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
 dt_stump.fit(X_train, y_train)
 dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)
 
-dt = DecisionTreeClassifier(max_depth=9)
+dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)
 dt.fit(X_train, y_train)
 dt_err = 1.0 - dt.score(X_test, y_test)
 
diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 5dbc2754b3a35..085309ed2a942 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -42,19 +42,19 @@
 n_estimator = 10
 X, y = make_classification(n_samples=80000)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
+
 # It is important to train the ensemble of trees on a different subset
 # of the training data than the linear regression model to avoid
 # overfitting, in particular if the total number of leaves is
 # similar to the number of training samples
-X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
-                                                            y_train,
-                                                            test_size=0.5)
+X_train, X_train_lr, y_train, y_train_lr = train_test_split(
+    X_train, y_train, test_size=0.5)
 
 # Unsupervised transformation based on totally random trees
 rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
                           random_state=0)
 
-rt_lm = LogisticRegression()
+rt_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 pipeline = make_pipeline(rt, rt_lm)
 pipeline.fit(X_train, y_train)
 y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
@@ -63,7 +63,7 @@
 # Supervised transformation based on random forests
 rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
 rf_enc = OneHotEncoder(categories='auto')
-rf_lm = LogisticRegression()
+rf_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 rf.fit(X_train, y_train)
 rf_enc.fit(rf.apply(X_train))
 rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
@@ -71,9 +71,10 @@
 y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
 fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
 
+# Supervised transformation based on gradient boosted trees
 grd = GradientBoostingClassifier(n_estimators=n_estimator)
 grd_enc = OneHotEncoder(categories='auto')
-grd_lm = LogisticRegression()
+grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
 grd.fit(X_train, y_train)
 grd_enc.fit(grd.apply(X_train)[:, :, 0])
 grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
@@ -82,12 +83,10 @@
     grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
 fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
 
-
 # The gradient boosted model by itself
 y_pred_grd = grd.predict_proba(X_test)[:, 1]
 fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
 
-
 # The random forest model by itself
 y_pred_rf = rf.predict_proba(X_test)[:, 1]
 fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
diff --git a/examples/ensemble/plot_gradient_boosting_oob.py b/examples/ensemble/plot_gradient_boosting_oob.py
index 99f30e750b7ed..ea38b326ce5c9 100644
--- a/examples/ensemble/plot_gradient_boosting_oob.py
+++ b/examples/ensemble/plot_gradient_boosting_oob.py
@@ -55,7 +55,7 @@
 
 # Fit classifier with out-of-bag estimates
 params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
-          'learning_rate': 0.01, 'random_state': 3}
+          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
 clf = ensemble.GradientBoostingClassifier(**params)
 
 clf.fit(X_train, y_train)
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 99e7289710e35..6fb2731a513ec 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -41,7 +41,8 @@ def f(x):
 
 clf = GradientBoostingRegressor(loss='quantile', alpha=alpha,
                                 n_estimators=250, max_depth=3,
-                                learning_rate=.1, min_samples_split=9)
+                                learning_rate=.1, min_samples_leaf=9,
+                                min_samples_split=9)
 
 clf.fit(X, y)
 
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py
index c729818620a60..e38a618da3782 100644
--- a/examples/ensemble/plot_voting_probas.py
+++ b/examples/ensemble/plot_voting_probas.py
@@ -29,7 +29,7 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import VotingClassifier
 
-clf1 = LogisticRegression(random_state=123)
+clf1 = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=123)
 clf2 = RandomForestClassifier(n_estimators=100, random_state=123)
 clf3 = GaussianNB()
 X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
@@ -79,4 +79,5 @@
 plt.ylim([0, 1])
 plt.title('Class probabilities for sample 1 by different classifiers')
 plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left')
+plt.tight_layout()
 plt.show()
diff --git a/examples/exercises/plot_digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
index 25ab7e71c5925..6651a1fa05783 100644
--- a/examples/exercises/plot_digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -15,7 +15,7 @@
 from sklearn import datasets, neighbors, linear_model
 
 digits = datasets.load_digits()
-X_digits = digits.data
+X_digits = digits.data / digits.data.max()
 y_digits = digits.target
 
 n_samples = len(X_digits)
@@ -26,7 +26,8 @@
 y_test = y_digits[int(.9 * n_samples):]
 
 knn = neighbors.KNeighborsClassifier()
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000,
+                                           multi_class='multinomial')
 
 print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
 print('LogisticRegression score: %f'
diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
index 8170de01898dc..4c438ce821284 100644
--- a/examples/gaussian_process/plot_gpr_co2.py
+++ b/examples/gaussian_process/plot_gpr_co2.py
@@ -66,7 +66,7 @@
 import numpy as np
 
 from matplotlib import pyplot as plt
-
+from sklearn.datasets import fetch_openml
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels \
     import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
@@ -79,29 +79,25 @@
 print(__doc__)
 
 
-def load_mauna_loa_atmospheric_c02():
-    url = ('http://cdiac.ess-dive.lbl.gov/'
-           'ftp/trends/co2/sio-keel-flask/maunaloa_c.dat')
+def load_mauna_loa_atmospheric_co2():
+    ml_data = fetch_openml(data_id=41187)
     months = []
     ppmv_sums = []
     counts = []
-    for line in urlopen(url):
-        line = line.decode('utf8')
-        if not line.startswith('MLO'):
-            # ignore headers
-            continue
-        station, date, weight, flag, ppmv = line.split()
-        y = date[:2]
-        m = date[2:4]
-        month_float = (int(('20' if y < '20' else '19') + y) +
-                       (int(m) - 1) / 12)
-        if not months or month_float != months[-1]:
-            months.append(month_float)
-            ppmv_sums.append(float(ppmv))
+
+    y = ml_data.data[:, 0]
+    m = ml_data.data[:, 1]
+    month_float = y + (m - 1) / 12
+    ppmvs = ml_data.target
+
+    for month, ppmv in zip(month_float, ppmvs):
+        if not months or month != months[-1]:
+            months.append(month)
+            ppmv_sums.append(ppmv)
             counts.append(1)
         else:
             # aggregate monthly sum to produce average
-            ppmv_sums[-1] += float(ppmv)
+            ppmv_sums[-1] += ppmv
             counts[-1] += 1
 
     months = np.asarray(months).reshape(-1, 1)
@@ -109,7 +105,7 @@ def load_mauna_loa_atmospheric_c02():
     return months, avg_ppmvs
 
 
-X, y = load_mauna_loa_atmospheric_c02()
+X, y = load_mauna_loa_atmospheric_co2()
 
 # Kernel with parameters given in GPML book
 k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
diff --git a/examples/linear_model/plot_iris_logistic.py b/examples/linear_model/plot_iris_logistic.py
index d2193e9907b56..968598392722d 100644
--- a/examples/linear_model/plot_iris_logistic.py
+++ b/examples/linear_model/plot_iris_logistic.py
@@ -7,29 +7,28 @@
 =========================================================
 
 Show below is a logistic-regression classifiers decision boundaries on the
-`iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The
-datapoints are colored according to their labels.
+first two dimensions (sepal length and width) of the `iris
+<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ dataset. The datapoints
+are colored according to their labels.
 
 """
 print(__doc__)
 
-
 # Code source: Gaël Varoquaux
 # Modified for documentation by Jaques Grobler
 # License: BSD 3 clause
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import linear_model, datasets
+from sklearn.linear_model import LogisticRegression
+from sklearn import datasets
 
 # import some data to play with
 iris = datasets.load_iris()
 X = iris.data[:, :2]  # we only take the first two features.
 Y = iris.target
 
-h = .02  # step size in the mesh
-
-logreg = linear_model.LogisticRegression(C=1e5)
+logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
 
 # we create an instance of Neighbours Classifier and fit the data.
 logreg.fit(X, Y)
@@ -38,6 +37,7 @@
 # point in the mesh [x_min, x_max]x[y_min, y_max].
 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
+h = .02  # step size in the mesh
 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
 
diff --git a/examples/linear_model/plot_logistic.py b/examples/linear_model/plot_logistic.py
index 488f1c3543a6a..6d94cb0548601 100644
--- a/examples/linear_model/plot_logistic.py
+++ b/examples/linear_model/plot_logistic.py
@@ -23,8 +23,7 @@
 
 from sklearn import linear_model
 
-# this is our test set, it's just a straight line with some
-# Gaussian noise
+# General a toy dataset:s it's just a straight line with some Gaussian noise:
 xmin, xmax = -5, 5
 n_samples = 100
 np.random.seed(0)
@@ -34,8 +33,9 @@
 X += .3 * np.random.normal(size=n_samples)
 
 X = X[:, np.newaxis]
-# run the classifier
-clf = linear_model.LogisticRegression(C=1e5)
+
+# Fit the classifier
+clf = linear_model.LogisticRegression(C=1e5, solver='lbfgs')
 clf.fit(X, y)
 
 # and plot the result
@@ -47,6 +47,8 @@
 
 def model(x):
     return 1 / (1 + np.exp(-x))
+
+
 loss = model(X_test * clf.coef_ + clf.intercept_).ravel()
 plt.plot(X_test, loss, color='red', linewidth=3)
 
@@ -63,4 +65,5 @@ def model(x):
 plt.xlim(-4, 10)
 plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
            loc="lower right", fontsize='small')
+plt.tight_layout()
 plt.show()
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
index be63b144c260a..bffc648965fca 100644
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
@@ -37,10 +37,10 @@
 
 
 # Set regularization parameter
-for i, C in enumerate((100, 1, 0.01)):
+for i, C in enumerate((1, 0.1, 0.01)):
     # turn down tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
-    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
+    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01, solver='saga')
+    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01, solver='saga')
     clf_l1_LR.fit(X, y)
     clf_l2_LR.fit(X, y)
 
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
index 66a1ab9bd0254..79b5522575eb0 100644
--- a/examples/linear_model/plot_logistic_path.py
+++ b/examples/linear_model/plot_logistic_path.py
@@ -1,10 +1,28 @@
 #!/usr/bin/env python
 """
-=================================
-Path with L1- Logistic Regression
-=================================
+==============================================
+Regularization path of L1- Logistic Regression
+==============================================
 
-Computes path on IRIS dataset.
+
+Train l1-penalized logistic regression models on a binary classification
+problem derived from the Iris dataset.
+
+The models are ordered from strongest regularized to least regularized. The 4
+coefficients of the models are collected and plotted as a "regularization
+path": on the left-hand side of the figure (strong regularizers), all the
+coefficients are exactly 0. When regularization gets progressively looser,
+coefficients can get non-zero values one after the other.
+
+Here we choose the SAGA solver because it can efficiently optimize for the
+Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
+
+Also note that we set a low value for the tolerance to make sure that the model
+has converged before collecting the coefficients.
+
+We also use warm_start=True which means that the coefficients of the models are
+reused to initialize the next model fit to speed-up the computation of the
+full-path.
 
 """
 print(__doc__)
@@ -12,7 +30,7 @@
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 # License: BSD 3 clause
 
-from datetime import datetime
+from time import time
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -27,26 +45,28 @@
 X = X[y != 2]
 y = y[y != 2]
 
-X -= np.mean(X, 0)
+X /= X.max()  # Normalize X to speed-up convergence
 
 # #############################################################################
 # Demo path functions
 
-cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)
+cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16)
 
 
 print("Computing regularization path ...")
-start = datetime.now()
-clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
+start = time()
+clf = linear_model.LogisticRegression(penalty='l1', solver='saga',
+                                      tol=1e-6, max_iter=int(1e6),
+                                      warm_start=True)
 coefs_ = []
 for c in cs:
     clf.set_params(C=c)
     clf.fit(X, y)
     coefs_.append(clf.coef_.ravel().copy())
-print("This took ", datetime.now() - start)
+print("This took %0.3fs" % (time() - start))
 
 coefs_ = np.array(coefs_)
-plt.plot(np.log10(cs), coefs_)
+plt.plot(np.log10(cs), coefs_, marker='o')
 ymin, ymax = plt.ylim()
 plt.xlabel('log(C)')
 plt.ylabel('Coefficients')
diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py
index cb3a5085e316d..afe0131926dea 100644
--- a/examples/multioutput/plot_classifier_chain_yeast.py
+++ b/examples/multioutput/plot_classifier_chain_yeast.py
@@ -54,14 +54,15 @@
 
 # Fit an independent logistic regression model for each class using the
 # OneVsRestClassifier wrapper.
-ovr = OneVsRestClassifier(LogisticRegression())
+base_lr = LogisticRegression(solver='lbfgs')
+ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
 ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)
 
 # Fit an ensemble of logistic regression classifier chains and take the
 # take the average prediction of all the chains.
-chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i)
+chains = [ClassifierChain(base_lr, order='random', random_state=i)
           for i in range(10)]
 for chain in chains:
     chain.fit(X_train, Y_train)
diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
index ab50d4e59a813..408e555443c58 100644
--- a/examples/neural_networks/plot_mnist_filters.py
+++ b/examples/neural_networks/plot_mnist_filters.py
@@ -28,6 +28,7 @@
 
 # Load data from https://www.openml.org/d/554
 X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
+X = X / 255.
 
 # rescale the data, use the traditional train/test split
 X_train, X_test = X[:60000], X[60000:]
diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index aa75ccc06d1f1..26223ad245214 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -40,6 +40,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import BernoulliRBM
 from sklearn.pipeline import Pipeline
+from sklearn.base import clone
 
 
 # #############################################################################
@@ -67,29 +68,32 @@ def nudge_dataset(X, Y):
          [0, 0, 0],
          [0, 1, 0]]]
 
-    shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
-                                  weights=w).ravel()
+    def shift(x, w):
+        return convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel()
+
     X = np.concatenate([X] +
                        [np.apply_along_axis(shift, 1, X, vector)
                         for vector in direction_vectors])
     Y = np.concatenate([Y for _ in range(5)], axis=0)
     return X, Y
 
+
 # Load Data
 digits = datasets.load_digits()
 X = np.asarray(digits.data, 'float32')
 X, Y = nudge_dataset(X, digits.target)
 X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
 
-X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
-                                                    test_size=0.2,
-                                                    random_state=0)
+X_train, X_test, Y_train, Y_test = train_test_split(
+    X, Y, test_size=0.2, random_state=0)
 
 # Models we will use
-logistic = linear_model.LogisticRegression()
+logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=10000,
+                                           multi_class='multinomial')
 rbm = BernoulliRBM(random_state=0, verbose=True)
 
-classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
+rbm_features_classifier = Pipeline(
+    steps=[('rbm', rbm), ('logistic', logistic)])
 
 # #############################################################################
 # Training
@@ -102,28 +106,26 @@ def nudge_dataset(X, Y):
 # More components tend to give better prediction performance, but larger
 # fitting time
 rbm.n_components = 100
-logistic.C = 6000.0
+logistic.C = 6000
 
 # Training RBM-Logistic Pipeline
-classifier.fit(X_train, Y_train)
+rbm_features_classifier.fit(X_train, Y_train)
 
-# Training Logistic regression
-logistic_classifier = linear_model.LogisticRegression(C=100.0)
-logistic_classifier.fit(X_train, Y_train)
+# Training the Logistic regression classifier directly on the pixel
+raw_pixel_classifier = clone(logistic)
+raw_pixel_classifier.C = 100.
+raw_pixel_classifier.fit(X_train, Y_train)
 
 # #############################################################################
 # Evaluation
 
-print()
+Y_pred = rbm_features_classifier.predict(X_test)
 print("Logistic regression using RBM features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
+Y_pred = raw_pixel_classifier.predict(X_test)
 print("Logistic regression using raw pixel features:\n%s\n" % (
-    metrics.classification_report(
-        Y_test,
-        logistic_classifier.predict(X_test))))
+    metrics.classification_report(Y_test, Y_pred)))
 
 # #############################################################################
 # Plotting
diff --git a/setup.cfg b/setup.cfg
index 09c5c9829ae21..59c4804c14478 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,12 +5,13 @@ test = pytest
 # disable-pytest-warnings should be removed once we rewrite tests
 # using yield with parametrize
 addopts =
+    --ignore build_tools
+    --ignore benchmarks
+    --ignore doc
+    --ignore examples
     --doctest-modules
     --disable-pytest-warnings
     -rs
-filterwarnings =
-    error::DeprecationWarning
-    error::FutureWarning
 
 [wheelhouse_uploader]
 artifact_indexes=
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 87b3630c574ef..5060bae3836bf 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -44,7 +44,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.20rc1'
+__version__ = '0.20'
 
 
 try:
diff --git a/sklearn/base.py b/sklearn/base.py
index d75adb06d61b1..56ffb18bf8b69 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -342,9 +342,12 @@ def fit_predict(self, X, y=None):
         X : ndarray, shape (n_samples, n_features)
             Input data.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
-        y : ndarray, shape (n_samples,)
+        labels : ndarray, shape (n_samples,)
             cluster labels
         """
         # non-optimized default implementation; override when a better
@@ -494,6 +497,9 @@ def fit_predict(self, X, y=None):
         X : ndarray, shape (n_samples, n_features)
             Input data.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
         y : ndarray, shape (n_samples,)
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index f6b8bdfbb3a36..6d42c02dc39b4 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -227,7 +227,7 @@ class DBSCAN(BaseEstimator, ClusterMixin):
 
     n_jobs : int or None, optional (default=None)
         The number of parallel jobs to run.
-       ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index c402bf6c8b615..5fbe8810e56e0 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -948,6 +948,7 @@ def fit(self, X, y=None, sample_weight=None):
             copy if the given data is not C-contiguous.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
@@ -979,6 +980,7 @@ def fit_predict(self, X, y=None, sample_weight=None):
             New data to transform.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
@@ -1002,6 +1004,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
             New data to transform.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
@@ -1081,6 +1084,7 @@ def score(self, X, y=None, sample_weight=None):
             New data.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
@@ -1473,6 +1477,7 @@ def fit(self, X, y=None, sample_weight=None):
             if the given data is not C-contiguous.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
@@ -1654,6 +1659,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
             X will be copied if it is not C-contiguous.
 
         y : Ignored
+            not used, present here for API consistency by convention.
 
         sample_weight : array-like, shape (n_samples,), optional
             The weights for each observation in X. If None, all observations
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index 487545ac039d3..800c85c365988 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -215,8 +215,10 @@ def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
     # If the distance between two kernels is less than the bandwidth,
     # then we have to remove one because it is a duplicate. Remove the
     # one with fewer points.
+
     sorted_by_intensity = sorted(center_intensity_dict.items(),
-                                 key=lambda tup: tup[1], reverse=True)
+                                 key=lambda tup: (tup[1], tup[0]),
+                                 reverse=True)
     sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
     unique = np.ones(len(sorted_centers), dtype=np.bool)
     nbrs = NearestNeighbors(radius=bandwidth,
@@ -359,9 +361,9 @@ class MeanShift(BaseEstimator, ClusterMixin):
     ...               [4, 7], [3, 5], [3, 6]])
     >>> clustering = MeanShift(bandwidth=2).fit(X)
     >>> clustering.labels_
-    array([0, 0, 0, 1, 1, 1])
+    array([1, 1, 1, 0, 0, 0])
     >>> clustering.predict([[0, 0], [5, 5]])
-    array([0, 1])
+    array([1, 0])
     >>> clustering # doctest: +NORMALIZE_WHITESPACE
     MeanShift(bandwidth=2, bin_seeding=False, cluster_all=True, min_bin_freq=1,
          n_jobs=None, seeds=None)
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 7935e7134d242..5994c770db9c9 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -885,7 +885,7 @@ def test_sparse_validate_centers():
     # Test that a ValueError is raised for validate_center_shape
     classifier = KMeans(n_clusters=3, init=centers, n_init=1)
 
-    msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \
+    msg = r"The shape of the initial centers \(\(4L?, 4L?\)\) " \
           "does not match the number of clusters 3"
     assert_raises_regex(ValueError, msg, classifier.fit, X)
 
@@ -969,7 +969,7 @@ def test_sample_weight_length():
     # check that an error is raised when passing sample weights
     # with an incompatible shape
     km = KMeans(n_clusters=n_clusters, random_state=42)
-    assert_raises_regex(ValueError, 'len\(sample_weight\)', km.fit, X,
+    assert_raises_regex(ValueError, r'len\(sample_weight\)', km.fit, X,
                         sample_weight=np.ones(2))
 
 
diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py
index 1d6940a947dc2..441f822cdbded 100644
--- a/sklearn/cluster/tests/test_mean_shift.py
+++ b/sklearn/cluster/tests/test_mean_shift.py
@@ -101,6 +101,18 @@ def test_unfitted():
     assert_false(hasattr(ms, "labels_"))
 
 
+def test_cluster_intensity_tie():
+    X = np.array([[1, 1], [2, 1], [1, 0],
+                  [4, 7], [3, 5], [3, 6]])
+    c1 = MeanShift(bandwidth=2).fit(X)
+
+    X = np.array([[4, 7], [3, 5], [3, 6],
+                  [1, 1], [2, 1], [1, 0]])
+    c2 = MeanShift(bandwidth=2).fit(X)
+    assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
+    assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
+
+
 def test_bin_seeds():
     # Test the bin seeding technique which can be used in the mean shift
     # algorithm
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index e09d2d09d7e43..9014623280d2e 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -211,20 +211,29 @@ def set_params(self, **kwargs):
         self._set_params('_transformers', **kwargs)
         return self
 
-    def _iter(self, X=None, fitted=False, replace_strings=False):
-        """Generate (name, trans, column, weight) tuples
+    def _iter(self, fitted=False, replace_strings=False):
+        """
+        Generate (name, trans, X_subset, weight, column) tuples.
+
+        If fitted=True, use the fitted transformers, else use the
+        user specified transformers updated with converted column names
+        and potentially appended with transformer for remainder.
+
         """
         if fitted:
             transformers = self.transformers_
         else:
-            transformers = self.transformers
+            # interleave the validated column specifiers
+            transformers = [
+                (name, trans, column) for (name, trans, _), column
+                in zip(self.transformers, self._columns)
+            ]
+            # add transformer tuple for remainder
             if self._remainder[2] is not None:
                 transformers = chain(transformers, [self._remainder])
         get_weight = (self.transformer_weights or {}).get
 
         for name, trans, column in transformers:
-            sub = None if X is None else _get_column(X, column)
-
             if replace_strings:
                 # replace 'passthrough' with identity transformer and
                 # skip in case of 'drop'
@@ -235,7 +244,7 @@ def _iter(self, X=None, fitted=False, replace_strings=False):
                 elif trans == 'drop':
                     continue
 
-            yield (name, trans, sub, get_weight(name))
+            yield (name, trans, column, get_weight(name))
 
     def _validate_transformers(self):
         if not self.transformers:
@@ -257,6 +266,17 @@ def _validate_transformers(self):
                                 "specifiers. '%s' (type %s) doesn't." %
                                 (t, type(t)))
 
+    def _validate_column_callables(self, X):
+        """
+        Converts callable column specifications.
+        """
+        columns = []
+        for _, _, column in self.transformers:
+            if callable(column):
+                column = column(X)
+            columns.append(column)
+        self._columns = columns
+
     def _validate_remainder(self, X):
         """
         Validates ``remainder`` and defines ``_remainder`` targeting
@@ -274,7 +294,7 @@ def _validate_remainder(self, X):
 
         n_columns = X.shape[1]
         cols = []
-        for _, _, columns in self.transformers:
+        for columns in self._columns:
             cols.extend(_get_column_indices(X, columns))
         remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
 
@@ -320,27 +340,23 @@ def get_feature_names(self):
 
     def _update_fitted_transformers(self, transformers):
         # transformers are fitted; excludes 'drop' cases
-        transformers = iter(transformers)
+        fitted_transformers = iter(transformers)
         transformers_ = []
 
-        transformer_iter = self.transformers
-        if self._remainder[2] is not None:
-            transformer_iter = chain(transformer_iter, [self._remainder])
-
-        for name, old, column in transformer_iter:
+        for name, old, column, _ in self._iter():
             if old == 'drop':
                 trans = 'drop'
             elif old == 'passthrough':
                 # FunctionTransformer is present in list of transformers,
                 # so get next transformer, but save original string
-                next(transformers)
+                next(fitted_transformers)
                 trans = 'passthrough'
             else:
-                trans = next(transformers)
+                trans = next(fitted_transformers)
             transformers_.append((name, trans, column))
 
         # sanity check that transformers is exhausted
-        assert not list(transformers)
+        assert not list(fitted_transformers)
         self.transformers_ = transformers_
 
     def _validate_output(self, result):
@@ -348,7 +364,8 @@ def _validate_output(self, result):
         Ensure that the output of each transformer is 2D. Otherwise
         hstack can raise an error or produce incorrect results.
         """
-        names = [name for name, _, _, _ in self._iter(replace_strings=True)]
+        names = [name for name, _, _, _ in self._iter(fitted=True,
+                                                      replace_strings=True)]
         for Xs, name in zip(result, names):
             if not getattr(Xs, 'ndim', 0) == 2:
                 raise ValueError(
@@ -366,9 +383,9 @@ def _fit_transform(self, X, y, func, fitted=False):
         try:
             return Parallel(n_jobs=self.n_jobs)(
                 delayed(func)(clone(trans) if not fitted else trans,
-                              X_sel, y, weight)
-                for _, trans, X_sel, weight in self._iter(
-                    X=X, fitted=fitted, replace_strings=True))
+                              _get_column(X, column), y, weight)
+                for _, trans, column, weight in self._iter(
+                    fitted=fitted, replace_strings=True))
         except ValueError as e:
             if "Expected 2D array, got 1D array instead" in str(e):
                 raise ValueError(_ERR_MSG_1DCOLUMN)
@@ -419,8 +436,9 @@ def fit_transform(self, X, y=None):
             sparse matrices.
 
         """
-        self._validate_remainder(X)
         self._validate_transformers()
+        self._validate_column_callables(X)
+        self._validate_remainder(X)
 
         result = self._fit_transform(X, y, _fit_transform_one)
 
@@ -545,9 +563,6 @@ def _get_column(X, key):
           can use any hashable object as key).
 
     """
-    if callable(key):
-        key = key(X)
-
     # check whether we have string column names or integers
     if _check_key_type(key, int):
         column_names = False
@@ -589,9 +604,6 @@ def _get_column_indices(X, key):
     """
     n_columns = X.shape[1]
 
-    if callable(key):
-        key = key(X)
-
     if _check_key_type(key, int):
         if isinstance(key, int):
             return [key]
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index f67806a52c543..7e5e5029fa71a 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -873,6 +873,8 @@ def func(X):
                            remainder='drop')
     assert_array_equal(ct.fit_transform(X_array), X_res_first)
     assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
+    assert callable(ct.transformers[0][2])
+    assert ct.transformers_[0][2] == [0]
 
     pd = pytest.importorskip('pandas')
     X_df = pd.DataFrame(X_array, columns=['first', 'second'])
@@ -886,3 +888,5 @@ def func(X):
                            remainder='drop')
     assert_array_equal(ct.fit_transform(X_df), X_res_first)
     assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
+    assert callable(ct.transformers[0][2])
+    assert ct.transformers_[0][2] == ['first']
diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py
index a150c032ed43f..8f1936aeb2f72 100644
--- a/sklearn/covariance/elliptic_envelope.py
+++ b/sklearn/covariance/elliptic_envelope.py
@@ -3,7 +3,6 @@
 # License: BSD 3 clause
 
 import numpy as np
-import scipy as sp
 import warnings
 from . import MinCovDet
 from ..utils.validation import check_is_fitted, check_array
@@ -103,7 +102,10 @@ def fit(self, X, y=None):
         ----------
         X : numpy array or sparse matrix, shape (n_samples, n_features).
             Training data
-        y : (ignored)
+
+        y : Ignored
+            not used, present for API consistency by convention.
+
         """
         super(EllipticEnvelope, self).fit(X)
         self.offset_ = np.percentile(-self.dist_, 100. * self.contamination)
diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py
index 84ad65af7a5a8..6265b1a9cca62 100644
--- a/sklearn/covariance/empirical_covariance_.py
+++ b/sklearn/covariance/empirical_covariance_.py
@@ -104,6 +104,9 @@ class EmpiricalCovariance(BaseEstimator):
 
     Attributes
     ----------
+    location_ : array-like, shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     covariance_ : 2D ndarray, shape (n_features, n_features)
         Estimated covariance matrix
 
@@ -111,6 +114,24 @@ class EmpiricalCovariance(BaseEstimator):
         Estimated pseudo-inverse matrix.
         (stored only if store_precision is True)
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EmpiricalCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = EmpiricalCovariance().fit(X)
+    >>> cov.covariance_ # doctest: +ELLIPSIS
+    array([[0.7569..., 0.2818...],
+           [0.2818..., 0.3928...]])
+    >>> cov.location_
+    array([0.0622..., 0.0193...])
+
     """
     def __init__(self, store_precision=True, assume_centered=False):
         self.store_precision = store_precision
diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
index bcd561319a053..47af47b9702dd 100644
--- a/sklearn/covariance/robust_covariance.py
+++ b/sklearn/covariance/robust_covariance.py
@@ -581,6 +581,24 @@ class MinCovDet(EmpiricalCovariance):
         Mahalanobis distances of the training set (on which `fit` is called)
         observations.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import MinCovDet
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = MinCovDet(random_state=0).fit(X)
+    >>> cov.covariance_ # doctest: +ELLIPSIS
+    array([[0.7411..., 0.2535...],
+           [0.2535..., 0.3053...]])
+    >>> cov.location_
+    array([0.0813... , 0.0427...])
+
     References
     ----------
 
diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
index 5a61759d665ec..4f95fd13ebf36 100644
--- a/sklearn/covariance/shrunk_covariance_.py
+++ b/sklearn/covariance/shrunk_covariance_.py
@@ -84,6 +84,9 @@ class ShrunkCovariance(EmpiricalCovariance):
 
     Attributes
     ----------
+    location_ : array-like, shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     covariance_ : array-like, shape (n_features, n_features)
         Estimated covariance matrix
 
@@ -95,6 +98,24 @@ class ShrunkCovariance(EmpiricalCovariance):
         Coefficient in the convex combination used for the computation
         of the shrunk estimate.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import ShrunkCovariance
+    >>> from sklearn.datasets import make_gaussian_quantiles
+    >>> real_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> np.random.seed(0)
+    >>> X = np.random.multivariate_normal(mean=[0, 0],
+    ...                                   cov=real_cov,
+    ...                                   size=500)
+    >>> cov = ShrunkCovariance().fit(X)
+    >>> cov.covariance_ # doctest: +ELLIPSIS
+    array([[0.7387..., 0.2536...],
+           [0.2536..., 0.4110...]])
+    >>> cov.location_
+    array([0.0622..., 0.0193...])
+
     Notes
     -----
     The regularized covariance is given by:
diff --git a/sklearn/datasets/descr/iris.rst b/sklearn/datasets/descr/iris.rst
index a35edc728c7d9..e05206454d218 100644
--- a/sklearn/datasets/descr/iris.rst
+++ b/sklearn/datasets/descr/iris.rst
@@ -25,7 +25,7 @@ Iris plants dataset
     sepal length:   4.3  7.9   5.84   0.83    0.7826
     sepal width:    2.0  4.4   3.05   0.43   -0.4194
     petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
-    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
+    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)
     ============== ==== ==== ======= ===== ====================
 
     :Missing Attribute Values: None
diff --git a/sklearn/datasets/descr/linnerud.rst b/sklearn/datasets/descr/linnerud.rst
index 848ee193e1adc..5585b50a7e42b 100644
--- a/sklearn/datasets/descr/linnerud.rst
+++ b/sklearn/datasets/descr/linnerud.rst
@@ -11,12 +11,12 @@ Linnerrud dataset
 
 The Linnerud dataset constains two small dataset:
 
-- *exercise*: A list containing the following components: exercise data with
-  20 observations on 3 exercise variables: Weight, Waist and Pulse.
+- *physiological* - CSV containing 20 observations on 3 exercise variables:
+   Weight, Waist and Pulse.
 
-- *physiological*: Data frame with 20 observations on 3 physiological variables:
+- *exercise* - CSV containing 20 observations on 3 physiological variables:
    Chins, Situps and Jumps.
 
 .. topic:: References
 
-  * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic.
\ No newline at end of file
+  * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic.
diff --git a/sklearn/datasets/mlcomp.py b/sklearn/datasets/mlcomp.py
index 169df6e55151a..9adb7bbc1c06e 100644
--- a/sklearn/datasets/mlcomp.py
+++ b/sklearn/datasets/mlcomp.py
@@ -24,7 +24,7 @@ def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
             "in March 2017, the load_mlcomp function was deprecated "
             "in version 0.19 and will be removed in 0.21.")
 def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs):
-    """Load a datasets as downloaded from http://mlcomp.org
+    r"""Load a datasets as downloaded from http://mlcomp.org
 
     Read more in the :ref:`User Guide <datasets>`.
 
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index a35fa51307993..d667cb3699b28 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -7,10 +7,10 @@
 
 try:
     # Python 3+
-    from urllib.request import urlopen
+    from urllib.request import urlopen, Request
 except ImportError:
     # Python 2
-    from urllib2 import urlopen
+    from urllib2 import urlopen, Request
 
 
 import numpy as np
@@ -18,7 +18,7 @@
 
 from sklearn.externals import _arff
 from .base import get_data_home
-from ..externals.six import string_types, PY2
+from ..externals.six import string_types, PY2, BytesIO
 from ..externals.six.moves.urllib.error import HTTPError
 from ..utils import Bunch
 
@@ -50,8 +50,18 @@ def _open_openml_url(openml_path, data_home):
     result : stream
         A stream to the OpenML resource
     """
+    req = Request(_OPENML_PREFIX + openml_path)
+    req.add_header('Accept-encoding', 'gzip')
+    fsrc = urlopen(req)
+    is_gzip = fsrc.info().get('Content-Encoding', '') == 'gzip'
+
     if data_home is None:
-        return urlopen(_OPENML_PREFIX + openml_path)
+        if is_gzip:
+            if PY2:
+                fsrc = BytesIO(fsrc.read())
+            return gzip.GzipFile(fileobj=fsrc, mode='rb')
+        return fsrc
+
     local_path = os.path.join(data_home, 'openml.org', openml_path + ".gz")
     if not os.path.exists(local_path):
         try:
@@ -61,15 +71,16 @@ def _open_openml_url(openml_path, data_home):
             pass
 
         try:
-            with gzip.GzipFile(local_path, 'wb') as fdst:
-                fsrc = urlopen(_OPENML_PREFIX + openml_path)
+            with open(local_path, 'wb') as fdst:
                 shutil.copyfileobj(fsrc, fdst)
                 fsrc.close()
         except Exception:
             os.unlink(local_path)
             raise
     # XXX: unnecessary decompression on first access
-    return gzip.GzipFile(local_path, 'rb')
+    if is_gzip:
+        return gzip.GzipFile(local_path, 'rb')
+    return fsrc
 
 
 def _get_json_content_from_openml_api(url, error_message, raise_if_error,
@@ -308,7 +319,7 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
         return_type = _arff.DENSE
 
     if PY2:
-        arff_file = _arff.load(response, encode_nominal=encode_nominal,
+        arff_file = _arff.load(response.read(), encode_nominal=encode_nominal,
                                return_type=return_type, )
     else:
         arff_file = _arff.loads(response.read().decode('utf-8'),
@@ -356,6 +367,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     (not both). In case a name is given, a version can also be
     provided.
 
+    Read more in the :ref:`User Guide <openml>`.
+
     .. note:: EXPERIMENTAL
 
         The API is experimental in version 0.20 (particularly the return value
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index c5be1b41607b7..cf9cfcdc81ede 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -7,6 +7,7 @@
 import re
 import scipy.sparse
 import sklearn
+import pytest
 
 from sklearn.datasets import fetch_openml
 from sklearn.datasets.openml import (_open_openml_url,
@@ -23,7 +24,6 @@
 currdir = os.path.dirname(os.path.abspath(__file__))
 # if True, urlopen will be monkey patched to only use local files
 test_offline = True
-test_gzip = True
 
 
 def _test_features_list(data_id):
@@ -135,44 +135,81 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     return data_by_id
 
 
-def _monkey_patch_webbased_functions(context, data_id, gziped_files):
+def _monkey_patch_webbased_functions(context,
+                                     data_id,
+                                     gzip_response):
     url_prefix_data_description = "https://openml.org/api/v1/json/data/"
     url_prefix_data_features = "https://openml.org/api/v1/json/data/features/"
     url_prefix_download_data = "https://openml.org/data/v1/"
     url_prefix_data_list = "https://openml.org/api/v1/json/data/list/"
 
-    path_suffix = ''
-    read_fn = open
-    if gziped_files:
-        path_suffix = '.gz'
-        read_fn = gzip.open
+    path_suffix = '.gz'
+    read_fn = gzip.open
+
+    class MockHTTPResponse(object):
+        def __init__(self, data, is_gzip):
+            self.data = data
+            self.is_gzip = is_gzip
+
+        def read(self, amt=-1):
+            return self.data.read(amt)
+
+        def tell(self):
+            return self.data.tell()
+
+        def seek(self, pos, whence=0):
+            return self.data.seek(pos, whence)
+
+        def close(self):
+            self.data.close()
+
+        def info(self):
+            if self.is_gzip:
+                return {'Content-Encoding': 'gzip'}
+            return {}
 
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)
 
-    def _mock_urlopen_data_description(url):
+    def _mock_urlopen_data_description(url, has_gzip_header):
         assert url.startswith(url_prefix_data_description)
 
         path = os.path.join(currdir, 'data', 'openml', str(data_id),
                             _file_name(url, '.json'))
-        return read_fn(path, 'rb')
 
-    def _mock_urlopen_data_features(url):
-        assert url.startswith(url_prefix_data_features)
+        if has_gzip_header and gzip_response:
+            fp = open(path, 'rb')
+            return MockHTTPResponse(fp, True)
+        else:
+            fp = read_fn(path, 'rb')
+            return MockHTTPResponse(fp, False)
 
+    def _mock_urlopen_data_features(url, has_gzip_header):
+        assert url.startswith(url_prefix_data_features)
         path = os.path.join(currdir, 'data', 'openml', str(data_id),
                             _file_name(url, '.json'))
-        return read_fn(path, 'rb')
+        if has_gzip_header and gzip_response:
+            fp = open(path, 'rb')
+            return MockHTTPResponse(fp, True)
+        else:
+            fp = read_fn(path, 'rb')
+            return MockHTTPResponse(fp, False)
 
-    def _mock_urlopen_download_data(url):
+    def _mock_urlopen_download_data(url, has_gzip_header):
         assert (url.startswith(url_prefix_download_data))
 
         path = os.path.join(currdir, 'data', 'openml', str(data_id),
                             _file_name(url, '.arff'))
-        return read_fn(path, 'rb')
 
-    def _mock_urlopen_data_list(url):
+        if has_gzip_header and gzip_response:
+            fp = open(path, 'rb')
+            return MockHTTPResponse(fp, True)
+        else:
+            fp = read_fn(path, 'rb')
+            return MockHTTPResponse(fp, False)
+
+    def _mock_urlopen_data_list(url, has_gzip_header):
         assert url.startswith(url_prefix_data_list)
 
         json_file_path = os.path.join(currdir, 'data', 'openml',
@@ -184,17 +221,25 @@ def _mock_urlopen_data_list(url):
             raise HTTPError(url=None, code=412,
                             msg='Simulated mock error',
                             hdrs=None, fp=None)
-        return read_fn(json_file_path, 'rb')
 
-    def _mock_urlopen(url):
+        if has_gzip_header:
+            fp = open(json_file_path, 'rb')
+            return MockHTTPResponse(fp, True)
+        else:
+            fp = read_fn(json_file_path, 'rb')
+            return MockHTTPResponse(fp, False)
+
+    def _mock_urlopen(request):
+        url = request.get_full_url()
+        has_gzip_header = request.get_header('Accept-encoding') == "gzip"
         if url.startswith(url_prefix_data_list):
-            return _mock_urlopen_data_list(url)
+            return _mock_urlopen_data_list(url, has_gzip_header)
         elif url.startswith(url_prefix_data_features):
-            return _mock_urlopen_data_features(url)
+            return _mock_urlopen_data_features(url, has_gzip_header)
         elif url.startswith(url_prefix_download_data):
-            return _mock_urlopen_download_data(url)
+            return _mock_urlopen_download_data(url, has_gzip_header)
         elif url.startswith(url_prefix_data_description):
-            return _mock_urlopen_data_description(url)
+            return _mock_urlopen_data_description(url, has_gzip_header)
         else:
             raise ValueError('Unknown mocking URL pattern: %s' % url)
 
@@ -203,7 +248,8 @@ def _mock_urlopen(url):
         context.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen)
 
 
-def test_fetch_openml_iris(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_iris(monkeypatch, gzip_response):
     # classification dataset with numeric only columns
     data_id = 61
     data_name = 'iris'
@@ -213,7 +259,7 @@ def test_fetch_openml_iris(monkeypatch):
     expected_features = 4
     expected_missing = 0
 
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     assert_warns_message(
         UserWarning,
         "Multiple active versions of the dataset matching the name"
@@ -233,12 +279,14 @@ def test_fetch_openml_iris(monkeypatch):
     )
 
 
-def test_decode_iris():
+def test_decode_iris(monkeypatch):
     data_id = 61
+    _monkey_patch_webbased_functions(monkeypatch, data_id, False)
     _test_features_list(data_id)
 
 
-def test_fetch_openml_iris_multitarget(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
     # classification dataset with numeric only columns
     data_id = 61
     data_name = 'iris'
@@ -248,7 +296,7 @@ def test_fetch_openml_iris_multitarget(monkeypatch):
     expected_features = 3
     expected_missing = 0
 
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
@@ -256,7 +304,8 @@ def test_fetch_openml_iris_multitarget(monkeypatch):
                                compare_default_target=False)
 
 
-def test_fetch_openml_anneal(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_anneal(monkeypatch, gzip_response):
     # classification dataset with numeric and categorical columns
     data_id = 2
     data_name = 'anneal'
@@ -266,7 +315,7 @@ def test_fetch_openml_anneal(monkeypatch):
     expected_observations = 11
     expected_features = 38
     expected_missing = 267
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
@@ -274,12 +323,14 @@ def test_fetch_openml_anneal(monkeypatch):
                                compare_default_target=True)
 
 
-def test_decode_anneal():
+def test_decode_anneal(monkeypatch):
     data_id = 2
+    _monkey_patch_webbased_functions(monkeypatch, data_id, False)
     _test_features_list(data_id)
 
 
-def test_fetch_openml_anneal_multitarget(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
     # classification dataset with numeric and categorical columns
     data_id = 2
     data_name = 'anneal'
@@ -289,7 +340,7 @@ def test_fetch_openml_anneal_multitarget(monkeypatch):
     expected_observations = 11
     expected_features = 36
     expected_missing = 267
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
@@ -297,7 +348,8 @@ def test_fetch_openml_anneal_multitarget(monkeypatch):
                                compare_default_target=False)
 
 
-def test_fetch_openml_cpu(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_cpu(monkeypatch, gzip_response):
     # regression dataset with numeric and categorical columns
     data_id = 561
     data_name = 'cpu'
@@ -306,7 +358,7 @@ def test_fetch_openml_cpu(monkeypatch):
     expected_observations = 209
     expected_features = 7
     expected_missing = 0
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
@@ -314,12 +366,14 @@ def test_fetch_openml_cpu(monkeypatch):
                                compare_default_target=True)
 
 
-def test_decode_cpu():
+def test_decode_cpu(monkeypatch):
     data_id = 561
+    _monkey_patch_webbased_functions(monkeypatch, data_id, False)
     _test_features_list(data_id)
 
 
-def test_fetch_openml_australian(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_australian(monkeypatch, gzip_response):
     # sparse dataset
     # Australian is the only sparse dataset that is reasonably small
     # as it is inactive, we need to catch the warning. Due to mocking
@@ -332,7 +386,7 @@ def test_fetch_openml_australian(monkeypatch):
     expected_observations = 85
     expected_features = 14
     expected_missing = 0
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     assert_warns_message(
         UserWarning,
         "Version 1 of dataset Australian is inactive,",
@@ -350,7 +404,8 @@ def test_fetch_openml_australian(monkeypatch):
     )
 
 
-def test_fetch_openml_miceprotein(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_miceprotein(monkeypatch, gzip_response):
     # JvR: very important check, as this dataset defined several row ids
     # and ignore attributes. Note that data_features json has 82 attributes,
     # and row id (1), ignore attributes (3) have been removed (and target is
@@ -363,7 +418,7 @@ def test_fetch_openml_miceprotein(monkeypatch):
     expected_observations = 7
     expected_features = 77
     expected_missing = 7
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
@@ -371,7 +426,8 @@ def test_fetch_openml_miceprotein(monkeypatch):
                                compare_default_target=True)
 
 
-def test_fetch_openml_emotions(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_emotions(monkeypatch, gzip_response):
     # classification dataset with multiple targets (natively)
     data_id = 40589
     data_name = 'emotions'
@@ -381,7 +437,7 @@ def test_fetch_openml_emotions(monkeypatch):
     expected_observations = 13
     expected_features = 72
     expected_missing = 0
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
 
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
@@ -390,15 +446,18 @@ def test_fetch_openml_emotions(monkeypatch):
                                compare_default_target=True)
 
 
-def test_decode_emotions():
+def test_decode_emotions(monkeypatch):
     data_id = 40589
+    _monkey_patch_webbased_functions(monkeypatch, data_id, False)
     _test_features_list(data_id)
 
 
-def test_open_openml_url_cache(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_open_openml_url_cache(monkeypatch, gzip_response):
     data_id = 61
 
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(
+        monkeypatch, data_id, gzip_response)
     openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
     test_directory = os.path.join(os.path.expanduser('~'), 'scikit_learn_data')
     # first fill the cache
@@ -411,23 +470,25 @@ def test_open_openml_url_cache(monkeypatch):
     assert response1.read() == response2.read()
 
 
-def test_fetch_openml_notarget(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_notarget(monkeypatch, gzip_response):
     data_id = 61
     target_column = None
     expected_observations = 150
     expected_features = 5
 
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     data = fetch_openml(data_id=data_id, target_column=target_column,
                         cache=False)
     assert data.data.shape == (expected_observations, expected_features)
     assert data.target is None
 
 
-def test_fetch_openml_inactive(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_inactive(monkeypatch, gzip_response):
     # fetch inactive dataset by id
     data_id = 40675
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     glas2 = assert_warns_message(
         UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
         data_id=data_id, cache=False)
@@ -439,19 +500,21 @@ def test_fetch_openml_inactive(monkeypatch):
     assert int(glas2_by_version.details['id']) == data_id
 
 
-def test_fetch_nonexiting(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_nonexiting(monkeypatch, gzip_response):
     # there is no active version of glass2
     data_id = 40675
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
     assert_raise_message(ValueError, "No active dataset glass2 found",
                          fetch_openml, name='glass2', cache=False)
 
 
-def test_raises_illegal_multitarget(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_raises_illegal_multitarget(monkeypatch, gzip_response):
     data_id = 61
     targets = ['sepalwidth', 'class']
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
     assert_raise_message(ValueError,
                          "Can only handle homogeneous multi-target datasets,",
@@ -459,11 +522,12 @@ def test_raises_illegal_multitarget(monkeypatch):
                          target_column=targets, cache=False)
 
 
-def test_warn_ignore_attribute(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_warn_ignore_attribute(monkeypatch, gzip_response):
     data_id = 40966
     expected_row_id_msg = "target_column={} has flag is_row_identifier."
     expected_ignore_msg = "target_column={} has flag is_ignore."
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # single column test
     assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                          fetch_openml, data_id=data_id,
@@ -484,18 +548,20 @@ def test_warn_ignore_attribute(monkeypatch):
                          cache=False)
 
 
-def test_string_attribute(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_string_attribute(monkeypatch, gzip_response):
     data_id = 40945
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # single column test
     assert_raise_message(ValueError,
                          'STRING attributes are not yet supported',
                          fetch_openml, data_id=data_id, cache=False)
 
 
-def test_illegal_column(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_illegal_column(monkeypatch, gzip_response):
     data_id = 61
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     assert_raise_message(KeyError, "Could not find target_column=",
                          fetch_openml, data_id=data_id,
                          target_column='undefined', cache=False)
@@ -506,9 +572,10 @@ def test_illegal_column(monkeypatch):
                          cache=False)
 
 
-def test_fetch_openml_raises_missing_values_target(monkeypatch):
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response):
     data_id = 2
-    _monkey_patch_webbased_functions(monkeypatch, data_id, test_gzip)
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     assert_raise_message(ValueError, "Target column ",
                          fetch_openml, data_id=data_id, target_column='family')
 
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 17054dd0a4a71..d050949f131fe 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -411,8 +411,7 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False,
             # R <- -1.0 * U_k * V_k^T + R
             R = ger(-1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True)
     if return_r2:
-        R **= 2
-        R = R.sum()
+        R = nrm2(R) ** 2.0
         return dictionary, R
     return dictionary
 
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 990d31bf2ccc0..0617a1797fcdc 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -880,7 +880,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
 
     init :  None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
         Method used to initialize the procedure.
-        Default: 'nndsvd' if n_components < n_features, otherwise random.
+        Default: 'random'.
         Valid options:
 
         - 'random': non-negative random matrices, scaled with:
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index f2c866413183b..2fac84fd7bea4 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -105,9 +105,11 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        if self.strategy not in ("most_frequent", "stratified", "uniform",
-                                 "constant", "prior"):
-            raise ValueError("Unknown strategy type.")
+        allowed_strategies = ("most_frequent", "stratified", "uniform",
+                              "constant", "prior")
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Unknown strategy type: %s, expected one of %s."
+                             % (self.strategy, allowed_strategies))
 
         if self.strategy == "uniform" and sp.issparse(y):
             y = y.toarray()
@@ -318,6 +320,37 @@ def predict_log_proba(self, X):
         else:
             return [np.log(p) for p in proba]
 
+    def score(self, X, y, sample_weight=None):
+        """Returns the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : {array-like, None}
+            Test samples with shape = (n_samples, n_features) or
+            None. Passing None as test samples gives the same result
+            as passing real test samples, since DummyClassifier
+            operates independently of the sampled observations.
+
+        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
+            True labels for X.
+
+        sample_weight : array-like, shape = [n_samples], optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) wrt. y.
+
+        """
+        if X is None:
+            X = np.zeros(shape=(len(y), 1))
+        return super(DummyClassifier, self).score(X, y, sample_weight)
+
 
 class DummyRegressor(BaseEstimator, RegressorMixin):
     """
@@ -386,10 +419,10 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-        if self.strategy not in ("mean", "median", "quantile", "constant"):
-            raise ValueError("Unknown strategy type: %s, expected "
-                             "'mean', 'median', 'quantile' or 'constant'"
-                             % self.strategy)
+        allowed_strategies = ("mean", "median", "quantile", "constant")
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Unknown strategy type: %s, expected one of %s."
+                             % (self.strategy, allowed_strategies))
 
         y = check_array(y, ensure_2d=False)
         if len(y) == 0:
@@ -478,3 +511,41 @@ def predict(self, X, return_std=False):
             y_std = np.ravel(y_std)
 
         return (y, y_std) if return_std else y
+
+    def score(self, X, y, sample_weight=None):
+        """Returns the coefficient of determination R^2 of the prediction.
+
+        The coefficient R^2 is defined as (1 - u/v), where u is the residual
+        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
+        sum of squares ((y_true - y_true.mean()) ** 2).sum().
+        The best possible score is 1.0 and it can be negative (because the
+        model can be arbitrarily worse). A constant model that always
+        predicts the expected value of y, disregarding the input features,
+        would get a R^2 score of 0.0.
+
+        Parameters
+        ----------
+        X : {array-like, None}
+            Test samples with shape = (n_samples, n_features) or None.
+            For some estimators this may be a
+            precomputed kernel matrix instead, shape = (n_samples,
+            n_samples_fitted], where n_samples_fitted is the number of
+            samples used in the fitting for the estimator.
+            Passing None as test samples gives the same result
+            as passing real test samples, since DummyRegressor
+            operates independently of the sampled observations.
+
+        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
+            True values for X.
+
+        sample_weight : array-like, shape = [n_samples], optional
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            R^2 of self.predict(X) wrt. y.
+        """
+        if X is None:
+            X = np.zeros(shape=(len(y), 1))
+        return super(DummyRegressor, self).score(X, y, sample_weight)
diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
index 51dce324a0013..abc9db6e35de8 100644
--- a/sklearn/ensemble/bagging.py
+++ b/sklearn/ensemble/bagging.py
@@ -472,7 +472,7 @@ class BaggingClassifier(BaseBagging, ClassifierMixin):
     bootstrap_features : boolean, optional (default=False)
         Whether features are drawn with replacement.
 
-    oob_score : bool
+    oob_score : bool, optional (default=False)
         Whether to use out-of-bag samples to estimate
         the generalization error.
 
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 125f48d5b0da6..542f7ca8043f1 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -784,8 +784,8 @@ class RandomForestClassifier(ForestClassifier):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -793,30 +793,25 @@ class RandomForestClassifier(ForestClassifier):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -963,10 +958,9 @@ class labels (multi-output problem).
     RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                 max_depth=2, max_features='auto', max_leaf_nodes=None,
                 min_impurity_decrease=0.0, min_impurity_split=None,
-                min_samples_leaf='deprecated', min_samples_split=2,
-                min_weight_fraction_leaf='deprecated', n_estimators=100,
-                n_jobs=None, oob_score=False, random_state=0, verbose=0,
-                warm_start=False)
+                min_samples_leaf=1, min_samples_split=2,
+                min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
+                oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(clf.feature_importances_)
     [0.14205973 0.76664038 0.0282433  0.06305659]
     >>> print(clf.predict([[0, 0, 0, 0]]))
@@ -975,7 +969,7 @@ class labels (multi-output problem).
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1001,8 +995,8 @@ def __init__(self,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.,
@@ -1079,8 +1073,8 @@ class RandomForestRegressor(ForestRegressor):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1088,30 +1082,25 @@ class RandomForestRegressor(ForestRegressor):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -1220,10 +1209,9 @@ class RandomForestRegressor(ForestRegressor):
     RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
-               min_samples_leaf='deprecated', min_samples_split=2,
-               min_weight_fraction_leaf='deprecated', n_estimators=100,
-               n_jobs=None, oob_score=False, random_state=0, verbose=0,
-               warm_start=False)
+               min_samples_leaf=1, min_samples_split=2,
+               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
+               oob_score=False, random_state=0, verbose=0, warm_start=False)
     >>> print(regr.feature_importances_)
     [0.18146984 0.81473937 0.00145312 0.00233767]
     >>> print(regr.predict([[0, 0, 0, 0]]))
@@ -1232,7 +1220,7 @@ class RandomForestRegressor(ForestRegressor):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1265,8 +1253,8 @@ def __init__(self,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.,
@@ -1334,8 +1322,8 @@ class ExtraTreesClassifier(ForestClassifier):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1343,30 +1331,25 @@ class ExtraTreesClassifier(ForestClassifier):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -1501,7 +1484,7 @@ class labels (multi-output problem).
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1523,8 +1506,8 @@ def __init__(self,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.,
@@ -1599,8 +1582,8 @@ class ExtraTreesRegressor(ForestRegressor):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1608,30 +1591,25 @@ class ExtraTreesRegressor(ForestRegressor):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -1729,7 +1707,7 @@ class ExtraTreesRegressor(ForestRegressor):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1750,8 +1728,8 @@ def __init__(self,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.,
@@ -1820,8 +1798,8 @@ class RandomTreesEmbedding(BaseForest):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` is the minimum
           number of samples for each split.
 
@@ -1829,30 +1807,25 @@ class RandomTreesEmbedding(BaseForest):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` is the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_leaf_nodes : int or None, optional (default=None)
         Grow trees with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
@@ -1928,8 +1901,8 @@ def __init__(self,
                  n_estimators='warn',
                  max_depth=5,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.,
                  min_impurity_split=None,
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 6e9cd843d59b9..39da7de21a166 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -25,7 +25,6 @@
 
 from abc import ABCMeta
 from abc import abstractmethod
-import warnings
 
 from .base import BaseEnsemble
 from ..base import ClassifierMixin
@@ -1125,13 +1124,13 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble)):
 
     @abstractmethod
     def __init__(self, loss, learning_rate, n_estimators, criterion,
-                 min_samples_split, min_weight_fraction_leaf,
+                 min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, min_impurity_decrease, min_impurity_split,
                  init, subsample, max_features,
                  random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
-                 min_samples_leaf='deprecated', warm_start=False,
-                 presort='auto', validation_fraction=0.1,
-                 n_iter_no_change=None, tol=1e-4):
+                 warm_start=False, presort='auto',
+                 validation_fraction=0.1, n_iter_no_change=None,
+                 tol=1e-4):
 
         self.n_estimators = n_estimators
         self.learning_rate = learning_rate
@@ -1190,22 +1189,14 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
                 # no inplace multiplication!
                 sample_weight = sample_weight * sample_mask.astype(np.float64)
 
-            if X_csc is not None:
-                tree.fit(X_csc, residual, sample_weight=sample_weight,
-                         check_input=False, X_idx_sorted=X_idx_sorted)
-            else:
-                tree.fit(X, residual, sample_weight=sample_weight,
-                         check_input=False, X_idx_sorted=X_idx_sorted)
+            X = X_csr if X_csr is not None else X
+            tree.fit(X, residual, sample_weight=sample_weight,
+                     check_input=False, X_idx_sorted=X_idx_sorted)
 
             # update tree leaves
-            if X_csr is not None:
-                loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
-                                             sample_weight, sample_mask,
-                                             self.learning_rate, k=k)
-            else:
-                loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
-                                             sample_weight, sample_mask,
-                                             self.learning_rate, k=k)
+            loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
+                                         sample_weight, sample_mask,
+                                         self.learning_rate, k=k)
 
             # add tree to ensemble
             self.estimators_[i, k] = tree
@@ -1366,9 +1357,10 @@ def fit(self, X, y, sample_weight=None, monitor=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csr_matrix``.
 
         y : array-like, shape (n_samples,)
             Target values (strings or integers in classification, real numbers
@@ -1498,17 +1490,9 @@ def _fit_stages(self, X, y, y_pred, sample_weight, random_state,
         n_inbag = max(1, int(self.subsample * n_samples))
         loss_ = self.loss_
 
-        if self.min_weight_fraction_leaf != 'deprecated':
-            warnings.warn("'min_weight_fraction_leaf' is deprecated in 0.20 "
-                          "and will be fixed to a value of 0 in 0.22.",
-                          DeprecationWarning)
-            min_weight_fraction_leaf = self.min_weight_fraction_leaf
-        else:
-            min_weight_fraction_leaf = 0.
-
         # Set min_weight_leaf from min_weight_fraction_leaf
-        if min_weight_fraction_leaf != 0. and sample_weight is not None:
-            min_weight_leaf = (min_weight_fraction_leaf *
+        if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
+            min_weight_leaf = (self.min_weight_fraction_leaf *
                                np.sum(sample_weight))
         else:
             min_weight_leaf = 0.
@@ -1746,8 +1730,8 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1755,30 +1739,25 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_depth : integer, optional (default=3)
         maximum depth of the individual regression estimators. The maximum
         depth limits the number of nodes in the tree. Tune this parameter
@@ -1955,8 +1934,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
 
     def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
                  min_impurity_split=None, init=None,
                  random_state=None, max_features=None, verbose=0,
@@ -2211,8 +2189,8 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -2220,19 +2198,19 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
@@ -2333,7 +2311,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
     validation_fraction : float, optional, default 0.1
         The proportion of training data to set aside as validation set for
         early stopping. Must be between 0 and 1.
-        Only used if early_stopping is True
+        Only used if ``n_iter_no_change`` is set to an integer.
 
         .. versionadded:: 0.20
 
@@ -2410,8 +2388,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
 
     def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
                  min_impurity_split=None, init=None, random_state=None,
                  max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
index 4ab267fc737ab..72d1d206f478b 100644
--- a/sklearn/ensemble/iforest.py
+++ b/sklearn/ensemble/iforest.py
@@ -200,6 +200,9 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight : array-like, shape = [n_samples] or None
             Sample weights. If None, then samples are equally weighted.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
         self : object
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index a470913f5f327..d7586c2866571 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -762,16 +762,13 @@ def check_min_samples_leaf(name):
     ForestEstimator = FOREST_ESTIMATORS[name]
 
     # test boundary value
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        assert_raises(ValueError,
-                      ForestEstimator(min_samples_leaf=-1).fit, X, y)
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        assert_raises(ValueError,
-                      ForestEstimator(min_samples_leaf=0).fit, X, y)
+    assert_raises(ValueError,
+                  ForestEstimator(min_samples_leaf=-1).fit, X, y)
+    assert_raises(ValueError,
+                  ForestEstimator(min_samples_leaf=0).fit, X, y)
 
     est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        est.fit(X, y)
+    est.fit(X, y)
     out = est.estimators_[0].tree_.apply(X)
     node_counts = np.bincount(out)
     # drop inner nodes
@@ -781,8 +778,7 @@ def check_min_samples_leaf(name):
 
     est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1,
                           random_state=0)
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        est.fit(X, y)
+    est.fit(X, y)
     out = est.estimators_[0].tree_.apply(X)
     node_counts = np.bincount(out)
     # drop inner nodes
@@ -815,9 +811,7 @@ def check_min_weight_fraction_leaf(name):
         if "RandomForest" in name:
             est.bootstrap = False
 
-        with pytest.warns(DeprecationWarning,
-                          match='min_weight_fraction_leaf'):
-            est.fit(X, y, sample_weight=weights)
+        est.fit(X, y, sample_weight=weights)
         out = est.estimators_[0].tree_.apply(X)
         node_weights = np.bincount(out, weights=weights)
         # drop inner nodes
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 332ab89317e1c..e407ca8ef2554 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -106,29 +106,17 @@ def test_classifier_parameter_checks():
     assert_raises(ValueError,
                   GradientBoostingClassifier(min_samples_split=1.1).fit, X, y)
 
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        assert_raises(
-            ValueError,
-            GradientBoostingClassifier(min_samples_leaf=0).fit,
-            X, y
-        )
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        assert_raises(
-            ValueError,
-            GradientBoostingClassifier(min_samples_leaf=-1.0).fit,
-            X, y
-        )
-
-    with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'):
-        assert_raises(ValueError,
-                      GradientBoostingClassifier(
-                          min_weight_fraction_leaf=-1.).fit,
-                      X, y)
-    with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'):
-        assert_raises(ValueError,
-                      GradientBoostingClassifier(
-                          min_weight_fraction_leaf=0.6).fit,
-                      X, y)
+    assert_raises(ValueError,
+                  GradientBoostingClassifier(min_samples_leaf=0).fit, X, y)
+    assert_raises(ValueError,
+                  GradientBoostingClassifier(min_samples_leaf=-1.0).fit, X, y)
+
+    assert_raises(ValueError,
+                  GradientBoostingClassifier(min_weight_fraction_leaf=-1.).fit,
+                  X, y)
+    assert_raises(ValueError,
+                  GradientBoostingClassifier(min_weight_fraction_leaf=0.6).fit,
+                  X, y)
 
     assert_raises(ValueError,
                   GradientBoostingClassifier(subsample=0.0).fit, X, y)
@@ -464,6 +452,7 @@ def test_max_feature_regression():
     assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
 
 
+@pytest.mark.network
 def test_feature_importance_regression():
     """Test that Gini importance is calculated correctly.
 
diff --git a/sklearn/ensemble/tests/test_voting_classifier.py b/sklearn/ensemble/tests/test_voting_classifier.py
index c480d8381f651..16de82e661779 100644
--- a/sklearn/ensemble/tests/test_voting_classifier.py
+++ b/sklearn/ensemble/tests/test_voting_classifier.py
@@ -203,18 +203,14 @@ def test_predict_proba_on_toy_problem():
     assert_almost_equal(t21, eclf_res[2][1], decimal=1)
     assert_almost_equal(t31, eclf_res[3][1], decimal=1)
 
-    try:
+    with pytest.raises(
+            AttributeError,
+            match="predict_proba is not available when voting='hard'"):
         eclf = VotingClassifier(estimators=[
                                 ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                                 voting='hard')
         eclf.fit(X, y).predict_proba(X)
 
-    except AttributeError:
-        pass
-    else:
-        raise AssertionError('AttributeError for voting == "hard"'
-                             ' and with predict_proba not raised')
-
 
 def test_multilabel():
     """Check if error is raised for multilabel classification."""
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 7fb445ef9d5a5..eaec6083d0ae4 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -641,7 +641,7 @@ def _decode_comment(self, s):
         :param s: a normalized string.
         :return: a string with the decoded comment.
         '''
-        res = re.sub('^\%( )?', '', s)
+        res = re.sub(r'^\%( )?', '', s)
         return res
 
     def _decode_relation(self, s):
diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
index a42646eb4c754..0d008b560522e 100644
--- a/sklearn/externals/joblib/__init__.py
+++ b/sklearn/externals/joblib/__init__.py
@@ -106,7 +106,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.12.4'
+__version__ = '0.12.5'
 
 
 from .memory import Memory, MemorizedResult, register_store_backend
diff --git a/sklearn/externals/joblib/externals/cloudpickle/__init__.py b/sklearn/externals/joblib/externals/cloudpickle/__init__.py
index df671a0f15696..8004dcde0b7de 100644
--- a/sklearn/externals/joblib/externals/cloudpickle/__init__.py
+++ b/sklearn/externals/joblib/externals/cloudpickle/__init__.py
@@ -2,4 +2,4 @@
 
 from .cloudpickle import *
 
-__version__ = '0.5.5'
+__version__ = '0.5.6'
diff --git a/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py b/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py
index b1107ba92c1da..842723539d128 100644
--- a/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py
+++ b/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py
@@ -635,11 +635,12 @@ def extract_func_data(self, func):
 
         base_globals = self.globals_ref.get(id(func.__globals__), None)
         if base_globals is None:
-            # For functions defined in __main__, use vars(__main__) for
-            # base_global. This is necessary to share the global variables
-            # across multiple functions in this module.
-            if func.__module__ == "__main__":
-                base_globals = "__main__"
+            # For functions defined in a well behaved module use
+            # vars(func.__module__) for base_globals. This is necessary to
+            # share the global variables across multiple pickled functions from
+            # this module.
+            if hasattr(func, '__module__') and func.__module__ is not None:
+                base_globals = func.__module__
             else:
                 base_globals = {}
         self.globals_ref[id(func.__globals__)] = base_globals
@@ -934,7 +935,6 @@ def subimport(name):
 def dynamic_subimport(name, vars):
     mod = imp.new_module(name)
     mod.__dict__.update(vars)
-    sys.modules[name] = mod
     return mod
 
 
@@ -1090,7 +1090,13 @@ def _make_skel_func(code, cell_count, base_globals=None):
     if base_globals is None:
         base_globals = {}
     elif isinstance(base_globals, str):
-        base_globals = vars(sys.modules[base_globals])
+        if sys.modules.get(base_globals, None) is not None:
+            # this checks if we can import the previous environment the object
+            # lived in
+            base_globals = vars(sys.modules[base_globals])
+        else:
+            base_globals = {}
+
     base_globals['__builtins__'] = __builtins__
 
     closure = (
diff --git a/sklearn/externals/joblib/externals/loky/__init__.py b/sklearn/externals/joblib/externals/loky/__init__.py
index 18c01d0a6aa04..4f686454588a0 100644
--- a/sklearn/externals/joblib/externals/loky/__init__.py
+++ b/sklearn/externals/joblib/externals/loky/__init__.py
@@ -19,4 +19,4 @@
            "FIRST_COMPLETED", "FIRST_EXCEPTION", "ALL_COMPLETED", ]
 
 
-__version__ = '2.3.0'
+__version__ = '2.3.1'
diff --git a/sklearn/externals/joblib/externals/loky/process_executor.py b/sklearn/externals/joblib/externals/loky/process_executor.py
index 57a7617d9ab7e..cfdd37abce923 100644
--- a/sklearn/externals/joblib/externals/loky/process_executor.py
+++ b/sklearn/externals/joblib/externals/loky/process_executor.py
@@ -62,7 +62,6 @@
 import os
 import gc
 import sys
-import types
 import struct
 import weakref
 import warnings
@@ -438,7 +437,6 @@ def _process_worker(call_queue, result_queue, initializer, initargs,
                 continue
             if time() - _last_memory_leak_check > _MEMORY_LEAK_CHECK_DELAY:
                 mem_usage = _get_memory_usage(pid)
-                print(mem_usage)
                 _last_memory_leak_check = time()
                 if mem_usage - _process_reference_size < _MAX_MEMORY_LEAK_SIZE:
                     # Memory usage stays within bounds: everything is fine.
@@ -618,34 +616,41 @@ def shutdown_all_workers():
         worker_sentinels = [p.sentinel for p in processes.values()]
         ready = wait(readers + worker_sentinels)
 
-        broken = ("A process in the executor was terminated abruptly", None)
+        broken = ("A worker process managed by the executor was unexpectedly "
+                  "terminated. This could be caused by a segmentation fault "
+                  "while calling the function or by an excessive memory usage "
+                  "causing the Operating System to kill the worker.", None,
+                  TerminatedWorkerError)
         if result_reader in ready:
             try:
                 result_item = result_reader.recv()
                 broken = None
                 if isinstance(result_item, _RemoteTraceback):
-                    cause = result_item.tb
-                    broken = ("A task has failed to un-serialize", cause)
+                    broken = ("A task has failed to un-serialize. Please "
+                              "ensure that the arguments of the function are "
+                              "all picklable.", result_item.tb,
+                              BrokenProcessPool)
             except BaseException as e:
                 tb = getattr(e, "__traceback__", None)
                 if tb is None:
                     _, _, tb = sys.exc_info()
-                broken = ("A result has failed to un-serialize",
-                          traceback.format_exception(type(e), e, tb))
+                broken = ("A result has failed to un-serialize. Please "
+                          "ensure that the objects returned by the function "
+                          "are always picklable.",
+                          traceback.format_exception(type(e), e, tb),
+                          BrokenProcessPool)
         elif wakeup_reader in ready:
             broken = None
             result_item = None
         thread_wakeup.clear()
-        if broken:
-            msg, cause = broken
-            # Mark the process pool broken so that submits fail right now.
-            executor_flags.flag_as_broken(
-                msg + ", the pool is not usable anymore.")
-            bpe = BrokenProcessPool(
-                msg + " while the future was running or pending.")
-            if cause is not None:
+        if broken is not None:
+            msg, cause_tb, exc_type = broken
+            bpe = exc_type(msg)
+            if cause_tb is not None:
                 bpe.__cause__ = _RemoteTraceback(
-                    "\n'''\n{}'''".format(''.join(cause)))
+                    "\n'''\n{}'''".format(''.join(cause_tb)))
+            # Mark the process pool broken so that submits fail right now.
+            executor_flags.flag_as_broken(bpe)
 
             # All futures in flight must be marked failed
             for work_id, work_item in pending_work_items.items():
@@ -808,6 +813,15 @@ class LokyRecursionError(RuntimeError):
 
 
 class BrokenProcessPool(_BPPException):
+    """
+    Raised when the executor is broken while a future was in the running state.
+    The cause can an error raised when unpickling the task in the worker
+    process or when unpickling the result value in the parent process. It can
+    also be caused by a worker process being terminated unexpectedly.
+    """
+
+
+class TerminatedWorkerError(BrokenProcessPool):
     """
     Raised when a process in a ProcessPoolExecutor terminated abruptly
     while a future was in the running state.
@@ -998,8 +1012,8 @@ def _ensure_executor_running(self):
 
     def submit(self, fn, *args, **kwargs):
         with self._flags.shutdown_lock:
-            if self._flags.broken:
-                raise BrokenProcessPool(self._flags.broken)
+            if self._flags.broken is not None:
+                raise self._flags.broken
             if self._flags.shutdown:
                 raise ShutdownExecutorError(
                     'cannot schedule new futures after shutdown')
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index b9431bc5439cb..d6b1b2b64b4c0 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1108,6 +1108,8 @@ def test_vectorizers_invalid_ngram_range(vec):
     message = ("Invalid value for ngram_range=%s "
                "lower boundary larger than the upper boundary."
                % str(invalid_range))
+    if isinstance(vec, HashingVectorizer):
+        pytest.xfail(reason='HashingVectorizer not supported on PyPy')
 
     assert_raise_message(
         ValueError, message, vec.fit, ["good news everyone"])
@@ -1119,6 +1121,7 @@ def test_vectorizers_invalid_ngram_range(vec):
             ValueError, message, vec.transform, ["good news everyone"])
 
 
+@fails_if_pypy
 def test_vectorizer_stop_words_inconsistent():
     if PY2:
         lstr = "[u'and', u'll', u've']"
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 30a28cd507f67..29734a2135d8f 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -399,6 +399,23 @@ class LinearRegression(LinearModel, RegressorMixin):
     intercept_ : array
         Independent term in the linear model.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import LinearRegression
+    >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
+    >>> # y = 1 * x_0 + 2 * x_1 + 3
+    >>> y = np.dot(X, np.array([1, 2])) + 3
+    >>> reg = LinearRegression().fit(X, y)
+    >>> reg.score(X, y)
+    1.0
+    >>> reg.coef_
+    array([1., 2.])
+    >>> reg.intercept_ # doctest: +ELLIPSIS
+    3.0000...
+    >>> reg.predict(np.array([[3, 5]]))
+    array([16.])
+
     Notes
     -----
     From the implementation point of view, this is just plain Ordinary
diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index cd044824b4b7a..c75ad0f667d46 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -251,11 +251,9 @@ def enet_coordinate_descent(floating[::1] w,
 
                 # update the maximum absolute coefficient update
                 d_w_ii = fabs(w[ii] - w_ii)
-                if d_w_ii > d_w_max:
-                    d_w_max = d_w_ii
+                d_w_max = fmax(d_w_max, d_w_ii)
 
-                if fabs(w[ii]) > w_max:
-                    w_max = fabs(w[ii])
+                w_max = fmax(w_max, fabs(w[ii]))
 
             if (w_max == 0.0 or
                 d_w_max / w_max < d_w_tol or
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 6fa71f2dddcf4..2d0723944be4e 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -1368,6 +1368,17 @@ class LassoCV(LinearModelCV, RegressorMixin):
         number of iterations run by the coordinate descent solver to reach
         the specified tolerance for the optimal alpha.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import LassoCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> reg = LassoCV(cv=5, random_state=0).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9993...
+    >>> reg.predict(X[:1,])
+    array([-78.4951...])
+
     Notes
     -----
     For an example, see
@@ -2235,6 +2246,19 @@ class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
         number of iterations run by the coordinate descent solver to reach
         the specified tolerance for the optimal alpha.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import MultiTaskLassoCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_targets=2, noise=4, random_state=0)
+    >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9994...
+    >>> reg.alpha_
+    0.5713...
+    >>> reg.predict(X[:1,])
+    array([[153.7971...,  94.9015...]])
+
     See also
     --------
     MultiTaskElasticNet
diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py
index b6f4658ea573d..3270b5d221a51 100644
--- a/sklearn/linear_model/huber.py
+++ b/sklearn/linear_model/huber.py
@@ -192,6 +192,29 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
         A boolean mask which is set to True where the samples are identified
         as outliers.
 
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import HuberRegressor, LinearRegression
+    >>> from sklearn.datasets import make_regression
+    >>> np.random.seed(0)
+    >>> X, y, coef = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
+    >>> X[:4] = np.random.uniform(10, 20, (4, 2))
+    >>> y[:4] = np.random.uniform(10, 20, 4)
+    >>> huber = HuberRegressor().fit(X, y)
+    >>> huber.score(X, y) # doctest: +ELLIPSIS
+    -7.284608623514573
+    >>> huber.predict(X[:1,])
+    array([806.7200...])
+    >>> linear = LinearRegression().fit(X, y)
+    >>> print("True coefficients:", coef)
+    True coefficients: [20.4923...  34.1698...]
+    >>> print("Huber coefficients:", huber.coef_)
+    Huber coefficients: [17.7906... 31.0106...]
+    >>> print("Linear Regression coefficients:", linear.coef_)
+    Linear Regression coefficients: [-1.9221...  7.0226...]
+
     References
     ----------
     .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index d139560260a87..ce13b99b6aae5 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -1070,6 +1070,19 @@ class LarsCV(Lars):
     n_iter_ : array-like or int
         the number of iterations run by Lars with the optimal alpha.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import LarsCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)
+    >>> reg = LarsCV(cv=5).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9996...
+    >>> reg.alpha_
+    0.0254...
+    >>> reg.predict(X[:1,])
+    array([154.0842...])
+
     See also
     --------
     lars_path, LassoLars, LassoLarsCV
@@ -1290,6 +1303,19 @@ class LassoLarsCV(LarsCV):
     n_iter_ : array-like or int
         the number of iterations run by Lars with the optimal alpha.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import LassoLarsCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4.0, random_state=0)
+    >>> reg = LassoLarsCV(cv=5).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9992...
+    >>> reg.alpha_
+    0.0484...
+    >>> reg.predict(X[:1,])
+    array([-77.8723...])
+
     Notes
     -----
 
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index a0f6d49490948..c304c0f341821 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -583,6 +583,17 @@ class OrthogonalMatchingPursuit(LinearModel, RegressorMixin):
     n_iter_ : int or array-like
         Number of active features across every target.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import OrthogonalMatchingPursuit
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(noise=4, random_state=0)
+    >>> reg = OrthogonalMatchingPursuit().fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9991...
+    >>> reg.predict(X[:1,])
+    array([-78.3854...])
+
     Notes
     -----
     Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,
@@ -814,6 +825,20 @@ class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
         Number of active features across every target for the model refit with
         the best hyperparameters got by cross-validating across all folds.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(n_features=100, n_informative=10,
+    ...                        noise=4, random_state=0)
+    >>> reg = OrthogonalMatchingPursuitCV(cv=5).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9991...
+    >>> reg.n_nonzero_coefs_
+    10
+    >>> reg.predict(X[:1,])
+    array([-78.3854...])
+
     See also
     --------
     orthogonal_mp
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index 9dcd044d1f3ea..f929533e871a8 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -186,6 +186,18 @@ class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin):
 
         .. versionadded:: 0.19
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import RANSACRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = RANSACRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9885...
+    >>> reg.predict(X[:1,])
+    array([-31.9417...])
+
     References
     ----------
     .. [1] https://en.wikipedia.org/wiki/RANSAC
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
index 93a2a6c912094..5e253003a2fe3 100644
--- a/sklearn/linear_model/stochastic_gradient.py
+++ b/sklearn/linear_model/stochastic_gradient.py
@@ -845,7 +845,7 @@ class SGDClassifier(BaseSGDClassifier):
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
-        Whether to use early stopping to terminate training when validation.
+        Whether to use early stopping to terminate training when validation
         score is not improving. If set to True, it will automatically set aside
         a fraction of training data as validation and terminate training when
         validation score is not improving by at least tol for
@@ -1454,7 +1454,7 @@ class SGDRegressor(BaseSGDRegressor):
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
-        Whether to use early stopping to terminate training when validation.
+        Whether to use early stopping to terminate training when validation
         score is not improving. If set to True, it will automatically set aside
         a fraction of training data as validation and terminate training when
         validation score is not improving by at least tol for
diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py
index 0f3b19164b146..00ad26d41b031 100644
--- a/sklearn/linear_model/theil_sen.py
+++ b/sklearn/linear_model/theil_sen.py
@@ -276,6 +276,18 @@ class TheilSenRegressor(LinearModel, RegressorMixin):
         Number of combinations taken into account from 'n choose k', where n is
         the number of samples and k is the number of subsamples.
 
+    Examples
+    --------
+    >>> from sklearn.linear_model import TheilSenRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> X, y = make_regression(
+    ...     n_samples=200, n_features=2, noise=4.0, random_state=0)
+    >>> reg = TheilSenRegressor(random_state=0).fit(X, y)
+    >>> reg.score(X, y) # doctest: +ELLIPSIS
+    0.9884...
+    >>> reg.predict(X[:1,])
+    array([-31.5871...])
+
     References
     ----------
     - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index c8130783c240f..60f47980d6a17 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -139,9 +139,9 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
     Returns
     -------
     score : float
-        If ``normalize == True``, return the correctly classified samples
-        (float), else it returns the number of correctly classified samples
-        (int).
+        If ``normalize == True``, return the fraction of correctly
+        classified samples (float), else returns the number of correctly
+        classified samples (int).
 
         The best performance is 1 with ``normalize == True`` and the number
         of samples with ``normalize == False``.
@@ -1403,6 +1403,13 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
     --------
     recall_score, roc_auc_score
 
+    Notes
+    -----
+    Some literature promotes alternative definitions of balanced accuracy. Our
+    definition is equivalent to :func:`accuracy_score` with class-balanced
+    sample weights, and shares desirable properties with the binary case.
+    See the :ref:`User Guide <balanced_accuracy_score>`.
+
     References
     ----------
     .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 0409794bf08eb..916804b384c7b 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -133,13 +133,13 @@ def assert_grid_iter_equals_getitem(grid):
 
 @pytest.mark.parametrize(
     "input, error_type, error_message",
-    [(0, TypeError, 'Parameter grid is not a dict or a list (0)'),
-     ([{'foo': [0]}, 0], TypeError, 'Parameter grid is not a dict (0)'),
+    [(0, TypeError, r'Parameter grid is not a dict or a list \(0\)'),
+     ([{'foo': [0]}, 0], TypeError, r'Parameter grid is not a dict \(0\)'),
      ({'foo': 0}, TypeError, "Parameter grid value is not iterable "
-      "(key='foo', value=0)")]
+      r"\(key='foo', value=0\)")]
 )
 def test_validate_parameter_grid_input(input, error_type, error_message):
-    with pytest.raises(error_type, message=error_message):
+    with pytest.raises(error_type, match=error_message):
         ParameterGrid(input)
 
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 710c194cfc698..28286bf2402fd 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -482,13 +482,10 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility():
                                                 cv.split(*data)):
                 # cv.split(...) returns an array of tuples, each tuple
                 # consisting of an array with train indices and test indices
-                try:
+                with pytest.raises(AssertionError,
+                                   message="The splits for data, are same even"
+                                           " when random state is not set"):
                     np.testing.assert_array_equal(test_a, test_b)
-                except AssertionError:
-                    pass
-                else:
-                    raise AssertionError("The splits for data, are same even "
-                                         "when random state is not set")
 
 
 def test_shuffle_stratifiedkfold():
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 852b0a5fe32f6..93c1bbbba0ba8 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -14,6 +14,7 @@
 from .kde import KernelDensity
 from .approximate import LSHForest
 from .lof import LocalOutlierFactor
+from .base import VALID_METRICS, VALID_METRICS_SPARSE
 
 __all__ = ['BallTree',
            'DistanceMetric',
@@ -28,4 +29,6 @@
            'radius_neighbors_graph',
            'KernelDensity',
            'LSHForest',
-           'LocalOutlierFactor']
+           'LocalOutlierFactor',
+           'VALID_METRICS',
+           'VALID_METRICS_SPARSE']
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index fcb221b037a83..9f30ba3ebd3fc 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -143,8 +143,11 @@ def _check_algorithm_metric(self):
                     "kd_tree algorithm does not support callable metric '%s'"
                     % self.metric)
         elif self.metric not in VALID_METRICS[alg_check]:
-            raise ValueError("Metric '%s' not valid for algorithm '%s'"
-                             % (self.metric, self.algorithm))
+            raise ValueError("Metric '%s' not valid. Use "
+                             "sorted(sklearn.neighbors.VALID_METRICS['%s']) "
+                             "to get valid options. "
+                             "Metric can also be a callable function."
+                             % (self.metric, alg_check))
 
         if self.metric_params is not None and 'p' in self.metric_params:
             warnings.warn("Parameter p is found in metric_params. "
@@ -214,9 +217,12 @@ def _fit(self, X):
                               "using brute force")
             if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \
                     and not callable(self.effective_metric_):
-
-                raise ValueError("metric '%s' not valid for sparse input"
-                                 % self.effective_metric_)
+                raise ValueError("Metric '%s' not valid for sparse input. "
+                                 "Use sorted(sklearn.neighbors."
+                                 "VALID_METRICS_SPARSE['brute']) "
+                                 "to get valid options. "
+                                 "Metric can also be a callable function."
+                                 % (self.effective_metric_))
             self._fit_X = X.copy()
             self._tree = None
             self._fit_method = 'brute'
diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py
index 68fe777b3c48c..df7b57c54bdd1 100644
--- a/sklearn/neighbors/lof.py
+++ b/sklearn/neighbors/lof.py
@@ -171,6 +171,9 @@ def fit_predict(self):
             The query sample or samples to compute the Local Outlier Factor
             w.r.t. to the training samples.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
         is_inlier : array, shape (n_samples,)
@@ -219,6 +222,9 @@ def fit(self, X, y=None):
             Training data. If array or matrix, shape [n_samples, n_features],
             or [n_samples, n_samples] if metric='precomputed'.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
         self : object
diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py
index ccf933ed19b56..c35e8840d23f7 100644
--- a/sklearn/neural_network/rbm.py
+++ b/sklearn/neural_network/rbm.py
@@ -19,7 +19,6 @@
 from ..utils import check_array
 from ..utils import check_random_state
 from ..utils import gen_even_slices
-from ..utils import issparse
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import log_logistic
 from ..utils.validation import check_is_fitted
@@ -310,7 +309,7 @@ def score_samples(self, X):
         # Randomly corrupt one feature in each sample in v.
         ind = (np.arange(v.shape[0]),
                rng.randint(0, v.shape[1], v.shape[0]))
-        if issparse(v):
+        if sp.issparse(v):
             data = -2 * v[ind] + 1
             v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
         else:
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 87d7180e0c230..294f69a113992 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -630,7 +630,7 @@ class FeatureUnion(_BaseComposition, TransformerMixin):
     Parameters of the transformers may be set using its name and the parameter
     name separated by a '__'. A transformer may be replaced entirely by
     setting the parameter with its name to another transformer,
-    or removed by setting to ``None``.
+    or removed by setting to 'drop' or ``None``.
 
     Read more in the :ref:`User Guide <feature_union>`.
 
@@ -709,7 +709,7 @@ def _validate_transformers(self):
 
         # validate estimators
         for t in transformers:
-            if t is None:
+            if t is None or t == 'drop':
                 continue
             if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
                     hasattr(t, "transform")):
@@ -719,12 +719,13 @@ def _validate_transformers(self):
 
     def _iter(self):
         """
-        Generate (name, trans, weight) tuples excluding None transformers
+        Generate (name, trans, weight) tuples excluding None and
+        'drop' transformers.
         """
         get_weight = (self.transformer_weights or {}).get
         return ((name, trans, get_weight(name))
                 for name, trans in self.transformer_list
-                if trans is not None)
+                if trans is not None and trans != 'drop')
 
     def get_feature_names(self):
         """Get feature names from all transformers.
@@ -830,10 +831,9 @@ def transform(self, X):
 
     def _update_transformer_list(self, transformers):
         transformers = iter(transformers)
-        self.transformer_list[:] = [
-            (name, None if old is None else next(transformers))
-            for name, old in self.transformer_list
-        ]
+        self.transformer_list[:] = [(name, old if old is None or old == 'drop'
+                                     else next(transformers))
+                                    for name, old in self.transformer_list]
 
 
 def make_union(*transformers, **kwargs):
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index bd6e10fb62810..b2dee7d926e06 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,11 +10,12 @@
 import numpy as np
 from scipy import sparse
 
+from .. import get_config as _get_config
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
 from ..utils import check_array
 from ..utils import deprecated
-from ..utils.fixes import _argmax
+from ..utils.fixes import _argmax, _object_dtype_isnan
 from ..utils.validation import check_is_fitted
 
 from .base import _transform_selected
@@ -37,14 +38,30 @@ class _BaseEncoder(BaseEstimator, TransformerMixin):
 
     """
 
-    def _fit(self, X, handle_unknown='error'):
+    def _check_X(self, X):
+        """
+        Perform custom check_array:
+        - convert list of strings to object dtype
+        - check for missing values for object dtype data (check_array does
+          not do that)
 
+        """
         X_temp = check_array(X, dtype=None)
         if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
             X = check_array(X, dtype=np.object)
         else:
             X = X_temp
 
+        if X.dtype == np.dtype('object'):
+            if not _get_config()['assume_finite']:
+                if _object_dtype_isnan(X).any():
+                    raise ValueError("Input contains NaN")
+
+        return X
+
+    def _fit(self, X, handle_unknown='error'):
+        X = self._check_X(X)
+
         n_samples, n_features = X.shape
 
         if self._categories != 'auto':
@@ -74,12 +91,7 @@ def _fit(self, X, handle_unknown='error'):
             self.categories_.append(cats)
 
     def _transform(self, X, handle_unknown='error'):
-
-        X_temp = check_array(X, dtype=None)
-        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
-            X = check_array(X, dtype=np.object)
-        else:
-            X = X_temp
+        X = self._check_X(X)
 
         _, n_features = X.shape
         X_int = np.zeros_like(X, dtype=np.int)
@@ -381,7 +393,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_feature]
+        X : array-like, shape [n_samples, n_features]
             The data to determine the categories of each feature.
 
         Returns
@@ -462,13 +474,17 @@ def _legacy_fit_transform(self, X):
     def fit_transform(self, X, y=None):
         """Fit OneHotEncoder to X, then transform X.
 
-        Equivalent to self.fit(X).transform(X), but more convenient and more
-        efficient. See fit for the parameters, transform for the return value.
+        Equivalent to fit(X).transform(X) but more convenient.
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_feature]
-            Input array of type int.
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array
+            Transformed input.
         """
         if self.handle_unknown not in ('error', 'ignore'):
             msg = ("handle_unknown should be either 'error' or 'ignore', "
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 9ec16b85df60d..67169432defdc 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -497,6 +497,25 @@ def test_one_hot_encoder_feature_names_unicode():
     assert_array_equal([u'n👍me_c❤t1', u'n👍me_dat2'], feature_names)
 
 
+@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
+                               np.array([['a', np.nan]], dtype=object).T],
+                         ids=['numeric', 'object'])
+@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
+def test_one_hot_encoder_raise_missing(X, handle_unknown):
+    ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
+
+    ohe.fit(X[:1, :])
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
+
+
 @pytest.mark.parametrize("X", [
     [['abc', 2, 55], ['def', 1, 55]],
     np.array([[10, 2, 55], [20, 1, 55]]),
@@ -524,6 +543,24 @@ def test_ordinal_encoder_inverse():
     assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
 
 
+@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
+                               np.array([['a', np.nan]], dtype=object).T],
+                         ids=['numeric', 'object'])
+def test_ordinal_encoder_raise_missing(X):
+    ohe = OrdinalEncoder()
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
+
+    ohe.fit(X[:1, :])
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
+
+
 def test_encoder_dtypes():
     # check that dtypes are preserved when determining categories
     enc = OneHotEncoder(categories='auto')
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 4bb423e790486..1028843a9bf19 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -841,9 +841,6 @@ class SVR(BaseLibSVM, RegressorMixin):
     intercept_ : array, shape = [1]
         Constants in decision function.
 
-    sample_weight : array-like, shape = [n_samples]
-            Individual weights for each sample
-
     Examples
     --------
     >>> from sklearn.svm import SVR
@@ -1122,6 +1119,9 @@ def fit(self, X, y=None, sample_weight=None, **params):
             Per-sample weights. Rescale C per sample. Higher weights
             force the classifier to put more emphasis on these points.
 
+        y : Ignored
+            not used, present for API consistency by convention.
+
         Returns
         -------
         self : object
diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py
index 2628cd741ab9f..ce14bda1db34e 100644
--- a/sklearn/svm/tests/test_sparse.py
+++ b/sklearn/svm/tests/test_sparse.py
@@ -1,8 +1,9 @@
 import pytest
+
 import numpy as np
-from scipy import sparse
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
                            assert_equal)
+from scipy import sparse
 
 from sklearn import datasets, svm, linear_model, base
 from sklearn.datasets import make_classification, load_digits, make_blobs
@@ -12,7 +13,6 @@
 from sklearn.utils.testing import (assert_raises, assert_true, assert_false,
                                    assert_warns, assert_raise_message,
                                    ignore_warnings, skip_if_32bit)
-import pytest
 
 
 # test sample 1
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 8e5f020985b19..c84962ed63e6d 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -72,10 +72,11 @@ def _tested_non_meta_estimators():
 
 
 def _generate_checks_per_estimator(check_generator, estimators):
-    for name, Estimator in estimators:
-        estimator = Estimator()
-        for check in check_generator(name, estimator):
-            yield name, Estimator, check
+    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
+        for name, Estimator in estimators:
+            estimator = Estimator()
+            for check in check_generator(name, estimator):
+                yield name, Estimator, check
 
 
 def _rename_partial(val):
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 648de6b6e6ca5..b4a831e571c4a 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -19,6 +19,8 @@
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.deprecation import _is_deprecated
 
+import pytest
+
 PUBLIC_MODULES = set([pckg[1] for pckg in walk_packages(prefix='sklearn.',
                                                         path=sklearn.__path__)
                       if not ("._" in pckg[1] or ".tests." in pckg[1])])
@@ -45,7 +47,8 @@
 
 # numpydoc 0.8.0's docscrape tool raises because of collections.abc under
 # Python 3.7
-@ignore_warnings(category=DeprecationWarning)
+@pytest.mark.filterwarnings('ignore::DeprecationWarning')
+@pytest.mark.skipif(IS_PYPY, reason='test segfaults on PyPy')
 def test_docstring_parameters():
     # Test module docstring formatting
 
diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
index 5d955f51017a1..805c90a7e018e 100644
--- a/sklearn/tests/test_dummy.py
+++ b/sklearn/tests/test_dummy.py
@@ -1,5 +1,7 @@
 from __future__ import division
 
+import pytest
+
 import numpy as np
 import scipy.sparse as sp
 
@@ -200,6 +202,45 @@ def test_string_labels():
     assert_array_equal(clf.predict(X), ["paris"] * 5)
 
 
+@pytest.mark.parametrize("y,y_test", [
+    ([2, 1, 1, 1], [2, 2, 1, 1]),
+    (np.array([[2, 2],
+               [1, 1],
+               [1, 1],
+               [1, 1]]),
+     np.array([[2, 2],
+               [2, 2],
+               [1, 1],
+               [1, 1]]))
+])
+def test_classifier_score_with_None(y, y_test):
+    clf = DummyClassifier(strategy="most_frequent")
+    clf.fit(None, y)
+    assert_equal(clf.score(None, y_test), 0.5)
+
+
+@pytest.mark.parametrize("strategy", [
+    "stratified",
+    "most_frequent",
+    "prior",
+    "uniform",
+    "constant"
+])
+def test_classifier_prediction_independent_of_X(strategy):
+    y = [0, 2, 1, 1]
+    X1 = [[0]] * 4
+    clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
+    clf1.fit(X1, y)
+    predictions1 = clf1.predict(X1)
+
+    X2 = [[1]] * 4
+    clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
+    clf2.fit(X2, y)
+    predictions2 = clf2.predict(X2)
+
+    assert_array_equal(predictions1, predictions2)
+
+
 def test_classifier_exceptions():
     clf = DummyClassifier(strategy="unknown")
     assert_raises(ValueError, clf.fit, [], [])
@@ -633,3 +674,39 @@ def test_dummy_regressor_return_std():
     assert_equal(len(y_pred_list), 2)
     # the second element should be all zeros
     assert_array_equal(y_pred_list[1], y_std_expected)
+
+
+@pytest.mark.parametrize("y,y_test", [
+    ([1, 1, 1, 2], [1.25] * 4),
+    (np.array([[2, 2],
+               [1, 1],
+               [1, 1],
+               [1, 1]]),
+     [[1.25, 1.25]] * 4)
+
+])
+def test_regressor_score_with_None(y, y_test):
+    reg = DummyRegressor()
+    reg.fit(None, y)
+    assert_equal(reg.score(None, y_test), 1.0)
+
+
+@pytest.mark.parametrize("strategy", [
+    "mean",
+    "median",
+    "quantile",
+    "constant"
+])
+def test_regressor_prediction_independent_of_X(strategy):
+    y = [0, 2, 1, 1]
+    X1 = [[0]] * 4
+    reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
+    reg1.fit(X1, y)
+    predictions1 = reg1.predict(X1)
+
+    X2 = [[1]] * 4
+    reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
+    reg2.fit(X2, y)
+    predictions2 = reg2.predict(X2)
+
+    assert_array_equal(predictions1, predictions2)
diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index 1c2d5a0873cd9..93e000132b4d4 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -37,9 +37,9 @@ def __init__(self, name, construct, skip_methods=(),
                       est, param_distributions={'param': [5]}, cv=2, n_iter=1),
                   skip_methods=['score']),
     DelegatorData('RFE', RFE,
-                  skip_methods=['transform', 'inverse_transform', 'score']),
+                  skip_methods=['transform', 'inverse_transform']),
     DelegatorData('RFECV', RFECV,
-                  skip_methods=['transform', 'inverse_transform', 'score']),
+                  skip_methods=['transform', 'inverse_transform']),
     DelegatorData('BaggingClassifier', BaggingClassifier,
                   skip_methods=['transform', 'inverse_transform', 'score',
                                 'predict_proba', 'predict_log_proba',
@@ -101,7 +101,7 @@ def decision_function(self, X, *args, **kwargs):
             return np.ones(X.shape[0])
 
         @hides
-        def score(self, X, *args, **kwargs):
+        def score(self, X, y, *args, **kwargs):
             self._check_fit()
             return 1.0
 
@@ -120,15 +120,24 @@ def score(self, X, *args, **kwargs):
                         msg="%s does not have method %r when its delegate does"
                             % (delegator_data.name, method))
             # delegation before fit raises a NotFittedError
-            assert_raises(NotFittedError, getattr(delegator, method),
-                          delegator_data.fit_args[0])
+            if method == 'score':
+                assert_raises(NotFittedError, getattr(delegator, method),
+                              delegator_data.fit_args[0],
+                              delegator_data.fit_args[1])
+            else:
+                assert_raises(NotFittedError, getattr(delegator, method),
+                              delegator_data.fit_args[0])
 
         delegator.fit(*delegator_data.fit_args)
         for method in methods:
             if method in delegator_data.skip_methods:
                 continue
             # smoke test delegation
-            getattr(delegator, method)(delegator_data.fit_args[0])
+            if method == 'score':
+                getattr(delegator, method)(delegator_data.fit_args[0],
+                                           delegator_data.fit_args[1])
+            else:
+                getattr(delegator, method)(delegator_data.fit_args[0])
 
         for method in methods:
             if method in delegator_data.skip_methods:
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 8a15238ede1d3..6a77d5215d7c3 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -832,7 +832,8 @@ def test_set_feature_union_steps():
     assert_equal(['mock__x5'], ft.get_feature_names())
 
 
-def test_set_feature_union_step_none():
+@pytest.mark.parametrize('drop', ['drop', None])
+def test_set_feature_union_step_drop(drop):
     mult2 = Mult(2)
     mult2.get_feature_names = lambda: ['x2']
     mult3 = Mult(3)
@@ -844,12 +845,12 @@ def test_set_feature_union_step_none():
     assert_array_equal([[2, 3]], ft.fit_transform(X))
     assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())
 
-    ft.set_params(m2=None)
+    ft.set_params(m2=drop)
     assert_array_equal([[3]], ft.fit(X).transform(X))
     assert_array_equal([[3]], ft.fit_transform(X))
     assert_equal(['m3__x3'], ft.get_feature_names())
 
-    ft.set_params(m3=None)
+    ft.set_params(m3=drop)
     assert_array_equal([[]], ft.fit(X).transform(X))
     assert_array_equal([[]], ft.fit_transform(X))
     assert_equal([], ft.get_feature_names())
@@ -858,6 +859,12 @@ def test_set_feature_union_step_none():
     ft.set_params(m3=mult3)
     assert_array_equal([[3]], ft.fit(X).transform(X))
 
+    # Check 'drop' step at construction time
+    ft = FeatureUnion([('m2', drop), ('m3', mult3)])
+    assert_array_equal([[3]], ft.fit(X).transform(X))
+    assert_array_equal([[3]], ft.fit_transform(X))
+    assert_equal(['m3__x3'], ft.get_feature_names())
+
 
 def test_step_name_validation():
     bad_steps1 = [('a__q', Mult(2)), ('b', Mult(3))]
diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py
index 7ceb80a281661..bffd43cc1416f 100644
--- a/sklearn/tests/test_site_joblib.py
+++ b/sklearn/tests/test_site_joblib.py
@@ -1,4 +1,6 @@
 import os
+import pytest
+from sklearn import externals
 from sklearn.externals import joblib as joblib_vendored
 from sklearn.utils import Parallel, delayed, Memory, parallel_backend
 
@@ -9,6 +11,11 @@
 
 
 def test_old_pickle(tmpdir):
+    vendored_joblib_home = os.path.dirname(joblib_vendored.__file__)
+    sklearn_externals_home = os.path.dirname(externals.__file__)
+    if not vendored_joblib_home.startswith(sklearn_externals_home):
+        pytest.skip("joblib is physically unvendored (e.g. as in debian)")
+
     # Check that a pickle that references sklearn.external.joblib can load
     f = tmpdir.join('foo.pkl')
     f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe'
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 68b5040374290..37eb6582c7023 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -507,28 +507,16 @@ def test_error():
         assert_raises(ValueError, est.predict_proba, X2)
 
     for name, TreeEstimator in ALL_TREES.items():
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_samples_leaf=-1).fit, X, y)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_samples_leaf=.6).fit, X, y)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_samples_leaf=0.).fit, X, y)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_samples_leaf=3.).fit, X, y)
-        with pytest.warns(DeprecationWarning,
-                          match='min_weight_fraction_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_weight_fraction_leaf=-1).fit,
-                          X, y)
-        with pytest.warns(DeprecationWarning,
-                          match='min_weight_fraction_leaf'):
-            assert_raises(ValueError,
-                          TreeEstimator(min_weight_fraction_leaf=0.51).fit,
-                          X, y)
+        assert_raises(ValueError, TreeEstimator(min_samples_leaf=-1).fit, X, y)
+        assert_raises(ValueError, TreeEstimator(min_samples_leaf=.6).fit, X, y)
+        assert_raises(ValueError, TreeEstimator(min_samples_leaf=0.).fit, X, y)
+        assert_raises(ValueError, TreeEstimator(min_samples_leaf=3.).fit, X, y)
+        assert_raises(ValueError,
+                      TreeEstimator(min_weight_fraction_leaf=-1).fit,
+                      X, y)
+        assert_raises(ValueError,
+                      TreeEstimator(min_weight_fraction_leaf=0.51).fit,
+                      X, y)
         assert_raises(ValueError, TreeEstimator(min_samples_split=-1).fit,
                       X, y)
         assert_raises(ValueError, TreeEstimator(min_samples_split=0.0).fit,
@@ -631,8 +619,7 @@ def test_min_samples_leaf():
         est = TreeEstimator(min_samples_leaf=5,
                             max_leaf_nodes=max_leaf_nodes,
                             random_state=0)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            est.fit(X, y)
+        est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
         # drop inner nodes
@@ -644,8 +631,7 @@ def test_min_samples_leaf():
         est = TreeEstimator(min_samples_leaf=0.1,
                             max_leaf_nodes=max_leaf_nodes,
                             random_state=0)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            est.fit(X, y)
+        est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
         # drop inner nodes
@@ -674,9 +660,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
         est = TreeEstimator(min_weight_fraction_leaf=frac,
                             max_leaf_nodes=max_leaf_nodes,
                             random_state=0)
-        with pytest.warns(DeprecationWarning,
-                          match='min_weight_fraction_leaf'):
-            est.fit(X, y, sample_weight=weights)
+        est.fit(X, y, sample_weight=weights)
 
         if sparse:
             out = est.tree_.apply(X.tocsr())
@@ -701,9 +685,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
         est = TreeEstimator(min_weight_fraction_leaf=frac,
                             max_leaf_nodes=max_leaf_nodes,
                             random_state=0)
-        with pytest.warns(DeprecationWarning,
-                          match='min_weight_fraction_leaf'):
-            est.fit(X, y)
+        est.fit(X, y)
 
         if sparse:
             out = est.tree_.apply(X.tocsr())
@@ -749,8 +731,7 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets,
                             max_leaf_nodes=max_leaf_nodes,
                             min_samples_leaf=5,
                             random_state=0)
-        with pytest.warns(DeprecationWarning):
-            est.fit(X, y)
+        est.fit(X, y)
 
         if sparse:
             out = est.tree_.apply(X.tocsr())
@@ -775,8 +756,7 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets,
                             max_leaf_nodes=max_leaf_nodes,
                             min_samples_leaf=.1,
                             random_state=0)
-        with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-            est.fit(X, y)
+        est.fit(X, y)
 
         if sparse:
             out = est.tree_.apply(X.tocsr())
@@ -1432,16 +1412,10 @@ def check_sparse_parameters(tree, dataset):
     assert_array_almost_equal(s.predict(X), d.predict(X))
 
     # Check min_samples_leaf
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        d = TreeEstimator(
-                random_state=0,
-                min_samples_leaf=X_sparse.shape[0] // 2
-            ).fit(X, y)
-    with pytest.warns(DeprecationWarning, match='min_samples_leaf'):
-        s = TreeEstimator(
-                random_state=0,
-                min_samples_leaf=X_sparse.shape[0] // 2
-            ).fit(X_sparse, y)
+    d = TreeEstimator(random_state=0,
+                      min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y)
+    s = TreeEstimator(random_state=0,
+                      min_samples_leaf=X_sparse.shape[0] // 2).fit(X_sparse, y)
     assert_tree_equal(d.tree_, s.tree_,
                       "{0} with dense and sparse format gave different "
                       "trees".format(tree))
@@ -1586,8 +1560,7 @@ def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight):
     assert_equal(est.tree_.max_depth, 1)
 
     est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4)
-    with pytest.warns(DeprecationWarning, match='min_weight_fraction_leaf'):
-        est.fit(X, y, sample_weight=sample_weight)
+    est.fit(X, y, sample_weight=sample_weight)
     assert_equal(est.tree_.max_depth, 0)
 
 
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 437dc197c7a04..9985cee2eef77 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -85,26 +85,26 @@ def __init__(self,
                  splitter,
                  max_depth,
                  min_samples_split,
+                 min_samples_leaf,
                  min_weight_fraction_leaf,
                  max_features,
                  max_leaf_nodes,
                  random_state,
                  min_impurity_decrease,
                  min_impurity_split,
-                 min_samples_leaf='deprecated',
                  class_weight=None,
                  presort=False):
         self.criterion = criterion
         self.splitter = splitter
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
         self.min_impurity_split = min_impurity_split
-        self.min_samples_leaf = min_samples_leaf
         self.class_weight = class_weight
         self.presort = presort
 
@@ -173,24 +173,18 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                           else self.max_leaf_nodes)
 
-        if self.min_samples_leaf != 'deprecated':
-            warnings.warn("'min_samples_leaf' is deprecated in 0.20 and "
-                          "will be fixed to a value of 1 in 0.22.",
-                          DeprecationWarning)
-            min_samples_leaf = self.min_samples_leaf
-        else:
-            min_samples_leaf = 1
-        if isinstance(min_samples_leaf, (numbers.Integral, np.integer)):
-            if not 1 <= min_samples_leaf:
+        if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)):
+            if not 1 <= self.min_samples_leaf:
                 raise ValueError("min_samples_leaf must be at least 1 "
                                  "or in (0, 0.5], got %s"
-                                 % min_samples_leaf)
+                                 % self.min_samples_leaf)
+            min_samples_leaf = self.min_samples_leaf
         else:  # float
-            if not 0. < min_samples_leaf <= 0.5:
+            if not 0. < self.min_samples_leaf <= 0.5:
                 raise ValueError("min_samples_leaf must be at least 1 "
                                  "or in (0, 0.5], got %s"
-                                 % min_samples_leaf)
-            min_samples_leaf = int(ceil(min_samples_leaf * n_samples))
+                                 % self.min_samples_leaf)
+            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
 
         if isinstance(self.min_samples_split, (numbers.Integral, np.integer)):
             if not 2 <= self.min_samples_split:
@@ -240,15 +234,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         if len(y) != n_samples:
             raise ValueError("Number of labels=%d does not match "
                              "number of samples=%d" % (len(y), n_samples))
-
-        if self.min_weight_fraction_leaf != 'deprecated':
-            warnings.warn("'min_weight_fraction_leaf' is deprecated in 0.20 "
-                          "and will be fixed to a value of 0 in 0.22.",
-                          DeprecationWarning)
-            min_weight_fraction_leaf = self.min_weight_fraction_leaf
-        else:
-            min_weight_fraction_leaf = 0
-        if not 0 <= min_weight_fraction_leaf <= 0.5:
+        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
             raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
         if max_depth <= 0:
             raise ValueError("max_depth must be greater than zero. ")
@@ -283,10 +269,10 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         # Set min_weight_leaf from min_weight_fraction_leaf
         if sample_weight is None:
-            min_weight_leaf = (min_weight_fraction_leaf *
+            min_weight_leaf = (self.min_weight_fraction_leaf *
                                n_samples)
         else:
-            min_weight_leaf = (min_weight_fraction_leaf *
+            min_weight_leaf = (self.min_weight_fraction_leaf *
                                np.sum(sample_weight))
 
         if self.min_impurity_split is not None:
@@ -553,8 +539,8 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -562,30 +548,25 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default=None)
         The number of features to consider when looking for the best split:
 
@@ -703,7 +684,7 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -751,8 +732,8 @@ def __init__(self,
                  splitter="best",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features=None,
                  random_state=None,
                  max_leaf_nodes=None,
@@ -930,8 +911,8 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -939,30 +920,25 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default=None)
         The number of features to consider when looking for the best split:
 
@@ -1051,7 +1027,7 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1099,8 +1075,8 @@ def __init__(self,
                  splitter="best",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features=None,
                  random_state=None,
                  max_leaf_nodes=None,
@@ -1197,8 +1173,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1206,30 +1182,25 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -1313,7 +1284,7 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1329,8 +1300,8 @@ def __init__(self,
                  splitter="random",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  random_state=None,
                  max_leaf_nodes=None,
@@ -1390,8 +1361,8 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
-        - If int, then consider ``min_samples_split`` as the minimum number.
-        - If float, then ``min_samples_split`` is a fraction and
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
@@ -1399,30 +1370,25 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
            Added float values for fractions.
 
     min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node:
-
-        - If int, then consider ``min_samples_leaf`` as the minimum number.
-        - If float, then ``min_samples_leaf`` is a fraction and
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` training samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
         .. versionchanged:: 0.18
            Added float values for fractions.
-        .. deprecated:: 0.20
-           The parameter ``min_samples_leaf`` is deprecated in version 0.20 and
-           will be fixed to a value of 1 in version 0.22. It was not effective
-           for regularization and empirically, 1 is the best value.
 
     min_weight_fraction_leaf : float, optional (default=0.)
         The minimum weighted fraction of the sum total of weights (of all
         the input samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided.
 
-        .. deprecated:: 0.20
-           The parameter ``min_weight_fraction_leaf`` is deprecated in version
-           0.20. Its implementation, like ``min_samples_leaf``, is ineffective
-           for regularization.
-
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
 
@@ -1486,7 +1452,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     Notes
     -----
     The default values for the parameters controlling the size of the trees
-    (e.g. ``max_depth``, ``min_samples_split``, etc.) lead to fully grown and
+    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
     unpruned trees which can potentially be very large on some data sets. To
     reduce memory consumption, the complexity and size of the trees should be
     controlled by setting those parameter values.
@@ -1502,8 +1468,8 @@ def __init__(self,
                  splitter="random",
                  max_depth=None,
                  min_samples_split=2,
-                 min_samples_leaf='deprecated',
-                 min_weight_fraction_leaf='deprecated',
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
                  max_features="auto",
                  random_state=None,
                  min_impurity_decrease=0.,
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index d49c0683ae864..b40b843e94322 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -27,7 +27,7 @@ ctypedef np.float64_t DOUBLE
 
 def csr_row_norms(X):
     """L2 norm of each row in CSR matrix X."""
-    if X.dtype != np.float32:
+    if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
     return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)
 
@@ -72,7 +72,7 @@ def csr_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    if X.dtype != np.float32:
+    if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
     means, variances, _ =  _csr_mean_variance_axis0(X.data, X.shape[0],
                                                     X.shape[1], X.indices)
@@ -152,7 +152,7 @@ def csc_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    if X.dtype != np.float32:
+    if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
     means, variances, _ = _csc_mean_variance_axis0(X.data, X.shape[0],
                                                    X.shape[1], X.indices,
@@ -260,7 +260,7 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
     `utils.extmath._batch_mean_variance_update`.
 
     """
-    if X.dtype != np.float32:
+    if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
     return _incr_mean_variance_axis0(X.data, X.shape[0], X.shape[1], X.indices,
                                      X.indptr, X.format, last_mean, last_var,
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 3e577ebaa8eec..5b32d9e2115d3 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -172,7 +172,7 @@ def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
      (np.inf, 'allow-nan', 'Input contains infinity'),
      (np.nan, True, 'Input contains NaN, infinity'),
      (np.nan, 'allow-inf', 'force_all_finite should be a bool or "allow-nan"'),
-     (np.nan, 1, 'force_all_finite should be a bool or "allow-nan"')]
+     (np.nan, 1, 'Input contains NaN, infinity')]
 )
 @pytest.mark.parametrize(
     "retype",
@@ -182,7 +182,7 @@ def test_check_array_force_all_finiteinvalid(value, force_all_finite,
                                              match_msg, retype):
     X = retype(np.arange(4).reshape(2, 2).astype(np.float))
     X[0, 0] = value
-    with pytest.raises(ValueError, message=match_msg):
+    with pytest.raises(ValueError, match=match_msg):
         check_array(X, force_all_finite=force_all_finite,
                     accept_sparse=True)