diff --git a/.circleci/artifact_path b/.circleci/artifact_path
new file mode 100644
index 000000000..a03587f29
--- /dev/null
+++ b/.circleci/artifact_path
@@ -0,0 +1 @@
+0/doc/_changed.html
\ No newline at end of file
diff --git a/.circleci/config.yml b/.circleci/config.yml
index c3890036b..1a53edff5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,53 +1,141 @@
 version: 2
+# # Copied from https://github.com/scikit-learn/scikit-learn/blob/master/.circleci/config.yml
 jobs:
-  build:
+  doc-min-dependencies:
     docker:
-      - image: continuumio/miniconda3
+      - image: circleci/python:3.7.3-stretch
+    environment:
+      - OMP_NUM_THREADS: 2
+      - MKL_NUM_THREADS: 2
+      - MINICONDA_PATH: ~/miniconda
+      - CONDA_ENV_NAME: testenv
+      - PYTHON_VERSION: 3.7
     steps:
       - checkout
+      - run: ./build_tools/circle/checkout_merge_commit.sh
       - restore_cache:
-          key: deps-{{ .Branch }}-{{ checksum "setup.py" }}-{{ checksum "build_tools/circle/install.sh" }}
-      - run:
-          name: Install dependencies
-          command: |
-            bash build_tools/circle/install.sh
-          no_output_timeout: 1024s
+          keys:
+            - doc-min-deps-ccache-{{ .Branch }}
+            - doc-min-deps-ccache
+      - run: ./build_tools/circle/build_doc.sh
       - save_cache:
-          key: deps-{{ .Branch }}-{{ checksum "setup.py" }}-{{ checksum "build_tools/circle/install.sh" }}
+          key: doc-min-deps-ccache-{{ .Branch }}-{{ .BuildNum }}
           paths:
-            - /opt/conda/pkgs
+            - ~/.ccache
+            - ~/.cache/pip
+      - store_artifacts:
+          path: doc/_build/html/stable
+          destination: doc
+      - store_artifacts:
+          path: ~/log.txt
+          destination: log.txt
+
+  doc:
+    docker:
+      - image: circleci/python:3.7.3-stretch
+    environment:
+      - OMP_NUM_THREADS: 2
+      - MKL_NUM_THREADS: 2
+      - MINICONDA_PATH: ~/miniconda
+      - CONDA_ENV_NAME: testenv
+      - PYTHON_VERSION: 3
+    steps:
+      - checkout
+      - run: ./build_tools/circle/checkout_merge_commit.sh
+      - restore_cache:
+          keys:
+            - doc-ccache-{{ .Branch }}
+            - doc-ccache
+      - run: ./build_tools/circle/build_doc.sh
+      - save_cache:
+          key: doc-ccache-{{ .Branch }}-{{ .BuildNum }}
+          paths:
+            - ~/.ccache
             - ~/.cache/pip
-      - run:
-          name: Run build
-          command: |
-            bash build_tools/circle/execute.sh
-            if grep -q "Traceback (most recent call last):" nb_to_md.txt; then false; else true; fi
-          no_output_timeout: 3600s
       - store_artifacts:
-          path: ~/doc
+          path: doc/_build/html/stable
           destination: doc
+      - store_artifacts:
+          path: ~/log.txt
+          destination: log.txt
+      # Persists generated documentation so that it can be attached and deployed
+      # in the 'deploy' step.
       - persist_to_workspace:
-          root: ~/doc
+          root: doc/_build/html
           paths: .
 
+  lint:
+    docker:
+      - image: circleci/python:3.6
+    steps:
+      - checkout
+      - run: ./build_tools/circle/checkout_merge_commit.sh
+      - run:
+          name: dependencies
+          command: sudo pip install flake8
+      - run:
+          name: linting
+          command: ./build_tools/circle/linting.sh
+
+  pypy3:
+    docker:
+      - image: pypy:3.6-7.2.0
+    steps:
+      - restore_cache:
+          keys:
+            - pypy3-ccache-{{ .Branch }}
+            - pypy3-ccache
+      - checkout
+      - run: ./build_tools/circle/build_test_pypy.sh
+      - save_cache:
+          key: pypy3-ccache-{{ .Branch }}-{{ .BuildNum }}
+          paths:
+            - ~/.ccache
+            - ~/.cache/pip
+
   deploy:
     docker:
-      - image: circleci/python:3.6.7
+      - image: circleci/python:3.6
     steps:
       - checkout
+      - run: ./build_tools/circle/checkout_merge_commit.sh
+      # Attach documentation generated in the 'doc' step so that it can be
+      # deployed.
       - attach_workspace:
-          at: ~/doc
+          at: doc/_build/html
+      - run: ls -ltrh doc/_build/html/stable
       - deploy:
-          command: bash build_tools/circle/deploy.sh
+          command: |
+            if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then
+              bash build_tools/circle/push_doc.sh doc/_build/html/stable
+            fi
 
 workflows:
   version: 2
   build-doc-and-deploy:
     jobs:
-      - build
+      - lint
+      - doc:
+          requires:
+            - lint
+      - doc-min-dependencies:
+          requires:
+            - lint
+      - pypy3:
+          filters:
+            branches:
+              only:
+                - 0.20.X
       - deploy:
           requires:
-            - build
+            - doc
+  pypy:
+    triggers:
+      - schedule:
+          cron: "0 0 * * *"
           filters:
             branches:
-              only: master
+              only:
+                - master
+    jobs:
+      - pypy3
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 8f690c549..d79ac469d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,61 +9,31 @@ cache:
 env:
   global:
     # Directory where tests are run from
-    - TEST_DIR=/tmp/sklearn
+    - TEST_DIR=/tmp/skopt
     - OMP_NUM_THREADS=4
     - OPENBLAS_NUM_THREADS=4
 
 
 matrix:
   include:
-    # Linux environment to test scikit-learn against numpy and scipy master
-    # installed from their CI wheels in a virtualenv with the Python
-    # interpreter provided by travis.
-    - name: "Python 3.5 - scikit 0.19.2"
+    - name: "Python 3.7  - scikit 0.24.2"
       python: "3.7"
-      env: DISTRIB="conda" PYTHON_VERSION="3.5"
-           NUMPY_VERSION="*" SCIPY_VERSION="*" PYAML_VERSION="*"
-           SCIKIT_LEARN_VERSION="0.19.2" MATPLOTLIB_VERSION="*" COVERAGE="false"
-           JOBLIB_VERSION="0.11"
-    - name: "Python 3.6 - scikit 0.20.4"
-      python: "3.7"
-      env: DISTRIB="conda" PYTHON_VERSION="3.6"
-           NUMPY_VERSION="*" SCIPY_VERSION="*" PYAML_VERSION="*"
-           SCIKIT_LEARN_VERSION="0.20.4" MATPLOTLIB_VERSION="*" COVERAGE="false"
-           JOBLIB_VERSION="0.12"
-    - name: "Python 3.7 - scikit 0.21.3"
-      python: "3.7"
-      env: DISTRIB="conda" PYTHON_VERSION="3.7"
-         NUMPY_VERSION="*" SCIPY_VERSION="*" PYAML_VERSION="*"
-         SCIKIT_LEARN_VERSION="0.21.3" MATPLOTLIB_VERSION="*" COVERAGE="true"
-         JOBLIB_VERSION="*"
-    - name: "Python 3.7 - scikit 0.22.1"
+      env: DISTRIB="conda" PYTHON_VERSION="3.7" COVERAGE="false"
+           NUMPY_VERSION="1.19.1" SCIPY_VERSION="1.5.2" PYAML_VERSION="20.4.0"
+           SCIKIT_LEARN_VERSION="0.24.2" JOBLIB_VERSION="0.16.0"
+    - name: "Python 3.7 - sdist check"
       python: "3.7"
       env: DISTRIB="conda" PYTHON_VERSION="3.7"
-         NUMPY_VERSION="*" SCIPY_VERSION="*" PYAML_VERSION="*"
-         SCIKIT_LEARN_VERSION="0.22.1" MATPLOTLIB_VERSION="*" COVERAGE="true"
-         JOBLIB_VERSION="*"
-    - name: "Python 3.8  latest package versions"
-      python: "3.7"
-      env: DISTRIB="conda" PYTHON_VERSION="3.8" COVERAGE="false"
            NUMPY_VERSION="*" SCIPY_VERSION="*" PYAML_VERSION="*"
-           SCIKIT_LEARN_VERSION="*" JOBLIB_VERSION="*"
+           SCIKIT_LEARN_VERSION="*" MATPLOTLIB_VERSION="*" COVERAGE="false"
+           JOBLIB_VERSION="*" SDIST="true"
 
 
 install: source build_tools/travis/install.sh
-
 script:
-  - if [ ${COVERAGE} == "true" ];
-    then pytest --cov=skopt --durations=10; else
-    pytest --durations=10;
-    fi
-
-after_success:
-  - if [ ${COVERAGE} == "true" ]; then
-    pip install codecov;
-    codecov;
-    fi
-
+  - bash build_tools/travis/test_script.sh
+  - bash build_tools/travis/test_docs.sh
+after_success: source build_tools/travis/after_success.sh
 deploy:
   provider: pypi
   user: __token__
@@ -73,7 +43,7 @@ deploy:
   on:
     tags: true
     repo: scikit-optimize/scikit-optimize
-    condition: "$PYTHON_VERSION = 3.6"
+    condition: "$PYTHON_VERSION = 3.7"
   skip_cleanup: true
   skip_existing: true
   password:
diff --git a/.zenodo.json b/.zenodo.json
index 7c95c0c3e..71ed3e4b0 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -4,7 +4,8 @@
   "creators": [
     {
       "affiliation": "Wild Tree Tech",
-      "name": "Head, Tim"
+      "name": "Head, Tim",
+      "orcid": "0000-0003-0931-3698"
     },
     {
       "affiliation": "Google Brain",
@@ -16,103 +17,12 @@
     },
     {
       "affiliation": "ULi\u00e8ge",
-      "name": "Louppe, Gilles"
+      "name": "Louppe, Gilles",
+      "orcid": "0000-0002-2082-3106"
     },
     {
       "affiliation": "Saarland University",
       "name": "Shcherbatyi, Iaroslav"
-    },
-    {
-      "name": "fcharras"
-    },
-    {
-      "name": "Z\u00e9 Vin\u00edcius"
-    },
-    {
-      "name": "cmmalone"
-    },
-    {
-      "name": "Christopher Schr\u00f6der"
-    },
-    {
-      "name": "nel215"
-    },
-    {
-      "affiliation": "@yldio",
-      "name": "Nuno Campos"
-    },
-    {
-      "name": "Todd Young"
-    },
-    {
-      "affiliation": "Politecnico di Milano",
-      "name": "Stefano Cereda"
-    },
-    {
-      "name": "Thomas Fan"
-    },
-    {
-      "name": "rene-rex"
-    },
-    {
-      "affiliation": "Columbia University",
-      "name": "Kejia (KJ) Shi"
-    },
-    {
-      "affiliation": "Biomedical Informatics Department, Emory School of Medicine",
-      "name": "Justus Schwabedal"
-    },
-    {
-      "name": "carlosdanielcsantos"
-    },
-    {
-      "affiliation": "Hvass Laboratories",
-      "name": "Hvass-Labs"
-    },
-    {
-      "affiliation": "Technical University of Munich",
-      "name": "Mikhail Pak"
-    },
-    {
-      "name": "SoManyUsernamesTaken"
-    },
-    {
-      "affiliation": "UC Berkeley",
-      "name": "Fred Callaway"
-    },
-    {
-      "name": "Lo\u00efc Est\u00e8ve"
-    },
-    {
-      "affiliation": "ENS de Cachan - Paris Saclay University",
-      "name": "Lilian Besson"
-    },
-    {
-      "name": "Mehdi Cherti"
-    },
-    {
-      "affiliation": "Paderborn University",
-      "name": "Karlson Pfannschmidt"
-    },
-    {
-      "affiliation": "Toptal",
-      "name": "Fabian Linzberger"
-    },
-    {
-      "affiliation": "@point8",
-      "name": "Christophe Cauet"
-    },
-    {
-      "affiliation": "10clouds",
-      "name": "Anna Gut"
-    },
-    {
-      "affiliation": "Columbia University Data Science Institute",
-      "name": "Andreas Mueller"
-    },
-    {
-      "affiliation": "DFKI",
-      "name": "Alexander Fabisch"
     }
   ],
   "keywords": [
@@ -124,6 +34,13 @@
     "hyperparameter",
     "bayesian-optimization"
   ],
-  "license": "bsd-license",
+  "license": "bsd-3-clause",
+  "related_identifiers": [
+    {
+      "identifier": "https://scikit-optimize.github.io",
+      "relation": "documents",
+      "scheme": "url"
+    }
+  ],
   "upload_type": "software"
 }
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0caf27d91..f9d8b4e29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,178 +1,5 @@
 # Release history
-## Version 0.7.1
-
-### New features
-
-* Sphinx documentation
-* notebooks are replaced by sphinx-gallery
-* New StringEncoder, can be used in Categoricals
-* Remove string conversion in Identity
-* dtype can be set in Integer and Real
-
-### Bug fixes
-
-* Fix categorical space (issue #821)
-* int can be set as dtype to fix issue #790
-
-### Maintenance
-
-* Old pdoc scripts are removed and replaced by sphinx
-
-## Version 0.7
-
-### New features
-
-* Models queue has now a customizable size (model_queue_size).
-* Add log-uniform prior to Integer space
-* Support for plotting categorical dimensions
-
-### Bug fixes
-
-* Allow BayesSearchCV to work with sklearn 0.21 
-* Reduce the amount of deprecation warnings in unit tests
-
-### Maintenance
-
-* joblib instead of sklearn.externals.joblib 
-* Improve travis CI unit tests (Different sklearn version are checked)
-* Added `versioneer` support, to keep things simple and to fix pypi deploy
-
-## Version 0.6
-
-Highly composite six.
-
-### New features
-
-* `plot_regret` function for plotting the cumulative regret; 
-The purpose of such plot is to access how much an optimizer 
-is effective at picking good points.
-* `CheckpointSaver` that can be used to save a 
-checkpoint after each iteration with skopt.dump
-* `Space.from_yaml()`
- to allow for external file to define Space parameters
-
-### Bug fixes
-
-* Fixed numpy broadcasting issues in gaussian_ei, gaussian_pi 
-* Fixed build with newest scikit-learn 
-* Use native python types inside BayesSearchCV
-* Include fit_params in BayesSearchCV refit 
-
-### Maintenance
-
-* Added `versioneer` support, to reduce changes with new version of the `skopt`
-
-## Version 0.5.2
-
-### Bug fixes
-
-* Separated `n_points` from `n_jobs` in `BayesSearchCV`.
-* Dimensions now support boolean np.arrays.
-
-### Maintenance
-
-* `matplotlib` is now an optional requirement (install with `pip install 'scikit-optimize[plots]'`)
-
-## Version 0.5
-
-High five!
-
-### New features
-
-* Single element dimension definition, which can be used to
-fix the value of a dimension during optimization.
-* `total_iterations` property of `BayesSearchCV` that
-counts total iterations needed to explore all subspaces.
-* Add iteration event handler for `BayesSearchCV`, useful
-for early stopping inside `BayesSearchCV` search loop.
-* added `utils.use_named_args` decorator to help with unpacking named dimensions
-when calling an objective function.
-
-### Bug fixes
-
-* Removed redundant estimator fitting inside `BayesSearchCV`.
-* Fixed the log10 transform for Real dimensions that would lead to values being
-  out of bounds.
-
-## Version 0.4
-
-Go forth!
-
-### New features
-
-* Support early stopping of optimization loop.
-* Benchmarking scripts to evaluate performance of different surrogate models.
-* Support for parallel evaluations of the objective function via several
-  constant liar stategies.
-* BayesSearchCV as a drop in replacement for scikit-learn's GridSearchCV.
-* New acquisition functions "EIps" and "PIps" that takes into account
-  function compute time.
-
-### Bug fixes
-
-* Fixed inference of dimensions of type Real.
-
-### API changes
-
-* Change interface of GradientBoostingQuantileRegressor's predict method to
-  match return type of other regressors
-* Dimensions of type Real are now inclusive of upper bound.
-
-
-## Version 0.3
-
-Third time's a charm.
-
-### New features
-
-* Accuracy improvements of the optimization of the acquisition function
-by pre-selecting good candidates as starting points when
-using `acq_optimizer='lbfgs'`.
-* Support a ask-and-tell interface. Check out the `Optimizer` class if you need
-fine grained control over the iterations.
-* Parallelize L-BFGS minimization runs over the acquisition function.
-* Implement weighted hamming distance kernel for problems with only categorical dimensions.
-* New acquisition function `gp_hedge` that probabilistically chooses one of `EI`, `PI`
-or `LCB` at every iteration depending upon the cumulative gain.
-
-### Bug fixes
-* Warnings are now raised if a point is chosen as the candidate optimum multiple
-times.
-* Infinite gradients that were raised in the kernel gradient computation are
-now fixed.
-* Integer dimensions are now normalized to [0, 1] internally in `gp_minimize`.
-
-### API Changes.
-* The default `acq_optimizer` function has changed from `"auto"` to `"lbfgs"`
-in `gp_minimize`.
-
-
-## Version 0.2
-
-### New features
-
-* Speed improvements when using `gp_minimize` with `acq_optimizer='lbfgs'` and
-`acq_optimizer='auto'` when all the search-space dimensions are Real.
-* Persistence of minimization results using `skopt.dump` and `skopt.load`.
-* Support for using arbitrary estimators that implement a
-`return_std` argument in their `predict` method by means of `base_minimize` from `skopt.optimizer.`
-* Support for tuning noise in `gp_minimize` using the `noise` argument.
-* `TimerCallback` in `skopt.callbacks` to log the time between iterations of
-the minimization loop.
-
-
-## Version 0.1
-
-First light!
-
-### New features
-
-* Bayesian optimization via `gp_minimize`.
-* Tree-based sequential model-based optimization via `forest_minimize` and `gbrt_minimize`, with support for multi-threading.
-* Support of LCB, EI and PI as acquisition functions.
-* Plotting functions for inspecting convergence, evaluations and the objective function.
-* API for specifying and sampling from a parameter space.
-
+See https://scikit-optimize.github.io/dev/whats_new.html
 
 # Contributors
 
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..0dbeb1883
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2016-2020 The scikit-optimize developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
deleted file mode 100644
index cf5772813..000000000
--- a/LICENSE.md
+++ /dev/null
@@ -1,32 +0,0 @@
-New BSD License
-
-Copyright (c) 2016-2020 The scikit-optimize developers.
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-  a. Redistributions of source code must retain the above copyright notice,
-     this list of conditions and the following disclaimer.
-     
-  b. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-     
-  c. Neither the name of the scikit-optimize developers nor the names of
-     its contributors may be used to endorse or promote products
-     derived from this software without specific prior written
-     permission. 
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
index 95de3d03e..f0036cd4a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,4 +2,6 @@ include *.md
 include *.rst
 recursive-include doc *
 recursive-include examples *
-include README.rst
\ No newline at end of file
+include LICENSE
+include README.rst
+include pyproject.toml
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..13d6b3799
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,67 @@
+# simple makefile to simplify repetitive build env management tasks under posix
+
+# caution: testing won't work on windows, see README
+
+PYTHON ?= python
+CYTHON ?= cython
+PYTEST ?= pytest
+CTAGS ?= ctags
+
+# skip doctests on 32bit python
+BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))')
+
+all: clean inplace test
+
+clean-ctags:
+	rm -f tags
+
+clean: clean-ctags
+	$(PYTHON) setup.py clean
+	rm -rf dist
+	# TODO: Remove in when all modules are removed.
+	$(PYTHON) sklearn/_build_utils/deprecated_modules.py
+
+in: inplace # just a shortcut
+inplace:
+	$(PYTHON) setup.py build_ext -i
+
+test-code: in
+	$(PYTEST) --showlocals -v skopt --durations=20
+test-sphinxext:
+	$(PYTEST) --showlocals -v doc/sphinxext/
+test-doc:
+ifeq ($(BITS),64)
+	$(PYTEST) $(shell find doc -name '*.rst' | sort)
+endif
+test-code-parallel: in
+	$(PYTEST) -n auto --showlocals -v skopt --durations=20
+
+test-coverage:
+	rm -rf coverage .coverage
+	$(PYTEST) skopt --showlocals -v --cov=sklearn --cov-report=html:coverage
+test-coverage-parallel:
+	rm -rf coverage .coverage .coverage.*
+	$(PYTEST) skopt -n auto --showlocals -v --cov=sklearn --cov-report=html:coverage
+
+test: test-code test-sphinxext test-doc
+
+trailing-spaces:
+	find skopt -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \;
+
+ctags:
+	# make tags for symbol based navigation in emacs and vim
+	# Install with: sudo apt-get install exuberant-ctags
+	$(CTAGS) --python-kinds=-i -R skopt
+
+doc: inplace
+	$(MAKE) -C doc html
+
+doc-noplot: inplace
+	$(MAKE) -C doc html-noplot
+
+code-analysis:
+	flake8 sklearn | grep -v __init__ | grep -v external
+	pylint -E -i y skopt/ -d E1103,E0611,E1101
+
+flake8-diff:
+	./build_tools/circle/linting.sh
diff --git a/README.rst b/README.rst
index e6bbe971f..9ba472283 100644
--- a/README.rst
+++ b/README.rst
@@ -29,8 +29,7 @@ Important links
 
 -  Static documentation - `Static
    documentation <https://scikit-optimize.github.io/>`__
--  Example notebooks - can be found in the
-   `examples directory <https://github.com/scikit-optimize/scikit-optimize/tree/master/examples>`_.
+-  Example notebooks - can be found in examples_.
 -  Issue tracker -
    https://github.com/scikit-optimize/scikit-optimize/issues
 -  Releases - https://pypi.python.org/pypi/scikit-optimize
@@ -38,8 +37,16 @@ Important links
 Install
 -------
 
-The latest released version of scikit-optimize is v0.7.1, which you can install
-with:
+scikit-optimize requires
+
+* Python >= 3.6
+* NumPy (>= 1.13.3)
+* SciPy (>= 0.19.1)
+* joblib (>= 0.11)
+* scikit-learn >= 0.20
+* matplotlib >= 2.0.0
+
+You can install the latest release with:
 ::
 
     pip install scikit-optimize
@@ -97,9 +104,8 @@ class:
 
 
 Read our `introduction to bayesian
-optimization <https://scikit-optimize.github.io/notebooks/bayesian-optimization.html>`__
-and the other
-`examples <https://github.com/scikit-optimize/scikit-optimize/tree/master/examples>`__.
+optimization <https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html>`__
+and the other examples_.
 
 
 Development
@@ -107,7 +113,7 @@ Development
 
 The library is still experimental and under heavy development. Checkout
 the `next
-milestone <https://github.com/scikit-optimize/scikit-optimize/milestone/6>`__
+milestone <https://github.com/scikit-optimize/scikit-optimize/milestones>`__
 for the plans for the next release or look at some `easy
 issues <https://github.com/scikit-optimize/scikit-optimize/issues?q=is%3Aissue+is%3Aopen+label%3AEasy>`__
 to get started contributing.
@@ -139,7 +145,7 @@ create a new issue and work through the following checklist:
 * update the version tag in ``__init__.py``
 * update the version tag mentioned in the README
 * check if the dependencies in ``setup.py`` are valid or need unpinning
-* check that the ``CHANGELOG.md`` is up to date
+* check that the ``doc/whats_new/v0.X.rst`` is up to date
 * did the last build of master succeed?
 * create a `new release <https://github.com/scikit-optimize/scikit-optimize/releases>`__
 * ping `conda-forge <https://github.com/conda-forge/scikit-optimize-feedstock>`__
@@ -198,3 +204,4 @@ recognition, feel free to add them to the "Made possible by" list.
    :target: https://gitter.im/scikit-optimize/Lobby
 .. |Zenodo DOI| image:: https://zenodo.org/badge/54340642.svg
    :target: https://zenodo.org/badge/latestdoi/54340642
+.. _examples: https://scikit-optimize.github.io/stable/auto_examples/index.html
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
new file mode 100755
index 000000000..e3e57484f
--- /dev/null
+++ b/build_tools/circle/build_doc.sh
@@ -0,0 +1,260 @@
+#!/usr/bin/env bash
+set -x
+set -e
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/build_doc.sh
+# The scikit-learn developers.
+# License: BSD-style
+#
+# Decide what kind of documentation build to run, and run it.
+#
+# If the last commit message has a "[doc skip]" marker, do not build
+# the doc. On the contrary if a "[doc build]" marker is found, build the doc
+# instead of relying on the subsequent rules.
+#
+# We always build the documentation for jobs that are not related to a specific
+# PR (e.g. a merge to master or a maintenance branch).
+#
+# If this is a PR, do a full build if there are some files in this PR that are
+# under the "doc/" or "examples/" folders, otherwise perform a quick build.
+#
+# If the inspection of the current commit fails for any reason, the default
+# behavior is to quick build the documentation.
+
+get_build_type() {
+    if [ -z "$CIRCLE_SHA1" ]
+    then
+        echo SKIP: undefined CIRCLE_SHA1
+        return
+    fi
+    commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1)
+    if [ -z "$commit_msg" ]
+    then
+        echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1
+        return
+    fi
+    if [[ "$commit_msg" =~ \[doc\ skip\] ]]
+    then
+        echo SKIP: [doc skip] marker found
+        return
+    fi
+    if [[ "$commit_msg" =~ \[doc\ quick\] ]]
+    then
+        echo QUICK: [doc quick] marker found
+        return
+    fi
+    if [[ "$commit_msg" =~ \[doc\ build\] ]]
+    then
+        echo BUILD: [doc build] marker found
+        return
+    fi
+    if [ -z "$CI_PULL_REQUEST" ]
+    then
+        echo BUILD: not a pull request
+        return
+    fi
+    git_range="origin/master...$CIRCLE_SHA1"
+    git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return)
+    filenames=$(git diff --name-only $git_range)
+    if [ -z "$filenames" ]
+    then
+        echo QUICK BUILD: no changed filenames for $git_range
+        return
+    fi
+    changed_examples=$(echo "$filenames" | grep -E "^examples/(.*/)*plot_")
+
+    # The following is used to extract the list of filenames of example python
+    # files that sphinx-gallery needs to run to generate png files used as
+    # figures or images in the .rst files  from the documentation.
+    # If the contributor changes a .rst file in a PR we need to run all
+    # the examples mentioned in that file to get sphinx build the
+    # documentation without generating spurious warnings related to missing
+    # png files.
+
+    if [[ -n "$filenames" ]]
+    then
+        # get rst files
+        rst_files="$(echo "$filenames" | grep -E "rst$")"
+
+        # get lines with figure or images
+        img_fig_lines="$(echo "$rst_files" | xargs grep -shE "(figure|image)::")"
+
+        # get only auto_examples
+        auto_example_files="$(echo "$img_fig_lines" | grep auto_examples | awk -F "/" '{print $NF}')"
+
+        # remove "sphx_glr_" from path and accept replace _(\d\d\d|thumb).png with .py
+        scripts_names="$(echo "$auto_example_files" | sed 's/sphx_glr_//' | sed -E 's/_([[:digit:]][[:digit:]][[:digit:]]|thumb).png/.py/')"
+
+        # get unique values
+        examples_in_rst="$(echo "$scripts_names" | uniq )"
+    fi
+
+    # executed only if there are examples in the modified rst files
+    if [[ -n "$examples_in_rst" ]]
+    then
+        if [[ -n "$changed_examples" ]]
+        then
+            changed_examples="$changed_examples|$examples_in_rst"
+        else
+            changed_examples="$examples_in_rst"
+        fi
+    fi
+
+    if [[ -n "$changed_examples" ]]
+    then
+        echo BUILD: detected examples/ filename modified in $git_range: $changed_examples
+        pattern=$(echo "$changed_examples" | paste -sd '|')
+        # pattern for examples to run is the last line of output
+        echo "$pattern"
+        return
+    fi
+    echo QUICK BUILD: no examples/ filename modified in $git_range:
+    echo "$filenames"
+}
+
+build_type=$(get_build_type)
+if [[ "$build_type" =~ ^SKIP ]]
+then
+    exit 0
+fi
+
+if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]]
+then
+    # PDF linked into HTML
+    make_args="dist LATEXMKOPTS=-halt-on-error"
+elif [[ "$build_type" =~ ^'BUILD: detected examples' ]]
+then
+    # pattern for examples to run is the last line of output
+    pattern=$(echo "$build_type" | tail -n 1)
+    make_args="html EXAMPLES_PATTERN=$pattern"
+else
+    make_args=html
+fi
+
+make_args="SPHINXOPTS=-T $make_args"  # show full traceback on exception
+
+# Installing required system packages to support the rendering of math
+# notation in the HTML documentation
+sudo -E apt-get -yq update
+sudo -E apt-get -yq remove texlive-binaries --purge
+sudo -E apt-get -yq --no-install-suggests --no-install-recommends \
+    install dvipng texlive-latex-base texlive-latex-extra \
+    texlive-latex-recommended texlive-fonts-recommended \
+    latexmk tex-gyre gsfonts ccache
+
+# deactivate circleci virtualenv and setup a miniconda env instead
+if [[ `type -t deactivate` ]]; then
+  deactivate
+fi
+
+# Install dependencies with miniconda
+wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+   -O miniconda.sh
+chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
+export PATH="/usr/lib/ccache:$MINICONDA_PATH/bin:$PATH"
+
+ccache -M 512M
+export CCACHE_COMPRESS=1
+
+# Old packages coming from the 'free' conda channel have been removed but we
+# are using them for our min-dependencies doc generation. See
+# https://www.anaconda.com/why-we-removed-the-free-channel-in-conda-4-7/ for
+# more details.
+if [[ "$CIRCLE_JOB" == "doc-min-dependencies" ]]; then
+    conda config --set restore_free_channel true
+fi
+
+# packaging won't be needed once setuptools starts shipping packaging>=17.0
+conda create -n $CONDA_ENV_NAME --yes --quiet python="${PYTHON_VERSION:-*}" \
+  numpy scipy \
+  cython pytest coverage \
+  matplotlib sphinx pillow \
+  scikit-image pandas \
+  joblib memory_profiler packaging
+
+export MPLBACKEND="agg"
+source activate testenv
+pip install sphinx-gallery
+pip install numpydoc
+
+# Build and install scikit-optimize in dev mode
+python setup.py build_ext --inplace -j 3
+python setup.py develop
+
+export OMP_NUM_THREADS=1
+
+if [[ "$CIRCLE_BRANCH" =~ ^master$ && -z "$CI_PULL_REQUEST" ]]
+then
+    # List available documentation versions if on master
+    python build_tools/circle/list_versions.py > doc/versions.rst
+fi
+
+# Install this noise maker on CircleCI to prevent
+# "Too long with no output (exceeded 10m0s): context deadline exceeded"
+while true; do sleep $((60 * 5)); echo -e '\nStill working ...\n'; done &
+noise_maker=$!
+
+# The pipefail is requested to propagate exit code
+set -o pipefail && cd doc && make $make_args 2>&1 | tee ~/log.txt
+
+kill $noise_maker
+
+# Insert the version warning for deployment
+find _build/html/stable -name "*.html" | xargs sed -i '/<\/body>/ i \
+\    <script src="https://scikit-optimize.github.io/versionwarning.js"></script>'
+
+cd -
+set +o pipefail
+
+affected_doc_paths() {
+    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
+    echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/'
+    echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/'
+    skopt_files=$(echo "$files" | grep '^skopt/')
+    if [ -n "$skopt_files" ]
+    then
+        grep -hlR -f<(echo "$skopt_files" | sed 's/^/scikit-optimize\/blob\/[a-z0-9]*\//') doc/_build/html/stable/modules/generated | cut -d/ -f5-
+    fi
+}
+
+affected_doc_warnings() {
+    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
+    # Look for sphinx warnings only in files affected by the PR
+    if [ -n "$files" ]
+    then
+        for af in ${files[@]}
+        do
+          warn+=`grep WARNING ~/log.txt | grep $af`
+        done
+    fi
+    echo "$warn"
+}
+
+if [ -n "$CI_PULL_REQUEST" ]
+then
+    echo "The following documentation warnings may have been generated by PR #$CI_PULL_REQUEST:"
+    warnings=$(affected_doc_warnings)
+    if [ -z "$warnings" ]
+    then
+        warnings="/home/circleci/project/ no warnings"
+    fi
+    echo "$warnings"
+
+    echo "The following documentation files may have been changed by PR #$CI_PULL_REQUEST:"
+    affected=$(affected_doc_paths)
+    echo "$affected"
+    (
+    echo '<html><body><ul>'
+    echo "$affected" | sed 's|.*|<li><a href="&">&</a> [<a href="https://scikit-optimize.github.io/dev/&">dev</a>, <a href="https://scikit-optimize.github.io/stable/&">stable</a>]</li>|'
+    echo '</ul><p>General: <a href="index.html">Home</a> | <a href="modules/classes.html">API Reference</a> | <a href="auto_examples/index.html">Examples</a></p>'
+    echo '<strong>Sphinx Warnings in affected files</strong><ul>'
+    echo "$warnings" | sed 's/\/home\/circleci\/project\//<li>/g'
+    echo '</ul></body></html>'
+    ) > 'doc/_build/html/stable/_changed.html'
+
+    if [ "$warnings" != "/home/circleci/project/ no warnings" ]
+    then
+        echo "Sphinx generated warnings when building the documentation related to files modified in this PR."
+        echo "Please check doc/_build/html/stable/_changed.html"
+        exit 1
+    fi
+fi
diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh
new file mode 100755
index 000000000..c858d9add
--- /dev/null
+++ b/build_tools/circle/build_test_pypy.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -x
+set -e
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/build_test_pypy.sh
+# The scikit-learn developers.
+# License: BSD-style
+apt-get -yq update
+apt-get -yq install libatlas-base-dev liblapack-dev gfortran ccache libopenblas-dev
+
+pip install virtualenv
+
+if command -v pypy3; then
+    virtualenv -p $(command -v pypy3) pypy-env
+elif command -v pypy; then
+    virtualenv -p $(command -v pypy) pypy-env
+fi
+
+source pypy-env/bin/activate
+
+python --version
+which python
+
+pip install -U pip
+
+# pins versions to install wheel from https://antocuni.github.io/pypy-wheels/manylinux2010
+pip install --extra-index-url https://antocuni.github.io/pypy-wheels/manylinux2010 numpy==1.18.0 scipy==1.3.2
+
+# Install Cython directly
+pip install https://antocuni.github.io/pypy-wheels/ubuntu/Cython/Cython-0.29.14-py3-none-any.whl
+pip install sphinx numpydoc docutils joblib pillow pytest matplotlib
+
+ccache -M 512M
+export CCACHE_COMPRESS=1
+export PATH=/usr/lib/ccache:$PATH
+export LOKY_MAX_CPU_COUNT="2"
+export OMP_NUM_THREADS="1"
+
+python setup.py build_ext --inplace -j 3
+pip install --no-build-isolation -e .
+
+# Check that Python implementation is PyPy
+python - << EOL
+import platform
+from skopt import IS_PYPY
+assert IS_PYPY is True, "platform={}!=PyPy".format(platform.python_implementation())
+EOL
+
+python -m pytest skopt/
+python -m pytest doc/sphinxext/
+python -m pytest $(find doc -name '*.rst' | sort)
\ No newline at end of file
diff --git a/build_tools/circle/checkout_merge_commit.sh b/build_tools/circle/checkout_merge_commit.sh
new file mode 100755
index 000000000..73947081b
--- /dev/null
+++ b/build_tools/circle/checkout_merge_commit.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/checkout_merge_commit.sh
+# The scikit-learn developers.
+# License: BSD-style
+
+# Add `master` branch to the update list.
+# Otherwise CircleCI will give us a cached one.
+FETCH_REFS="+master:master"
+
+# Update PR refs for testing.
+if [[ -n "${CIRCLE_PR_NUMBER}" ]]
+then
+    FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head"
+    FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge"
+fi
+
+# Retrieve the refs.
+git fetch -u origin ${FETCH_REFS}
+
+# Checkout the PR merge ref.
+if [[ -n "${CIRCLE_PR_NUMBER}" ]]
+then
+    git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || (
+        echo Could not fetch merge commit. >&2
+        echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2;
+        exit 1)
+fi
+
+# Check for merge conflicts.
+if [[ -n "${CIRCLE_PR_NUMBER}" ]]
+then
+    git branch --merged | grep master > /dev/null
+    git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null
+fi
\ No newline at end of file
diff --git a/build_tools/circle/deploy.sh b/build_tools/circle/deploy.sh
deleted file mode 100644
index 824f7ba23..000000000
--- a/build_tools/circle/deploy.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Almost copied verbatim from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/push_doc.sh
-export SKOPT_HOME=$(pwd)
-
-if [ -z $CIRCLE_PROJECT_USERNAME ];
-then USERNAME="skoptci";
-else USERNAME=$CIRCLE_PROJECT_USERNAME;
-fi
-
-MSG="Pushing the docs for revision for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1"
-
-# Copying to github pages
-echo "Copying built files"
-git clone -b master "git@github.com:scikit-optimize/scikit-optimize.github.io" deploy
-cd deploy
-git rm -r space
-git rm -r optimizer
-git rm -r learning
-cd ..
-for entry in ${HOME}/doc/skopt/*
-do
-  echo "$entry"
-done
-
-cp -r ${HOME}/doc/skopt/* deploy
-# Move into deployment directory
-cd deploy
-
-# Commit changes, allowing empty changes (when unchanged)
-echo "Committing and pushing to Github"
-echo "$USERNAME"
-git config --global user.name $USERNAME
-git config --global user.email "skoptci@gmail.com"
-git config --global push.default matching
-git add -A
-git commit --allow-empty -m "$MSG"
-git push
-
-echo "$MSG"
diff --git a/build_tools/circle/execute.sh b/build_tools/circle/execute.sh
deleted file mode 100644
index 28006266a..000000000
--- a/build_tools/circle/execute.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-source activate testenv
-export SKOPT_HOME=$(pwd)
-
-python --version
-python -c "import numpy; print('numpy %s' % numpy.__version__)"
-python -c "import scipy; print('scipy %s' % scipy.__version__)"
-
-
-mkdir -p ${HOME}/doc/skopt
-
-cd ~
-cd ${SKOPT_HOME}/doc && sphinx-build -M html ${SKOPT_HOME}/doc ${SKOPT_HOME}/doc/_build # -W --keep-going
-
-for entry in ${SKOPT_HOME}/doc/_build/*
-do
-  echo "$entry"
-done
-
-cp -r ${SKOPT_HOME}/doc/_build/html/* ${HOME}/doc/skopt
diff --git a/build_tools/circle/install.sh b/build_tools/circle/install.sh
deleted file mode 100644
index f0a67d38c..000000000
--- a/build_tools/circle/install.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-conda update -n base conda
-conda create -n testenv --yes python pip pytest nose
-source activate testenv
-
-python -m pip install -e '.[plots]'
-export SKOPT_HOME=$(pwd)
-
-python -m pip install sphinx sphinx-gallery numpydoc memory_profiler
-
-# importing matplotlib once builds the font caches. This avoids
-# having warnings in our example notebooks
-python -c "import matplotlib.pyplot as plt"
diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh
new file mode 100755
index 000000000..bdeeaa0cd
--- /dev/null
+++ b/build_tools/circle/linting.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/linting.sh
+# The scikit-learn developers.
+# License: BSD-style
+#
+# This script is used in CircleCI to check that PRs do not add obvious
+# flake8 violations. It relies on two things:
+#   - find common ancestor between branch and
+#     scikit-optimize/scikit-optimize remote
+#   - run flake8 --diff on the diff between the branch and the common
+#     ancestor
+#
+# Additional features:
+#   - the line numbers in Travis match the local branch on the PR
+#     author machine.
+#   - ./build_tools/circle/flake8_diff.sh can be run locally for quick
+#     turn-around
+
+set -e
+# pipefail is necessary to propagate exit codes
+set -o pipefail
+
+PROJECT=scikit-optimize/scikit-optimize
+PROJECT_URL=https://github.com/$PROJECT.git
+
+# Find the remote with the project name (upstream in most cases)
+REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '')
+
+# Add a temporary remote if needed. For example this is necessary when
+# Travis is configured to run in a fork. In this case 'origin' is the
+# fork and not the reference repo we want to diff against.
+if [[ -z "$REMOTE" ]]; then
+    TMP_REMOTE=tmp_reference_upstream
+    REMOTE=$TMP_REMOTE
+    git remote add $REMOTE $PROJECT_URL
+fi
+
+echo "Remotes:"
+echo '--------------------------------------------------------------------------------'
+git remote --verbose
+
+# Travis does the git clone with a limited depth (50 at the time of
+# writing). This may not be enough to find the common ancestor with
+# $REMOTE/master so we unshallow the git checkout
+if [[ -a .git/shallow ]]; then
+    echo -e '\nTrying to unshallow the repo:'
+    echo '--------------------------------------------------------------------------------'
+    git fetch --unshallow
+fi
+
+if [[ "$TRAVIS" == "true" ]]; then
+    if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]
+    then
+        # In main repo, using TRAVIS_COMMIT_RANGE to test the commits
+        # that were pushed into a branch
+        if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then
+            if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then
+                echo "New branch, no commit range from Travis so passing this test by convention"
+                exit 0
+            fi
+            COMMIT_RANGE=$TRAVIS_COMMIT_RANGE
+        fi
+    else
+        # We want to fetch the code as it is in the PR branch and not
+        # the result of the merge into master. This way line numbers
+        # reported by Travis will match with the local code.
+        LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST
+        # In Travis the PR target is always origin
+        git fetch origin pull/$TRAVIS_PULL_REQUEST/head:refs/$LOCAL_BRANCH_REF
+    fi
+fi
+
+# If not using the commit range from Travis we need to find the common
+# ancestor between $LOCAL_BRANCH_REF and $REMOTE/master
+if [[ -z "$COMMIT_RANGE" ]]; then
+    if [[ -z "$LOCAL_BRANCH_REF" ]]; then
+        LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD)
+    fi
+    echo -e "\nLast 2 commits in $LOCAL_BRANCH_REF:"
+    echo '--------------------------------------------------------------------------------'
+    git --no-pager log -2 $LOCAL_BRANCH_REF
+
+    REMOTE_MASTER_REF="$REMOTE/master"
+    # Make sure that $REMOTE_MASTER_REF is a valid reference
+    echo -e "\nFetching $REMOTE_MASTER_REF"
+    echo '--------------------------------------------------------------------------------'
+    git fetch $REMOTE master:refs/remotes/$REMOTE_MASTER_REF
+    LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF)
+    REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF)
+
+    COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \
+        echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)"
+
+    if [ -z "$COMMIT" ]; then
+        exit 1
+    fi
+
+    COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT)
+
+    echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\
+         "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:"
+    echo '--------------------------------------------------------------------------------'
+    git --no-pager show --no-patch $COMMIT_SHORT_HASH
+
+    COMMIT_RANGE="$COMMIT_SHORT_HASH..$LOCAL_BRANCH_SHORT_HASH"
+
+    if [[ -n "$TMP_REMOTE" ]]; then
+        git remote remove $TMP_REMOTE
+    fi
+
+else
+    echo "Got the commit range from Travis: $COMMIT_RANGE"
+fi
+
+echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \
+     "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):"
+echo '--------------------------------------------------------------------------------'
+
+# We need the following command to exit with 0 hence the echo in case
+# there is no match
+MODIFIED_FILES="$(git diff --name-only $COMMIT_RANGE | \
+                     grep -v 'doc/sphinxext' || echo "no_match")"
+
+check_files() {
+    files="$1"
+    shift
+    options="$*"
+    if [ -n "$files" ]; then
+        # Conservative approach: diff without context (--unified=0) so that code
+        # that was not changed does not create failures
+        git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options
+    fi
+}
+
+if [[ "$MODIFIED_FILES" == "no_match" ]]; then
+    echo "No file outside doc/sphinxext has been modified"
+else
+
+    check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)"
+    check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \
+        --config ./examples/.flake8
+fi
+echo -e "No problem detected by flake8\n"
+
+# For docstrings and warnings of deprecated attributes to be rendered
+# properly, the property decorator must come before the deprecated decorator
+# (else they are treated as functions)
+
+# do not error when grep -B1 "@property" finds nothing
+set +e
+bad_deprecation_property_order=`git grep -A 10 "@property"  -- "*.py" | awk '/@property/,/def /' | grep -B1 "@deprecated"`
+
+if [ ! -z "$bad_deprecation_property_order" ]
+then
+    echo "property decorator should come before deprecated decorator"
+    echo "found the following occurrencies:"
+    echo $bad_deprecation_property_order
+    exit 1
+fi
\ No newline at end of file
diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py
new file mode 100644
index 000000000..b63db3cc2
--- /dev/null
+++ b/build_tools/circle/list_versions.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/
+# build_tools/circle/list_versions.sh
+# The scikit-learn developers.
+# License: BSD-style
+# List all available versions of the documentation
+import json
+import re
+import sys
+
+from distutils.version import LooseVersion
+from urllib.request import urlopen
+
+
+def json_urlread(url):
+    try:
+        return json.loads(urlopen(url).read().decode('utf8'))
+    except Exception:
+        print('Error reading', url, file=sys.stderr)
+        raise
+
+
+def human_readable_data_quantity(quantity, multiple=1024):
+    # https://stackoverflow.com/questions/1094841/
+    # reusable-library-to-get-human-readable-version-of-file-size
+    if quantity == 0:
+        quantity = +0
+    SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple]
+                        for i in "KMGTPEZY"]
+    for suffix in SUFFIXES:
+        if quantity < multiple or suffix == SUFFIXES[-1]:
+            if suffix == SUFFIXES[0]:
+                return "%d %s" % (quantity, suffix)
+            else:
+                return "%.1f %s" % (quantity, suffix)
+        else:
+            quantity /= multiple
+
+
+def get_pdf_size(version):
+    api_url = ROOT_URL + '%s/_downloads' % version
+    for path_details in json_urlread(api_url):
+        if path_details['name'] == 'scikit-optimize-docs.pdf':
+            return human_readable_data_quantity(path_details['size'], 1000)
+
+
+print(':orphan:')
+print()
+heading = 'Available documentation for Scikit-optimize'
+print(heading)
+print('=' * len(heading))
+print()
+print('Web-based documentation is available for versions listed below:')
+print()
+
+ROOT_URL = 'https://api.github.com/repos/scikit-optimize/scikit-optimize.github.io/contents/'  # noqa
+RAW_FMT = 'https://raw.githubusercontent.com/scikit-optimize/scikit-optimize.github.io/master/%s/index.html'  # noqa
+VERSION_RE = re.compile(r"scikit-optimize ([\w\.\-]+) documentation</title>")
+NAMED_DIRS = ['dev', 'stable']
+
+# Gather data for each version directory, including symlinks
+dirs = {}
+symlinks = {}
+root_listing = json_urlread(ROOT_URL)
+for path_details in root_listing:
+    name = path_details['name']
+    if not (name[:1].isdigit() or name in NAMED_DIRS):
+        continue
+    if path_details['type'] == 'dir':
+        html = urlopen(RAW_FMT % name).read().decode('utf8')
+        version_num = VERSION_RE.search(html).group(1)
+        pdf_size = get_pdf_size(name)
+        dirs[name] = (version_num, pdf_size)
+
+    if path_details['type'] == 'symlink':
+        symlinks[name] = json_urlread(path_details['_links']['self'])['target']
+
+
+# Symlinks should have same data as target
+for src, dst in symlinks.items():
+    if dst in dirs:
+        dirs[src] = dirs[dst]
+
+# Output in order: dev, stable, decreasing other version
+seen = set()
+for name in (NAMED_DIRS +
+             sorted((k for k in dirs if k[:1].isdigit()),
+                    key=LooseVersion, reverse=True)):
+    if name not in dirs:
+        continue
+    version_num, pdf_size = dirs[name]
+    if version_num in seen:
+        # symlink came first
+        continue
+    else:
+        seen.add(version_num)
+    name_display = '' if name[:1].isdigit() else ' (%s)' % name
+    path = 'https://scikit-optimize.github.io/%s/' % name
+    out = ('* `Scikit-optimize %s%s documentation <%s>`_'
+           % (version_num, name_display, path))
+    if pdf_size is not None:
+        out += (' (`PDF %s <%s/_downloads/scikit-optimize-docs.pdf>`_)'
+                % (pdf_size, path))
+    print(out)
diff --git a/build_tools/circle/push_doc.sh b/build_tools/circle/push_doc.sh
new file mode 100755
index 000000000..1c8eae252
--- /dev/null
+++ b/build_tools/circle/push_doc.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/circle/push_doc.sh
+# The scikit-learn developers.
+# License: BSD-style
+#
+# This script is meant to be called in the "deploy" step defined in
+# circle.yml. See https://circleci.com/docs/ for more details.
+# The behavior of the script is controlled by environment variable defined
+# in the circle.yml in the top level folder of the project.
+
+set -ex
+
+if [ -z $CIRCLE_PROJECT_USERNAME ];
+then USERNAME="skoptci";
+else USERNAME=$CIRCLE_PROJECT_USERNAME;
+fi
+
+DOC_REPO="scikit-optimize.github.io"
+GENERATED_DOC_DIR=$1
+
+if [[ -z "$GENERATED_DOC_DIR" ]]; then
+    echo "Need to pass directory of the generated doc as argument"
+    echo "Usage: $0 <generated_doc_dir>"
+    exit 1
+fi
+
+# Absolute path needed because we use cd further down in this script
+GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR)
+
+if [ "$CIRCLE_BRANCH" = "master" ]
+then
+    dir=dev
+else
+    # Strip off .X
+    dir="${CIRCLE_BRANCH::-2}"
+fi
+
+MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1"
+
+cd $HOME
+if [ ! -d $DOC_REPO ];
+then git clone --depth 1 --no-checkout "git@github.com:scikit-optimize/"$DOC_REPO".git";
+fi
+cd $DOC_REPO
+
+# check if it's a new branch
+
+echo $dir > .git/info/sparse-checkout
+if ! git show HEAD:$dir >/dev/null
+then
+	# directory does not exist. Need to make it so sparse checkout works
+	mkdir $dir
+	touch $dir/index.html
+	git add $dir
+fi
+git checkout master
+git reset --hard origin/master
+if [ -d $dir ]
+then
+	git rm -rf $dir/ && rm -rf $dir/
+fi
+cp -R $GENERATED_DOC_DIR $dir
+git config user.email "skoptci@gmail.com"
+git config user.name $USERNAME
+git config push.default matching
+git add -f $dir/
+git commit -m "$MSG" $dir
+git push
+echo $MSG
\ No newline at end of file
diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh
new file mode 100644
index 000000000..494f86b6e
--- /dev/null
+++ b/build_tools/travis/after_success.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# This script is meant to be called by the "after_success" step defined in
+# .travis.yml. See https://docs.travis-ci.com/ for more details.
+
+# License: 3-clause BSD
+
+set -e
+
+if [[ "$COVERAGE" == "true" ]]; then
+    # Need to run codecov from a git checkout, so we copy .coverage
+    # from TEST_DIR where pytest has been run
+    cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR
+
+    # Ignore codecov failures as the codecov server is not
+    # very reliable but we don't want travis to report a failure
+    # in the github UI just because the coverage report failed to
+    # be published.
+    codecov --root $TRAVIS_BUILD_DIR || echo "codecov upload failed"
+fi
\ No newline at end of file
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 04cd1bf92..f6237c0a1 100644
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -64,7 +64,12 @@ if [[ "$COVERAGE" == "true" ]]; then
     pip install pytest-cov coverage coveralls
 fi
 
-pip install -e '.[plots]'
+if [[ "$SDIST" == "true" ]]; then
+    python setup.py sdist
+    pip install twine
+else
+    pip install -e '.[plots]'
+fi
 python --version
 python -c "import numpy; print('numpy %s' % numpy.__version__)"
 python -c "import scipy; print('scipy %s' % scipy.__version__)"
diff --git a/build_tools/travis/test_docs.sh b/build_tools/travis/test_docs.sh
new file mode 100644
index 000000000..3df03926f
--- /dev/null
+++ b/build_tools/travis/test_docs.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+set -x
+if [[ "$SDIST" != "true" ]]; then
+    make test-doc
+fi
diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh
new file mode 100644
index 000000000..42eb17249
--- /dev/null
+++ b/build_tools/travis/test_script.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Copied from https://github.com/scikit-learn/scikit-learn/blob/master/build_tools/travis/test_script.sh
+# The scikit-learn developers.
+# License: BSD-style
+#
+# This script is meant to be called by the "script" step defined in
+# .travis.yml. See https://docs.travis-ci.com/ for more details.
+# The behavior of the script is controlled by environment variabled defined
+# in the .travis.yml in the top level folder of the project.
+
+# License: 3-clause BSD
+
+set -e
+
+python --version
+python -c "import numpy; print('numpy %s' % numpy.__version__)"
+python -c "import scipy; print('scipy %s' % scipy.__version__)"
+python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
+python -c "\
+try:
+    import skopt
+    print('skopt %s' % skopt.__version__)
+except ImportError:
+    pass
+"
+python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())"
+
+run_tests() {
+    TEST_CMD="pytest --showlocals --durations=20 --pyargs"
+
+    # Get into a temp directory to run test from the installed scikit-learn and
+    # check if we do not leave artifacts
+    mkdir -p $TEST_DIR
+    # We need the setup.cfg for the pytest settings
+    cp setup.cfg $TEST_DIR
+    cd $TEST_DIR
+
+    # Skip tests that require large downloads over the network to save bandwidth
+    # usage as travis workers are stateless and therefore traditional local
+    # disk caching does not work.
+    export SKOPT_SKIP_NETWORK_TESTS=1
+
+    if [[ "$COVERAGE" == "true" ]]; then
+        TEST_CMD="$TEST_CMD --cov skopt"
+    fi
+
+    if [[ -n "$CHECK_WARNINGS" ]]; then
+        TEST_CMD="$TEST_CMD -Werror::DeprecationWarning -Werror::FutureWarning"
+    fi
+
+    set -x  # print executed commands to the terminal
+
+    $TEST_CMD skopt
+}
+
+run_package_check() {
+
+    TEST_CMD="twine check dist/*"
+    set -x
+    $TEST_CMD
+}
+
+if [[ "$SDIST" == "true" ]]; then
+    run_package_check
+else
+    run_tests
+fi
\ No newline at end of file
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 000000000..f2a991049
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,83 @@
+# Even if empty this file is useful so that when running from the root folder
+# ./sklearn is added to sys.path by pytest. See
+# https://docs.pytest.org/en/latest/pythonpath.html for more details.  For
+# example, this allows to build extensions in place and run pytest
+# doc/modules/clustering.rst and use sklearn from the local folder rather than
+# the one from site-packages.
+
+import platform
+import sys
+from distutils.version import LooseVersion
+import os
+
+import pytest
+from _pytest.doctest import DoctestItem
+from skopt import _IS_32BIT
+
+
+PYTEST_MIN_VERSION = '3.3.0'
+
+if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION:
+    raise ImportError('Your version of pytest is too old, you should have '
+                      'at least pytest >= {} installed.'
+                      .format(PYTEST_MIN_VERSION))
+
+
+def pytest_addoption(parser):
+    parser.addoption("--skip-network", action="store_true", default=False,
+                     help="skip network tests")
+
+
+def pytest_collection_modifyitems(config, items):
+    # FeatureHasher is not compatible with PyPy
+    if platform.python_implementation() == 'PyPy':
+        skip_marker = pytest.mark.skip(
+            reason='FeatureHasher is not compatible with PyPy')
+        for item in items:
+            if item.name.endswith(('_hash.FeatureHasher',
+                                   'text.HashingVectorizer')):
+                item.add_marker(skip_marker)
+
+    # Skip tests which require internet if the flag is provided
+    if config.getoption("--skip-network"):
+        skip_network = pytest.mark.skip(
+            reason="test requires internet connectivity")
+        for item in items:
+            if "network" in item.keywords:
+                item.add_marker(skip_network)
+
+    # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
+    # run doctests only for numpy >= 1.14.
+    skip_doctests = False
+    try:
+        import numpy as np
+        if LooseVersion(np.__version__) < LooseVersion('1.14'):
+            reason = 'doctests are only run for numpy >= 1.14'
+            skip_doctests = True
+        elif _IS_32BIT:
+            reason = ('doctest are only run when the default numpy int is '
+                      '64 bits.')
+            skip_doctests = True
+        elif sys.platform.startswith("win32"):
+            reason = ("doctests are not run for Windows because numpy arrays "
+                      "repr is inconsistent across platforms.")
+            skip_doctests = True
+    except ImportError:
+        pass
+
+    if skip_doctests:
+        skip_marker = pytest.mark.skip(reason=reason)
+
+        for item in items:
+            if isinstance(item, DoctestItem):
+                item.add_marker(skip_marker)
+
+
+def pytest_configure(config):
+    import sys
+    sys._is_pytest_session = True
+
+
+def pytest_unconfigure(config):
+    import sys
+    del sys._is_pytest_session
diff --git a/doc/Makefile b/doc/Makefile
index a0e8bf588..73e661410 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,22 +1,110 @@
-# Minimal makefile for Sphinx documentation
+# Makefile for Sphinx documentation
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -W --keep-going
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = scikit-optimize
-SOURCEDIR     = source
-BUILDDIR      = build
+SPHINXOPTS    = -j auto
+SPHINXBUILD  ?= sphinx-build
+PAPER         =
+BUILDDIR      = _build
+ifneq ($(EXAMPLES_PATTERN),)
+    EXAMPLES_PATTERN_OPTS := -D sphinx_gallery_conf.filename_pattern="$(EXAMPLES_PATTERN)"
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\
+    $(EXAMPLES_PATTERN_OPTS) .
+
+
+.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest optipng
+
+all: html-noplot
 
-# Put it first so that "make" without argument is like "make help".
 help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html      to make standalone HTML files"
+	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  pickle    to make pickle files"
+	@echo "  json      to make JSON files"
+	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  changes   to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck to check all external links for integrity"
+	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+	-rm -rf auto_examples/
+	-rm -rf generated/*
+	-rm -rf modules/generated/
+
+html:
+	# These two lines make the build a bit more lengthy, and the
+	# the embedding of images more robust
+	rm -rf $(BUILDDIR)/html/_images
+	#rm -rf _build/doctrees/
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable"
+
+html-noplot:
+	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	make -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+# download-data:
+# 	python -c "from sklearn.datasets._lfw import _check_fetch_lfw; _check_fetch_lfw()"
 
-.PHONY: help Makefile
+# Optimize PNG files. Needs OptiPNG. Change the -P argument to the number of
+# cores you have available, so -P 64 if you have a real computer ;)
+optipng:
+	find _build auto_examples */generated -name '*.png' -print0 \
+	  | xargs -0 -n 1 -P 4 optipng -o10
 
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-    mkdir ./source/notebooks
-    cp -r ../examples/* ./source/notebooks
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+dist: html latexpdf
+	cp _build/latex/scikit-optimize.pdf _build/html/stable/_downloads/scikit-optimize-docs.pdf
diff --git a/doc/conf.py b/doc/conf.py
index 780b5dbd8..370802f18 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,33 +15,33 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
-
 import warnings
 import os
 import re
+from packaging.version import parse
 # import pkg_resources
 import sys
 import skopt
 
 sys.path.insert(0, os.path.abspath('sphinxext'))
-
 from github_link import make_linkcode_resolve
 import sphinx_gallery
 
-__version__ = ".".join(skopt.__version__.split(".")[:2])
+
 #  __version__ = pkg_resources.get_distribution('skopt').version
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 
 # -- Project information -----------------------------------------------------
 
 project = 'scikit-optimize'
-copyright = '2017 - 2020, The scikit-optimize contributors.'
+copyright = '2017 - 2020, scikit-optimize contributors (BSD License)'
 author = 'The scikit-optimize contributors'
 
 # The short X.Y version
-version = __version__
+version = parse(skopt.__version__).base_version
+version = ".".join(version.split(".")[:2])
 # The full version, including alpha/beta/rc tags
-release = __version__
+release = skopt.__version__
 
 # -- General configuration ---------------------------------------------------
 
@@ -308,7 +308,8 @@ def __call__(self, directory):
 # thumbnails for the front page of the scikit-learn home page.
 # key: first image in set
 # values: (number of plot in set, height of thumbnail)
-carousel_thumbs = {'sphx_glr_plot_ask-and-tell_002.png': 600,
+carousel_thumbs = {'sphx_glr_sklearn-gridsearchcv-replacement_001.png': 600,
+                   'sphx_glr_plot_ask-and-tell_002.png': 600,
                    'sphx_glr_bayesian-optimization_004.png': 600,
                    'sphx_glr_strategy-comparison_002.png': 600,
                    'sphx_glr_visualizing-results_008.png': 600}
diff --git a/doc/conftest.py b/doc/conftest.py
new file mode 100644
index 000000000..a52e06264
--- /dev/null
+++ b/doc/conftest.py
@@ -0,0 +1,12 @@
+import os
+from os.path import exists
+from os.path import join
+import warnings
+
+import numpy as np
+
+from skopt import IS_PYPY
+
+
+def pytest_runtest_setup(item):
+    fname = item.fspath.strpath
diff --git a/doc/contents.rst b/doc/contents.rst
index 93b93c808..9ee878ddd 100644
--- a/doc/contents.rst
+++ b/doc/contents.rst
@@ -14,7 +14,9 @@ Table Of Contents
 .. toctree::
     :maxdepth: 2
 
+    preface
     getting_started
     user_guide
     auto_examples/index
     modules/classes
+    development
diff --git a/doc/development.rst b/doc/development.rst
index 59bde45fa..d756fdbea 100644
--- a/doc/development.rst
+++ b/doc/development.rst
@@ -4,7 +4,7 @@ Development
 
 The library is still experimental and under heavy development. Checkout
 the `next
-milestone <https://github.com/scikit-optimize/scikit-optimize/milestone/7>`__
+milestone <https://github.com/scikit-optimize/scikit-optimize/milestones>`__
 for the plans for the next release or look at some `easy
 issues <https://github.com/scikit-optimize/scikit-optimize/issues?q=is%3Aissue+is%3Aopen+label%3AEasy>`__
 to get started contributing.
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 39379ef45..4836c0758 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -1,8 +1,10 @@
-.. currentmodule:: skopt
 
 ===============
 Getting started
 ===============
+
+.. currentmodule:: skopt
+
 Scikit-Optimize, or ``skopt``, is a simple and efficient library to
 minimize (very) expensive and noisy black-box functions. It implements
 several methods for sequential model-based optimization. ``skopt`` aims
@@ -27,27 +29,29 @@ Finding a minimum
 Find the minimum of the noisy function ``f(x)`` over the range ``-2 < x < 2``
 with :class:`skopt`::
 
-  import numpy as np
-  from skopt import gp_minimize
-
-  def f(x):
-      return (np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) *
-              np.random.randn() * 0.1)
-
-  res = gp_minimize(f, [(-2.0, 2.0)])
+    >>> import numpy as np
+    >>> from skopt import gp_minimize
+    >>> np.random.seed(123)
+    >>> def f(x):
+    ...     return (np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) *
+    ...             np.random.randn() * 0.1)
+    >>>
+    >>> res = gp_minimize(f, [(-2.0, 2.0)], n_calls=20)
+    >>> print("x*=%.2f f(x*)=%.2f" % (res.x[0], res.fun))
+    x*=0.85 f(x*)=-0.06
 
 For more control over the optimization loop you can use the :class:`skopt.Optimizer`
 class::
 
-  from skopt import Optimizer
-
-  opt = Optimizer([(-2.0, 2.0)])
-
-  for i in range(20):
-      suggested = opt.ask()
-      y = f(suggested)
-      opt.tell(suggested, y)
-      print('iteration:', i, suggested, y)
+    >>> from skopt import Optimizer
+    >>> opt = Optimizer([(-2.0, 2.0)])
+    >>>
+    >>> for i in range(20):
+    ...     suggested = opt.ask()
+    ...     y = f(suggested)
+    ...     res = opt.tell(suggested, y)
+    >>> print("x*=%.2f f(x*)=%.2f" % (res.x[0], res.fun))
+    x*=0.27 f(x*)=-0.15
 
 For more read our :ref:`sphx_glr_auto_examples_bayesian-optimization.py` and the other
 `examples <auto_examples/index.html>`_.
diff --git a/doc/install.rst b/doc/install.rst
index 620ad836b..2744eb7b0 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -4,7 +4,14 @@
 Installation
 ============
 
-scikit-optimize supports Python 3.5 or newer.
+scikit-optimize requires:
+
+* Python >= 3.6
+* NumPy (>= 1.13.3)
+* SciPy (>= 0.19.1)
+* joblib (>= 0.11)
+* scikit-learn >= 0.20
+* matplotlib >= 2.0.0
 
 The newest release can be installed via pip:
 
diff --git a/doc/modules/acquisition.rst b/doc/modules/acquisition.rst
index 3ecf2e116..5332833b8 100644
--- a/doc/modules/acquisition.rst
+++ b/doc/modules/acquisition.rst
@@ -4,3 +4,48 @@
 
 Acquisition
 ===========
+Function to minimize over the posterior distribution.
+
+:class:`gaussian_lcb`
+---------------------
+Use the lower confidence bound to estimate the acquisition
+values.
+
+The trade-off between exploitation and exploration is left to
+be controlled by the user through the parameter ``kappa``.
+
+:class:`gaussian_pi`
+--------------------
+Use the probability of improvement to calculate the acquisition values.
+
+The conditional probability `P(y=f(x) | x)` form a gaussian with a
+certain mean and standard deviation approximated by the model.
+
+The PI condition is derived by computing ``E[u(f(x))]``
+where ``u(f(x)) = 1``, if ``f(x) < y_opt`` and ``u(f(x)) = 0``,
+if ``f(x) > y_opt``.
+
+This means that the PI condition does not care about how "better" the
+predictions are than the previous values, since it gives an equal reward
+to all of them.
+
+Note that the value returned by this function should be maximized to
+obtain the ``X`` with maximum improvement.
+
+
+:class:`gaussian_ei`
+--------------------
+Use the expected improvement to calculate the acquisition values.
+
+The conditional probability `P(y=f(x) | x)` form a gaussian with a certain
+mean and standard deviation approximated by the model.
+
+The EI condition is derived by computing ``E[u(f(x))]``
+where ``u(f(x)) = 0``, if ``f(x) > y_opt`` and ``u(f(x)) = y_opt - f(x)``,
+if ``f(x) < y_opt``.
+
+This solves one of the issues of the PI condition by giving a reward
+proportional to the amount of improvement got.
+
+Note that the value returned by this function should be maximized to
+obtain the ``X`` with maximum improvement.
diff --git a/doc/modules/bayessearchcv.rst b/doc/modules/bayessearchcv.rst
index 1a2b1f383..dfa402c12 100644
--- a/doc/modules/bayessearchcv.rst
+++ b/doc/modules/bayessearchcv.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: skopt
+
 .. _bayessearchcv:
 
 BayesSearchCV, a GridSearchCV compatible estimator
@@ -5,3 +7,17 @@ BayesSearchCV, a GridSearchCV compatible estimator
 
 Use ``BayesSearchCV`` as a replacement for scikit-learn's GridSearchCV.
 
+BayesSearchCV implements a "fit" and a "score" method.
+It also implements "predict", "predict_proba", "decision_function",
+"transform" and "inverse_transform" if they are implemented in the
+estimator used.
+
+The parameters of the estimator used to apply these methods are optimized
+by cross-validated search over parameter settings.
+
+In contrast to GridSearchCV, not all parameter values are tried out, but
+rather a fixed number of parameter settings is sampled from the specified
+distributions. The number of parameter settings that are tried is
+given by n_iter.
+
+Parameters are presented as a list of :class:`skopt.space.Dimension` objects.
\ No newline at end of file
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index a43b295ef..5d364e38c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -33,6 +33,7 @@ Functions
     dummy_minimize
     dump
     expected_minimum
+    expected_minimum_random_sampling
     forest_minimize
     gbrt_minimize
     gp_minimize
@@ -187,9 +188,14 @@ details.
    :template: function.rst
 
     plots.partial_dependence
+    plots.partial_dependence_1D
+    plots.partial_dependence_2D
     plots.plot_convergence
     plots.plot_evaluations
+    plots.plot_gaussian_process
     plots.plot_objective
+    plots.plot_objective_2D
+    plots.plot_histogram
     plots.plot_regret
 
 .. _utils_ref:
@@ -211,14 +217,38 @@ details.
    :template: function.rst
 
     utils.cook_estimator
+    utils.cook_initial_point_generator
     utils.dimensions_aslist
     utils.expected_minimum
+    utils.expected_minimum_random_sampling
     utils.dump
     utils.load
     utils.point_asdict
     utils.point_aslist
     utils.use_named_args
 
+.. _sampler_ref:
+
+:mod:`skopt.sampler`: Samplers
+==============================
+
+.. automodule:: skopt.sampler
+   :no-members:
+   :no-inherited-members:
+
+**User guide:** See the :ref:`sampler` section for further details.
+
+.. currentmodule:: skopt
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+    sampler.Lhs
+    sampler.Sobol
+    sampler.Halton
+    sampler.Hammersly
+
 
 .. _space_ref:
 
@@ -272,5 +302,7 @@ details.
     space.transformers.Normalize
     space.transformers.Pipeline
     space.transformers.Transformer
+    space.transformers.LabelEncoder
+    space.transformers.StringEncoder
 
 
diff --git a/doc/modules/minimize_functions.rst b/doc/modules/minimize_functions.rst
index 0e738a115..1b05cd551 100644
--- a/doc/modules/minimize_functions.rst
+++ b/doc/modules/minimize_functions.rst
@@ -8,7 +8,42 @@ These are easy to get started with. They mirror the ``scipy.optimize``
 API and provide a high level interface to various pre-configured
 optimizers.
 
-* :class:`dummy_minimize`
-* :class:`forest_minimize`
-* :class:`gbrt_minimize`
-* :class:`gp_minimize`
+:class:`dummy_minimize`
+-----------------------
+Random search by uniform sampling within the given bounds.
+
+:class:`forest_minimize`
+------------------------
+Sequential optimisation using decision trees.
+
+A tree based regression model is used to model the expensive to evaluate
+function `func`. The model is improved by sequentially evaluating
+the expensive function at the next best point. Thereby finding the
+minimum of `func` with as few evaluations as possible.
+
+:class:`gbrt_minimize`
+----------------------
+Sequential optimization using gradient boosted trees.
+
+Gradient boosted regression trees are used to model the (very)
+expensive to evaluate function `func`. The model is improved
+by sequentially evaluating the expensive function at the next
+best point. Thereby finding the minimum of `func` with as
+few evaluations as possible.
+
+:class:`gp_minimize`
+--------------------
+Bayesian optimization using Gaussian Processes.
+
+If every function evaluation is expensive, for instance
+when the parameters are the hyperparameters of a neural network
+and the function evaluation is the mean cross-validation score across
+ten folds, optimizing the hyperparameters by standard optimization
+routines would take for ever!
+
+The idea is to approximate the function using a Gaussian process.
+In other words the function values are assumed to follow a multivariate
+gaussian. The covariance of the function values are given by a
+GP kernel between the parameters. Then a smart choice to choose the
+next parameter to evaluate can be made by the acquisition function
+over the Gaussian prior which is much quicker to evaluate.
diff --git a/doc/modules/plots.rst b/doc/modules/plots.rst
index d33c3f65f..77f8b3ee0 100644
--- a/doc/modules/plots.rst
+++ b/doc/modules/plots.rst
@@ -1,13 +1,39 @@
-.. currentmodule:: skopt.plots
 
 .. _plots:
 
+==============
 Plotting tools
 ==============
-Plotting functions.
 
-* :class:`partial_dependence`
-* :class:`plot_convergence`
-* :class:`plot_evaluations`
-* :class:`plot_objective`
-* :class:`plot_regret`
\ No newline at end of file
+.. currentmodule:: skopt.plots
+
+Plotting functions can be used to visualize the optimization process.
+
+plot_convergence
+================
+:class:`plot_convergence` plots one or several convergence traces.
+
+.. figure:: ../auto_examples/images/sphx_glr_hyperparameter-optimization_001.png
+   :target: ../auto_examples/hyperparameter-optimization.html
+   :align: center
+
+plot_evaluations
+================
+:class:`plot_evaluations` visualize the order in which points where sampled.
+
+.. figure:: ../auto_examples/plots/images/sphx_glr_visualizing-results_002.png
+   :target: ../auto_examples/plots/visualizing-results.htm
+   :align: center
+
+plot_objective
+==============
+:class:`plot_objective` creates pairwise dependence plot of the objective function.
+
+.. figure:: ../auto_examples/plots/images/sphx_glr_partial-dependence-plot_001.png
+   :target: ../auto_examples/plots/partial-dependence-plot.html
+   :align: center
+
+
+plot_regret
+===========
+:class:`plot_regret` plot one or several cumulative regret traces.
diff --git a/doc/modules/sampler.rst b/doc/modules/sampler.rst
new file mode 100644
index 000000000..eeb151f6d
--- /dev/null
+++ b/doc/modules/sampler.rst
@@ -0,0 +1,6 @@
+.. currentmodule:: skopt.sampler
+
+.. _sampler:
+
+Sampling methods
+================
diff --git a/doc/modules/space.rst b/doc/modules/space.rst
index 4525732d6..9eac0857c 100644
--- a/doc/modules/space.rst
+++ b/doc/modules/space.rst
@@ -2,6 +2,18 @@
 
 .. _space:
 
-Space define the optimization space
-===================================
+Space
+=====
+:class:`Space` define the optimization space which contains one or multiple dimensions of the following type:
 
+:class:`Real`
+-------------
+Search space dimension that can take on any real value.
+
+:class:`Integer`
+----------------
+Search space dimension that can take on integer values.
+
+:class:`Categorical`
+--------------------
+Search space dimension that can take on categorical values.
\ No newline at end of file
diff --git a/doc/modules/transformers.rst b/doc/modules/transformers.rst
new file mode 100644
index 000000000..34693f287
--- /dev/null
+++ b/doc/modules/transformers.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: skopt.space.transformers
+
+.. _transformers:
+
+Transformers
+============
+
diff --git a/doc/modules/utils.rst b/doc/modules/utils.rst
index a7e7fa0dd..62134ca88 100644
--- a/doc/modules/utils.rst
+++ b/doc/modules/utils.rst
@@ -7,3 +7,31 @@ Utility functions
 This is a list of public utility functions. Other functions in this module
 are meant for internal use.
 
+:func:`use_named_args`
+----------------------
+This utility function allows it to use objective functions with named arguments::
+
+    >>> # Define the search-space dimensions. They must all have names!
+    >>> from skopt.space import Real
+    >>> from skopt.utils import use_named_args
+    >>> dim1 = Real(name='foo', low=0.0, high=1.0)
+    >>> dim2 = Real(name='bar', low=0.0, high=1.0)
+    >>> dim3 = Real(name='baz', low=0.0, high=1.0)
+    >>>
+    >>> # Gather the search-space dimensions in a list.
+    >>> dimensions = [dim1, dim2, dim3]
+    >>>
+    >>> # Define the objective function with named arguments
+    >>> # and use this function-decorator to specify the
+    >>> # search-space dimensions.
+    >>> @use_named_args(dimensions=dimensions)
+    ... def my_objective_function(foo, bar, baz):
+    ...     return foo ** 2 + bar ** 4 + baz ** 8
+
+:func:`dump`
+------------
+Store an skopt optimization result into a file.
+
+:func:`load`
+------------
+Reconstruct a skopt optimization result from a file persisted with :func:`dump`.
diff --git a/doc/preface.rst b/doc/preface.rst
new file mode 100644
index 000000000..c80f619cd
--- /dev/null
+++ b/doc/preface.rst
@@ -0,0 +1,25 @@
+.. This helps define the TOC ordering for "about us" sections. Particularly
+   useful for PDF output as this section is not linked from elsewhere.
+
+.. Places global toc into the sidebar
+
+:globalsidebartoc: True
+
+.. _preface_menu:
+
+.. include:: includes/big_toc_css.rst
+.. include:: tune_toc.rst
+
+==========================
+Welcome to scikit-optimize
+==========================
+
+|
+
+.. toctree::
+    :maxdepth: 2
+
+    install
+    whats_new
+
+|
\ No newline at end of file
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 1ab1b285d..4713469ee 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -4,5 +4,5 @@ matplotlib
 pandas
 ipywidgets
 sphinx
-sphinx-gallery
+sphinx-gallery>=0.6
 numpydoc
diff --git a/doc/templates/index.html b/doc/templates/index.html
index 8244df212..7aff01909 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -8,7 +8,7 @@
         <h1 class="sk-landing-header text-dark text-monospace">scikit-optimize</h1>
         <h4 class="sk-landing-subheader text-dark font-italic mb-3">Sequential model-based optimization in Python</h4>
         <a class="btn sk-landing-btn mb-1" href="{{ pathto('getting_started') }}" role="button">Getting Started</a>
-        <a class="btn sk-landing-btn mb-1" href="whats_new/v{{ release }}.html" role="button">What's New in {{ release }}</a>
+        <a class="btn sk-landing-btn mb-1" href="whats_new/v{{ version }}.html" role="button">What's New in {{ release }}</a>
         <a class="btn sk-landing-btn mb-1" href="https://github.com/scikit-optimize/scikit-optimize" role="button">GitHub</a>
       </div>
       <div class="col-md-6 d-flex">
@@ -35,6 +35,7 @@ <h4 class="sk-landing-subheader text-dark font-italic mb-3">Sequential model-bas
         </div>
         <div class="overflow-hidden mx-2 text-center flex-fill">
           <a href="auto_examples/sklearn-gridsearchcv-replacement.html"  aria-label="sklearn-gridsearchcv-replacement">
+            <img src="_images/sphx_glr_sklearn-gridsearchcv-replacement_001.png" class="sk-index-img" alt="Scikit-learn hyperparameter search wrapper">
           </a>
         </div>
       </div>
@@ -55,11 +56,11 @@ <h4 class="sk-landing-subheader text-dark font-italic mb-3">Sequential model-bas
     <div class="col-md-4 mb-3 px-md-2 sk-px-xl-4">
       <div class="card h-100">
         <div class="card-body">
-          <a href="auto_examples/visualizing-results.html"><h4 class="sk-card-title card-title">Visualizing</h4></a>
+          <a href="modules/plots.html"><h4 class="sk-card-title card-title">Visualizing</h4></a>
           <p class="card-text">Visualizing optimization results</p>
         </div>
         <div class="overflow-hidden mx-2 text-center flex-fill">
-          <a href="auto_examples/visualizing-results.html"  aria-label="Visualizing">
+          <a href="auto_examples/plots/visualizing-results.html"  aria-label="Visualizing">
           <img src="_images/sphx_glr_visualizing-results_006.png" class="sk-index-img" alt="Visualizing">
           </a>
         </div>
@@ -106,10 +107,13 @@ <h4 class="sk-landing-subheader text-dark font-italic mb-3">Sequential model-bas
         <h4 class="sk-landing-call-header">News</h4>
         <ul class="sk-landing-call-list list-unstyled">
         <li><strong>On-going development:</strong>
-        <a href="whats_new.html"><strong>What's new</strong> (Changelog)</a>
+        <a href="https://scikit-optimize.github.io/dev/whats_new.html"><strong>What's new</strong> (Changelog)</a>
         </li>
-        <li><strong>Feb 2020.</strong> scikit-optimize 0.7.1  (<a href="whats_new/v0.7.1.html">Changelog</a>).
-        <li><strong>Jan 2020.</strong> scikit-optimize 0.7  (<a href="whats_new/v0.7.html">Changelog</a>).
+		<li><strong>Sep 2020.</strong> scikit-optimize 0.8.1  (<a href="whats_new/v0.8.html#version-0-8-1">Changelog</a>).
+		<li><strong>Sep 2020.</strong> scikit-optimize 0.8  (<a href="whats_new/v0.8.html#version-0-8">Changelog</a>).
+        <li><strong>Feb 2020.</strong> scikit-optimize 0.7.2  (<a href="whats_new/v0.7.html#version-0-7-2">Changelog</a>).
+        <li><strong>Feb 2020.</strong> scikit-optimize 0.7.1  (<a href="whats_new/v0.7.html#version-0-7-1">Changelog</a>).
+        <li><strong>Jan 2020.</strong> scikit-optimize 0.7  (<a href="whats_new/v0.7.html#version-0-7">Changelog</a>).
         <li><strong>April 2018.</strong> scikit-optimize 0.6  (<a href="whats_new/v0.6.html">Changelog</a>).
         <li><strong>Mar 2018.</strong> scikit-optimize 0.5  (<a href="whats_new/v0.5.html">Changelog</a>).
         <li><strong>Aug 2017.</strong> scikit-optimize 0.4 (<a href="whats_new/v0.4.html">Changelog</a>).
diff --git a/doc/themes/scikit-learn-modern/javascript.html b/doc/themes/scikit-learn-modern/javascript.html
index f62c657d1..4d9685653 100644
--- a/doc/themes/scikit-learn-modern/javascript.html
+++ b/doc/themes/scikit-learn-modern/javascript.html
@@ -10,7 +10,7 @@
 
 <script>
 $(document).ready(function() {
-    /* Add a [>>>] button on the top-right corner of code samples to hide
+    /* Add a [>>>] button on the top-right corner of code sampler to hide
      * the >>> and ... prompts and the output and thus make the code
      * copyable. */
     var div = $('.highlight-python .highlight,' +
diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html
index a2ad0e25d..362a8f153 100644
--- a/doc/themes/scikit-learn-modern/layout.html
+++ b/doc/themes/scikit-learn-modern/layout.html
@@ -78,6 +78,7 @@
         <div class="alert alert-danger p-1 mb-2" role="alert">
           <p class="text-center mb-0">
           <strong>scikit-optimize {{ release }}</strong><br/>
+            <a href="https://scikit-optimize.github.io/dev/versions.html">Other versions</a>
           </p>
         </div>
         {%- endif %}
diff --git a/doc/themes/scikit-learn-modern/nav.html b/doc/themes/scikit-learn-modern/nav.html
index 5f908d94e..3c1109ecf 100644
--- a/doc/themes/scikit-learn-modern/nav.html
+++ b/doc/themes/scikit-learn-modern/nav.html
@@ -9,7 +9,8 @@
 {%- set drop_down_navigation = [
   ('Getting Started', pathto('getting_started')),
   ('Development', pathto('development')),
-  ('GitHub', 'https://github.com/scikit-optimize/scikit-optimize')]
+  ('GitHub', 'https://github.com/scikit-optimize/scikit-optimize'),
+  ('Other Versions', 'https://scikit-optimize.github.io/dev/versions.html')]
 -%}
 
 <nav id="navbar" class="{{ nav_bar_class }} navbar navbar-expand-md navbar-light bg-light py-0">
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index c862a9a53..15dd110f7 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -8,7 +8,8 @@ Release notes for all scikit-optimize releases are linked in this this page.
 .. toctree::
     :maxdepth: 1
 
-    Version 0.7.1 <whats_new/v0.7.1.rst>
+    Version 0.9 <whats_new/v0.9.rst>
+    Version 0.8 <whats_new/v0.8.rst>
     Version 0.7 <whats_new/v0.7.rst>
     Version 0.6 <whats_new/v0.6.rst>
     Version 0.5 <whats_new/v0.5.rst>
diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst
new file mode 100644
index 000000000..83453a9fe
--- /dev/null
+++ b/doc/whats_new/_contributors.rst
@@ -0,0 +1,12 @@
+.. role:: raw-html(raw)
+   :format: html
+
+.. role:: raw-latex(raw)
+   :format: latex
+
+.. |MajorFeature| replace:: :raw-html:`<span class="badge badge-success">Major Feature</span>` :raw-latex:`{\small\sc [Major Feature]}`
+.. |Feature| replace:: :raw-html:`<span class="badge badge-success">Feature</span>` :raw-latex:`{\small\sc [Feature]}`
+.. |Efficiency| replace:: :raw-html:`<span class="badge badge-info">Efficiency</span>` :raw-latex:`{\small\sc [Efficiency]}`
+.. |Enhancement| replace:: :raw-html:`<span class="badge badge-info">Enhancement</span>` :raw-latex:`{\small\sc [Enhancement]}`
+.. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
+.. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`
diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst
index 961cb1f34..44372a512 100644
--- a/doc/whats_new/v0.4.rst
+++ b/doc/whats_new/v0.4.rst
@@ -8,7 +8,7 @@ New features
 
 * Support early stopping of optimization loop.
 * Benchmarking scripts to evaluate performance of different surrogate models.
-* Support for parallel evaluations of the objective function via several   constant liar stategies.
+* Support for parallel evaluations of the objective function via several   constant liar strategies.
 * BayesSearchCV as a drop in replacement for scikit-learn's GridSearchCV.
 * New acquisition functions "EIps" and "PIps" that takes into account function compute time.
 
diff --git a/doc/whats_new/v0.7.1.rst b/doc/whats_new/v0.7.1.rst
deleted file mode 100644
index 800008942..000000000
--- a/doc/whats_new/v0.7.1.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-Version 0.7.1
-=============
-
-New features
-------------
-
-* Sphinx documentation
-* notebooks are replaced by sphinx-gallery
-* New StringEncoder, can be used in Categoricals
-* Remove string conversion in Identity
-* dtype can be set in Integer and Real
-
-Bug fixes
----------
-
-* Fix categorical space (issue #821)
-* int can be set as dtype to fix issue #790
-
-Maintenance
------------
-
-* Old pdoc scripts are removed and replaced by sphinx
diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
index 335cb71fa..3cad94bc1 100644
--- a/doc/whats_new/v0.7.rst
+++ b/doc/whats_new/v0.7.rst
@@ -1,22 +1,116 @@
+.. include:: _contributors.rst
+
+.. currentmodule:: skopt
+
+.. _changes_0_7_2:
+
+Version 0.7.2
+=============
+**February 2020**
+
+:mod:`skopt.optimizer`
+----------------------
+- |Feature| update_next() and get_results() added to Optimize and
+  add more examples
+  :pr:`837` by :user:`Holger Nahrstaedt <holgern>` and
+  :user:`Sigurd Carlsen <sigurdcarlsen>`
+- |Fix| Fix random forest regressor (Add missing min_impurity_decrease)
+  :pr:`829` by :user:`Holger Nahrstaedt <holgern>`
+
+:mod:`skopt.utils`
+------------------
+- |Enhancement| Add expected_minimum_random_sampling
+  :pr:`830` by :user:`Holger Nahrstaedt <holgern>`
+- |FIX| Return ordereddict in point_asdict and add some more unit tests.
+  :pr:`840` by :user:`Holger Nahrstaedt <holgern>`
+- |Enhancement| Added `check_list_types` and `check_dimension_names`
+  :pr:`803` by :user:`Hvass-Labs <Hvass-Labs>` and
+  :user:`Holger Nahrstaedt <holgern>`
+
+:mod:`skopt.plots`
+------------------
+- |Enhancement| Add more parameter to plot_objective and more plot examples
+  :pr:`830` by :user:`Holger Nahrstaedt <holgern>` and
+  :user:`Sigurd Carlsen <sigurdcarlsen>`
+
+:mod:`skopt.searchcv`
+---------------------
+- |Fix| Fix searchcv rank (issue #831)
+  :pr:`832` by :user:`Holger Nahrstaedt <holgern>`
+
+:mod:`skopt.space`
+------------------
+* |Fix| Fix integer normalize by using round()
+  :pr:`830` by :user:`Holger Nahrstaedt <holgern>`
+
+Miscellaneous
+-------------
+* |Fix| Fix doc examples
+* |Fix| Fix license detection in github
+  :pr:`827` by :user:`Holger Nahrstaedt <holgern>`
+* |Enhancement| Add doctest to CI
+
+.. _changes_0_7_1:
+
+Version 0.7.1
+=============
+**February 2020**
+
+:mod:`skopt.space`
+------------------
+* |Fix| Fix categorical space (issue #821)
+  :pr:`823` by :user:`Holger Nahrstaedt <holgern>`
+* |Enhancement| int can be set as dtype to fix issue #790
+  :pr:`807` by :user:`Holger Nahrstaedt <holgern>`
+* |Feature| New StringEncoder, can be used in Categoricals
+* Remove string conversion in Identity
+* |Enhancement| dtype can be set in Integer and Real
+
+Miscellaneous
+-------------
+- Sphinx documentation
+  :pr:`809` by :user:`Holger Nahrstaedt <holgern>`
+- notebooks are replaced by sphinx-gallery
+  :pr:`811` by :user:`Holger Nahrstaedt <holgern>`
+- Improve sphinx doc
+  :pr:`819` by :user:`Holger Nahrstaedt <holgern>`
+- Old pdoc scripts are removed and replaced by sphinx
+  :pr:`822` by :user:`Holger Nahrstaedt <holgern>`
+
+.. _changes_0_7:
+
 Version 0.7
 ===========
+**January 2020**
 
-New features
-------------
-
-* Models queue has now a customizable size (model_queue_size).
-* Add log-uniform prior to Integer space
-* Support for plotting categorical dimensions
+:mod:`skopt.optimizer`
+----------------------
+- |Enhancement| Models queue has now a customizable size (model_queue_size).
+  :pr:`803` by :user:`Kajetan Tukendorf <Bacoknight>` and
+  :user:`Holger Nahrstaedt <holgern>`
+- |Enhancement| Add log-uniform prior to Integer space
+  :pr:`805` by :user:`Alex Liebscher <liebscher>`
 
-Bug fixes
----------
+:mod:`skopt.plots`
+------------------
+- |Enhancement| Support for plotting categorical dimensions
+  :pr:`806` by :user:`jkleint <jkleint>`
 
-* Allow BayesSearchCV to work with sklearn 0.21
-* Reduce the amount of deprecation warnings in unit tests
+:mod:`skopt.searchcv`
+---------------------
+- |Fix| Allow BayesSearchCV to work with sklearn 0.21.
+  :pr:`777` by :user:`Kit Choi <kitchoi>`
 
-Maintenance
------------
+Miscellaneous
+-------------
 
-* joblib instead of sklearn.externals.joblib
-* Improve travis CI unit tests (Different sklearn version are checked)
-* Removed `versioneer` support, to keep things simple and to fix pypi deploy
+- |Fix| Reduce the amount of deprecation warnings in unit tests
+  :pr:`808` by :user:`Holger Nahrstaedt <holgern>`
+- |Fix| Reduce the amount of deprecation warnings in unit tests
+  :pr:`802` by :user:`Alex Liebscher <liebscher>`
+- joblib instead of sklearn.externals.joblib
+  :pr:`776` by :user:`Vince Jankovics <vakker>`
+- Improve travis CI unit tests (Different sklearn version are checked)
+  :pr:`804` by :user:`Holger Nahrstaedt <holgern>`
+- Removed `versioneer` support, to keep things simple and to fix pypi deploy
+  :pr:`816` by :user:`Holger Nahrstaedt <holgern>`
diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
index 0fe66437f..822ff1cbb 100644
--- a/doc/whats_new/v0.8.rst
+++ b/doc/whats_new/v0.8.rst
@@ -1,11 +1,76 @@
-Version 0.8
-===========
+.. include:: _contributors.rst
 
-New features
-------------
+.. currentmodule:: skopt
 
-Bug fixes
----------
+.. _changes_0_8_1:
+
+Version 0.8.1
+=============
+**September 2020**
+
+- |Fix| GaussianProcessRegressor on sklearn 0.23 normalizes the
+  variance to 1, which needs to reverted on predict.
+
+.. _changes_0_8:
+
+Version 0.8.0
+=============
+**September 2020**
+
+:mod:`skopt.Optimizer`
+----------------------
+- |Enhancement| n_jobs support was added to Optimizer and
+  fixed for forest_minimize
+  :pr:`884` by :user:`Holger Nahrstaedt <holgern>`
+  based on :pr:`627` by :user:`JPN <jonathanng>`
+
+:mod:`skopt.plots`
+------------------
+- |Enhancement| Allow dimension selection for plot_objective
+  and plot_evaluations and add plot_histogram and plot_objective_2D.
+  Plot code has been refactored.
+  :pr:`848` by :user:`Holger Nahrstaedt <holgern>`
+  based on :pr:`579` by :user:`Hvass-Labs <Hvass-Labs>`
+
+:mod:`skopt.sampler`
+--------------------
+- |MajorFeature| Initial sampling generation
+  from latin hypercube, sobol, hammersly and halton
+  is possible and can be set in all optimizers
+  :pr:`835` by :user:`Holger Nahrstaedt <holgern>`
+- |Enhancement| Improve sampler and add grid sampler
+  :pr:`851` by :user:`Holger Nahrstaedt <holgern>`
+
+:mod:`skopt.searchcv`
+---------------------
+- |Fix| Fix library for scikit-learn >= 0.23.
+  numpy MaskArray is replaced by numpy.ma.array.
+  y_normalize=False has been added and initial runs
+  has been increased.
+  :pr: `939` by :user:`Lucas Plagwitz <lucasplagwitz>`
+
+:mod:`skopt.space`
+------------------
+- |Fix| Fix Integer transform and inverse_transform for normalize
+  :pr:`880` by :user:`Holger Nahrstaedt <holgern>`
+- |Enhancement| Add `is_constant`  property to dimension and
+  `n_constant_dimensions` property to Space
+  :pr:`883` by :user:`Holger Nahrstaedt <holgern>`
+- |Enhancement| Skip constant dimensions for plot_objective and
+  plot_evaluations
+  to allow plots using BayesSearchCV
+  :pr:`888` by :user:`Holger Nahrstaedt <holgern>`
+
+:mod:`skopt.utils`
+------------------
+- |Fix| Fix Optimizer for full categorical spaces
+  :pr:`874` by :user:`Holger Nahrstaedt <holgern>`
+
+Miscellaneous
+-------------
+- Improve circle ci
+  :pr:`852` by :user:`Holger Nahrstaedt <holgern>`
+- Add project toml and adapt minimal numpy, scipy, pyyaml and
+  joblib version in setup.py
+  :pr:`850` by :user:`Holger Nahrstaedt <holgern>`
 
-Maintenance
------------
diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst
new file mode 100644
index 000000000..979e09c67
--- /dev/null
+++ b/doc/whats_new/v0.9.rst
@@ -0,0 +1,28 @@
+.. include:: _contributors.rst
+
+.. currentmodule:: skopt
+
+.. _changes_0_9:
+
+Version 0.9.0
+=============
+**October 2021**
+
+- |Fix| :obj:`skopt.learning.gaussian_process.gpr.GaussianProcessRegressor`
+  for sklearn >= 0.23. :pr:`943`
+- Change `skip=` parameter in :obj:`skopt.sampler.sobol.Sobol`
+  initial point generator. :pr:`955`
+- |Feature| :obj:`skopt.callbacks.HollowIterationsStopper` callback. :pr:`917`
+- |Feature| :obj:`skopt.callbacks.ThresholdStopper` callback. :pr:`1000`
+- |Fix| Fix :obj:`skopt.searchcv.BayesSearchCV` for scikit-learn >= 0.24.
+  :pr:`988`
+- |API| Deprecate :class:`skopt.searchcv.BayesSearchCV` parameter `iid=`.
+  :pr:`988`
+- |Fix| NumPy deprecation errors. :pr:`1023`
+- |Fix| issue with :class:`skopt.optimizer.optimizer.Optimizer` not being
+  garbage-collectable. :pr:`1029`
+- |Fix| version check in
+  :class:`skopt.learning.gaussian_process.gpr.GaussianProcessRegressor`
+  for scikit-learn >= 1.0. :pr:`1063`
+- Minor documentation improvements.
+- Various small bugs and fixes.
diff --git a/examples/ask-and-tell.py b/examples/ask-and-tell.py
index 21433b7a0..eb806632b 100644
--- a/examples/ask-and-tell.py
+++ b/examples/ask-and-tell.py
@@ -28,8 +28,8 @@
 
 import numpy as np
 np.random.seed(1234)
-
 import matplotlib.pyplot as plt
+from skopt.plots import plot_gaussian_process
 
 #############################################################################
 # The Setup
@@ -51,6 +51,9 @@ def objective(x, noise_level=noise_level):
     return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2))\
            + np.random.randn() * noise_level
 
+def objective_wo_noise(x, noise_level=0):
+    return objective(x, noise_level=0)
+
 #########################################################################
 # Here a quick plot to visualize what the function looks like:
 
@@ -72,7 +75,9 @@ def objective(x, noise_level=noise_level):
 # naming of the ***_minimize()** functions. An important difference is that
 # you do not pass the objective function to the optimizer.
 
-opt = Optimizer([(-2.0, 2.0)], "ET", acq_optimizer="sampling")
+opt = Optimizer([(-2.0, 2.0)], "GP", acq_func="EI",
+                acq_optimizer="sampling",
+                initial_point_generator="lhs")
 
 # To obtain a suggestion for the point at which to evaluate the objective
 # you call the ask() method of opt:
@@ -90,58 +95,24 @@ def objective(x, noise_level=noise_level):
 opt.tell(next_x, f_val)
 
 #########################################################################
-# Like ***_minimize()** the first few points are random suggestions as there
+# Like ***_minimize()** the first few points are suggestions from
+# the initial point generator as there
 # is no data yet with which to fit a surrogate model.
 
 
 for i in range(9):
     next_x = opt.ask()
     f_val = objective(next_x)
-    opt.tell(next_x, f_val)
+    res = opt.tell(next_x, f_val)
 
 #########################################################################
 # We can now plot the random suggestions and the first model that has been
 # fit:
-
-from skopt.acquisition import gaussian_ei
-
-
-def plot_optimizer(opt, x, fx):
-    model = opt.models[-1]
-    x_model = opt.space.transform(x.tolist())
-
-    # Plot true function.
-    plt.plot(x, fx, "r--", label="True (unknown)")
-    plt.fill(np.concatenate([x, x[::-1]]),
-             np.concatenate([fx - 1.9600 * noise_level,
-                             fx[::-1] + 1.9600 * noise_level]),
-             alpha=.2, fc="r", ec="None")
-
-    # Plot Model(x) + contours
-    y_pred, sigma = model.predict(x_model, return_std=True)
-    plt.plot(x, y_pred, "g--", label=r"$\mu(x)$")
-    plt.fill(np.concatenate([x, x[::-1]]),
-             np.concatenate([y_pred - 1.9600 * sigma,
-                             (y_pred + 1.9600 * sigma)[::-1]]),
-             alpha=.2, fc="g", ec="None")
-
-    # Plot sampled points
-    plt.plot(opt.Xi, opt.yi,
-             "r.", markersize=8, label="Observations")
-
-    acq = gaussian_ei(x_model, model, y_opt=np.min(opt.yi))
-    # shift down to make a better plot
-    acq = 4 * acq - 2
-    plt.plot(x, acq, "b", label="EI(x)")
-    plt.fill_between(x.ravel(), -2.0, acq.ravel(), alpha=0.3, color='blue')
-
-    # Adjust plot layout
-    plt.grid()
-    plt.legend(loc='best')
-
-
-plot_optimizer(opt, x, fx)
-
+_ = plot_gaussian_process(res, objective=objective_wo_noise,
+                          noise_level=noise_level,
+                          show_next_point=False,
+                          show_acq_func=True)
+plt.show()
 #########################################################################
 # Let us sample a few more points and plot the optimizer again:
 
@@ -149,10 +120,13 @@ def plot_optimizer(opt, x, fx):
 for i in range(10):
     next_x = opt.ask()
     f_val = objective(next_x)
-    opt.tell(next_x, f_val)
-
-plot_optimizer(opt, x, fx)
+    res = opt.tell(next_x, f_val)
 
+_ = plot_gaussian_process(res, objective=objective_wo_noise,
+                          noise_level=noise_level,
+                          show_next_point=True,
+                          show_acq_func=True)
+plt.show()
 #########################################################################
 # By using the :class:`Optimizer` class directly you get control over the
 # optimization loop.
diff --git a/examples/bayesian-optimization.py b/examples/bayesian-optimization.py
index 4e7977161..757c4d678 100644
--- a/examples/bayesian-optimization.py
+++ b/examples/bayesian-optimization.py
@@ -19,13 +19,15 @@
 under the constraints that
 
 - :math:`f` is a black box for which no closed form is known
-    (nor its gradients);
+  (nor its gradients);
 - :math:`f` is expensive to evaluate;
 - and evaluations of :math:`y = f(x)` may be noisy.
 
 **Disclaimer.** If you do not have these constraints, then there
 is certainly a better optimization algorithm than Bayesian optimization.
 
+This example uses :class:`plots.plot_gaussian_process` which is available
+since version 0.8.
 
 Bayesian optimization loop
 --------------------------
@@ -33,15 +35,12 @@
 For :math:`t=1:T`:
 
 1. Given observations :math:`(x_i, y_i=f(x_i))` for :math:`i=1:t`, build a
-    probabilistic model for the objective :math:`f`. Integrate out all
-    possible true functions, using Gaussian process regression.
-
-2. optimize a cheap acquisition/utility function $u$ based on the posterior
-    distribution for sampling the next point.
-
-    .. math::
-        x_{t+1} = arg \min_x u(x)
+   probabilistic model for the objective :math:`f`. Integrate out all
+   possible true functions, using Gaussian process regression.
 
+2. optimize a cheap acquisition/utility function :math:`u` based on the
+   posterior distribution for sampling the next point.
+   :math:`x_{t+1} = arg \\min_x u(x)`
    Exploit uncertainty to balance exploration against exploitation.
 
 3. Sample the next observation :math:`y_{t+1}` at :math:`x_{t+1}`.
@@ -54,10 +53,8 @@
 tried next:
 
 - Expected improvement (default):
-    :math:`-EI(x) = -\mathbb{E} [f(x) - f(x_t^+)]`
-
+  :math:`-EI(x) = -\\mathbb{E} [f(x) - f(x_t^+)]`
 - Lower confidence bound: :math:`LCB(x) = \mu_{GP}(x) + \kappa \sigma_{GP}(x)`
-
 - Probability of improvement: :math:`-PI(x) = -P(f(x) \geq f(x_t^+) + \kappa)`
 
 where :math:`x_t^+` is the best point observed so far.
@@ -70,8 +67,9 @@
 print(__doc__)
 
 import numpy as np
-np.random.seed(1234)
+np.random.seed(237)
 import matplotlib.pyplot as plt
+from skopt.plots import plot_gaussian_process
 
 #############################################################################
 # Toy example
@@ -114,7 +112,7 @@ def f(x, noise_level=noise_level):
                   n_calls=15,         # the number of evaluations of f
                   n_random_starts=5,  # the number of random initialization points
                   noise=0.1**2,       # the noise level (optional)
-                  random_state=123)   # the random seed
+                  random_state=1234)   # the random seed
 
 #############################################################################
 # Accordingly, the approximated minimum is found to be:
@@ -129,7 +127,7 @@ def f(x, noise_level=noise_level):
 # - `fun` [float]: function value at the minimum.
 # - `models`: surrogate models used for each iteration.
 # - `x_iters` [array]:
-#    location of function evaluation for each iteration.
+#   location of function evaluation for each iteration.
 # - `func_vals` [array]: function value for each iteration.
 # - `space` [Space]: the optimization space.
 # - `specs` [dict]: parameters passed to the function.
@@ -150,73 +148,40 @@ def f(x, noise_level=noise_level):
 # 1. The approximation of the fit gp model to the original function.
 # 2. The acquisition values that determine the next point to be queried.
 
-from skopt.acquisition import gaussian_ei
-
 plt.rcParams["figure.figsize"] = (8, 14)
 
-x = np.linspace(-2, 2, 400).reshape(-1, 1)
-x_gp = res.space.transform(x.tolist())
-fx = np.array([f(x_i, noise_level=0.0) for x_i in x])
+
+def f_wo_noise(x):
+    return f(x, noise_level=0)
 
 #############################################################################
 # Plot the 5 iterations following the 5 random points
 
 for n_iter in range(5):
-    gp = res.models[n_iter]
-    curr_x_iters = res.x_iters[:5+n_iter]
-    curr_func_vals = res.func_vals[:5+n_iter]
-
     # Plot true function.
     plt.subplot(5, 2, 2*n_iter+1)
-    plt.plot(x, fx, "r--", label="True (unknown)")
-    plt.fill(np.concatenate([x, x[::-1]]),
-             np.concatenate([fx - 1.9600 * noise_level,
-                             fx[::-1] + 1.9600 * noise_level]),
-             alpha=.2, fc="r", ec="None")
-
-    # Plot GP(x) + contours
-    y_pred, sigma = gp.predict(x_gp, return_std=True)
-    plt.plot(x, y_pred, "g--", label=r"$\mu_{GP}(x)$")
-    plt.fill(np.concatenate([x, x[::-1]]),
-             np.concatenate([y_pred - 1.9600 * sigma,
-                             (y_pred + 1.9600 * sigma)[::-1]]),
-             alpha=.2, fc="g", ec="None")
-
-    # Plot sampled points
-    plt.plot(curr_x_iters, curr_func_vals,
-             "r.", markersize=8, label="Observations")
-
-    # Adjust plot layout
-    plt.grid()
 
     if n_iter == 0:
-        plt.legend(loc="best", prop={'size': 6}, numpoints=1)
-
-    if n_iter != 4:
-        plt.tick_params(axis='x', which='both', bottom='off',
-                        top='off', labelbottom='off')
-
+        show_legend = True
+    else:
+        show_legend = False
+
+    ax = plot_gaussian_process(res, n_calls=n_iter,
+                               objective=f_wo_noise,
+                               noise_level=noise_level,
+                               show_legend=show_legend, show_title=False,
+                               show_next_point=False, show_acq_func=False)
+    ax.set_ylabel("")
+    ax.set_xlabel("")
     # Plot EI(x)
     plt.subplot(5, 2, 2*n_iter+2)
-    acq = gaussian_ei(x_gp, gp, y_opt=np.min(curr_func_vals))
-    plt.plot(x, acq, "b", label="EI(x)")
-    plt.fill_between(x.ravel(), -2.0, acq.ravel(), alpha=0.3, color='blue')
-
-    next_x = res.x_iters[5+n_iter]
-    next_acq = gaussian_ei(res.space.transform([next_x]), gp,
-                           y_opt=np.min(curr_func_vals))
-    plt.plot(next_x, next_acq, "bo", markersize=6, label="Next query point")
-
-    # Adjust plot layout
-    plt.ylim(0, 0.1)
-    plt.grid()
-
-    if n_iter == 0:
-        plt.legend(loc="best", prop={'size': 6}, numpoints=1)
-
-    if n_iter != 4:
-        plt.tick_params(axis='x', which='both', bottom='off',
-                        top='off', labelbottom='off')
+    ax = plot_gaussian_process(res, n_calls=n_iter,
+                               show_legend=show_legend, show_title=False,
+                               show_mu=False, show_acq_func=True,
+                               show_observations=False,
+                               show_next_point=True)
+    ax.set_ylabel("")
+    ax.set_xlabel("")
 
 plt.show()
 
@@ -242,33 +207,7 @@ def f(x, noise_level=noise_level):
 plt.rcParams["figure.figsize"] = (6, 4)
 
 # Plot f(x) + contours
-x = np.linspace(-2, 2, 400).reshape(-1, 1)
-x_gp = res.space.transform(x.tolist())
-
-fx = [f(x_i, noise_level=0.0) for x_i in x]
-plt.plot(x, fx, "r--", label="True (unknown)")
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate(([fx_i - 1.9600 * noise_level for fx_i in fx],
-                         [fx_i + 1.9600 * noise_level for fx_i in fx[::-1]])),
-         alpha=.2, fc="r", ec="None")
-
-# Plot GP(x) + contours
-gp = res.models[-1]
-y_pred, sigma = gp.predict(x_gp, return_std=True)
-
-plt.plot(x, y_pred, "g--", label=r"$\mu_{GP}(x)$")
-plt.fill(np.concatenate([x, x[::-1]]),
-         np.concatenate([y_pred - 1.9600 * sigma,
-                         (y_pred + 1.9600 * sigma)[::-1]]),
-         alpha=.2, fc="g", ec="None")
-
-# Plot sampled points
-plt.plot(res.x_iters,
-         res.func_vals,
-         "r.", markersize=15, label="Observations")
-
-plt.title(r"$x^* = %.4f, f(x^*) = %.4f$" % (res.x[0], res.fun))
-plt.legend(loc="best", prop={'size': 8}, numpoints=1)
-plt.grid()
+_ = plot_gaussian_process(res, objective=f_wo_noise,
+                          noise_level=noise_level)
 
 plt.show()
diff --git a/examples/exploration-vs-exploitation.py b/examples/exploration-vs-exploitation.py
new file mode 100644
index 000000000..c00fd205f
--- /dev/null
+++ b/examples/exploration-vs-exploitation.py
@@ -0,0 +1,187 @@
+"""
+===========================
+Exploration vs exploitation
+===========================
+
+Sigurd Carlen, September 2019.
+Reformatted by Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+
+We can control how much the acqusition function favors exploration and
+exploitation by tweaking the two parameters kappa and xi. Higher values
+means more exploration and less exploitation and vice versa with low values.
+
+kappa is only used if acq_func is set to "LCB". xi is used when acq_func is
+"EI" or "PI". By default the acqusition function is set to "gp_hedge" which
+chooses the best of these three. Therefore I recommend not using gp_hedge
+when tweaking exploration/exploitation, but instead choosing "LCB",
+"EI" or "PI".
+
+The way to pass kappa and xi to the optimizer is to use the named argument
+"acq_func_kwargs". This is a dict of extra arguments for the aqcuisition
+function.
+
+If you want opt.ask() to give a new acquisition value immediately after
+tweaking kappa or xi call opt.update_next(). This ensures that the next
+value is updated with the new acquisition parameters.
+
+This example uses :class:`plots.plot_gaussian_process` which is available
+since version 0.8.
+"""
+print(__doc__)
+
+import numpy as np
+np.random.seed(1234)
+import matplotlib.pyplot as plt
+from skopt.learning import ExtraTreesRegressor
+from skopt import Optimizer
+from skopt.plots import plot_gaussian_process
+
+#############################################################################
+# Toy example
+# -----------
+# First we define our objective like in the ask-and-tell example notebook and
+# define a plotting function. We do however only use on initial random point.
+# All points after the first one is therefore chosen by the acquisition
+# function.
+
+noise_level = 0.1
+
+
+# Our 1D toy problem, this is the function we are trying to
+# minimize
+def objective(x, noise_level=noise_level):
+    return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) +\
+           np.random.randn() * noise_level
+
+
+def objective_wo_noise(x):
+    return objective(x, noise_level=0)
+#############################################################################
+
+
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_optimizer="sampling")
+#############################################################################
+# Plotting parameters
+
+plot_args = {"objective": objective_wo_noise,
+             "noise_level": noise_level, "show_legend": True,
+             "show_title": True, "show_next_point": False,
+             "show_acq_func": True}
+
+#############################################################################
+# We run a an optimization loop with standard settings
+
+for i in range(30):
+    next_x = opt.ask()
+    f_val = objective(next_x)
+    opt.tell(next_x, f_val)
+# The same output could be created with opt.run(objective, n_iter=30)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+
+#############################################################################
+# We see that some minima is found and "exploited"
+#
+# Now lets try to set kappa and xi using'to other values and
+# pass it to the optimizer:
+acq_func_kwargs = {"xi": 10000, "kappa": 10000}
+#############################################################################
+
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+# We see that the points are more random now.
+#
+# This works both for kappa when using acq_func="LCB":
+
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="LCB", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+# And for xi when using acq_func="EI": or acq_func="PI":
+
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="PI", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+# We can also favor exploitaton:
+acq_func_kwargs = {"xi": 0.000001, "kappa": 0.001}
+#############################################################################
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="LCB", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="EI", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="PI", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+
+#############################################################################
+# Note that negative values does not work with the "PI"-acquisition function
+# but works with "EI":
+acq_func_kwargs = {"xi": -1000000000000}
+#############################################################################
+
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="PI", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="EI", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+# Changing kappa and xi on the go
+# -------------------------------
+# If we want to change kappa or ki at any point during our optimization
+# process we just replace opt.acq_func_kwargs. Remember to call
+# `opt.update_next()` after the change, in order for next point to be
+# recalculated.
+acq_func_kwargs = {"kappa": 0}
+#############################################################################
+opt = Optimizer([(-2.0, 2.0)], "GP", n_initial_points=3,
+                acq_func="LCB", acq_optimizer="sampling",
+                acq_func_kwargs=acq_func_kwargs)
+#############################################################################
+opt.acq_func_kwargs
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
+#############################################################################
+acq_func_kwargs = {"kappa": 100000}
+#############################################################################
+opt.acq_func_kwargs = acq_func_kwargs
+opt.update_next()
+#############################################################################
+opt.run(objective, n_iter=20)
+_ = plot_gaussian_process(opt.get_result(), **plot_args)
diff --git a/examples/hyperparameter-optimization.py b/examples/hyperparameter-optimization.py
index 0de3bb1fb..46a75aec9 100644
--- a/examples/hyperparameter-optimization.py
+++ b/examples/hyperparameter-optimization.py
@@ -22,7 +22,7 @@
 
 In this notebook, we illustrate how to couple :class:`gp_minimize` with sklearn's
 estimators to tune hyper-parameters using sequential model-based optimisation,
-hopefully resulting in equivalent or better solutions, but within less
+hopefully resulting in equivalent or better solutions, but within fewer
 evaluations.
 
 Note: scikit-optimize provides a dedicated interface for estimator tuning via
diff --git a/examples/interruptible-optimization.py b/examples/interruptible-optimization.py
index a4a0dff20..0d39ecf2b 100644
--- a/examples/interruptible-optimization.py
+++ b/examples/interruptible-optimization.py
@@ -32,11 +32,6 @@
 np.random.seed(777)
 import os
 
-# The followings are hacks to allow sphinx-gallery to run the example.
-sys.path.insert(0, os.getcwd())
-main_dir = os.path.basename(sys.modules['__main__'].__file__)
-IS_RUN_WITH_SPHINX_GALLERY = main_dir != os.getcwd()
-
 #############################################################################
 # Simple example
 # ==============
@@ -52,25 +47,22 @@
 
 noise_level = 0.1
 
-if IS_RUN_WITH_SPHINX_GALLERY:
-    # When this example is run with sphinx gallery, it breaks the pickling
-    # capacity for multiprocessing backend so we have to modify the way we
-    # define our functions. This has nothing to do with the example.
-    from utils import obj_fun
-else:
-    def obj_fun(x, noise_level=noise_level):
-        return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) + np.random.randn() * noise_level
+
+def obj_fun(x, noise_level=noise_level):
+    return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) + np.random.randn() \
+        * noise_level
 
 checkpoint_saver = CheckpointSaver("./checkpoint.pkl", compress=9) # keyword arguments will be passed to `skopt.dump`
 
-gp_minimize(obj_fun,                       # the function to minimize
-              [(-20.0, 20.0)],             # the bounds on each dimension of x
-              x0=[-20.],                     # the starting point
-              acq_func="LCB",              # the acquisition function (optional)
-              n_calls=10,                   # the number of evaluations of f including at x0
-              n_random_starts=0,           # the number of random initialization points
-              callback=[checkpoint_saver], # a list of callbacks including the checkpoint saver
-              random_state=777);
+gp_minimize(obj_fun,            # the function to minimize
+            [(-20.0, 20.0)],    # the bounds on each dimension of x
+            x0=[-20.],          # the starting point
+            acq_func="LCB",     # the acquisition function (optional)
+            n_calls=10,         # number of evaluations of f including at x0
+            n_random_starts=3,  # the number of random initial points
+            callback=[checkpoint_saver],
+            # a list of callbacks including the checkpoint saver
+            random_state=777)
 
 #############################################################################
 # Now let's assume this did not finish at once but took some long time: you
@@ -107,14 +99,14 @@ def obj_fun(x, noise_level=noise_level):
 y0 = res.func_vals
 
 gp_minimize(obj_fun,            # the function to minimize
-              [(-20.0, 20.0)],    # the bounds on each dimension of x
-              x0=x0,              # already examined values for x
-              y0=y0,              # observed values for x0
-              acq_func="LCB",     # the acquisition function (optional)
-              n_calls=10,         # the number of evaluations of f including at x0
-              n_random_starts=0,  # the number of random initialization points
-              callback=[checkpoint_saver],
-              random_state=777);
+            [(-20.0, 20.0)],    # the bounds on each dimension of x
+            x0=x0,              # already examined values for x
+            y0=y0,              # observed values for x0
+            acq_func="LCB",     # the acquisition function (optional)
+            n_calls=10,         # number of evaluations of f including at x0
+            n_random_starts=3,  # the number of random initialization points
+            callback=[checkpoint_saver],
+            random_state=777)
 
 #############################################################################
 # Possible problems
diff --git a/examples/optimizer-with-different-base-estimator.py b/examples/optimizer-with-different-base-estimator.py
new file mode 100644
index 000000000..064f13dba
--- /dev/null
+++ b/examples/optimizer-with-different-base-estimator.py
@@ -0,0 +1,139 @@
+"""
+==============================================
+Use different base estimators for optimization
+==============================================
+
+Sigurd Carlen, September 2019.
+Reformatted by Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+
+To use different base_estimator or create a regressor with different parameters,
+we can create a regressor object and set it as kernel.
+
+This example uses :class:`plots.plot_gaussian_process` which is available
+since version 0.8.
+"""
+print(__doc__)
+
+import numpy as np
+np.random.seed(1234)
+import matplotlib.pyplot as plt
+from skopt.plots import plot_gaussian_process
+from skopt import Optimizer
+
+#############################################################################
+# Toy example
+# -----------
+#
+# Let assume the following noisy function :math:`f`:
+
+noise_level = 0.1
+
+# Our 1D toy problem, this is the function we are trying to
+# minimize
+
+
+def objective(x, noise_level=noise_level):
+    return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2))\
+           + np.random.randn() * noise_level
+
+
+def objective_wo_noise(x):
+    return objective(x, noise_level=0)
+
+#############################################################################
+
+opt_gp = Optimizer([(-2.0, 2.0)], base_estimator="GP", n_initial_points=5,
+                acq_optimizer="sampling", random_state=42)
+
+#############################################################################
+
+
+def plot_optimizer(res, n_iter, max_iters=5):
+    if n_iter == 0:
+        show_legend = True
+    else:
+        show_legend = False
+    ax = plt.subplot(max_iters, 2, 2 * n_iter + 1)
+    # Plot GP(x) + contours
+    ax = plot_gaussian_process(res, ax=ax,
+                               objective=objective_wo_noise,
+                               noise_level=noise_level,
+                               show_legend=show_legend, show_title=True,
+                               show_next_point=False, show_acq_func=False)
+    ax.set_ylabel("")
+    ax.set_xlabel("")
+    if n_iter < max_iters - 1:
+        ax.get_xaxis().set_ticklabels([])
+    # Plot EI(x)
+    ax = plt.subplot(max_iters, 2, 2 * n_iter + 2)
+    ax = plot_gaussian_process(res, ax=ax,
+                               noise_level=noise_level,
+                               show_legend=show_legend, show_title=False,
+                               show_next_point=True, show_acq_func=True,
+                               show_observations=False,
+                               show_mu=False)
+    ax.set_ylabel("")
+    ax.set_xlabel("")
+    if n_iter < max_iters - 1:
+        ax.get_xaxis().set_ticklabels([])
+
+
+#############################################################################
+# GP kernel
+# ---------
+
+fig = plt.figure()
+fig.suptitle("Standard GP kernel")
+for i in range(10):
+    next_x = opt_gp.ask()
+    f_val = objective(next_x)
+    res = opt_gp.tell(next_x, f_val)
+    if i >= 5:
+        plot_optimizer(res, n_iter=i-5, max_iters=5)
+plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+plt.plot()
+
+#############################################################################
+# Test different kernels
+# ----------------------
+
+from skopt.learning import GaussianProcessRegressor
+from skopt.learning.gaussian_process.kernels import ConstantKernel, Matern
+# Gaussian process with Matérn kernel as surrogate model
+
+from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
+                                              ExpSineSquared, DotProduct,
+                                              ConstantKernel)
+
+
+kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
+           1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1),
+           1.0 * ExpSineSquared(length_scale=1.0, periodicity=3.0,
+                                length_scale_bounds=(0.1, 10.0),
+                                periodicity_bounds=(1.0, 10.0)),
+           ConstantKernel(0.1, (0.01, 10.0))
+               * (DotProduct(sigma_0=1.0, sigma_0_bounds=(0.1, 10.0)) ** 2),
+           1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0),
+                        nu=2.5)]
+#############################################################################
+
+for kernel in kernels:
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=noise_level ** 2,
+                                   normalize_y=True, noise="gaussian",
+                                   n_restarts_optimizer=2
+                                   )
+    opt = Optimizer([(-2.0, 2.0)], base_estimator=gpr, n_initial_points=5,
+                    acq_optimizer="sampling", random_state=42)
+    fig = plt.figure()
+    fig.suptitle(repr(kernel))
+    for i in range(10):
+        next_x = opt.ask()
+        f_val = objective(next_x)
+        res = opt.tell(next_x, f_val)
+        if i >= 5:
+            plot_optimizer(res, n_iter=i - 5, max_iters=5)
+    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+    plt.show()
diff --git a/examples/plots/README.txt b/examples/plots/README.txt
new file mode 100644
index 000000000..9acaa11b1
--- /dev/null
+++ b/examples/plots/README.txt
@@ -0,0 +1,6 @@
+.. _plots_examples:
+
+Plotting functions
+------------------
+
+Examples concerning the :mod:`skopt.plots` module.
diff --git a/examples/plots/partial-dependence-plot-2D.py b/examples/plots/partial-dependence-plot-2D.py
new file mode 100644
index 000000000..bb03ba2c6
--- /dev/null
+++ b/examples/plots/partial-dependence-plot-2D.py
@@ -0,0 +1,105 @@
+"""
+===========================
+Partial Dependence Plots 2D
+===========================
+
+Hvass-Labs Dec 2017
+Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+Simple example to show the new 2D plots.
+"""
+print(__doc__)
+import numpy as np
+from math import exp
+
+from skopt import gp_minimize
+from skopt.space import Real, Categorical, Integer
+from skopt.plots import plot_histogram, plot_objective_2D, plot_objective
+from skopt.utils import point_asdict
+np.random.seed(123)
+import matplotlib.pyplot as plt
+#############################################################################
+
+dim_learning_rate = Real(name='learning_rate', low=1e-6, high=1e-2, prior='log-uniform')
+dim_num_dense_layers = Integer(name='num_dense_layers', low=1, high=5)
+dim_num_dense_nodes = Integer(name='num_dense_nodes', low=5, high=512)
+dim_activation = Categorical(name='activation', categories=['relu', 'sigmoid'])
+
+dimensions = [dim_learning_rate,
+              dim_num_dense_layers,
+              dim_num_dense_nodes,
+              dim_activation]
+
+default_parameters = [1e-4, 1, 64, 'relu']
+
+def model_fitness(x):
+    learning_rate, num_dense_layers, num_dense_nodes, activation = x
+
+    fitness = ((exp(learning_rate) - 1.0) * 1000) ** 2 + \
+               (num_dense_layers) ** 2 + \
+               (num_dense_nodes/100) ** 2
+
+    fitness *= 1.0 + 0.1 * np.random.rand()
+
+    if activation == 'sigmoid':
+        fitness += 10
+
+    return fitness
+
+print(model_fitness(x=default_parameters))
+
+#############################################################################
+
+search_result = gp_minimize(func=model_fitness,
+                            dimensions=dimensions,
+                            n_calls=30,
+                            x0=default_parameters,
+                            random_state=123
+                            )
+
+print(search_result.x)
+print(search_result.fun)
+
+#############################################################################
+
+for fitness, x in sorted(zip(search_result.func_vals, search_result.x_iters)):
+    print(fitness, x)
+
+#############################################################################
+
+space = search_result.space
+
+print(search_result.x_iters)
+
+search_space = {name: space[name][1] for name in space.dimension_names}
+
+print(point_asdict(search_space, default_parameters))
+
+#############################################################################
+print("Plotting now ...")
+
+_ = plot_histogram(result=search_result, dimension_identifier='learning_rate',
+                   bins=20)
+plt.show()
+
+#############################################################################
+_ = plot_objective_2D(result=search_result,
+                      dimension_identifier1='learning_rate',
+                      dimension_identifier2='num_dense_nodes')
+plt.show()
+
+#############################################################################
+
+_ = plot_objective_2D(result=search_result,
+                      dimension_identifier1='num_dense_layers',
+                      dimension_identifier2='num_dense_nodes')
+plt.show()
+
+#############################################################################
+
+_ = plot_objective(result=search_result,
+                   plot_dims=['num_dense_layers',
+                              'num_dense_nodes'])
+plt.show()
diff --git a/examples/plots/partial-dependence-plot-with-categorical.py b/examples/plots/partial-dependence-plot-with-categorical.py
new file mode 100644
index 000000000..6aa075cf6
--- /dev/null
+++ b/examples/plots/partial-dependence-plot-with-categorical.py
@@ -0,0 +1,97 @@
+"""
+=================================================
+Partial Dependence Plots  with categorical values
+=================================================
+
+Sigurd Carlsen Feb 2019
+Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+Plot objective now supports optional use of partial dependence as well as
+different methods of defining parameter values for dependency plots.
+"""
+print(__doc__)
+import sys
+from skopt.plots import plot_objective
+from skopt import forest_minimize
+import numpy as np
+np.random.seed(123)
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.datasets import load_breast_cancer
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import cross_val_score
+from skopt.space import Integer, Categorical
+from skopt import plots, gp_minimize
+from skopt.plots import plot_objective
+
+#############################################################################
+# objective function
+# ==================
+# Here we define a function that we evaluate.
+
+def objective(params):
+    clf = DecisionTreeClassifier(
+        **{dim.name: val for dim, val in
+           zip(SPACE, params) if dim.name != 'dummy'})
+    return -np.mean(cross_val_score(clf, *load_breast_cancer(return_X_y=True)))
+
+#############################################################################
+# Bayesian optimization
+# =====================
+
+SPACE = [
+    Integer(1, 20, name='max_depth'),
+    Integer(2, 100, name='min_samples_split'),
+    Integer(5, 30, name='min_samples_leaf'),
+    Integer(1, 30, name='max_features'),
+    Categorical(list('abc'), name='dummy'),
+    Categorical(['gini', 'entropy'], name='criterion'),
+    Categorical(list('def'), name='dummy'),
+]
+
+result = gp_minimize(objective, SPACE, n_calls=20)
+
+#############################################################################
+# Partial dependence plot
+# =======================
+#
+# Here we see an example of using partial dependence. Even when setting
+# n_points all the way down to 10 from the default of 40, this method is
+# still very slow. This is because partial dependence calculates 250 extra
+# predictions for each point on the plots.
+
+_ = plot_objective(result, n_points=10)
+
+#############################################################################
+# Plot without partial dependence
+# ===============================
+# Here we plot without partial dependence. We see that it is a lot faster.
+# Also the values for the other parameters are set to the default "result"
+# which is the parameter set of the best observed value so far. In the case
+# of funny_func this is close to 0 for all parameters.
+
+_ = plot_objective(result,  sample_source='result', n_points=10)
+
+#############################################################################
+# Modify the shown minimum
+# ========================
+# Here we try with setting the other parameters to something other than
+# "result". When dealing with categorical dimensions we can't use
+# 'expected_minimum'. Therefore we try with "expected_minimum_random"
+# which is a naive way of finding the minimum of the surrogate by only
+# using random sampling. `n_minimum_search` sets the number of random samples,
+# which is used to find the minimum
+
+_ = plot_objective(result, n_points=10, sample_source='expected_minimum_random',
+                   minimum='expected_minimum_random', n_minimum_search=10000)
+
+#############################################################################
+# Set a minimum location
+# ======================
+# Lastly we can also define these parameters ourselfs by
+# parsing a list as the pars argument:
+
+_ = plot_objective(result, n_points=10, sample_source=[15, 4, 7, 15, 'b', 'entropy', 'e'],
+                   minimum=[15, 4, 7, 15, 'b', 'entropy', 'e'])
diff --git a/examples/plots/partial-dependence-plot.py b/examples/plots/partial-dependence-plot.py
new file mode 100644
index 000000000..03f6409c8
--- /dev/null
+++ b/examples/plots/partial-dependence-plot.py
@@ -0,0 +1,115 @@
+"""
+========================
+Partial Dependence Plots
+========================
+
+Sigurd Carlsen Feb 2019
+Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+Plot objective now supports optional use of partial dependence as well as
+different methods of defining parameter values for dependency plots.
+"""
+print(__doc__)
+import sys
+from skopt.plots import plot_objective
+from skopt import forest_minimize
+import numpy as np
+np.random.seed(123)
+import matplotlib.pyplot as plt
+
+#############################################################################
+# Objective function
+# ==================
+# Plot objective now supports optional use of partial dependence as well as
+# different methods of defining parameter values for dependency plots
+
+# Here we define a function that we evaluate.
+def funny_func(x):
+    s = 0
+    for i in range(len(x)):
+        s += (x[i] * i) ** 2
+    return s
+
+#############################################################################
+# Optimisation using decision trees
+# =================================
+# We run forest_minimize on the function
+bounds = [(-1, 1.), ] * 3
+n_calls = 150
+
+result = forest_minimize(funny_func, bounds, n_calls=n_calls,
+                         base_estimator="ET",
+                         random_state=4)
+
+#############################################################################
+# Partial dependence plot
+# =======================
+# Here we see an example of using partial dependence. Even when setting
+# n_points all the way down to 10 from the default of 40, this method is
+# still very slow. This is because partial dependence calculates 250 extra
+# predictions for each point on the plots.
+
+
+_ = plot_objective(result, n_points=10)
+
+#############################################################################
+# It is possible to change the location of the red dot, which normally shows
+# the position of the found minimum. We can set it 'expected_minimum',
+# which is the minimum value of the surrogate function, obtained by a
+# minimum search method.
+
+_ = plot_objective(result, n_points=10, minimum='expected_minimum')
+#############################################################################
+# Plot without partial dependence
+# ===============================
+# Here we plot without partial dependence. We see that it is a lot faster.
+# Also the values for the other parameters are set to the default "result"
+# which is the parameter set of the best observed value so far. In the case
+# of funny_func this is close to 0 for all parameters.
+
+_ = plot_objective(result,  sample_source='result', n_points=10)
+
+#############################################################################
+# Modify the shown minimum
+# ========================
+# Here we try with setting the `minimum` parameters to something other than
+# "result". First we try with "expected_minimum" which is the set of
+# parameters that gives the miniumum value of the surrogate function,
+# using scipys minimum search method.
+
+_ = plot_objective(result, n_points=10, sample_source='expected_minimum',
+                   minimum='expected_minimum')
+
+#############################################################################
+# "expected_minimum_random" is a naive way of finding the minimum of the
+# surrogate by only using random sampling:
+
+_ = plot_objective(result, n_points=10, sample_source='expected_minimum_random',
+                   minimum='expected_minimum_random')
+
+#############################################################################
+# We can also specify how many initial samples are used for the two different
+# "expected_minimum" methods. We set it to a low value in the next examples
+# to showcase how it affects the minimum for the two methods.
+
+_ = plot_objective(result, n_points=10, sample_source='expected_minimum_random',
+                   minimum='expected_minimum_random',
+                   n_minimum_search=10)
+
+#############################################################################
+
+_ = plot_objective(result, n_points=10, sample_source="expected_minimum",
+                   minimum='expected_minimum', n_minimum_search=2)
+
+#############################################################################
+# Set a minimum location
+# ======================
+# Lastly we can also define these parameters ourself by parsing a list
+# as the minimum argument:
+
+_ = plot_objective(result, n_points=10, sample_source=[1, -0.5, 0.5],
+                   minimum=[1, -0.5, 0.5])
+
+
diff --git a/examples/visualizing-results.py b/examples/plots/visualizing-results.py
similarity index 97%
rename from examples/visualizing-results.py
rename to examples/plots/visualizing-results.py
index 40c0e8aa8..cb5a0cc54 100644
--- a/examples/visualizing-results.py
+++ b/examples/plots/visualizing-results.py
@@ -81,7 +81,8 @@ def plot_branin():
 
     cm = ax.pcolormesh(x_ax, y_ax, fx,
                        norm=LogNorm(vmin=fx.min(),
-                                    vmax=fx.max()))
+                                    vmax=fx.max()),
+                       cmap='viridis_r')
 
     minima = np.array([[-np.pi, 12.275], [+np.pi, 2.275], [9.42478, 2.475]])
     ax.plot(minima[:, 0], minima[:, 1], "r.", markersize=14,
@@ -199,7 +200,7 @@ def plot_branin():
 #############################################################################
 
 _ = plot_evaluations(forest_res)
-_ = plot_objective(forest_res)
+_ = plot_objective(forest_res, n_samples=40)
 
 #############################################################################
 # Going from 6 to 6+2 dimensions
@@ -218,6 +219,6 @@ def plot_branin():
                              base_estimator="ET", random_state=4)
 
 _ = plot_evaluations(forest_res)
-_ = plot_objective(forest_res)
+_ = plot_objective(forest_res, n_samples=40)
 
 # .. [Friedman (2001)] `doi:10.1214/aos/1013203451 section 8.2 <http://projecteuclid.org/euclid.aos/1013203451>`
diff --git a/examples/sampler/README.txt b/examples/sampler/README.txt
new file mode 100644
index 000000000..ee045187d
--- /dev/null
+++ b/examples/sampler/README.txt
@@ -0,0 +1,6 @@
+.. _sampler_examples:
+
+Initial sampling functions
+--------------------------
+
+Examples concerning the :mod:`skopt.sampler` module.
diff --git a/examples/sampler/initial-sampling-method-integer.py b/examples/sampler/initial-sampling-method-integer.py
new file mode 100644
index 000000000..6d775a087
--- /dev/null
+++ b/examples/sampler/initial-sampling-method-integer.py
@@ -0,0 +1,178 @@
+"""
+===================================================
+Comparing initial sampling methods on integer space
+===================================================
+
+Holger Nahrstaedt 2020 Sigurd Carlsen October 2019
+
+.. currentmodule:: skopt
+
+When doing baysian optimization we often want to reserve some of the
+early part of the optimization to pure exploration. By default the
+optimizer suggests purely random samples for the first n_initial_points
+(10 by default). The downside to this is that there is no guarantee that
+these samples are spread out evenly across all the dimensions.
+
+Sampling methods as Latin hypercube, Sobol', Halton and Hammersly
+take advantage of the fact that we know beforehand how many random
+points we want to sample. Then these points can be "spread out" in
+such a way that each dimension is explored.
+
+See also the example on a real space
+:ref:`sphx_glr_auto_examples_initial_sampling_method.py`
+"""
+
+print(__doc__)
+import numpy as np
+np.random.seed(1234)
+import matplotlib.pyplot as plt
+from skopt.space import Space
+from skopt.sampler import Sobol
+from skopt.sampler import Lhs
+from skopt.sampler import Halton
+from skopt.sampler import Hammersly
+from skopt.sampler import Grid
+from scipy.spatial.distance import pdist
+
+#############################################################################
+
+def plot_searchspace(x, title):
+    fig, ax = plt.subplots()
+    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', label='samples')
+    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bs', markersize=40, alpha=0.5)
+    # ax.legend(loc="best", numpoints=1)
+    ax.set_xlabel("X1")
+    ax.set_xlim([0, 5])
+    ax.set_ylabel("X2")
+    ax.set_ylim([0, 5])
+    plt.title(title)
+    ax.grid(True)
+
+
+n_samples = 10
+space = Space([(0, 5), (0, 5)])
+
+#############################################################################
+# Random sampling
+# ---------------
+x = space.rvs(n_samples)
+plot_searchspace(x, "Random samples")
+pdist_data = []
+x_label = []
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("random")
+
+#############################################################################
+# Sobol'
+# ------
+
+sobol = Sobol()
+x = sobol.generate(space.dimensions, n_samples)
+plot_searchspace(x, "Sobol'")
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("sobol'")
+
+#############################################################################
+# Classic latin hypercube sampling
+# --------------------------------
+
+lhs = Lhs(lhs_type="classic", criterion=None)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'classic LHS')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("lhs")
+
+#############################################################################
+# Centered latin hypercube sampling
+# ---------------------------------
+
+lhs = Lhs(lhs_type="centered", criterion=None)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'centered LHS')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("center")
+
+#############################################################################
+# Maximin optimized hypercube sampling
+# ------------------------------------
+
+lhs = Lhs(criterion="maximin", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'maximin LHS')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("maximin")
+
+#############################################################################
+# Correlation optimized hypercube sampling
+# ----------------------------------------
+
+lhs = Lhs(criterion="correlation", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'correlation LHS')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("corr")
+
+#############################################################################
+# Ratio optimized hypercube sampling
+# ----------------------------------
+
+lhs = Lhs(criterion="ratio", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'ratio LHS')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("ratio")
+
+#############################################################################
+# Halton sampling
+# ---------------
+
+halton = Halton()
+x = halton.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Halton')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("halton")
+
+#############################################################################
+# Hammersly sampling
+# ------------------
+
+hammersly = Hammersly()
+x = hammersly.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Hammersly')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("hammersly")
+
+#############################################################################
+# Grid sampling
+# -------------
+
+grid = Grid(border="include", use_full_layout=False)
+x = grid.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Grid')
+print("empty fields: %d" % (36 - np.size(np.unique(x, axis=0), 0)))
+pdist_data.append(pdist(x).flatten())
+x_label.append("grid")
+
+#############################################################################
+# Pdist boxplot of all methods
+# ----------------------------
+#
+# This boxplot shows the distance between all generated points using
+# Euclidian distance. The higher the value, the better the sampling method.
+# It can be seen that random has the worst performance
+
+fig, ax = plt.subplots()
+ax.boxplot(pdist_data)
+plt.grid(True)
+plt.ylabel("pdist")
+_ = ax.set_ylim(0, 6)
+_ = ax.set_xticklabels(x_label, rotation=45, fontsize=8)
diff --git a/examples/sampler/initial-sampling-method.py b/examples/sampler/initial-sampling-method.py
new file mode 100644
index 000000000..ba734742f
--- /dev/null
+++ b/examples/sampler/initial-sampling-method.py
@@ -0,0 +1,169 @@
+"""
+==================================
+Comparing initial sampling methods
+==================================
+
+Holger Nahrstaedt 2020 Sigurd Carlsen October 2019
+
+.. currentmodule:: skopt
+
+
+When doing baysian optimization we often want to reserve some of the
+early part of the optimization to pure exploration. By default the
+optimizer suggests purely random samples for the first n_initial_points
+(10 by default). The downside to this is that there is no guarantee that
+these samples are spread out evenly across all the dimensions.
+
+Sampling methods as Latin hypercube, Sobol', Halton and Hammersly
+take advantage of the fact that we know beforehand how many random
+points we want to sample. Then these points can be "spread out" in
+such a way that each dimension is explored.
+
+See also the example on an integer space
+:ref:`sphx_glr_auto_examples_initial_sampling_method_integer.py`
+"""
+
+print(__doc__)
+import numpy as np
+np.random.seed(123)
+import matplotlib.pyplot as plt
+from skopt.space import Space
+from skopt.sampler import Sobol
+from skopt.sampler import Lhs
+from skopt.sampler import Halton
+from skopt.sampler import Hammersly
+from skopt.sampler import Grid
+from scipy.spatial.distance import pdist
+
+#############################################################################
+
+def plot_searchspace(x, title):
+    fig, ax = plt.subplots()
+    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', label='samples')
+    plt.plot(np.array(x)[:, 0], np.array(x)[:, 1], 'bo', markersize=80, alpha=0.5)
+    # ax.legend(loc="best", numpoints=1)
+    ax.set_xlabel("X1")
+    ax.set_xlim([-5, 10])
+    ax.set_ylabel("X2")
+    ax.set_ylim([0, 15])
+    plt.title(title)
+
+n_samples = 10
+
+space = Space([(-5., 10.), (0., 15.)])
+# space.set_transformer("normalize")
+
+#############################################################################
+# Random sampling
+# ---------------
+x = space.rvs(n_samples)
+plot_searchspace(x, "Random samples")
+pdist_data = []
+x_label = []
+pdist_data.append(pdist(x).flatten())
+x_label.append("random")
+
+#############################################################################
+# Sobol'
+# ------
+
+sobol = Sobol()
+x = sobol.generate(space.dimensions, n_samples)
+plot_searchspace(x, "Sobol'")
+pdist_data.append(pdist(x).flatten())
+x_label.append("sobol'")
+
+#############################################################################
+# Classic Latin hypercube sampling
+# --------------------------------
+
+lhs = Lhs(lhs_type="classic", criterion=None)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'classic LHS')
+pdist_data.append(pdist(x).flatten())
+x_label.append("lhs")
+
+#############################################################################
+# Centered Latin hypercube sampling
+# ---------------------------------
+
+lhs = Lhs(lhs_type="centered", criterion=None)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'centered LHS')
+pdist_data.append(pdist(x).flatten())
+x_label.append("center")
+
+#############################################################################
+# Maximin optimized hypercube sampling
+# ------------------------------------
+
+lhs = Lhs(criterion="maximin", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'maximin LHS')
+pdist_data.append(pdist(x).flatten())
+x_label.append("maximin")
+
+#############################################################################
+# Correlation optimized hypercube sampling
+# ----------------------------------------
+
+lhs = Lhs(criterion="correlation", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'correlation LHS')
+pdist_data.append(pdist(x).flatten())
+x_label.append("corr")
+
+#############################################################################
+# Ratio optimized hypercube sampling
+# ----------------------------------
+
+lhs = Lhs(criterion="ratio", iterations=10000)
+x = lhs.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'ratio LHS')
+pdist_data.append(pdist(x).flatten())
+x_label.append("ratio")
+
+#############################################################################
+# Halton sampling
+# ---------------
+
+halton = Halton()
+x = halton.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Halton')
+pdist_data.append(pdist(x).flatten())
+x_label.append("halton")
+
+#############################################################################
+# Hammersly sampling
+# ------------------
+
+hammersly = Hammersly()
+x = hammersly.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Hammersly')
+pdist_data.append(pdist(x).flatten())
+x_label.append("hammersly")
+
+#############################################################################
+# Grid sampling
+# -------------
+
+grid = Grid(border="include", use_full_layout=False)
+x = grid.generate(space.dimensions, n_samples)
+plot_searchspace(x, 'Grid')
+pdist_data.append(pdist(x).flatten())
+x_label.append("grid")
+
+#############################################################################
+# Pdist boxplot of all methods
+# ----------------------------
+#
+# This boxplot shows the distance between all generated points using
+# Euclidian distance. The higher the value, the better the sampling method.
+# It can be seen that random has the worst performance
+
+fig, ax = plt.subplots()
+ax.boxplot(pdist_data)
+plt.grid(True)
+plt.ylabel("pdist")
+_ = ax.set_ylim(0, 12)
+_ = ax.set_xticklabels(x_label, rotation=45, fontsize=8)
diff --git a/examples/sampler/sampling_comparison.py b/examples/sampler/sampling_comparison.py
new file mode 100644
index 000000000..a52d92c0f
--- /dev/null
+++ b/examples/sampler/sampling_comparison.py
@@ -0,0 +1,191 @@
+"""
+==========================================
+Comparing initial point generation methods
+==========================================
+
+Holger Nahrstaedt 2020
+
+.. currentmodule:: skopt
+
+Bayesian optimization or sequential model-based optimization uses a surrogate
+model to model the expensive to evaluate function `func`. There are several
+choices for what kind of surrogate model to use. This notebook compares the
+performance of:
+
+* Halton sequence,
+* Hammersly sequence,
+* Sobol' sequence and
+* Latin hypercube sampling
+
+as initial points. The purely random point generation is used as
+a baseline.
+"""
+
+print(__doc__)
+import numpy as np
+np.random.seed(123)
+import matplotlib.pyplot as plt
+
+#############################################################################
+# Toy model
+# =========
+#
+# We will use the :class:`benchmarks.hart6` function as toy model for the expensive function.
+# In a real world application this function would be unknown and expensive
+# to evaluate.
+
+from skopt.benchmarks import hart6 as hart6_
+# redefined `hart6` to allow adding arbitrary "noise" dimensions
+def hart6(x, noise_level=0.):
+    return hart6_(x[:6]) + noise_level * np.random.randn()
+
+from skopt.benchmarks import branin as _branin
+
+def branin(x, noise_level=0.):
+    return _branin(x) + noise_level * np.random.randn()
+
+#############################################################################
+
+from matplotlib.pyplot import cm
+import time
+from skopt import gp_minimize, forest_minimize, dummy_minimize
+
+def plot_convergence(result_list, true_minimum=None, yscale=None, title="Convergence plot"):
+
+    ax = plt.gca()
+    ax.set_title(title)
+    ax.set_xlabel("Number of calls $n$")
+    ax.set_ylabel(r"$\min f(x)$ after $n$ calls")
+    ax.grid()
+    if yscale is not None:
+        ax.set_yscale(yscale)
+    colors = cm.hsv(np.linspace(0.25, 1.0, len(result_list)))
+
+    for results, color in zip(result_list, colors):
+        name, results = results
+        n_calls = len(results[0].x_iters)
+        iterations = range(1, n_calls + 1)
+        mins = [[np.min(r.func_vals[:i]) for i in iterations]
+                for r in results]
+        ax.plot(iterations, np.mean(mins, axis=0), c=color, label=name)
+        #ax.errorbar(iterations, np.mean(mins, axis=0),
+        #             yerr=np.std(mins, axis=0), c=color, label=name)
+    if true_minimum:
+        ax.axhline(true_minimum, linestyle="--",
+                   color="r", lw=1,
+                   label="True minimum")
+    ax.legend(loc="best")
+    return ax
+
+
+def run(minimizer, initial_point_generator,
+        n_initial_points=10, n_repeats=1):
+    return [minimizer(func, bounds, n_initial_points=n_initial_points,
+                      initial_point_generator=initial_point_generator,
+                      n_calls=n_calls, random_state=n)
+            for n in range(n_repeats)]
+
+
+def run_measure(initial_point_generator, n_initial_points=10):
+    start = time.time()
+    # n_repeats must set to a much higher value to obtain meaningful results.
+    n_repeats = 1
+    res = run(gp_minimize, initial_point_generator,
+              n_initial_points=n_initial_points, n_repeats=n_repeats)
+    duration = time.time() - start
+    # print("%s %s: %.2f s" % (initial_point_generator,
+    #                          str(init_point_gen_kwargs),
+    #                          duration))
+    return res
+#############################################################################
+# Objective
+# =========
+#
+# The objective of this example is to find one of these minima in as
+# few iterations as possible. One iteration is defined as one call
+# to the :class:`benchmarks.hart6` function.
+#
+# We will evaluate each model several times using a different seed for the
+# random number generator. Then compare the average performance of these
+# models. This makes the comparison more robust against models that get
+# "lucky".
+
+from functools import partial
+example = "hart6"
+
+if example == "hart6":
+    func = partial(hart6, noise_level=0.1)
+    bounds = [(0., 1.), ] * 6
+    true_minimum = -3.32237
+    n_calls = 40
+    n_initial_points = 10
+    yscale = None
+    title = "Convergence plot - hart6"
+else:
+    func = partial(branin, noise_level=2.0)
+    bounds = [(-5.0, 10.0), (0.0, 15.0)]
+    true_minimum = 0.397887
+    n_calls = 30
+    n_initial_points = 10
+    yscale="log"
+    title = "Convergence plot - branin"
+
+#############################################################################
+from skopt.utils import cook_initial_point_generator
+
+# Random search
+dummy_res = run_measure("random", n_initial_points)
+lhs = cook_initial_point_generator(
+    "lhs", lhs_type="classic", criterion=None)
+lhs_res = run_measure(lhs, n_initial_points)
+lhs2 = cook_initial_point_generator("lhs", criterion="maximin")
+lhs2_res = run_measure(lhs2, n_initial_points)
+sobol = cook_initial_point_generator("sobol", randomize=False,
+                                     min_skip=1, max_skip=100)
+sobol_res = run_measure(sobol, n_initial_points)
+halton_res = run_measure("halton", n_initial_points)
+hammersly_res = run_measure("hammersly", n_initial_points)
+grid_res = run_measure("grid", n_initial_points)
+
+#############################################################################
+# Note that this can take a few minutes.
+
+plot = plot_convergence([("random", dummy_res),
+                        ("lhs", lhs_res),
+                        ("lhs_maximin", lhs2_res),
+                        ("sobol'", sobol_res),
+                        ("halton", halton_res),
+                        ("hammersly", hammersly_res),
+                        ("grid", grid_res)],
+                        true_minimum=true_minimum,
+                        yscale=yscale,
+                        title=title)
+
+plt.show()
+
+#############################################################################
+# This plot shows the value of the minimum found (y axis) as a function
+# of the number of iterations performed so far (x axis). The dashed red line
+# indicates the true value of the minimum of the :class:`benchmarks.hart6`
+# function.
+
+#############################################################################
+# Test with different n_random_starts values
+lhs2 = cook_initial_point_generator("lhs", criterion="maximin")
+lhs2_15_res = run_measure(lhs2, 12)
+lhs2_20_res = run_measure(lhs2, 14)
+lhs2_25_res = run_measure(lhs2, 16)
+
+#############################################################################
+# n_random_starts = 10 produces the best results
+
+plot = plot_convergence([("random - 10", dummy_res),
+                        ("lhs_maximin - 10", lhs2_res),
+                        ("lhs_maximin - 12", lhs2_15_res),
+                        ("lhs_maximin - 14", lhs2_20_res),
+                        ("lhs_maximin - 16", lhs2_25_res)],
+                        true_minimum=true_minimum,
+                        yscale=yscale,
+                        title=title)
+
+plt.show()
diff --git a/examples/sklearn-gridsearchcv-replacement.py b/examples/sklearn-gridsearchcv-replacement.py
index 2fd181bc9..2d52b8775 100644
--- a/examples/sklearn-gridsearchcv-replacement.py
+++ b/examples/sklearn-gridsearchcv-replacement.py
@@ -35,6 +35,8 @@
 """
 print(__doc__)
 import numpy as np
+np.random.seed(123)
+import matplotlib.pyplot as plt
 
 #############################################################################
 # Minimal example
@@ -47,7 +49,7 @@
 from sklearn.svm import SVC
 from sklearn.model_selection import train_test_split
 
-X, y = load_digits(10, True)
+X, y = load_digits(n_class=10, return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=.25, random_state=0)
 
 # log-uniform: understand as search over p = exp(x) by varying x
@@ -79,13 +81,14 @@
 
 from skopt import BayesSearchCV
 from skopt.space import Real, Categorical, Integer
+from skopt.plots import plot_objective, plot_histogram
 
 from sklearn.datasets import load_digits
 from sklearn.svm import LinearSVC, SVC
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import train_test_split
 
-X, y = load_digits(10, True)
+X, y = load_digits(n_class=10, return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
 # pipeline class is used as estimator to enable
@@ -114,7 +117,8 @@
 
 opt = BayesSearchCV(
     pipe,
-    [(svc_search, 20), (linsvc_search, 16)], # (parameter space, # of evaluations)
+    # (parameter space, # of evaluations)
+    [(svc_search, 40), (linsvc_search, 16)],
     cv=3
 )
 
@@ -122,6 +126,21 @@
 
 print("val. score: %s" % opt.best_score_)
 print("test score: %s" % opt.score(X_test, y_test))
+print("best params: %s" % str(opt.best_params_))
+
+#############################################################################
+# Partial Dependence plot of the objective function for SVC
+#
+_ = plot_objective(opt.optimizer_results_[0],
+                   dimensions=["C", "degree", "gamma", "kernel"],
+                   n_minimum_search=int(1e8))
+plt.show()
+
+#############################################################################
+# Plot of the histogram for LinearSVC
+#
+_ = plot_histogram(opt.optimizer_results_[1], 1)
+plt.show()
 
 #############################################################################
 # Progress monitoring and control using `callback` argument of `fit` method
@@ -144,7 +163,7 @@
 from sklearn.datasets import load_iris
 from sklearn.svm import SVC
 
-X, y = load_iris(True)
+X, y = load_iris(return_X_y=True)
 
 searchcv = BayesSearchCV(
     SVC(gamma='scale'),
@@ -153,10 +172,9 @@
     cv=3
 )
 
-
 # callback handler
 def on_step(optim_result):
-    score = searchcv.best_score_
+    score = -optim_result['fun']
     print("best score: %s" % score)
     if score >= 0.98:
         print('Interrupting!')
@@ -180,7 +198,7 @@ def on_step(optim_result):
 from sklearn.datasets import load_iris
 from sklearn.svm import SVC
 
-X, y = load_iris(True)
+X, y = load_iris(return_X_y=True)
 
 searchcv = BayesSearchCV(
     SVC(),
diff --git a/examples/store-and-load-results.py b/examples/store-and-load-results.py
index c952f3d11..655fb208e 100644
--- a/examples/store-and-load-results.py
+++ b/examples/store-and-load-results.py
@@ -31,10 +31,6 @@
 import os
 import sys
 
-# The followings are hacks to allow sphinx-gallery to run the example.
-sys.path.insert(0, os.getcwd())
-main_dir = os.path.basename(sys.modules['__main__'].__file__)
-IS_RUN_WITH_SPHINX_GALLERY = main_dir != os.getcwd()
 
 #############################################################################
 # Simple example
@@ -46,21 +42,17 @@
 from skopt import gp_minimize
 noise_level = 0.1
 
-if IS_RUN_WITH_SPHINX_GALLERY:
-    # When this example is run with sphinx gallery, it breaks the pickling
-    # capacity for multiprocessing backend so we have to modify the way we
-    # define our functions. This has nothing to do with the example.
-    from utils import obj_fun
-else:
-    def obj_fun(x, noise_level=noise_level):
-        return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) + np.random.randn() * noise_level
+
+def obj_fun(x, noise_level=noise_level):
+    return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) + np.random.randn() \
+        * noise_level
 
 res = gp_minimize(obj_fun,            # the function to minimize
                   [(-2.0, 2.0)],      # the bounds on each dimension of x
                   x0=[0.],            # the starting point
                   acq_func="LCB",     # the acquisition function (optional)
                   n_calls=15,         # the number of evaluations of f including at x0
-                  n_random_starts=0,  # the number of random initialization points
+                  n_random_starts=3,  # the number of random initial points
                   random_state=777)
 
 #############################################################################
diff --git a/examples/strategy-comparison.py b/examples/strategy-comparison.py
index 33c659223..d59d4bb7b 100644
--- a/examples/strategy-comparison.py
+++ b/examples/strategy-comparison.py
@@ -55,7 +55,8 @@ def plot_branin():
 
     cm = ax.pcolormesh(x_ax, y_ax, fx,
                        norm=LogNorm(vmin=fx.min(),
-                                    vmax=fx.max()))
+                                    vmax=fx.max()),
+                       cmap='viridis_r')
 
     minima = np.array([[-np.pi, 12.275], [+np.pi, 2.275], [9.42478, 2.475]])
     ax.plot(minima[:, 0], minima[:, 1], "r.", markersize=14,
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..6fad80b64
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,11 @@
+[build-system]
+# Minimum requirements for the build system to execute.
+requires = [
+    "setuptools",
+    "wheel",
+    "numpy>=1.12",
+    "scipy>=0.18",
+    "scikit-learn>=0.19.1",
+    "pyaml>=16.9",
+    "joblib>=0.11",
+]
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 590369315..000000000
--- a/pytest.ini
+++ /dev/null
@@ -1,6 +0,0 @@
-[pytest]
-markers =
-    fast_test: marks tests as fast (deselect with '-m "not fast_test"')
-    slow_test: marks tests as slow (deselect with '-m "not slow_test"')
-    slow: marks tests as slow (deselect with '-m "not slow"')
-    serial
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 87497b02c..1eaa3083a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
-numpy
-scipy
-scikit-learn>=0.19
-matplotlib
+numpy>=1.13.3
+scipy>=0.19.1
+scikit-learn>=0.20
+matplotlib>=2.0.0
 pytest
-nose
-pyaml
+pyaml>=16.9
+joblib>=0.11
diff --git a/setup.cfg b/setup.cfg
index 99ac4a263..dcc9caad1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,29 @@
+[aliases]
+test = pytest
+
+[tool:pytest]
+# disable-pytest-warnings should be removed once we rewrite tests
+# using yield with parametrize
+doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
+addopts =
+    --ignore build_tools
+    --ignore benchmarks
+    --ignore doc
+    --ignore examples
+    --ignore maint_tools
+    --doctest-modules
+    --disable-pytest-warnings
+    -rxXs
+
+filterwarnings =
+    ignore:the matrix subclass:PendingDeprecationWarning
+
+markers =
+    fast_test: marks tests as fast (deselect with '-m "not fast_test"')
+    slow_test: marks tests as slow (deselect with '-m "not slow_test"')
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    serial
+
 # used by our travis auto-deployment system
 # needs changing if scikit-optimize ever stops being
 # a pure python module
@@ -7,3 +33,6 @@ universal = 1
 [coverage:run]
 omit = */tests/*
 
+[flake8]
+# Default flake8 3.5 ignored flags
+ignore=E121,E123,E126,E226,E24,E704,W503,W504
\ No newline at end of file
diff --git a/setup.py b/setup.py
index ac0675dea..8879da880 100644
--- a/setup.py
+++ b/setup.py
@@ -17,18 +17,36 @@
 
 VERSION = skopt.__version__
 
+CLASSIFIERS = ['Intended Audience :: Science/Research',
+               'Intended Audience :: Developers',
+               'License :: OSI Approved :: BSD License',
+               'Programming Language :: Python',
+               'Topic :: Software Development',
+               'Topic :: Scientific/Engineering',
+               'Operating System :: Microsoft :: Windows',
+               'Operating System :: POSIX',
+               'Operating System :: Unix',
+               'Operating System :: MacOS',
+               'Programming Language :: Python :: 3.6',
+               'Programming Language :: Python :: 3.7',
+               'Programming Language :: Python :: 3.8']
+
+
 setup(name='scikit-optimize',
       version=VERSION,
       description='Sequential model-based optimization toolbox.',
       long_description=open('README.rst').read(),
       url='https://scikit-optimize.github.io/',
-      license='BSD',
+      license='BSD 3-clause',
       author='The scikit-optimize contributors',
+      classifiers=CLASSIFIERS,
       packages=['skopt', 'skopt.learning', 'skopt.optimizer', 'skopt.space',
-                'skopt.learning.gaussian_process'],
-      install_requires=['joblib', 'pyaml', 'numpy', 'scipy>=0.14.0',
-                        'scikit-learn>=0.19.1'],
+                'skopt.learning.gaussian_process', 'skopt.sampler'],
+      install_requires=['joblib>=0.11', 'pyaml>=16.9', 'numpy>=1.13.3',
+                        'scipy>=0.19.1',
+                        'scikit-learn>=0.20.0'],
       extras_require={
-        'plots':  ["matplotlib"]
+        'plots':  ["matplotlib>=2.0.0"]
         }
+
       )
diff --git a/skopt/__init__.py b/skopt/__init__.py
index c0ae1e311..80a423a9f 100644
--- a/skopt/__init__.py
+++ b/skopt/__init__.py
@@ -29,7 +29,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "0.7.1"
+__version__ = "0.9.0"
 
 if __SKOPT_SETUP__:
     import sys
@@ -37,6 +37,8 @@
     # We are not importing the rest of scikit-optimize during the build
     # process, as it may not be compiled yet
 else:
+    import platform
+    import struct
     from . import acquisition
     from . import benchmarks
     from . import callbacks
@@ -44,6 +46,7 @@
     from . import optimizer
 
     from . import space
+    from . import sampler
     from .optimizer import dummy_minimize
     from .optimizer import forest_minimize
     from .optimizer import gbrt_minimize
@@ -53,6 +56,7 @@
     from .space import Space
     from .utils import dump
     from .utils import expected_minimum
+    from .utils import expected_minimum_random_sampling
     from .utils import load
     __all__ = (
         "acquisition",
@@ -61,6 +65,7 @@
         "learning",
         "optimizer",
         "plots",
+        "sampler",
         "space",
         "gp_minimize",
         "dummy_minimize",
@@ -73,3 +78,5 @@
         "BayesSearchCV",
         "Space"
     )
+    IS_PYPY = platform.python_implementation() == 'PyPy'
+    _IS_32BIT = 8 * struct.calcsize("P") == 32
diff --git a/skopt/acquisition.py b/skopt/acquisition.py
index fec941dc9..ba3d409ee 100644
--- a/skopt/acquisition.py
+++ b/skopt/acquisition.py
@@ -112,7 +112,7 @@ def gaussian_lcb(X, model, kappa=1.96, return_grad=False):
         exploration over exploitation and vice versa.
         If set to 'inf', the acquisition function will only use the variance
         which is useful in a pure exploration setting.
-        Useless if ``method`` is set to "LCB".
+        Useless if ``method`` is not set to "LCB".
 
     return_grad : boolean, optional
         Whether or not to return the grad. Implemented only for the case where
diff --git a/skopt/benchmarks.py b/skopt/benchmarks.py
index 0eacd76b0..031e9994f 100644
--- a/skopt/benchmarks.py
+++ b/skopt/benchmarks.py
@@ -66,9 +66,10 @@ def bench5(x):
     return float(x[0]) ** 2 + x[1] ** 2
 
 
-def branin(x, a=1, b=5.1 / (4 * np.pi**2), c=5. / np.pi,
+def branin(x, a=1, b=5.1 / (4 * np.pi ** 2), c=5. / np.pi,
            r=6, s=10, t=1. / (8 * np.pi)):
-    """Branin-Hoo function is defined on the square x1 ∈ [-5, 10], x2 ∈ [0, 15].
+    """Branin-Hoo function is defined on the square
+    :math:`x1 \\in [-5, 10], x2 \\in [0, 15]`.
 
     It has three minima with f(x*) = 0.397887 at x* = (-pi, 12.275),
     (+pi, 2.275), and (9.42478, 2.475).
@@ -81,10 +82,10 @@ def branin(x, a=1, b=5.1 / (4 * np.pi**2), c=5. / np.pi,
 
 def hart6(x,
           alpha=np.asarray([1.0, 1.2, 3.0, 3.2]),
-          P=10**-4 * np.asarray([[1312, 1696, 5569, 124, 8283, 5886],
-                                 [2329, 4135, 8307, 3736, 1004, 9991],
-                                 [2348, 1451, 3522, 2883, 3047, 6650],
-                                 [4047, 8828, 8732, 5743, 1091, 381]]),
+          P=10 ** -4 * np.asarray([[1312, 1696, 5569, 124, 8283, 5886],
+                                   [2329, 4135, 8307, 3736, 1004, 9991],
+                                   [2348, 1451, 3522, 2883, 3047, 6650],
+                                   [4047, 8828, 8732, 5743, 1091, 381]]),
           A=np.asarray([[10, 3, 17, 3.50, 1.7, 8],
                         [0.05, 10, 17, 0.1, 8, 14],
                         [3, 3.5, 1.7, 10, 17, 8],
@@ -96,4 +97,4 @@ def hart6(x,
 
     More details: <http://www.sfu.ca/~ssurjano/hart6.html>
     """
-    return -np.sum(alpha * np.exp(-np.sum(A * (np.array(x) - P)**2, axis=1)))
+    return -np.sum(alpha * np.exp(-np.sum(A * (np.array(x) - P) ** 2, axis=1)))
diff --git a/skopt/callbacks.py b/skopt/callbacks.py
index 7d8e8b2ef..11e6c0bea 100644
--- a/skopt/callbacks.py
+++ b/skopt/callbacks.py
@@ -219,6 +219,25 @@ def _criterion(self, result):
             return None
 
 
+class HollowIterationsStopper(EarlyStopper):
+    """
+    Stop if the improvement over the last n iterations is below a threshold.
+    """
+
+    def __init__(self, n_iterations, threshold=0):
+        super(HollowIterationsStopper, self).__init__()
+        self.n_iterations = n_iterations
+        self.threshold = abs(threshold)
+
+    def _criterion(self, result):
+
+        if len(result.func_vals) <= self.n_iterations:
+            return False
+
+        cummin = np.minimum.accumulate(result.func_vals)
+        return cummin[-self.n_iterations - 1] - cummin[-1] <= self.threshold
+
+
 class DeadlineStopper(EarlyStopper):
     """
     Stop the optimization before running out of a fixed budget of time.
@@ -252,6 +271,19 @@ def _criterion(self, result):
             return None
 
 
+class ThresholdStopper(EarlyStopper):
+    """
+    Stop the optimization when the objective value is lower
+    than the given threshold.
+    """
+    def __init__(self, threshold: float) -> None:
+        super(EarlyStopper, self).__init__()
+        self.threshold = threshold
+
+    def _criterion(self, result) -> bool:
+        return np.any([val <= self.threshold for val in result.func_vals])
+
+
 class CheckpointSaver(object):
     """
     Save current state after each iteration with :class:`skopt.dump`.
@@ -260,9 +292,11 @@ class CheckpointSaver(object):
     Examples
     --------
     >>> import skopt
-    >>>
+    >>> def obj_fun(x):
+    ...     return x[0]**2
     >>> checkpoint_callback = skopt.callbacks.CheckpointSaver("./result.pkl")
-    >>> skopt.gp_minimize(obj_fun, dims, callback=[checkpoint_callback])
+    >>> skopt.gp_minimize(obj_fun, [(-2, 2)], n_calls=10,
+    ...                   callback=[checkpoint_callback]) # doctest: +SKIP
 
     Parameters
     ----------
diff --git a/skopt/learning/forest.py b/skopt/learning/forest.py
index 62746575e..096770c1d 100644
--- a/skopt/learning/forest.py
+++ b/skopt/learning/forest.py
@@ -28,6 +28,7 @@ def _return_std(X, trees, predictions, min_variance):
     std : array-like, shape=(n_samples,)
         Standard deviation of `y` at `X`. If criterion
         is set to "mse", then `std[i] ~= std(y | X[i])`.
+
     """
     # This derives std(y | x) as described in 4.3.2 of arXiv:1211.0906
     std = np.zeros(len(X))
@@ -68,6 +69,7 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
 
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
+
         - If int, then consider `max_features` features at each split.
         - If float, then `max_features` is a percentage and
           `int(max_features * n_features)` features are considered at each
@@ -76,9 +78,12 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
         - If "sqrt", then `max_features=sqrt(n_features)`.
         - If "log2", then `max_features=log2(n_features)`.
         - If None, then `max_features=n_features`.
-        Note: the search for a split does not stop until at least one
-        valid partition of the node samples is found, even if it requires to
-        effectively inspect more than ``max_features`` features.
+
+        .. note::
+            The search for a split does not stop until at least one
+            valid partition of the node samples is found, even if it
+            requires to effectively inspect more than ``max_features``
+            features.
 
     max_depth : integer or None, optional (default=None)
         The maximum depth of the tree. If None, then nodes are expanded until
@@ -87,6 +92,7 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
 
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
+
         - If int, then consider `min_samples_split` as the minimum number.
         - If float, then `min_samples_split` is a percentage and
           `ceil(min_samples_split * n_samples)` are the minimum
@@ -94,6 +100,7 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
 
     min_samples_leaf : int, float, optional (default=1)
         The minimum number of samples required to be at a leaf node:
+
         - If int, then consider `min_samples_leaf` as the minimum number.
         - If float, then `min_samples_leaf` is a percentage and
           `ceil(min_samples_leaf * n_samples)` are the minimum
@@ -113,8 +120,10 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
         The weighted impurity decrease equation is the following::
+
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
+
         where ``N`` is the total number of samples, ``N_t`` is the number of
         samples at the current node, ``N_t_L`` is the number of samples in the
         left child, and ``N_t_R`` is the number of samples in the right child.
@@ -183,11 +192,13 @@ class RandomForestRegressor(_sk_RandomForestRegressor):
     References
     ----------
     .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
+
     """
     def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
                  min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0, max_features='auto',
-                 max_leaf_nodes=None, bootstrap=True, oob_score=False,
+                 max_leaf_nodes=None, min_impurity_decrease=0.,
+                 bootstrap=True, oob_score=False,
                  n_jobs=1, random_state=None, verbose=0, warm_start=False,
                  min_variance=0.0):
         self.min_variance = min_variance
@@ -198,6 +209,7 @@ def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features, max_leaf_nodes=max_leaf_nodes,
+            min_impurity_decrease=min_impurity_decrease,
             bootstrap=bootstrap, oob_score=oob_score,
             n_jobs=n_jobs, random_state=random_state,
             verbose=verbose, warm_start=warm_start)
@@ -222,6 +234,7 @@ def predict(self, X, return_std=False):
         std : array-like of shape=(n_samples,)
             Standard deviation of `y` at `X`. If criterion
             is set to "mse", then `std[i] ~= std(y | X[i])`.
+
         """
         mean = super(RandomForestRegressor, self).predict(X)
 
@@ -252,6 +265,7 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
 
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
+
         - If int, then consider `max_features` features at each split.
         - If float, then `max_features` is a percentage and
           `int(max_features * n_features)` features are considered at each
@@ -260,9 +274,12 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
         - If "sqrt", then `max_features=sqrt(n_features)`.
         - If "log2", then `max_features=log2(n_features)`.
         - If None, then `max_features=n_features`.
-        Note: the search for a split does not stop until at least one
-        valid partition of the node samples is found, even if it requires to
-        effectively inspect more than ``max_features`` features.
+
+        .. note::
+            The search for a split does not stop until at least one
+            valid partition of the node samples is found, even if it
+            requires to effectively inspect more than ``max_features``
+            features.
 
     max_depth : integer or None, optional (default=None)
         The maximum depth of the tree. If None, then nodes are expanded until
@@ -271,6 +288,7 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
 
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
+
         - If int, then consider `min_samples_split` as the minimum number.
         - If float, then `min_samples_split` is a percentage and
           `ceil(min_samples_split * n_samples)` are the minimum
@@ -278,6 +296,7 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
 
     min_samples_leaf : int, float, optional (default=1)
         The minimum number of samples required to be at a leaf node:
+
         - If int, then consider `min_samples_leaf` as the minimum number.
         - If float, then `min_samples_leaf` is a percentage and
           `ceil(min_samples_leaf * n_samples)` are the minimum
@@ -297,8 +316,10 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
         The weighted impurity decrease equation is the following::
+
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
+
         where ``N`` is the total number of samples, ``N_t`` is the number of
         samples at the current node, ``N_t_L`` is the number of samples in the
         left child, and ``N_t_R`` is the number of samples in the right child.
@@ -367,11 +388,13 @@ class ExtraTreesRegressor(_sk_ExtraTreesRegressor):
     References
     ----------
     .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
+
     """
     def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
                  min_samples_split=2, min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0, max_features='auto',
-                 max_leaf_nodes=None, bootstrap=False, oob_score=False,
+                 max_leaf_nodes=None, min_impurity_decrease=0.,
+                 bootstrap=False, oob_score=False,
                  n_jobs=1, random_state=None, verbose=0, warm_start=False,
                  min_variance=0.0):
         self.min_variance = min_variance
@@ -382,6 +405,7 @@ def __init__(self, n_estimators=10, criterion='mse', max_depth=None,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_features=max_features, max_leaf_nodes=max_leaf_nodes,
+            min_impurity_decrease=min_impurity_decrease,
             bootstrap=bootstrap, oob_score=oob_score,
             n_jobs=n_jobs, random_state=random_state,
             verbose=verbose, warm_start=warm_start)
diff --git a/skopt/learning/gaussian_process/gpr.py b/skopt/learning/gaussian_process/gpr.py
index 331d19ad7..34902d78c 100644
--- a/skopt/learning/gaussian_process/gpr.py
+++ b/skopt/learning/gaussian_process/gpr.py
@@ -224,10 +224,15 @@ def fit(self, X, y):
         self.K_inv_ = L_inv.dot(L_inv.T)
 
         # Fix deprecation warning #462
-        if int(sklearn.__version__[2:4]) >= 19:
+        if sklearn.__version__ >= "0.23":
+            self.y_train_std_ = self._y_train_std
             self.y_train_mean_ = self._y_train_mean
+        elif sklearn.__version__ >= "0.19":
+            self.y_train_mean_ = self._y_train_mean
+            self.y_train_std_ = 1
         else:
             self.y_train_mean_ = self.y_train_mean
+            self.y_train_std_ = 1
 
         return self
 
@@ -309,11 +314,14 @@ def predict(self, X, return_std=False, return_cov=False,
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_train_)
             y_mean = K_trans.dot(self.alpha_)    # Line 4 (y_mean = f_star)
-            y_mean = self.y_train_mean_ + y_mean  # undo normal.
+            # undo normalisation
+            y_mean = self.y_train_std_ * y_mean + self.y_train_mean_
 
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                 y_cov = self.kernel_(X) - K_trans.dot(v)   # Line 6
+                # undo normalisation
+                y_cov = y_cov * self.y_train_std_**2
                 return y_mean, y_cov
 
             elif return_std:
@@ -330,17 +338,22 @@ def predict(self, X, return_std=False, return_cov=False,
                     warnings.warn("Predicted variances smaller than 0. "
                                   "Setting those variances to 0.")
                     y_var[y_var_negative] = 0.0
+                # undo normalisation
+                y_var = y_var * self.y_train_std_**2
                 y_std = np.sqrt(y_var)
 
             if return_mean_grad:
                 grad = self.kernel_.gradient_x(X[0], self.X_train_)
                 grad_mean = np.dot(grad.T, self.alpha_)
-
+                # undo normalisation
+                grad_mean = grad_mean * self.y_train_std_
                 if return_std_grad:
                     grad_std = np.zeros(X.shape[1])
                     if not np.allclose(y_std, grad_std):
                         grad_std = -np.dot(K_trans,
                                            np.dot(K_inv, grad))[0] / y_std
+                        # undo normalisation
+                        grad_std = grad_std * self.y_train_std_**2
                     return y_mean, y_std, grad_mean, grad_std
 
                 if return_std:
diff --git a/skopt/learning/gaussian_process/kernels.py b/skopt/learning/gaussian_process/kernels.py
index fa6490b4c..c0d55a211 100644
--- a/skopt/learning/gaussian_process/kernels.py
+++ b/skopt/learning/gaussian_process/kernels.py
@@ -316,7 +316,7 @@ def gradient_x(self, x, X_train):
 
 class HammingKernel(sk_StationaryKernelMixin, sk_NormalizedKernelMixin,
                     Kernel):
-    """
+    r"""
     The HammingKernel is used to handle categorical inputs.
 
     ``K(x_1, x_2) = exp(\sum_{j=1}^{d} -ls_j * (I(x_1j != x_2j)))``
@@ -378,7 +378,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         if np.iterable(length_scale):
             if len(length_scale) > 1:
-                length_scale = np.asarray(length_scale, dtype=np.float)
+                length_scale = np.asarray(length_scale, dtype=float)
             else:
                 length_scale = float(length_scale[0])
         else:
diff --git a/skopt/learning/tests/test_forest.py b/skopt/learning/tests/test_forest.py
new file mode 100644
index 000000000..0711cde9d
--- /dev/null
+++ b/skopt/learning/tests/test_forest.py
@@ -0,0 +1,102 @@
+import numpy as np
+import pytest
+
+from scipy import stats
+
+from numpy.testing import assert_equal
+from numpy.testing import assert_array_equal
+from numpy.testing import assert_almost_equal
+
+from skopt.learning import ExtraTreesRegressor, RandomForestRegressor
+
+
+def truth(X):
+    return 0.5 * np.sin(1.75*X[:, 0])
+
+
+@pytest.mark.fast_test
+def test_random_forest():
+    # toy sample
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+    y = [-1, -1, -1, 1, 1, 1]
+    T = [[-1, -1], [2, 2], [3, 2]]
+    true_result = [-1, 1, 1]
+
+    clf = RandomForestRegressor(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = RandomForestRegressor(n_estimators=10, min_impurity_decrease=0.1,
+                                random_state=1)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = RandomForestRegressor(n_estimators=10, criterion="mse",
+                                max_depth=None, min_samples_split=2,
+                                min_samples_leaf=1,
+                                min_weight_fraction_leaf=0.,
+                                max_features="auto", max_leaf_nodes=None,
+                                min_impurity_decrease=0., bootstrap=True,
+                                oob_score=False,
+                                n_jobs=1, random_state=1,
+                                verbose=0, warm_start=False)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = RandomForestRegressor(n_estimators=10, max_features=1,
+                                random_state=1)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.fast_test
+def test_extra_forest():
+    # toy sample
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+    y = [-1, -1, -1, 1, 1, 1]
+    T = [[-1, -1], [2, 2], [3, 2]]
+    true_result = [-1, 1, 1]
+
+    clf = ExtraTreesRegressor(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ExtraTreesRegressor(n_estimators=10, min_impurity_decrease=0.1,
+                              random_state=1)
+    clf.fit(X, y)
+
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ExtraTreesRegressor(n_estimators=10, criterion="mse",
+                              max_depth=None, min_samples_split=2,
+                              min_samples_leaf=1, min_weight_fraction_leaf=0.,
+                              max_features="auto", max_leaf_nodes=None,
+                              min_impurity_decrease=0., bootstrap=False,
+                              oob_score=False,
+                              n_jobs=1, random_state=1,
+                              verbose=0, warm_start=False)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ExtraTreesRegressor(n_estimators=10, max_features=1, random_state=1)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
diff --git a/skopt/optimizer/base.py b/skopt/optimizer/base.py
index ab7ea40ab..0ded90bb4 100644
--- a/skopt/optimizer/base.py
+++ b/skopt/optimizer/base.py
@@ -4,8 +4,7 @@
 It is sufficient that one re-implements the base estimator.
 """
 
-import copy
-import inspect
+import warnings
 import numbers
 try:
     from collections.abc import Iterable
@@ -21,12 +20,15 @@
 
 
 def base_minimize(func, dimensions, base_estimator,
-                  n_calls=100, n_random_starts=10,
+                  n_calls=100, n_random_starts=None,
+                  n_initial_points=10,
+                  initial_point_generator="random",
                   acq_func="EI", acq_optimizer="lbfgs",
                   x0=None, y0=None, random_state=None, verbose=False,
                   callback=None, n_points=10000, n_restarts_optimizer=5,
                   xi=0.01, kappa=1.96, n_jobs=1, model_queue_size=None):
     """Base optimizer class
+
     Parameters
     ----------
     func : callable
@@ -34,7 +36,7 @@ def base_minimize(func, dimensions, base_estimator,
         and return the objective value.
     
         If you have a search-space where all dimensions have names,
-        then you can use `skopt.utils.use_named_args` as a decorator
+        then you can use :func:`skopt.utils.use_named_args` as a decorator
         on your objective function, in order to call it directly
         with the named arguments. See `use_named_args` for an example.
 
@@ -50,24 +52,43 @@ def base_minimize(func, dimensions, base_estimator,
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
-         NOTE: The upper and lower bounds are inclusive for `Integer`
-         dimensions.
+         .. note:: The upper and lower bounds are inclusive for `Integer`
+            dimensions.
 
     base_estimator : sklearn regressor
         Should inherit from `sklearn.base.RegressorMixin`.
         In addition, should have an optional `return_std` argument,
-        which returns `std(Y | x)`` along with `E[Y | x]`.
+        which returns `std(Y | x)` along with `E[Y | x]`.
 
-    n_calls : int, default=100
-        Maximum number of calls to `func`. An objective fucntion will
+    n_calls : int, default: 100
+        Maximum number of calls to `func`. An objective function will
         always be evaluated this number of times; Various options to
         supply initialization points do not affect this value.
 
-    n_random_starts : int, default=10
+    n_random_starts : int, default: None
         Number of evaluations of `func` with random points before
         approximating it with `base_estimator`.
 
-    acq_func : string, default=`"EI"`
+        .. deprecated:: 0.8
+            use `n_initial_points` instead.
+
+    n_initial_points : int, default: 10
+        Number of evaluations of `func` with initialization points
+        before approximating it with `base_estimator`. Initial point
+        generator can be changed by setting `initial_point_generator`.
+
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: `"random"`
+        Sets a initial points generator. Can be either
+
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+        - `"grid"` for a uniform grid sequence
+
+    acq_func : string, default: `"EI"`
         Function to minimize over the posterior distribution. Can be either
 
         - `"LCB"` for lower confidence bound,
@@ -79,10 +100,10 @@ def base_minimize(func, dimensions, base_estimator,
           the second being the time taken in seconds.
         - `"PIps"` for negated probability of improvement per second. The
           return type of the objective function is assumed to be similar to
-          that of `"EIps
+          that of `"EIps"`
 
-    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default=`"lbfgs"`
-        Method to minimize the acquistion function. The fit model
+    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default: `"lbfgs"`
+        Method to minimize the acquisition function. The fit model
         is updated with the optimal value obtained by optimizing `acq_func`
         with `acq_optimizer`.
 
@@ -90,11 +111,12 @@ def base_minimize(func, dimensions, base_estimator,
           `acq_func` at `n_points` randomly sampled points and the smallest
           value found is used.
         - If set to `"lbfgs"`, then
-              - The `n_restarts_optimizer` no. of points which the acquisition
-                function is least are taken as start points.
-              - `"lbfgs"` is run for 20 iterations with these points as initial
-                points to find local minima.
-              - The optimal of these local minima is used to update the prior.
+
+          - The `n_restarts_optimizer` no. of points which the acquisition
+            function is least are taken as start points.
+          - `"lbfgs"` is run for 20 iterations with these points as initial
+            points to find local minima.
+          - The optimal of these local minima is used to update the prior.
 
     x0 : list, list of lists or `None`
         Initial input points.
@@ -125,7 +147,7 @@ def base_minimize(func, dimensions, base_estimator,
         Set random state to something other than None for reproducible
         results.
 
-    verbose : boolean, default=False
+    verbose : boolean, default: False
         Control the verbosity. It is advised to set the verbosity to True
         for long optimization runs.
 
@@ -133,33 +155,35 @@ def base_minimize(func, dimensions, base_estimator,
         If callable then `callback(res)` is called after each call to `func`.
         If list of callables, then each callable in the list is called.
 
-    n_points : int, default=10000
+    n_points : int, default: 10000
         If `acq_optimizer` is set to `"sampling"`, then `acq_func` is
         optimized by computing `acq_func` at `n_points` randomly sampled
         points.
 
-    n_restarts_optimizer : int, default=5
+    n_restarts_optimizer : int, default: 5
         The number of restarts of the optimizer when `acq_optimizer`
         is `"lbfgs"`.
 
-    xi : float, default=0.01
+    xi : float, default: 0.01
         Controls how much improvement one wants over the previous best
         values. Used when the acquisition is either `"EI"` or `"PI"`.
 
-    kappa : float, default=1.96
+    kappa : float, default: 1.96
         Controls how much of the variance in the predicted values should be
         taken into account. If set to be very high, then we are favouring
         exploration over exploitation and vice versa.
         Used when the acquisition is `"LCB"`.
 
-    n_jobs : int, default=1
+    n_jobs : int, default: 1
         Number of cores to run in parallel while running the lbfgs
-        optimizations over the acquisition function. Valid only when
-        `acq_optimizer` is set to "lbfgs."
+        optimizations over the acquisition function and given to
+        the base_estimator. Valid only when
+        `acq_optimizer` is set to "lbfgs". or when the base_estimator
+        supports n_jobs as parameter and was given as string.
         Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
         to number of cores.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -170,28 +194,21 @@ def base_minimize(func, dimensions, base_estimator,
         Important attributes are:
 
         - `x` [list]: location of the minimum.
-
         - `fun` [float]: function value at the minimum.
-
         - `models`: surrogate models used for each iteration.
-
         - `x_iters` [list of lists]: location of function evaluation for each
-           iteration.
-
+          iteration.
         - `func_vals` [array]: function value for each iteration.
-
         - `space` [Space]: the optimization space.
-
         - `specs` [dict]`: the call specifications.
-
         - `rng` [RandomState instance]: State of the random state
-           at the end of minimization.
+          at the end of minimization.
 
         For more details related to the OptimizeResult object, refer
         http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
     """
-    specs = {"args": copy.copy(inspect.currentframe().f_locals),
-             "function": inspect.currentframe().f_code.co_name}
+    specs = {"args": locals(),
+             "function": "base_minimize"}
 
     acq_optimizer_kwargs = {
         "n_points": n_points, "n_restarts_optimizer": n_restarts_optimizer,
@@ -208,26 +225,36 @@ def base_minimize(func, dimensions, base_estimator,
         x0 = [x0]
     if not isinstance(x0, list):
         raise ValueError("`x0` should be a list, but got %s" % type(x0))
-    if n_random_starts <= 0 and not x0:
-        raise ValueError("Either set `n_random_starts` > 0,"
+
+    # Check `n_random_starts` deprecation first
+    if n_random_starts is not None:
+        warnings.warn(("n_random_starts will be removed in favour of "
+                       "n_initial_points. It overwrites n_initial_points."),
+                      DeprecationWarning)
+        n_initial_points = n_random_starts
+
+    if n_initial_points <= 0 and not x0:
+        raise ValueError("Either set `n_initial_points` > 0,"
                          " or provide `x0`")
     # check y0: list-like, requirement of maximal calls
     if isinstance(y0, Iterable):
         y0 = list(y0)
     elif isinstance(y0, numbers.Number):
         y0 = [y0]
-    required_calls = n_random_starts + (len(x0) if not y0 else 0)
+    required_calls = n_initial_points + (len(x0) if not y0 else 0)
     if n_calls < required_calls:
         raise ValueError(
             "Expected `n_calls` >= %d, got %d" % (required_calls, n_calls))
     # calculate the total number of initial points
-    n_initial_points = n_random_starts + len(x0)
+    n_initial_points = n_initial_points + len(x0)
 
     # Build optimizer
 
     # create optimizer class
     optimizer = Optimizer(dimensions, base_estimator,
                           n_initial_points=n_initial_points,
+                          initial_point_generator=initial_point_generator,
+                          n_jobs=n_jobs,
                           acq_func=acq_func, acq_optimizer=acq_optimizer,
                           random_state=random_state,
                           model_queue_size=model_queue_size,
@@ -243,7 +270,7 @@ def base_minimize(func, dimensions, base_estimator,
     if verbose:
         callbacks.append(VerboseCallback(
             n_init=len(x0) if not y0 else 0,
-            n_random=n_random_starts,
+            n_random=n_initial_points,
             n_total=n_calls))
 
     # Record provided points
diff --git a/skopt/optimizer/dummy.py b/skopt/optimizer/dummy.py
index 7be17b137..c372c243b 100644
--- a/skopt/optimizer/dummy.py
+++ b/skopt/optimizer/dummy.py
@@ -3,9 +3,10 @@
 from .base import base_minimize
 
 
-def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
+def dummy_minimize(func, dimensions, n_calls=100,
+                   initial_point_generator="random", x0=None, y0=None,
                    random_state=None, verbose=False, callback=None,
-                   model_queue_size=None):
+                   model_queue_size=None, init_point_gen_kwargs=None):
     """Random search by uniform sampling within the given bounds.
 
     Parameters
@@ -15,7 +16,7 @@ def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
         and return the objective value.
     
         If you have a search-space where all dimensions have names,
-        then you can use `skopt.utils.use_named_args` as a decorator
+        then you can use :func:`skopt.utils.use_named_args` as a decorator
         on your objective function, in order to call it directly
         with the named arguments. See `use_named_args` for an example.
 
@@ -31,9 +32,20 @@ def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
-    n_calls : int, default=100
+    n_calls : int, default: 100
         Number of calls to `func` to find the minimum.
 
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: `"random"`
+        Sets a initial points generator. Can be either
+
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+        - `"grid"` for a uniform grid sequence
+
     x0 : list, list of lists or `None`
         Initial input points.
 
@@ -56,7 +68,7 @@ def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
         Set random state to something other than None for reproducible
         results.
 
-    verbose : boolean, default=False
+    verbose : boolean, default: False
         Control the verbosity. It is advised to set the verbosity to True
         for long optimization runs.
 
@@ -64,7 +76,7 @@ def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
         If callable then `callback(res)` is called after each call to `func`.
         If list of callables, then each callable in the list is called.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -75,36 +87,34 @@ def dummy_minimize(func, dimensions, n_calls=100, x0=None, y0=None,
         Important attributes are:
 
         - `x` [list]: location of the minimum.
-
         - `fun` [float]: function value at the minimum.
-
         - `x_iters` [list of lists]: location of function evaluation for each
-           iteration.
-
+          iteration.
         - `func_vals` [array]: function value for each iteration.
-
         - `space` [Space]: the optimisation space.
-
         - `specs` [dict]: the call specifications.
-
         - `rng` [RandomState instance]: State of the random state
-           at the end of minimization.
+          at the end of minimization.
 
         For more details related to the OptimizeResult object, refer
         http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
+
+    .. seealso:: functions :class:`skopt.gp_minimize`,
+        :class:`skopt.forest_minimize`, :class:`skopt.gbrt_minimize`
     """
     # all our calls want random suggestions, except if we need to evaluate
     # some initial points
     if x0 is not None and y0 is None:
-        n_random_calls = n_calls - len(x0)
+        n_initial_points = n_calls - len(x0)
     else:
-        n_random_calls = n_calls
+        n_initial_points = n_calls
 
     return base_minimize(func, dimensions, base_estimator="dummy",
                          # explicitly set optimizer to sampling as "dummy"
                          # minimizer does not provide gradients.
                          acq_optimizer="sampling",
-                         n_calls=n_calls, n_random_starts=n_random_calls,
+                         n_calls=n_calls, n_initial_points=n_initial_points,
+                         initial_point_generator=initial_point_generator,
                          x0=x0, y0=y0, random_state=random_state,
                          verbose=verbose,
                          callback=callback, model_queue_size=model_queue_size)
diff --git a/skopt/optimizer/forest.py b/skopt/optimizer/forest.py
index c079f941a..455a04cf0 100644
--- a/skopt/optimizer/forest.py
+++ b/skopt/optimizer/forest.py
@@ -8,7 +8,8 @@
 
 
 def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
-                    n_random_starts=10, acq_func="EI",
+                    n_random_starts=None, n_initial_points=10, acq_func="EI",
+                    initial_point_generator="random",
                     x0=None, y0=None, random_state=None, verbose=False,
                     callback=None, n_points=10000, xi=0.01, kappa=1.96,
                     n_jobs=1, model_queue_size=None):
@@ -21,13 +22,16 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
 
     The total number of evaluations, `n_calls`, are performed like the
     following. If `x0` is provided but not `y0`, then the elements of `x0`
-    are first evaluated, followed by `n_random_starts` evaluations.
-    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
+    are first evaluated, followed by `n_initial_points` evaluations.
+    Finally, `n_calls - len(x0) - n_initial_points` evaluations are
     made guided by the surrogate model. If `x0` and `y0` are both
-    provided then `n_random_starts` evaluations are first made then
-    `n_calls - n_random_starts` subsequent evaluations are made
+    provided then `n_initial_points` evaluations are first made then
+    `n_calls - n_initial_points` subsequent evaluations are made
     guided by the surrogate model.
 
+    The first `n_initial_points` are generated by the
+    `initial_point_generator`.
+
     Parameters
     ----------
     func : callable
@@ -38,7 +42,7 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
         then you can use :func:`skopt.utils.use_named_args` as a decorator
         on your objective function, in order to call it directly
         with the named arguments. See :func:`skopt.utils.use_named_args`
-         for an example.
+        for an example.
 
     dimensions : list, shape (n_dims,)
         List of search space dimensions.
@@ -52,10 +56,10 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
-         NOTE: The upper and lower bounds are inclusive for `Integer`
-         dimensions.
+         .. note:: The upper and lower bounds are inclusive for `Integer`
+            dimensions.
 
-    base_estimator : string or `Regressor`, default="ET"
+    base_estimator : string or `Regressor`, default: `"ET"`
         The regressor to use as surrogate model. Can be either
 
         - `"RF"` for random forest regressor
@@ -68,14 +72,33 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
         a regressor which returns the mean and standard deviation when
         making predictions.
 
-    n_calls : int, default=100
+    n_calls : int, default: 100
         Number of calls to `func`.
 
-    n_random_starts : int, default=10
+    n_random_starts : int, default: None
         Number of evaluations of `func` with random points before
         approximating it with `base_estimator`.
 
-    acq_func : string, default="LCB"
+        .. deprecated:: 0.8
+            use `n_initial_points` instead.
+
+    n_initial_points : int, default: 10
+        Number of evaluations of `func` with initialization points
+        before approximating it with `base_estimator`. Initial point
+        generator can be changed by setting `initial_point_generator`.
+
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: `"random"`
+        Sets a initial points generator. Can be either
+
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+        - `"grid"` for a uniform grid sequence
+
+    acq_func : string, default: `"LCB"`
         Function to minimize over the forest posterior. Can be either
 
         - `"LCB"` for lower confidence bound.
@@ -111,31 +134,31 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
         Set random state to something other than None for reproducible
         results.
 
-    verbose : boolean, default=False
+    verbose : boolean, default: False
         Control the verbosity. It is advised to set the verbosity to True
         for long optimization runs.
 
     callback : callable, optional
         If provided, then `callback(res)` is called after call to func.
 
-    n_points : int, default=10000
+    n_points : int, default: 10000
         Number of points to sample when minimizing the acquisition function.
 
-    xi : float, default=0.01
+    xi : float, default: 0.01
         Controls how much improvement one wants over the previous best
         values. Used when the acquisition is either `"EI"` or `"PI"`.
 
-    kappa : float, default=1.96
+    kappa : float, default: 1.96
         Controls how much of the variance in the predicted values should be
         taken into account. If set to be very high, then we are favouring
         exploration over exploitation and vice versa.
         Used when the acquisition is `"LCB"`.
 
-    n_jobs : int, default=1
+    n_jobs : int, default: 1
         The number of jobs to run in parallel for `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -146,30 +169,27 @@ def forest_minimize(func, dimensions, base_estimator="ET", n_calls=100,
         Important attributes are:
 
         - `x` [list]: location of the minimum.
-
         - `fun` [float]: function value at the minimum.
-
         - `models`: surrogate models used for each iteration.
-
         - `x_iters` [list of lists]: location of function evaluation for each
-           iteration.
-
+          iteration.
         - `func_vals` [array]: function value for each iteration.
-
         - `space` [Space]: the optimization space.
-
         - `specs` [dict]`: the call specifications.
 
         For more details related to the OptimizeResult object, refer
         http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
 
     .. seealso:: functions :class:`skopt.gp_minimize`,
-        :class:`skopt.dummy_minimize`
+        :class:`skopt.dummy_minimize`, :class:`skopt.gbrt_minimize`
     """
     return base_minimize(func, dimensions, base_estimator,
                          n_calls=n_calls, n_points=n_points,
                          n_random_starts=n_random_starts,
+                         n_initial_points=n_initial_points,
+                         initial_point_generator=initial_point_generator,
                          x0=x0, y0=y0, random_state=random_state,
+                         n_jobs=n_jobs,
                          acq_func=acq_func,
                          xi=xi, kappa=kappa, verbose=verbose,
                          callback=callback, acq_optimizer="sampling",
diff --git a/skopt/optimizer/gbrt.py b/skopt/optimizer/gbrt.py
index 00815d10f..de43d6fcc 100644
--- a/skopt/optimizer/gbrt.py
+++ b/skopt/optimizer/gbrt.py
@@ -6,7 +6,9 @@
 
 
 def gbrt_minimize(func, dimensions, base_estimator=None,
-                  n_calls=100, n_random_starts=10,
+                  n_calls=100, n_random_starts=None,
+                  n_initial_points=10,
+                  initial_point_generator="random",
                   acq_func="EI", acq_optimizer="auto",
                   x0=None, y0=None, random_state=None, verbose=False,
                   callback=None, n_points=10000, xi=0.01, kappa=1.96,
@@ -21,13 +23,16 @@ def gbrt_minimize(func, dimensions, base_estimator=None,
 
     The total number of evaluations, `n_calls`, are performed like the
     following. If `x0` is provided but not `y0`, then the elements of `x0`
-    are first evaluated, followed by `n_random_starts` evaluations.
-    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
+    are first evaluated, followed by `n_initial_points` evaluations.
+    Finally, `n_calls - len(x0) - n_initial_points` evaluations are
     made guided by the surrogate model. If `x0` and `y0` are both
-    provided then `n_random_starts` evaluations are first made then
-    `n_calls - n_random_starts` subsequent evaluations are made
+    provided then `n_initial_points` evaluations are first made then
+    `n_calls - n_initial_points` subsequent evaluations are made
     guided by the surrogate model.
 
+    The first `n_initial_points` are generated by the
+    `initial_point_generator`.
+
     Parameters
     ----------
     func : callable
@@ -54,14 +59,33 @@ def gbrt_minimize(func, dimensions, base_estimator=None,
     base_estimator : `GradientBoostingQuantileRegressor`
         The regressor to use as surrogate model
 
-    n_calls : int, default=100
+    n_calls : int, default: 100
         Number of calls to `func`.
 
-    n_random_starts : int, default=10
+    n_random_starts : int, default: None
         Number of evaluations of `func` with random points before
         approximating it with `base_estimator`.
 
-    acq_func : string, default=`"LCB"`
+        .. deprecated:: 0.8
+            use `n_initial_points` instead.
+
+    n_initial_points : int, default: 10
+        Number of evaluations of `func` with initialization points
+        before approximating it with `base_estimator`. Initial point
+        generator can be changed by setting `initial_point_generator`.
+
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: `"random"`
+        Sets a initial points generator. Can be either
+
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+        - `"grid"` for a uniform grid sequence
+
+    acq_func : string, default: `"LCB"`
         Function to minimize over the forest posterior. Can be either
 
         - `"LCB"` for lower confidence bound.
@@ -95,31 +119,31 @@ def gbrt_minimize(func, dimensions, base_estimator=None,
         Set random state to something other than None for reproducible
         results.
 
-    verbose : boolean, default=False
+    verbose : boolean, default: False
         Control the verbosity. It is advised to set the verbosity to True
         for long optimization runs.
 
     callback : callable, optional
         If provided, then `callback(res)` is called after call to func.
 
-    n_points : int, default=10000
+    n_points : int, default: 10000
         Number of points to sample when minimizing the acquisition function.
 
-    xi : float, default=0.01
+    xi : float, default: 0.01
         Controls how much improvement one wants over the previous best
         values. Used when the acquisition is either `"EI"` or `"PI"`.
 
-    kappa : float, default=1.96
+    kappa : float, default: 1.96
         Controls how much of the variance in the predicted values should be
         taken into account. If set to be very high, then we are favouring
         exploration over exploitation and vice versa.
         Used when the acquisition is `"LCB"`.
 
-    n_jobs : int, default=1
+    n_jobs : int, default: 1
         The number of jobs to run in parallel for `fit` and `predict`.
         If -1, then the number of jobs is set to the number of cores.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -130,25 +154,21 @@ def gbrt_minimize(func, dimensions, base_estimator=None,
         Important attributes are:
 
         - `x` [list]: location of the minimum.
-
         - `fun` [float]: function value at the minimum.
-
         - `models`: surrogate models used for each iteration.
-
         - `x_iters` [list of lists]: location of function evaluation for each
-           iteration.
-
+          iteration.
         - `func_vals` [array]: function value for each iteration.
-
         - `space` [Space]: the optimization space.
-
         - `specs` [dict]`: the call specifications.
-
         - `rng` [RandomState instance]: State of the random state
-           at the end of minimization.
+          at the end of minimization.
 
         For more details related to the OptimizeResult object, refer
         http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
+
+    .. seealso:: functions :class:`skopt.gp_minimize`,
+        :class:`skopt.dummy_minimize`, :class:`skopt.forest_minimize`
     """
     # Check params
     rng = check_random_state(random_state)
@@ -159,6 +179,8 @@ def gbrt_minimize(func, dimensions, base_estimator=None,
     return base_minimize(func, dimensions, base_estimator,
                          n_calls=n_calls, n_points=n_points,
                          n_random_starts=n_random_starts,
+                         n_initial_points=n_initial_points,
+                         initial_point_generator=initial_point_generator,
                          x0=x0, y0=y0, random_state=random_state, xi=xi,
                          kappa=kappa, acq_func=acq_func, verbose=verbose,
                          callback=callback, acq_optimizer="sampling",
diff --git a/skopt/optimizer/gp.py b/skopt/optimizer/gp.py
index 5d272da4d..67935a8e8 100644
--- a/skopt/optimizer/gp.py
+++ b/skopt/optimizer/gp.py
@@ -10,7 +10,9 @@
 
 
 def gp_minimize(func, dimensions, base_estimator=None,
-                n_calls=100, n_random_starts=10,
+                n_calls=100, n_random_starts=None,
+                n_initial_points=10,
+                initial_point_generator="random",
                 acq_func="gp_hedge", acq_optimizer="auto", x0=None, y0=None,
                 random_state=None, verbose=False, callback=None,
                 n_points=10000, n_restarts_optimizer=5, xi=0.01, kappa=1.96,
@@ -32,13 +34,16 @@ def gp_minimize(func, dimensions, base_estimator=None,
 
     The total number of evaluations, `n_calls`, are performed like the
     following. If `x0` is provided but not `y0`, then the elements of `x0`
-    are first evaluated, followed by `n_random_starts` evaluations.
-    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
+    are first evaluated, followed by `n_initial_points` evaluations.
+    Finally, `n_calls - len(x0) - n_initial_points` evaluations are
     made guided by the surrogate model. If `x0` and `y0` are both
-    provided then `n_random_starts` evaluations are first made then
-    `n_calls - n_random_starts` subsequent evaluations are made
+    provided then `n_initial_points` evaluations are first made then
+    `n_calls - n_initial_points` subsequent evaluations are made
     guided by the surrogate model.
 
+    The first `n_initial_points` are generated by the
+    `initial_point_generator`.
+
     Parameters
     ----------
     func : callable
@@ -56,17 +61,14 @@ def gp_minimize(func, dimensions, base_estimator=None,
 
         - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
           dimensions),
-
         - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
           dimensions),
-
         - as a list of categories (for `Categorical` dimensions), or
-
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
          .. note:: The upper and lower bounds are inclusive for `Integer`
-         dimensions.
+            dimensions.
 
     base_estimator : a Gaussian process estimator
         The Gaussian process estimator to use for optimization.
@@ -74,57 +76,66 @@ def gp_minimize(func, dimensions, base_estimator=None,
         hyperparameters tuned.
 
         - All the length scales of the Matern kernel.
-
         - The covariance amplitude that each element is multiplied with.
-
         - Noise that is added to the matern kernel. The noise is assumed
           to be iid gaussian.
 
-    n_calls : int, default=100
+    n_calls : int, default: 100
         Number of calls to `func`.
 
-    n_random_starts : int, default=10
+    n_random_starts : int, default: None
         Number of evaluations of `func` with random points before
         approximating it with `base_estimator`.
 
-    acq_func : string, default=`"gp_hedge"`
+        .. deprecated:: 0.8
+            use `n_initial_points` instead.
+
+    n_initial_points : int, default: 10
+        Number of evaluations of `func` with initialization points
+        before approximating it with `base_estimator`. Initial point
+        generator can be changed by setting `initial_point_generator`.
+
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: 'random'
+        Sets a initial points generator. Can be either
+
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+
+    acq_func : string, default: `"gp_hedge"`
         Function to minimize over the gaussian prior. Can be either
 
         - `"LCB"` for lower confidence bound.
-
         - `"EI"` for negative expected improvement.
-
         - `"PI"` for negative probability of improvement.
-
         - `"gp_hedge"` Probabilistically choose one of the above three
           acquisition functions at every iteration. The weightage
-          given to these gains can be set by :math:`\eta` through
+          given to these gains can be set by :math:`\\eta` through
           `acq_func_kwargs`.
 
-            - The gains `g_i` are initialized to zero.
-
-            - At every iteration,
+          - The gains `g_i` are initialized to zero.
+          - At every iteration,
 
-                - Each acquisition function is optimised independently to
-                  propose an candidate point `X_i`.
-
-                - Out of all these candidate points, the next point `X_best` is
-                  chosen by :math:`softmax(\eta g_i)`
-
-                - After fitting the surrogate model with `(X_best, y_best)`,
-                  the gains are updated such that :math:`g_i -= \mu(X_i)`
+            - Each acquisition function is optimised independently to
+              propose an candidate point `X_i`.
+            - Out of all these candidate points, the next point `X_best` is
+              chosen by :math:`softmax(\\eta g_i)`
+            - After fitting the surrogate model with `(X_best, y_best)`,
+              the gains are updated such that :math:`g_i -= \\mu(X_i)`
 
         - `"EIps"` for negated expected improvement per second to take into
           account the function compute time. Then, the objective function is
           assumed to return two values, the first being the objective value and
           the second being the time taken in seconds.
-
         - `"PIps"` for negated probability of improvement per second. The
           return type of the objective function is assumed to be similar to
-          that of `"EIps
+          that of `"EIps"`
 
-    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default=`"lbfgs"`
-        Method to minimize the acquistion function. The fit model
+    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default: `"lbfgs"`
+        Method to minimize the acquisition function. The fit model
         is updated with the optimal value obtained by optimizing `acq_func`
         with `acq_optimizer`.
 
@@ -132,28 +143,22 @@ def gp_minimize(func, dimensions, base_estimator=None,
 
         - If set to `"auto"`, then `acq_optimizer` is configured on the
           basis of the space searched over.
-          If the space is Categorical then this is set to be "sampling"`.
-
+          If the space is Categorical then this is set to be `"sampling"`.
         - If set to `"sampling"`, then the point among these `n_points`
           where the `acq_func` is minimum is the next candidate minimum.
-
         - If set to `"lbfgs"`, then
 
-              - The `n_restarts_optimizer` no. of points which the acquisition
-                function is least are taken as start points.
-
-              - `"lbfgs"` is run for 20 iterations with these points as initial
-                points to find local minima.
-
-              - The optimal of these local minima is used to update the prior.
+          - The `n_restarts_optimizer` no. of points which the acquisition
+            function is least are taken as start points.
+          - `"lbfgs"` is run for 20 iterations with these points as initial
+            points to find local minima.
+          - The optimal of these local minima is used to update the prior.
 
     x0 : list, list of lists or `None`
         Initial input points.
 
         - If it is a list of lists, use it as a list of input points.
-
         - If it is a list, use it as a single initial input point.
-
         - If it is `None`, no initial input points are used.
 
     y0 : list, scalar or `None`
@@ -162,10 +167,8 @@ def gp_minimize(func, dimensions, base_estimator=None,
         - If it is a list, then it corresponds to evaluations of the function
           at each element of `x0` : the i-th element of `y0` corresponds
           to the function evaluated at the i-th element of `x0`.
-
         - If it is a scalar, then it corresponds to the evaluation of the
           function at `x0`.
-
         - If it is None and `x0` is provided, then the function is evaluated
           at each element of `x0`.
 
@@ -173,7 +176,7 @@ def gp_minimize(func, dimensions, base_estimator=None,
         Set random state to something other than None for reproducible
         results.
 
-    verbose : boolean, default=False
+    verbose : boolean, default: False
         Control the verbosity. It is advised to set the verbosity to True
         for long optimization runs.
 
@@ -181,44 +184,42 @@ def gp_minimize(func, dimensions, base_estimator=None,
         If callable then `callback(res)` is called after each call to `func`.
         If list of callables, then each callable in the list is called.
 
-    n_points : int, default=10000
+    n_points : int, default: 10000
         Number of points to sample to determine the next "best" point.
         Useless if acq_optimizer is set to `"lbfgs"`.
 
-    n_restarts_optimizer : int, default=5
+    n_restarts_optimizer : int, default: 5
         The number of restarts of the optimizer when `acq_optimizer`
         is `"lbfgs"`.
 
-    kappa : float, default=1.96
+    kappa : float, default: 1.96
         Controls how much of the variance in the predicted values should be
         taken into account. If set to be very high, then we are favouring
         exploration over exploitation and vice versa.
         Used when the acquisition is `"LCB"`.
 
-    xi : float, default=0.01
+    xi : float, default: 0.01
         Controls how much improvement one wants over the previous best
         values. Used when the acquisition is either `"EI"` or `"PI"`.
 
-    noise : float, default="gaussian"
+    noise : float, default: "gaussian"
 
         - Use noise="gaussian" if the objective returns noisy observations.
           The noise of each observation is assumed to be iid with
           mean zero and a fixed variance.
-
         - If the variance is known before-hand, this can be set directly
           to the variance of the noise.
-
         - Set this to a value close to zero (1e-10) if the function is
           noise-free. Setting to zero might cause stability issues.
 
-    n_jobs : int, default=1
+    n_jobs : int, default: 1
         Number of cores to run in parallel while running the lbfgs
         optimizations over the acquisition function. Valid only
-        when `acq_optimizer` is set to "lbfgs."
+        when `acq_optimizer` is set to `"lbfgs"`.
         Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
         to number of cores.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -229,28 +230,22 @@ def gp_minimize(func, dimensions, base_estimator=None,
         Important attributes are:
 
         - `x` [list]: location of the minimum.
-
         - `fun` [float]: function value at the minimum.
-
         - `models`: surrogate models used for each iteration.
-
         - `x_iters` [list of lists]: location of function evaluation for each
-           iteration.
-
+          iteration.
         - `func_vals` [array]: function value for each iteration.
-
         - `space` [Space]: the optimization space.
-
         - `specs` [dict]`: the call specifications.
-
         - `rng` [RandomState instance]: State of the random state
-           at the end of minimization.
+          at the end of minimization.
 
         For more details related to the OptimizeResult object, refer
         http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
 
     .. seealso:: functions :class:`skopt.forest_minimize`,
-        :class:`skopt.dummy_minimize`
+        :class:`skopt.dummy_minimize`, :class:`skopt.gbrt_minimize`
+
     """
     # Check params
     rng = check_random_state(random_state)
@@ -266,6 +261,8 @@ def gp_minimize(func, dimensions, base_estimator=None,
         acq_func=acq_func,
         xi=xi, kappa=kappa, acq_optimizer=acq_optimizer, n_calls=n_calls,
         n_points=n_points, n_random_starts=n_random_starts,
+        n_initial_points=n_initial_points,
+        initial_point_generator=initial_point_generator,
         n_restarts_optimizer=n_restarts_optimizer,
         x0=x0, y0=y0, random_state=rng, verbose=verbose,
         callback=callback, n_jobs=n_jobs, model_queue_size=model_queue_size)
diff --git a/skopt/optimizer/optimizer.py b/skopt/optimizer/optimizer.py
index 94b122cf5..240ffbfc1 100644
--- a/skopt/optimizer/optimizer.py
+++ b/skopt/optimizer/optimizer.py
@@ -2,7 +2,6 @@
 import warnings
 from math import log
 from numbers import Number
-
 import numpy as np
 
 from scipy.optimize import fmin_l_bfgs_b
@@ -25,6 +24,7 @@
 from ..utils import is_listlike
 from ..utils import is_2Dlistlike
 from ..utils import normalize_dimensions
+from ..utils import cook_initial_point_generator
 
 
 class Optimizer(object):
@@ -51,26 +51,36 @@ class Optimizer(object):
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
-    base_estimator : `"GP"`, `"RF"`, `"ET"`, `"GBRT"` or sklearn regressor,
-    default=`"GP"`
+    base_estimator : `"GP"`, `"RF"`, `"ET"`, `"GBRT"` or sklearn regressor, \
+            default: `"GP"`
         Should inherit from :obj:`sklearn.base.RegressorMixin`.
         In addition the `predict` method, should have an optional `return_std`
-        argument, which returns `std(Y | x)`` along with `E[Y | x]`.
+        argument, which returns `std(Y | x)` along with `E[Y | x]`.
         If base_estimator is one of ["GP", "RF", "ET", "GBRT"], a default
         surrogate model of the corresponding type is used corresponding to what
         is used in the minimize functions.
 
-    n_random_starts : int, default=10
-        .. deprecated::
+    n_random_starts : int, default: 10
+        .. deprecated:: 0.6
             use `n_initial_points` instead.
 
-    n_initial_points : int, default=10
+    n_initial_points : int, default: 10
         Number of evaluations of `func` with initialization points
-        before approximating it with `base_estimator`. Points provided as
-        `x0` count as initialization points. If len(x0) < n_initial_points
-        additional points are sampled at random.
+        before approximating it with `base_estimator`. Initial point
+        generator can be changed by setting `initial_point_generator`.
+
+    initial_point_generator : str, InitialPointGenerator instance, \
+            default: `"random"`
+        Sets a initial points generator. Can be either
 
-    acq_func : string, default=`"gp_hedge"`
+        - `"random"` for uniform random numbers,
+        - `"sobol"` for a Sobol' sequence,
+        - `"halton"` for a Halton sequence,
+        - `"hammersly"` for a Hammersly sequence,
+        - `"lhs"` for a latin hypercube sequence,
+        - `"grid"` for a uniform grid sequence
+
+    acq_func : string, default: `"gp_hedge"`
         Function to minimize over the posterior distribution. Can be either
 
         - `"LCB"` for lower confidence bound.
@@ -78,50 +88,60 @@ class Optimizer(object):
         - `"PI"` for negative probability of improvement.
         - `"gp_hedge"` Probabilistically choose one of the above three
           acquisition functions at every iteration.
-            - The gains `g_i` are initialized to zero.
-            - At every iteration,
-                - Each acquisition function is optimised independently to
-                  propose an candidate point `X_i`.
-                - Out of all these candidate points, the next point `X_best` is
-                  chosen by :math:`softmax(\eta g_i)`
-                - After fitting the surrogate model with `(X_best, y_best)`,
-                  the gains are updated such that :math:`g_i -= \mu(X_i)`
-        - `"EIps" for negated expected improvement per second to take into
+
+          - The gains `g_i` are initialized to zero.
+          - At every iteration,
+
+            - Each acquisition function is optimised independently to
+              propose an candidate point `X_i`.
+            - Out of all these candidate points, the next point `X_best` is
+              chosen by :math:`softmax(\\eta g_i)`
+            - After fitting the surrogate model with `(X_best, y_best)`,
+              the gains are updated such that :math:`g_i -= \\mu(X_i)`
+
+        - `"EIps"` for negated expected improvement per second to take into
           account the function compute time. Then, the objective function is
           assumed to return two values, the first being the objective value and
           the second being the time taken in seconds.
         - `"PIps"` for negated probability of improvement per second. The
           return type of the objective function is assumed to be similar to
-          that of `"EIps
+          that of `"EIps"`
 
-    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default=`"auto"`
-        Method to minimize the acquistion function. The fit model
+    acq_optimizer : string, `"sampling"` or `"lbfgs"`, default: `"auto"`
+        Method to minimize the acquisition function. The fit model
         is updated with the optimal value obtained by optimizing `acq_func`
         with `acq_optimizer`.
 
         - If set to `"auto"`, then `acq_optimizer` is configured on the
           basis of the base_estimator and the space searched over.
           If the space is Categorical or if the estimator provided based on
-          tree-models then this is set to be "sampling"`.
+          tree-models then this is set to be `"sampling"`.
         - If set to `"sampling"`, then `acq_func` is optimized by computing
           `acq_func` at `n_points` randomly sampled points.
         - If set to `"lbfgs"`, then `acq_func` is optimized by
-              - Sampling `n_restarts_optimizer` points randomly.
-              - `"lbfgs"` is run for 20 iterations with these points as initial
-                points to find local minima.
-              - The optimal of these local minima is used to update the prior.
+
+          - Sampling `n_restarts_optimizer` points randomly.
+          - `"lbfgs"` is run for 20 iterations with these points as initial
+            points to find local minima.
+          - The optimal of these local minima is used to update the prior.
 
     random_state : int, RandomState instance, or None (default)
         Set random state to something other than None for reproducible
         results.
 
+    n_jobs : int, default: 1
+        The number of jobs to run in parallel in the base_estimator,
+        if the base_estimator supports n_jobs as parameter and
+        base_estimator was given as string.
+        If -1, then the number of jobs is set to the number of cores.
+
     acq_func_kwargs : dict
-        Additional arguments to be passed to the acquistion function.
+        Additional arguments to be passed to the acquisition function.
 
     acq_optimizer_kwargs : dict
-        Additional arguments to be passed to the acquistion optimizer.
+        Additional arguments to be passed to the acquisition optimizer.
 
-    model_queue_size : int or None, default=None
+    model_queue_size : int or None, default: None
         Keeps list of models only as long as the argument given. In the
         case of None, the list has no capped length.
 
@@ -139,15 +159,20 @@ class Optimizer(object):
         space used to sample points, bounds, and type of parameters.
 
     """
+
     def __init__(self, dimensions, base_estimator="gp",
                  n_random_starts=None, n_initial_points=10,
-                 acq_func="gp_hedge",
+                 initial_point_generator="random",
+                 n_jobs=1, acq_func="gp_hedge",
                  acq_optimizer="auto",
                  random_state=None,
                  model_queue_size=None,
                  acq_func_kwargs=None,
                  acq_optimizer_kwargs=None):
-
+        args = locals().copy()
+        del args['self']
+        self.specs = {"args": args,
+                      "function": "Optimizer"}
         self.rng = check_random_state(random_state)
 
         # Configure acquisition function
@@ -193,7 +218,8 @@ def __init__(self, dimensions, base_estimator="gp",
         if isinstance(base_estimator, str):
             base_estimator = cook_estimator(
                 base_estimator, space=dimensions,
-                random_state=self.rng.randint(0, np.iinfo(np.int32).max))
+                random_state=self.rng.randint(0, np.iinfo(np.int32).max),
+                n_jobs=n_jobs)
 
         # check if regressor
         if not is_regressor(base_estimator) and base_estimator is not None:
@@ -221,7 +247,7 @@ def __init__(self, dimensions, base_estimator="gp",
                              "'sampling', got {0}".format(acq_optimizer))
 
         if (not has_gradients(self.base_estimator_) and
-            acq_optimizer != "sampling"):
+                acq_optimizer != "sampling"):
             raise ValueError("The regressor {0} should run with "
                              "acq_optimizer"
                              "='sampling'.".format(type(base_estimator)))
@@ -234,8 +260,7 @@ def __init__(self, dimensions, base_estimator="gp",
         self.n_points = acq_optimizer_kwargs.get("n_points", 10000)
         self.n_restarts_optimizer = acq_optimizer_kwargs.get(
             "n_restarts_optimizer", 5)
-        n_jobs = acq_optimizer_kwargs.get("n_jobs", 1)
-        self.n_jobs = n_jobs
+        self.n_jobs = acq_optimizer_kwargs.get("n_jobs", 1)
         self.acq_optimizer_kwargs = acq_optimizer_kwargs
 
         # Configure search space
@@ -245,6 +270,17 @@ def __init__(self, dimensions, base_estimator="gp",
             dimensions = normalize_dimensions(dimensions)
         self.space = Space(dimensions)
 
+        self._initial_samples = None
+        self._initial_point_generator = cook_initial_point_generator(
+            initial_point_generator)
+
+        if self._initial_point_generator is not None:
+            transformer = self.space.get_transformer()
+            self._initial_samples = self._initial_point_generator.generate(
+                self.space.dimensions, n_initial_points,
+                random_state=self.rng.randint(0, np.iinfo(np.int32).max))
+            self.space.set_transformer(transformer)
+
         # record categorical and non-categorical indices
         self._cat_inds = []
         self._non_cat_inds = []
@@ -264,7 +300,6 @@ def __init__(self, dimensions, base_estimator="gp",
         self.yi = []
 
         # Initialize cache for `ask` method responses
-
         # This ensures that multiple calls to `ask` with n_points set
         # return same sets of points. Reset to {} at every call to `tell`.
         self.cache_ = {}
@@ -282,16 +317,16 @@ def copy(self, random_state=None):
             dimensions=self.space.dimensions,
             base_estimator=self.base_estimator_,
             n_initial_points=self.n_initial_points_,
+            initial_point_generator=self._initial_point_generator,
             acq_func=self.acq_func,
             acq_optimizer=self.acq_optimizer,
             acq_func_kwargs=self.acq_func_kwargs,
             acq_optimizer_kwargs=self.acq_optimizer_kwargs,
-            random_state=random_state,
+            random_state=random_state
         )
-
+        optimizer._initial_samples = self._initial_samples
         if hasattr(self, "gains_"):
             optimizer.gains_ = np.copy(self.gains_)
-
         if self.Xi:
             optimizer._tell(self.Xi, self.yi)
 
@@ -300,7 +335,7 @@ def copy(self, random_state=None):
     def ask(self, n_points=None, strategy="cl_min"):
         """Query point or multiple points at which objective should be evaluated.
 
-        n_points : int or None, default=None
+        n_points : int or None, default: None
             Number of points returned by the ask method.
             If the value is None, a single point to evaluate is returned.
             Otherwise a list of points to evaluate is returned of size
@@ -308,7 +343,7 @@ def ask(self, n_points=None, strategy="cl_min"):
             parallel, and thus obtain more objective function evaluations per
             unit of time.
 
-        strategy : string, default="cl_min"
+        strategy : string, default: "cl_min"
             Method to use to sample multiple points (see also `n_points`
             description). This parameter is ignored if n_points = None.
             Supported options are `"cl_min"`, `"cl_mean"` or `"cl_max"`.
@@ -395,7 +430,12 @@ def _ask(self):
         if self._n_initial_points > 0 or self.base_estimator_ is None:
             # this will not make a copy of `self.rng` and hence keep advancing
             # our random state.
-            return self.space.rvs(random_state=self.rng)[0]
+            if self._initial_samples is None:
+                return self.space.rvs(random_state=self.rng)[0]
+            else:
+                # The samples are evaluated starting form initial_samples[0]
+                return self._initial_samples[
+                    len(self._initial_samples) - self._n_initial_points]
 
         else:
             if not self.models:
@@ -415,8 +455,8 @@ def _ask(self):
     def tell(self, x, y, fit=True):
         """Record an observation (or several) of the objective function.
 
-        Provide values of the objective function at points suggested by `ask()`
-        or other points. By default a new model will be fit to all
+        Provide values of the objective function at points suggested by
+        `ask()` or other points. By default a new model will be fit to all
         observations. The new model is used to suggest the next point at
         which to evaluate the objective. This point can be retrieved by calling
         `ask()`.
@@ -434,7 +474,7 @@ def tell(self, x, y, fit=True):
         y : scalar or list
             Value of objective at `x`.
 
-        fit : bool, default=True
+        fit : bool, default: True
             Fit a model to observed evaluations of the objective. A model will
             only be fitted after `n_initial_points` points have been told to
             the optimizer irrespective of the value of `fit`.
@@ -487,7 +527,7 @@ def _tell(self, x, y, fit=True):
         # after being "told" n_initial_points we switch from sampling
         # random points to using a surrogate model
         if (fit and self._n_initial_points <= 0 and
-           self.base_estimator_ is not None):
+                self.base_estimator_ is not None):
             transformed_bounds = np.array(self.space.transformed_bounds)
             est = clone(self.base_estimator_)
 
@@ -568,8 +608,11 @@ def _tell(self, x, y, fit=True):
                 next_x.reshape((1, -1)))[0]
 
         # Pack results
-        return create_result(self.Xi, self.yi, self.space, self.rng,
-                             models=self.models)
+        result = create_result(self.Xi, self.yi, self.space, self.rng,
+                               models=self.models)
+
+        result.specs = self.specs
+        return result
 
     def _check_y_is_valid(self, x, y):
         """Check if the shape and types of x and y are consistent."""
@@ -602,5 +645,32 @@ def run(self, func, n_iter=1):
             x = self.ask()
             self.tell(x, func(x))
 
-        return create_result(self.Xi, self.yi, self.space, self.rng,
-                             models=self.models)
+        result = create_result(self.Xi, self.yi, self.space, self.rng,
+                               models=self.models)
+        result.specs = self.specs
+        return result
+
+    def update_next(self):
+        """Updates the value returned by opt.ask(). Useful if a parameter
+        was updated after ask was called."""
+        self.cache_ = {}
+        # Ask for a new next_x.
+        # We only need to overwrite _next_x if it exists.
+        if hasattr(self, '_next_x'):
+            opt = self.copy(random_state=self.rng)
+            self._next_x = opt._next_x
+
+    def get_result(self):
+        """Returns the same result that would be returned by opt.tell()
+        but without calling tell
+
+        Returns
+        -------
+        res : `OptimizeResult`, scipy object
+            OptimizeResult instance with the required information.
+
+        """
+        result = create_result(self.Xi, self.yi, self.space, self.rng,
+                               models=self.models)
+        result.specs = self.specs
+        return result
diff --git a/skopt/plots.py b/skopt/plots.py
index 45a2e2b4d..5a14b01bf 100644
--- a/skopt/plots.py
+++ b/skopt/plots.py
@@ -1,3 +1,4 @@
+# -*- encoding: UTF-8 -*-
 """Plotting functions."""
 import sys
 import numpy as np
@@ -5,11 +6,15 @@
 from functools import partial
 from scipy.optimize import OptimizeResult
 
+from .acquisition import _gaussian_acquisition
+from skopt import expected_minimum, expected_minimum_random_sampling
 from .space import Categorical
+from collections import Counter
 
 # For plot tests, matplotlib must be set to headless mode early
 if 'pytest' in sys.modules:
     import matplotlib
+
     matplotlib.use('Agg')
 
 import matplotlib.pyplot as plt
@@ -101,6 +106,167 @@ def plot_convergence(*args, **kwargs):
     return ax
 
 
+def plot_gaussian_process(res, **kwargs):
+    """Plots the optimization results and the gaussian process
+    for 1-D objective functions.
+
+    Parameters
+    ----------
+    res :  `OptimizeResult`
+        The result for which to plot the gaussian process.
+
+    ax : `Axes`, optional
+        The matplotlib axes on which to draw the plot, or `None` to create
+        a new one.
+
+    n_calls : int, default: -1
+        Can be used to evaluate the model at call `n_calls`.
+
+    objective : func, default: None
+        Defines the true objective function. Must have one input parameter.
+
+    n_points : int, default: 1000
+        Number of data points used to create the plots
+
+    noise_level : float, default: 0
+        Sets the estimated noise level
+
+    show_legend : boolean, default: True
+        When True, a legend is plotted.
+
+    show_title : boolean, default: True
+        When True, a title containing the found minimum value
+        is shown
+
+    show_acq_func : boolean, default: False
+        When True, the acquisition function is plotted
+
+    show_next_point : boolean, default: False
+        When True, the next evaluated point is plotted
+
+    show_observations : boolean, default: True
+        When True, observations are plotted as dots.
+
+    show_mu : boolean, default: True
+        When True, the predicted model is shown.
+
+    Returns
+    -------
+    ax : `Axes`
+        The matplotlib axes.
+    """
+    ax = kwargs.get("ax", None)
+    n_calls = kwargs.get("n_calls", -1)
+    objective = kwargs.get("objective", None)
+    noise_level = kwargs.get("noise_level", 0)
+    show_legend = kwargs.get("show_legend", True)
+    show_title = kwargs.get("show_title", True)
+    show_acq_func = kwargs.get("show_acq_func", False)
+    show_next_point = kwargs.get("show_next_point", False)
+    show_observations = kwargs.get("show_observations", True)
+    show_mu = kwargs.get("show_mu", True)
+    n_points = kwargs.get("n_points", 1000)
+
+    if ax is None:
+        ax = plt.gca()
+    n_dims = res.space.n_dims
+    assert n_dims == 1, "Space dimension must be 1"
+    dimension = res.space.dimensions[0]
+    x, x_model = _evenly_sample(dimension, n_points)
+    x = x.reshape(-1, 1)
+    x_model = x_model.reshape(-1, 1)
+    if res.specs is not None and "args" in res.specs:
+        n_random = res.specs["args"].get('n_random_starts', None)
+        acq_func = res.specs["args"].get("acq_func", "EI")
+        acq_func_kwargs = res.specs["args"].get("acq_func_kwargs", {})
+
+    if acq_func_kwargs is None:
+        acq_func_kwargs = {}
+    if acq_func is None or acq_func == "gp_hedge":
+        acq_func = "EI"
+    if n_random is None:
+        n_random = len(res.x_iters) - len(res.models)
+
+    if objective is not None:
+        fx = np.array([objective(x_i) for x_i in x])
+    if n_calls < 0:
+        model = res.models[-1]
+        curr_x_iters = res.x_iters
+        curr_func_vals = res.func_vals
+    else:
+        model = res.models[n_calls]
+
+        curr_x_iters = res.x_iters[:n_random + n_calls]
+        curr_func_vals = res.func_vals[:n_random + n_calls]
+
+    # Plot true function.
+    if objective is not None:
+        ax.plot(x, fx, "r--", label="True (unknown)")
+        ax.fill(np.concatenate(
+            [x, x[::-1]]),
+            np.concatenate(([fx_i - 1.9600 * noise_level
+                             for fx_i in fx],
+                            [fx_i + 1.9600 * noise_level
+                             for fx_i in fx[::-1]])),
+            alpha=.2, fc="r", ec="None")
+
+    # Plot GP(x) + contours
+    if show_mu:
+        y_pred, sigma = model.predict(x_model, return_std=True)
+        ax.plot(x, y_pred, "g--", label=r"$\mu_{GP}(x)$")
+        ax.fill(np.concatenate([x, x[::-1]]),
+                np.concatenate([y_pred - 1.9600 * sigma,
+                                (y_pred + 1.9600 * sigma)[::-1]]),
+                alpha=.2, fc="g", ec="None")
+
+    # Plot sampled points
+    if show_observations:
+        ax.plot(curr_x_iters, curr_func_vals,
+                "r.", markersize=8, label="Observations")
+    if (show_mu or show_observations or objective is not None)\
+            and show_acq_func:
+        ax_ei = ax.twinx()
+        ax_ei.set_ylabel(str(acq_func) + "(x)")
+        plot_both = True
+    else:
+        ax_ei = ax
+        plot_both = False
+    if show_acq_func:
+        acq = _gaussian_acquisition(x_model, model,
+                                    y_opt=np.min(curr_func_vals),
+                                    acq_func=acq_func,
+                                    acq_func_kwargs=acq_func_kwargs)
+        next_x = x[np.argmin(acq)]
+        next_acq = acq[np.argmin(acq)]
+        acq = - acq
+        next_acq = -next_acq
+        ax_ei.plot(x, acq, "b", label=str(acq_func) + "(x)")
+        if not plot_both:
+            ax_ei.fill_between(x.ravel(), 0, acq.ravel(),
+                               alpha=0.3, color='blue')
+
+        if show_next_point and next_x is not None:
+            ax_ei.plot(next_x, next_acq, "bo", markersize=6,
+                       label="Next query point")
+
+    if show_title:
+        ax.set_title(r"x* = %.4f, f(x*) = %.4f" % (res.x[0], res.fun))
+    # Adjust plot layout
+    ax.grid()
+    ax.set_xlabel("x")
+    ax.set_ylabel("f(x)")
+    if show_legend:
+        if plot_both:
+            lines, labels = ax.get_legend_handles_labels()
+            lines2, labels2 = ax_ei.get_legend_handles_labels()
+            ax_ei.legend(lines + lines2, labels + labels2, loc="best",
+                         prop={'size': 6}, numpoints=1)
+        else:
+            ax.legend(loc="best", prop={'size': 6}, numpoints=1)
+
+    return ax
+
+
 def plot_regret(*args, **kwargs):
     """Plot one or several cumulative regret traces.
 
@@ -192,49 +358,57 @@ def plot_regret(*args, **kwargs):
     return ax
 
 
-def _format_scatter_plot_axes(ax, space, ylabel, dim_labels=None):
+def _format_scatter_plot_axes(ax, space, ylabel, plot_dims,
+                              dim_labels=None):
     # Work out min, max of y axis for the diagonal so we can adjust
     # them all to the same value
-    diagonal_ylim = (np.min([ax[i, i].get_ylim()[0]
-                             for i in range(space.n_dims)]),
-                     np.max([ax[i, i].get_ylim()[1]
-                             for i in range(space.n_dims)]))
+    diagonal_ylim = _get_ylim_diagonal(ax)
+
+    # Number of search-space dimensions we are using.
+    if isinstance(ax, (list, np.ndarray)):
+        n_dims = len(plot_dims)
+    else:
+        n_dims = 1
 
     if dim_labels is None:
         dim_labels = ["$X_{%i}$" % i if d.name is None else d.name
-                      for i, d in enumerate(space.dimensions)]
+                      for i, d in plot_dims]
     # Axes for categorical dimensions are really integers; we have to
     # label them with the category names
-    iscat = [isinstance(dim, Categorical) for dim in space.dimensions]
+    iscat = [isinstance(dim[1], Categorical) for dim in plot_dims]
 
     # Deal with formatting of the axes
-    for i in range(space.n_dims):  # rows
-        for j in range(space.n_dims):  # columns
-            ax_ = ax[i, j]
-
+    for i in range(n_dims):  # rows
+        for j in range(n_dims):  # columns
+            if n_dims > 1:
+                ax_ = ax[i, j]
+            else:
+                ax_ = ax
+            index_i, dim_i = plot_dims[i]
+            index_j, dim_j = plot_dims[j]
             if j > i:
                 ax_.axis("off")
-            elif i > j:        # off-diagonal plots
+            elif i > j:  # off-diagonal plots
                 # plots on the diagonal are special, like Texas. They have
                 # their own range so do not mess with them.
                 if not iscat[i]:  # bounds not meaningful for categoricals
-                    ax_.set_ylim(*space.dimensions[i].bounds)
+                    ax_.set_ylim(*dim_i.bounds)
                 if iscat[j]:
                     # partial() avoids creating closures in a loop
                     ax_.xaxis.set_major_formatter(FuncFormatter(
-                            partial(_cat_format, space.dimensions[j])))
+                        partial(_cat_format, dim_j)))
                 else:
-                    ax_.set_xlim(*space.dimensions[j].bounds)
-                if j == 0:      # only leftmost column (0) gets y labels
+                    ax_.set_xlim(*dim_j.bounds)
+                if j == 0:  # only leftmost column (0) gets y labels
                     ax_.set_ylabel(dim_labels[i])
-                    if iscat[i]:    # Set category labels for left column
+                    if iscat[i]:  # Set category labels for left column
                         ax_.yaxis.set_major_formatter(FuncFormatter(
-                            partial(_cat_format, space.dimensions[i])))
+                            partial(_cat_format, dim_i)))
                 else:
                     ax_.set_yticklabels([])
 
                 # for all rows except ...
-                if i < space.n_dims - 1:
+                if i < n_dims - 1:
                     ax_.set_xticklabels([])
                 # ... the bottom row
                 else:
@@ -242,20 +416,23 @@ def _format_scatter_plot_axes(ax, space, ylabel, dim_labels=None):
                     ax_.set_xlabel(dim_labels[j])
 
                 # configure plot for linear vs log-scale
-                if space.dimensions[j].prior == 'log-uniform':
+                if dim_j.prior == 'log-uniform':
                     ax_.set_xscale('log')
                 else:
                     ax_.xaxis.set_major_locator(MaxNLocator(6, prune='both',
                                                             integer=iscat[j]))
 
-                if space.dimensions[i].prior == 'log-uniform':
+                if dim_i.prior == 'log-uniform':
                     ax_.set_yscale('log')
                 else:
                     ax_.yaxis.set_major_locator(MaxNLocator(6, prune='both',
                                                             integer=iscat[i]))
 
-            else:       # diagonal plots
+            else:  # diagonal plots
                 ax_.set_ylim(*diagonal_ylim)
+                if not iscat[i]:
+                    low, high = dim_i.bounds
+                    ax_.set_xlim(low, high)
                 ax_.yaxis.tick_right()
                 ax_.yaxis.set_label_position('right')
                 ax_.yaxis.set_ticks_position('both')
@@ -265,20 +442,20 @@ def _format_scatter_plot_axes(ax, space, ylabel, dim_labels=None):
                 ax_.xaxis.set_label_position('top')
                 ax_.set_xlabel(dim_labels[j])
 
-                if space.dimensions[i].prior == 'log-uniform':
+                if dim_i.prior == 'log-uniform':
                     ax_.set_xscale('log')
                 else:
                     ax_.xaxis.set_major_locator(MaxNLocator(6, prune='both',
                                                             integer=iscat[i]))
                     if iscat[i]:
                         ax_.xaxis.set_major_formatter(FuncFormatter(
-                            partial(_cat_format, space.dimensions[i])))
+                            partial(_cat_format, dim_i)))
 
     return ax
 
 
 def partial_dependence(space, model, i, j=None, sample_points=None,
-                       n_samples=250, n_points=40):
+                       n_samples=250, n_points=40, x_eval=None):
     """Calculate the partial dependence for dimensions `i` and `j` with
     respect to the objective value, as approximated by `model`.
 
@@ -286,6 +463,9 @@ def partial_dependence(space, model, i, j=None, sample_points=None,
     `i` and `j` influence the `model` predictions after "averaging out"
     the influence of all other dimensions.
 
+    When `x_eval` is not `None`, the given values are used instead of
+    random samples. In this case, `n_samples` will be ignored.
+
     Parameters
     ----------
     space : `Space`
@@ -302,17 +482,27 @@ def partial_dependence(space, model, i, j=None, sample_points=None,
         To calculate the 1D partial dependence on `i` alone set `j=None`.
 
     sample_points : np.array, shape=(n_points, n_dims), default=None
+        Only used when `x_eval=None`, i.e in case partial dependence should
+        be calculated.
         Randomly sampled and transformed points to use when averaging
-        the model function at each of the `n_points`.
+        the model function at each of the `n_points` when using partial
+        dependence.
 
     n_samples : int, default=100
         Number of random samples to use for averaging the model function
-        at each of the `n_points`. Only used when `sample_points=None`.
+        at each of the `n_points` when using partial dependence. Only used
+        when `sample_points=None` and `x_eval=None`.
 
     n_points : int, default=40
         Number of points at which to evaluate the partial dependence
         along each dimension `i` and `j`.
 
+    x_eval : list, default=None
+        `x_eval` is a list of parameter values or None. In case `x_eval`
+        is not None, the parsed dependence will be calculated using these
+        values.
+        Otherwise, random selected samples will be used.
+
     Returns
     -------
     For 1D partial dependence:
@@ -335,76 +525,81 @@ def partial_dependence(space, model, i, j=None, sample_points=None,
     For Categorical variables, the `xi` (and `yi` for 2D) returned are
     the indices of the variable in `Dimension.categories`.
     """
-    # The idea is to step through one dimension, evaluating the model with
-    # that dimension fixed and averaging over random values in all other
-    # dimensions.  (Or step through 2 dimensions when i and j are given.)
-    # Categorical dimensions make this interesting, because they are one-
-    # hot-encoded, so there is a one-to-many mapping of input dimensions
-    # to transformed (model) dimensions.
-
-    if sample_points is None:
+    # If we haven't parsed an x_eval list we use random sampled values instead
+    if x_eval is None and sample_points is None:
         sample_points = space.transform(space.rvs(n_samples=n_samples))
-
-    # dim_locs[i] is the (column index of the) start of dim i in sample_points
-    dim_locs = np.cumsum([0] + [d.transformed_size for d in space.dimensions])
+    elif sample_points is None:
+        sample_points = space.transform([x_eval])
 
     if j is None:
-        xi, xi_transformed = _evenly_sample(space.dimensions[i], n_points)
-        yi = []
-        for x_ in xi_transformed:
-            rvs_ = np.array(sample_points)      # copy
-            rvs_[:, dim_locs[i]:dim_locs[i + 1]] = x_
-            yi.append(np.mean(model.predict(rvs_)))
-
-        return xi, yi
-
+        return partial_dependence_1D(space, model, i,
+                                     sample_points, n_points)
     else:
-        xi, xi_transformed = _evenly_sample(space.dimensions[j], n_points)
-        yi, yi_transformed = _evenly_sample(space.dimensions[i], n_points)
-
-        zi = []
-        for x_ in xi_transformed:
-            row = []
-            for y_ in yi_transformed:
-                rvs_ = np.array(sample_points)      # copy
-                rvs_[:, dim_locs[j]:dim_locs[j + 1]] = x_
-                rvs_[:, dim_locs[i]:dim_locs[i + 1]] = y_
-                row.append(np.mean(model.predict(rvs_)))
-            zi.append(row)
-
-        return xi, yi, np.array(zi).T
+        return partial_dependence_2D(space, model, i, j,
+                                     sample_points, n_points)
 
 
 def plot_objective(result, levels=10, n_points=40, n_samples=250, size=2,
-                   zscale='linear', dimensions=None):
-    """Pairwise partial dependence plot of the objective function.
-
-    The diagonal shows the partial dependence for dimension `i` with
-    respect to the objective function. The off-diagonal shows the
-    partial dependence for dimensions `i` and `j` with
-    respect to the objective function. The objective function is
-    approximated by `result.model.`
-
-    Pairwise scatter plots of the points at which the objective
-    function was directly evaluated are shown on the off-diagonal.
-    A red point indicates the found minimum.
+                   zscale='linear', dimensions=None, sample_source='random',
+                   minimum='result', n_minimum_search=None, plot_dims=None,
+                   show_points=True, cmap='viridis_r'):
+    """Plot a 2-d matrix with so-called Partial Dependence plots
+    of the objective function. This shows the influence of each
+    search-space dimension on the objective function.
+
+    This uses the last fitted model for estimating the objective function.
+
+    The diagonal shows the effect of a single dimension on the
+    objective function, while the plots below the diagonal show
+    the effect on the objective function when varying two dimensions.
+
+    The Partial Dependence is calculated by averaging the objective value
+    for a number of random samples in the search-space,
+    while keeping one or two dimensions fixed at regular intervals. This
+    averages out the effect of varying the other dimensions and shows
+    the influence of one or two dimensions on the objective function.
+
+    Also shown are small black dots for the points that were sampled
+    during optimization.
+
+    A red star indicates per default the best observed minimum, but
+    this can be changed by changing argument ´minimum´.
+
+    .. note::
+          The Partial Dependence plot is only an estimation of the surrogate
+          model which in turn is only an estimation of the true objective
+          function that has been optimized. This means the plots show
+          an "estimate of an estimate" and may therefore be quite imprecise,
+          especially if few samples have been collected during the
+          optimization
+          (e.g. less than 100-200 samples), and in regions of the search-space
+          that have been sparsely sampled (e.g. regions away from the optimum).
+          This means that the plots may change each time you run the
+          optimization and they should not be considered completely reliable.
+          These compromises are necessary because we cannot evaluate the
+          expensive objective function in order to plot it, so we have to use
+          the cheaper surrogate model to plot its contour. And in order to
+          show search-spaces with 3 dimensions or more in a 2-dimensional
+          plot,
+          we further need to map those dimensions to only 2-dimensions using
+          the Partial Dependence, which also causes distortions in the plots.
 
     Parameters
     ----------
     result : `OptimizeResult`
-        The result for which to create the scatter plot matrix.
+        The optimization results from calling e.g. `gp_minimize()`.
 
     levels : int, default=10
         Number of levels to draw on the contour plot, passed directly
-        to `plt.contour()`.
+        to `plt.contourf()`.
 
     n_points : int, default=40
         Number of points at which to evaluate the partial dependence
         along each dimension.
 
     n_samples : int, default=250
-        Number of random samples to use for averaging the model function
-        at each of the `n_points`.
+        Number of samples to use for averaging the model function
+        at each of the `n_points` when `sample_method` is set to 'random'.
 
     size : float, default=2
         Height (in inches) of each facet.
@@ -418,14 +613,95 @@ def plot_objective(result, levels=10, n_points=40, n_samples=250, size=2,
         variables. `None` defaults to `space.dimensions[i].name`, or
         if also `None` to `['X_0', 'X_1', ..]`.
 
+    plot_dims : list of str and int, default=None
+        List of dimension names or dimension indices from the
+        search-space dimensions to be included in the plot.
+        If `None` then use all dimensions except constant ones
+        from the search-space.
+
+    sample_source : str or list of floats, default='random'
+        Defines to samples generation to use for averaging the model function
+        at each of the `n_points`.
+
+        A partial dependence plot is only generated, when `sample_source`
+        is set to 'random' and `n_samples` is sufficient.
+
+        `sample_source` can also be a list of
+        floats, which is then used for averaging.
+
+        Valid strings:
+
+        - 'random' - `n_samples` random samples will used
+        - 'result' - Use only the best observed parameters
+        - 'expected_minimum' - Parameters that gives the best
+          minimum Calculated using scipy's minimize method.
+          This method currently does not work with categorical values.
+        - 'expected_minimum_random' - Parameters that gives the
+          best minimum when using naive random sampling.
+          Works with categorical values.
+
+    minimum : str or list of floats, default = 'result'
+        Defines the values for the red points in the plots.
+        Valid strings:
+
+        - 'result' - Use best observed parameters
+        - 'expected_minimum' - Parameters that gives the best
+          minimum Calculated using scipy's minimize method.
+          This method currently does not work with categorical values.
+        - 'expected_minimum_random' - Parameters that gives the
+          best minimum when using naive random sampling.
+          Works with categorical values
+
+    n_minimum_search : int, default = None
+        Determines how many points should be evaluated
+        to find the minimum when using 'expected_minimum' or
+        'expected_minimum_random'. Parameter is used when
+        `sample_source` and/or `minimum` is set to
+        'expected_minimum' or 'expected_minimum_random'.
+
+    show_points: bool, default = True
+        Choose whether to show evaluated points in the
+        contour plots.
+
+    cmap: str or Colormap, default = 'viridis_r'
+        Color map for contour plots. Passed directly to
+        `plt.contourf()`
+
     Returns
     -------
-    ax : `Axes`
-        The matplotlib axes.
+    ax : `Matplotlib.Axes`
+        A 2-d matrix of Axes-objects with the sub-plots.
+
     """
+    # Here we define the values for which to plot the red dot (2d plot) and
+    # the red dotted line (1d plot).
+    # These same values will be used for evaluating the plots when
+    # calculating dependence. (Unless partial
+    # dependence is to be used instead).
     space = result.space
-    rvs_transformed = space.transform(space.rvs(n_samples=n_samples))
-    samples, minimum, _ = _map_categories(space, result.x_iters, result.x)
+    # Get the relevant search-space dimensions.
+    if plot_dims is None:
+        # Get all dimensions.
+        plot_dims = []
+        for row in range(space.n_dims):
+            if space.dimensions[row].is_constant:
+                continue
+            plot_dims.append((row, space.dimensions[row]))
+    else:
+        plot_dims = space[plot_dims]
+    # Number of search-space dimensions we are using.
+    n_dims = len(plot_dims)
+    if dimensions is not None:
+        assert len(dimensions) == n_dims
+    x_vals = _evaluate_min_params(result, minimum, n_minimum_search)
+    if sample_source == "random":
+        x_eval = None
+        samples = space.transform(space.rvs(n_samples=n_samples))
+    else:
+        x_eval = _evaluate_min_params(result, sample_source,
+                                      n_minimum_search)
+        samples = space.transform([x_eval])
+    x_samples, minimum, _ = _map_categories(space, result.x_iters, x_vals)
 
     if zscale == 'log':
         locator = LogLocator()
@@ -435,54 +711,69 @@ def plot_objective(result, levels=10, n_points=40, n_samples=250, size=2,
         raise ValueError("Valid values for zscale are 'linear' and 'log',"
                          " not '%s'." % zscale)
 
-    fig, ax = plt.subplots(space.n_dims, space.n_dims,
-                           figsize=(size * space.n_dims, size * space.n_dims))
+    fig, ax = plt.subplots(n_dims, n_dims,
+                           figsize=(size * n_dims, size * n_dims))
 
     fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95,
                         hspace=0.1, wspace=0.1)
 
-    for i in range(space.n_dims):
-        for j in range(space.n_dims):
+    for i in range(n_dims):
+        for j in range(n_dims):
             if i == j:
-                xi, yi = partial_dependence(space, result.models[-1], i,
-                                            j=None,
-                                            sample_points=rvs_transformed,
-                                            n_points=n_points)
-
-                ax[i, i].plot(xi, yi)
-                ax[i, i].axvline(minimum[i], linestyle="--", color="r", lw=1)
+                index, dim = plot_dims[i]
+                xi, yi = partial_dependence_1D(space, result.models[-1],
+                                               index,
+                                               samples=samples,
+                                               n_points=n_points)
+                if n_dims > 1:
+                    ax_ = ax[i, i]
+                else:
+                    ax_ = ax
+                ax_.plot(xi, yi)
+                ax_.axvline(minimum[index], linestyle="--", color="r", lw=1)
 
             # lower triangle
             elif i > j:
-                xi, yi, zi = partial_dependence(space, result.models[-1],
-                                                i, j,
-                                                rvs_transformed, n_points)
-                ax[i, j].contourf(xi, yi, zi, levels,
-                                  locator=locator, cmap='viridis_r')
-                ax[i, j].scatter(samples[:, j], samples[:, i],
-                                 c='k', s=10, lw=0.)
-                ax[i, j].scatter(minimum[j], minimum[i],
-                                 c=['r'], s=20, lw=0.)
-
-    return _format_scatter_plot_axes(ax, space, ylabel="Partial dependence",
+                index1, dim1 = plot_dims[i]
+                index2, dim2 = plot_dims[j]
+                ax_ = ax[i, j]
+                xi, yi, zi = partial_dependence_2D(space, result.models[-1],
+                                                   index1, index2,
+                                                   samples, n_points)
+                ax_.contourf(xi, yi, zi, levels,
+                             locator=locator, cmap=cmap)
+                if show_points:
+                    ax_.scatter(x_samples[:, index2], x_samples[:, index1],
+                                c='k', s=10, lw=0.)
+                ax_.scatter(minimum[index2], minimum[index1],
+                            c=['r'], s=100, lw=0., marker='*')
+    ylabel = "Partial dependence"
+
+    # Make various adjustments to the plots.
+    return _format_scatter_plot_axes(ax, space, ylabel=ylabel,
+                                     plot_dims=plot_dims,
                                      dim_labels=dimensions)
 
 
-def plot_evaluations(result, bins=20, dimensions=None):
-    """Visualize the order in which points where sampled.
+def plot_evaluations(result, bins=20, dimensions=None,
+                     plot_dims=None):
+    """Visualize the order in which points were sampled during optimization.
+
+    This creates a 2-d matrix plot where the diagonal plots are histograms
+    that show the distribution of samples for each search-space dimension.
 
-    The scatter plot matrix shows at which points in the search
-    space and in which order samples were evaluated. Pairwise
-    scatter plots are shown on the off-diagonal for each
-    dimension of the search space. The order in which samples
+    The plots below the diagonal are scatter-plots of the samples for
+    all combinations of search-space dimensions.
+
+    The order in which samples
     were evaluated is encoded in each point's color.
-    The diagonal shows a histogram of sampled values for each
-    dimension. A red point indicates the found minimum.
+
+    A red star shows the best found parameters.
 
     Parameters
     ----------
     result : `OptimizeResult`
-        The result for which to create the scatter plot matrix.
+        The optimization results from calling e.g. `gp_minimize()`.
 
     bins : int, bins=20
         Number of bins to use for histograms on the diagonal.
@@ -492,10 +783,17 @@ def plot_evaluations(result, bins=20, dimensions=None):
         variables. `None` defaults to `space.dimensions[i].name`, or
         if also `None` to `['X_0', 'X_1', ..]`.
 
+    plot_dims : list of str and int, default=None
+        List of dimension names or dimension indices from the
+        search-space dimensions to be included in the plot.
+        If `None` then use all dimensions except constant ones
+        from the search-space.
+
     Returns
     -------
-    ax : `Axes`
-        The matplotlib axes.
+    ax : `Matplotlib.Axes`
+        A 2-d matrix of Axes-objects with the sub-plots.
+
     """
     space = result.space
     # Convert categoricals to integers, so we can ensure consistent ordering.
@@ -504,43 +802,472 @@ def plot_evaluations(result, bins=20, dimensions=None):
     # and may order categoricals differently in different plots anyway.
     samples, minimum, iscat = _map_categories(space, result.x_iters, result.x)
     order = range(samples.shape[0])
-    fig, ax = plt.subplots(space.n_dims, space.n_dims,
-                           figsize=(2 * space.n_dims, 2 * space.n_dims))
+
+    if plot_dims is None:
+        # Get all dimensions.
+        plot_dims = []
+        for row in range(space.n_dims):
+            if space.dimensions[row].is_constant:
+                continue
+            plot_dims.append((row, space.dimensions[row]))
+    else:
+        plot_dims = space[plot_dims]
+    # Number of search-space dimensions we are using.
+    n_dims = len(plot_dims)
+    if dimensions is not None:
+        assert len(dimensions) == n_dims
+
+    fig, ax = plt.subplots(n_dims, n_dims,
+                           figsize=(2 * n_dims, 2 * n_dims))
 
     fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95,
                         hspace=0.1, wspace=0.1)
 
-    for i in range(space.n_dims):
-        for j in range(space.n_dims):
+    for i in range(n_dims):
+        for j in range(n_dims):
             if i == j:
+                index, dim = plot_dims[i]
                 if iscat[j]:
-                    bins_ = len(space.dimensions[j].categories)
-                elif space.dimensions[j].prior == 'log-uniform':
-                    low, high = space.bounds[j]
+                    bins_ = len(dim.categories)
+                elif dim.prior == 'log-uniform':
+                    low, high = space.bounds[index]
                     bins_ = np.logspace(np.log10(low), np.log10(high), bins)
                 else:
                     bins_ = bins
-                ax[i, i].hist(samples[:, j], bins=bins_, range=None if iscat[j]
-                              else space.dimensions[j].bounds)
+                if n_dims == 1:
+                    ax_ = ax
+                else:
+                    ax_ = ax[i, i]
+                ax_.hist(samples[:, index], bins=bins_,
+                         range=None if iscat[j] else dim.bounds)
 
             # lower triangle
             elif i > j:
-                ax[i, j].scatter(samples[:, j], samples[:, i],
-                                 c=order, s=40, lw=0., cmap='viridis')
-                ax[i, j].scatter(minimum[j], minimum[i],
-                                 c=['r'], s=20, lw=0.)
-
+                index_i, dim_i = plot_dims[i]
+                index_j, dim_j = plot_dims[j]
+                ax_ = ax[i, j]
+                ax_.scatter(samples[:, index_j], samples[:, index_i],
+                            c=order, s=40, lw=0., cmap='viridis')
+                ax_.scatter(minimum[index_j], minimum[index_i],
+                            c=['r'], s=100, lw=0., marker='*')
+
+    # Make various adjustments to the plots.
     return _format_scatter_plot_axes(ax, space, ylabel="Number of samples",
+                                     plot_dims=plot_dims,
                                      dim_labels=dimensions)
 
 
+def _get_ylim_diagonal(ax):
+    """Get the min / max of the ylim for all diagonal plots.
+    This is used in _adjust_fig() so the ylim is the same
+    for all diagonal plots.
+
+    Parameters
+    ----------
+    ax : `Matplotlib.Axes`
+        2-dimensional matrix with Matplotlib Axes objects.
+
+    Returns
+    -------
+    ylim_diagonal : tuple(int)
+        The common min and max ylim for the diagonal plots.
+
+    """
+
+    # Number of search-space dimensions used in this plot.
+    if isinstance(ax, (list, np.ndarray)):
+        n_dims = len(ax)
+        # Get ylim for all diagonal plots.
+        ylim = [ax[row, row].get_ylim() for row in range(n_dims)]
+    else:
+        n_dim = 1
+        ylim = [ax.get_ylim()]
+
+    # Separate into two lists with low and high ylim.
+    ylim_lo, ylim_hi = zip(*ylim)
+
+    # Min and max ylim for all diagonal plots.
+    ylim_min = np.min(ylim_lo)
+    ylim_max = np.max(ylim_hi)
+
+    return ylim_min, ylim_max
+
+
+def partial_dependence_1D(space, model, i, samples,
+                          n_points=40):
+    """
+    Calculate the partial dependence for a single dimension.
+
+    This uses the given model to calculate the average objective value
+    for all the samples, where the given dimension is fixed at
+    regular intervals between its bounds.
+
+    This shows how the given dimension affects the objective value
+    when the influence of all other dimensions are averaged out.
+
+    Parameters
+    ----------
+    space : `Space`
+        The parameter space over which the minimization was performed.
+
+    model
+        Surrogate model for the objective function.
+
+    i : int
+        The dimension for which to calculate the partial dependence.
+
+    samples : np.array, shape=(n_points, n_dims)
+        Randomly sampled and transformed points to use when averaging
+        the model function at each of the `n_points` when using partial
+        dependence.
+
+    n_points : int, default=40
+        Number of points at which to evaluate the partial dependence
+        along each dimension `i`.
+
+    Returns
+    -------
+    xi : np.array
+        The points at which the partial dependence was evaluated.
+
+    yi : np.array
+        The average value of the modelled objective function at
+        each point `xi`.
+
+    """
+    # The idea is to step through one dimension, evaluating the model with
+    # that dimension fixed and averaging either over random values or over
+    # the given ones in x_val in all other dimensions.
+    # (Or step through 2 dimensions when i and j are given.)
+    # Categorical dimensions make this interesting, because they are one-
+    # hot-encoded, so there is a one-to-many mapping of input dimensions
+    # to transformed (model) dimensions.
+
+    # dim_locs[i] is the (column index of the) start of dim i in
+    # sample_points.
+    # This is usefull when we are using one hot encoding, i.e using
+    # categorical values
+    dim_locs = np.cumsum([0] + [d.transformed_size for d in space.dimensions])
+
+    def _calc(x):
+        """
+        Helper-function to calculate the average predicted
+        objective value for the given model, when setting
+        the index'th dimension of the search-space to the value x,
+        and then averaging over all samples.
+        """
+        rvs_ = np.array(samples)  # copy
+        # We replace the values in the dimension that we want to keep
+        # fixed
+        rvs_[:, dim_locs[i]:dim_locs[i + 1]] = x
+        # In case of `x_eval=None` rvs conists of random samples.
+        # Calculating the mean of these samples is how partial dependence
+        # is implemented.
+        return np.mean(model.predict(rvs_))
+    xi, xi_transformed = _evenly_sample(space.dimensions[i], n_points)
+    # Calculate the partial dependence for all the points.
+    yi = [_calc(x) for x in xi_transformed]
+
+    return xi, yi
+
+
+def partial_dependence_2D(space, model, i, j, samples,
+                          n_points=40):
+    """
+    Calculate the partial dependence for two dimensions in the search-space.
+
+    This uses the given model to calculate the average objective value
+    for all the samples, where the given dimensions are fixed at
+    regular intervals between their bounds.
+
+    This shows how the given dimensions affect the objective value
+    when the influence of all other dimensions are averaged out.
+
+    Parameters
+    ----------
+    space : `Space`
+        The parameter space over which the minimization was performed.
+
+    model
+        Surrogate model for the objective function.
+
+    i : int
+        The first dimension for which to calculate the partial dependence.
+
+    j : int
+        The second dimension for which to calculate the partial dependence.
+
+    samples : np.array, shape=(n_points, n_dims)
+        Randomly sampled and transformed points to use when averaging
+        the model function at each of the `n_points` when using partial
+        dependence.
+
+    n_points : int, default=40
+        Number of points at which to evaluate the partial dependence
+        along each dimension `i` and `j`.
+
+    Returns
+    -------
+    xi : np.array, shape=n_points
+        The points at which the partial dependence was evaluated.
+
+    yi : np.array, shape=n_points
+        The points at which the partial dependence was evaluated.
+
+    zi : np.array, shape=(n_points, n_points)
+        The average value of the objective function at each point `(xi, yi)`.
+    """
+    # The idea is to step through one dimension, evaluating the model with
+    # that dimension fixed and averaging either over random values or over
+    # the given ones in x_val in all other dimensions.
+    # (Or step through 2 dimensions when i and j are given.)
+    # Categorical dimensions make this interesting, because they are one-
+    # hot-encoded, so there is a one-to-many mapping of input dimensions
+    # to transformed (model) dimensions.
+
+    # dim_locs[i] is the (column index of the) start of dim i in
+    # sample_points.
+    # This is usefull when we are using one hot encoding, i.e using
+    # categorical values
+    dim_locs = np.cumsum([0] + [d.transformed_size for d in space.dimensions])
+
+    def _calc(x, y):
+        """
+        Helper-function to calculate the average predicted
+        objective value for the given model, when setting
+        the index1'th dimension of the search-space to the value x
+        and setting the index2'th dimension to the value y,
+        and then averaging over all samples.
+        """
+        rvs_ = np.array(samples)  # copy
+        rvs_[:, dim_locs[j]:dim_locs[j + 1]] = x
+        rvs_[:, dim_locs[i]:dim_locs[i + 1]] = y
+        return np.mean(model.predict(rvs_))
+
+    xi, xi_transformed = _evenly_sample(space.dimensions[j], n_points)
+    yi, yi_transformed = _evenly_sample(space.dimensions[i], n_points)
+    # Calculate the partial dependence for all combinations of these points.
+    zi = [[_calc(x, y) for x in xi_transformed] for y in yi_transformed]
+
+    # Convert list-of-list to a numpy array.
+    zi = np.array(zi)
+
+    return xi, yi, zi
+
+
+def plot_objective_2D(result, dimension_identifier1, dimension_identifier2,
+                      n_points=40, n_samples=250, levels=10, zscale='linear',
+                      sample_source='random',
+                      minimum='result', n_minimum_search=None, ax=None):
+    """
+    Create and return a Matplotlib figure and axes with a landscape
+    contour-plot of the last fitted model of the search-space,
+    overlaid with all the samples from the optimization results,
+    for the two given dimensions of the search-space.
+
+    This is similar to `plot_objective()` but only for 2 dimensions
+    whose doc-string also has a more extensive explanation.
+
+    Parameters
+    ----------
+    result : `OptimizeResult`
+        The optimization results e.g. from calling `gp_minimize()`.
+
+    dimension_identifier1 : str or int
+        Name or index of a dimension in the search-space.
+
+    dimension_identifier2 : str or int
+        Name or index of a dimension in the search-space.
+
+    n_samples : int, default=250
+        Number of random samples used for estimating the contour-plot
+        of the objective function.
+
+    n_points : int, default=40
+        Number of points along each dimension where the partial dependence
+        is evaluated when generating the contour-plots.
+
+    levels : int, default=10
+        Number of levels to draw on the contour plot.
+
+    zscale : str, default='linear'
+        Scale to use for the z axis of the contour plots.
+        Either 'log' or linear for all other choices.
+
+    ax : `Matplotlib.Axes`, default: None
+        When set, everything is plotted inside this axis.
+
+    Returns
+    -------
+    ax : `Matplotlib.Axes`
+        The Matplotlib Figure-object.
+        For example, you can save the plot by calling
+        `fig.savefig('file.png')`
+    """
+
+    # Get the search-space instance from the optimization results.
+    space = result.space
+    x_vals = _evaluate_min_params(result, minimum, n_minimum_search)
+    if sample_source == "random":
+        x_eval = None
+        samples = space.transform(space.rvs(n_samples=n_samples))
+    else:
+        x_eval = _evaluate_min_params(result, sample_source,
+                                      n_minimum_search)
+        samples = space.transform([x_eval])
+    x_samples, x_minimum, _ = _map_categories(space, result.x_iters, x_vals)
+    # Get the dimension-object, its index in the search-space, and its name.
+    index1, dimension1 = space[dimension_identifier1]
+    index2, dimension2 = space[dimension_identifier2]
+
+    # Get the samples from the optimization-log for the relevant dimensions.
+    # samples1 = get_samples_dimension(result=result, index=index1)
+    samples1 = x_samples[:, index1]
+    samples2 = x_samples[:, index2]
+    # samples2 = get_samples_dimension(result=result, index=index2)
+
+    # Get the best-found samples for the relevant dimensions.
+    best_sample1 = x_minimum[index1]
+    best_sample2 = x_minimum[index2]
+
+    # Get the last fitted model for the search-space.
+    last_model = result.models[-1]
+
+    # Estimate the objective function for these sampled points
+    # using the last fitted model for the search-space.
+    xi, yi, zi = partial_dependence_2D(space, last_model, index2, index1,
+                                       samples, n_points=n_points)
+
+    if ax is None:
+        ax = plt.gca()
+
+    # Scale for the z-axis of the contour-plot. Either Log or Linear (None).
+    locator = LogLocator() if zscale == 'log' else None
+
+    # Plot the contour-landscape for the objective function.
+    ax.contourf(xi, yi, zi, levels, locator=locator, cmap='viridis_r')
+
+    # Plot all the parameters that were sampled during optimization.
+    # These are plotted as small black dots.
+    ax.scatter(samples1, samples2, c='black', s=10, linewidths=1)
+
+    # Plot the best parameters that were sampled during optimization.
+    # These are plotted as a big red star.
+    ax.scatter(best_sample1, best_sample2,
+               c='red', s=50, linewidths=1, marker='*')
+
+    # Use the dimension-names as the labels for the plot-axes.
+    ax.set_xlabel(dimension1.name)
+    ax.set_ylabel(dimension2.name)
+    ax.autoscale(enable=True, axis='x', tight=True)
+    ax.autoscale(enable=True, axis='y', tight=True)
+    # Use log-scale on the x-axis?
+    if dimension1.prior == 'log-uniform':
+        ax.set_xscale('log')
+
+    # Use log-scale on the y-axis?
+    if dimension2.prior == 'log-uniform':
+        ax.set_yscale('log')
+    return ax
+
+
+def plot_histogram(result, dimension_identifier, bins=20, rotate_labels=0,
+                   ax=None):
+    """
+    Create and return a Matplotlib figure with a histogram
+    of the samples from the optimization results,
+    for a given dimension of the search-space.
+
+    Parameters
+    ----------
+    result : `OptimizeResult`
+        The optimization results e.g. from calling `gp_minimize()`.
+
+    dimension_identifier : str or int
+        Name or index of a dimension in the search-space.
+
+    bins : int, bins=20
+        Number of bins in the histogram.
+
+    rotate_labels : int, rotate_labels=0
+        Degree to rotate category-names on the x-axis.
+        Only used for Categorical dimensions.
+
+    Returns
+    -------
+    ax : `Matplotlib.Axes`
+        The Matplotlib Axes-object.
+    """
+
+    # Get the search-space instance from the optimization results.
+    space = result.space
+
+    # Get the dimension-object.
+    index, dimension = space[dimension_identifier]
+
+    # Get the samples from the optimization-log for that particular dimension.
+    samples = [x[index] for x in result.x_iters]
+
+    if ax is None:
+        ax = plt.gca()
+
+    if isinstance(dimension, Categorical):
+        # When the search-space dimension is Categorical, it means
+        # that the possible values are strings. Matplotlib's histogram
+        # does not support this, so we have to make a bar-plot instead.
+
+        # NOTE: This only shows the categories that are in the samples.
+        # So if a category was not sampled, it will not be shown here.
+
+        # Count the number of occurrences of the string-categories.
+        counter = Counter(samples)
+
+        # The counter returns a dict where the keys are the category-names
+        # and the values are the number of occurrences for each category.
+        names = list(counter.keys())
+        counts = list(counter.values())
+
+        # Although Matplotlib's docs indicate that the bar() function
+        # can take a list of strings for the x-axis, it doesn't appear to work.
+        # So we hack it by creating a list of integers and setting the
+        # tick-labels with the category-names instead.
+        x = np.arange(len(counts))
+
+        # Plot using bars.
+        ax.bar(x, counts, tick_label=names)
+
+        # Adjust the rotation of the category-names on the x-axis.
+        ax.set_xticklabels(labels=names, rotation=rotate_labels)
+    else:
+        # Otherwise the search-space Dimension is either integer or float,
+        # in which case the histogram can be plotted more easily.
+        if dimension.prior == 'log-uniform':
+            # Map the number of bins to a log-space for the dimension bounds.
+            bins_mapped = np.logspace(*np.log10(dimension.bounds), bins)
+        else:
+            # Use the original number of bins.
+            bins_mapped = bins
+        # Plot the histogram.
+        ax.hist(samples, bins=bins_mapped, range=dimension.bounds)
+
+        # Use log-scale on the x-axis?
+        if dimension.prior == 'log-uniform':
+            ax.set_xscale('log')
+
+    # Set the labels.
+    ax.set_xlabel(dimension.name)
+    ax.set_ylabel('Sample Count')
+
+    return ax
+
+
 def _map_categories(space, points, minimum):
     """
     Map categorical values to integers in a set of points.
 
     Returns
     -------
-   mapped_points : np.array, shape=points.shape
+    mapped_points : np.array, shape=points.shape
         A copy of `points` with categoricals replaced with their indices in
         the corresponding `Dimension`.
 
@@ -592,8 +1319,9 @@ def _evenly_sample(dim, n_points):
         The transformed values of `xi`, for feeding to a model.
     """
     cats = np.array(getattr(dim, 'categories', []), dtype=object)
-    if len(cats):   # Sample categoricals while maintaining order
-        xi = np.linspace(0, len(cats) - 1, min(len(cats), n_points), dtype=int)
+    if len(cats):  # Sample categoricals while maintaining order
+        xi = np.linspace(0, len(cats) - 1, min(len(cats), n_points),
+                         dtype=int)
         xi_transformed = dim.transform(cats[xi])
     else:
         bounds = dim.bounds
@@ -607,3 +1335,63 @@ def _cat_format(dimension, x, _):
     """Categorical axis tick formatter function.  Returns the name of category
     `x` in `dimension`.  Used with `matplotlib.ticker.FuncFormatter`."""
     return str(dimension.categories[int(x)])
+
+
+def _evaluate_min_params(result, params='result',
+                         n_minimum_search=None,
+                         random_state=None):
+    """Returns the minimum based on `params`"""
+    x_vals = None
+    space = result.space
+    if isinstance(params, str):
+        if params == 'result':
+            # Using the best observed result
+            x_vals = result.x
+        elif params == 'expected_minimum':
+            if result.space.is_partly_categorical:
+                # space is also categorical
+                raise ValueError('expected_minimum does not support any'
+                                 'categorical values')
+            # Do a gradient based minimum search using scipys own minimizer
+            if n_minimum_search:
+                # If a value for
+                # expected_minimum_samples has been parsed
+                x_vals, _ = expected_minimum(
+                    result,
+                    n_random_starts=n_minimum_search,
+                    random_state=random_state)
+            else:  # Use standard of 20 random starting points
+                x_vals, _ = expected_minimum(result,
+                                             n_random_starts=20,
+                                             random_state=random_state)
+        elif params == 'expected_minimum_random':
+            # Do a minimum search by evaluating the function with
+            # n_samples sample values
+            if n_minimum_search is not None:
+                # If a value for
+                # n_minimum_samples has been parsed
+                x_vals, _ = expected_minimum_random_sampling(
+                    result,
+                    n_random_starts=n_minimum_search,
+                    random_state=random_state)
+            else:
+                # Use standard of 10^n_parameters. Note this
+                # becomes very slow for many parameters
+                n_minimum_search = 10 ** len(result.x)
+                x_vals, _ = expected_minimum_random_sampling(
+                    result,
+                    n_random_starts=n_minimum_search,
+                    random_state=random_state)
+        else:
+            raise ValueError('Argument ´eval_min_params´ must be a valid'
+                             'string (´result´)')
+    elif isinstance(params, list):
+        assert len(params) == len(result.x), 'Argument' \
+            '´eval_min_params´ of type list must have same length as' \
+            'number of features'
+        # Using defined x_values
+        x_vals = params
+    else:
+        raise ValueError('Argument ´eval_min_params´ must'
+                         'be a string or a list')
+    return x_vals
diff --git a/skopt/sampler/__init__.py b/skopt/sampler/__init__.py
new file mode 100644
index 000000000..61224ac69
--- /dev/null
+++ b/skopt/sampler/__init__.py
@@ -0,0 +1,16 @@
+"""
+Utilities for generating initial sequences
+"""
+from .lhs import Lhs
+from .sobol import Sobol
+from .halton import Halton
+from .hammersly import Hammersly
+from .grid import Grid
+from .base import InitialPointGenerator
+
+
+__all__ = [
+    "Lhs", "Sobol",
+    "Halton", "Hammersly",
+    "Grid", "InitialPointGenerator"
+]
diff --git a/skopt/sampler/base.py b/skopt/sampler/base.py
new file mode 100644
index 000000000..39dc6af5c
--- /dev/null
+++ b/skopt/sampler/base.py
@@ -0,0 +1,28 @@
+
+from collections import defaultdict
+
+
+class InitialPointGenerator(object):
+    def generate(self, dimensions, n_samples, random_state=None):
+        raise NotImplemented
+
+    def set_params(self, **params):
+        """
+        Set the parameters of this initial point generator.
+
+        Parameters
+        ----------
+        **params : dict
+            Generator parameters.
+        Returns
+        -------
+        self : object
+            Generator instance.
+        """
+        if not params:
+            # Simple optimization to gain speed (inspect is slow)
+            return self
+        for key, value in params.items():
+            setattr(self, key, value)
+
+        return self
diff --git a/skopt/sampler/grid.py b/skopt/sampler/grid.py
new file mode 100644
index 000000000..2e90d7a2e
--- /dev/null
+++ b/skopt/sampler/grid.py
@@ -0,0 +1,170 @@
+"""
+Inspired by https://github.com/jonathf/chaospy/blob/master/chaospy/
+distributions/sampler/sequences/grid.py
+"""
+import numpy as np
+from .base import InitialPointGenerator
+from ..space import Space
+from sklearn.utils import check_random_state
+
+
+def _quadrature_combine(args):
+    args = [np.asarray(arg).reshape(len(arg), -1) for arg in args]
+    shapes = [arg.shape for arg in args]
+
+    size = np.prod(shapes, 0)[0] * np.sum(shapes, 0)[1]
+    if size > 10 ** 9:
+        raise MemoryError("Too large sets")
+
+    out = args[0]
+    for arg in args[1:]:
+        out = np.hstack([
+            np.tile(out, len(arg)).reshape(-1, out.shape[1]),
+            np.tile(arg.T, len(out)).reshape(arg.shape[1], -1).T,
+        ])
+    return out
+
+
+def _create_uniform_grid_exclude_border(n_dim, order):
+    assert order > 0
+    assert n_dim > 0
+    x_data = np.arange(1, order + 1) / (order + 1.)
+    x_data = _quadrature_combine([x_data] * n_dim)
+    return x_data
+
+
+def _create_uniform_grid_include_border(n_dim, order):
+    assert order > 1
+    assert n_dim > 0
+    x_data = np.arange(0, order) / (order - 1.)
+    x_data = _quadrature_combine([x_data] * n_dim)
+    return x_data
+
+
+def _create_uniform_grid_only_border(n_dim, order):
+    assert n_dim > 0
+    assert order > 1
+    x = [[0., 1.]] * (n_dim - 1)
+    x.append(list(np.arange(0, order) / (order - 1.)))
+    x_data = _quadrature_combine(x)
+    return x_data
+
+
+class Grid(InitialPointGenerator):
+    """Generate samples from a regular grid.
+
+    Parameters
+    ----------
+    border : str, default='exclude'
+        defines how the samples are generated:
+        - 'include' : Includes the border into the grid layout
+        - 'exclude' : Excludes the border from the grid layout
+        - 'only' : Selects only points at the border of the dimension
+    use_full_layout : boolean, default=True
+        When True, a  full factorial design is generated and
+        missing points are taken from the next larger full factorial
+        design, depending on `append_border`
+        When False, the next larger  full factorial design is
+        generated and points are randomly selected from it.
+    append_border : str, default="only"
+        When use_full_layout is True, this parameter defines how the missing
+        points will be generated from the next larger grid layout:
+        - 'include' : Includes the border into the grid layout
+        - 'exclude' : Excludes the border from the grid layout
+        - 'only' : Selects only points at the border of the dimension
+    """
+
+    def __init__(self, border="exclude", use_full_layout=True,
+                 append_border="only"):
+        self.border = border
+        self.use_full_layout = use_full_layout
+        self.append_border = append_border
+
+    def generate(self, dimensions, n_samples, random_state=None):
+        """Creates samples from a regular grid.
+
+        Parameters
+        ----------
+        dimensions : list, shape (n_dims,)
+            List of search space dimensions.
+            Each search dimension can be defined either as
+
+            - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
+              dimensions),
+            - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
+              dimensions),
+            - as a list of categories (for `Categorical` dimensions), or
+            - an instance of a `Dimension` object (`Real`, `Integer` or
+              `Categorical`).
+
+        n_samples : int
+            The order of the Halton sequence. Defines the number of samples.
+        random_state : int, RandomState instance, or None (default)
+            Set random state to something other than None for reproducible
+            results.
+
+        Returns
+        -------
+        np.array, shape=(n_dim, n_samples)
+            grid set
+        """
+        rng = check_random_state(random_state)
+        space = Space(dimensions)
+        n_dim = space.n_dims
+        transformer = space.get_transformer()
+        space.set_transformer("normalize")
+
+        if self.border == "include":
+            if self.use_full_layout:
+                order = int(np.floor(np.sqrt(n_samples)))
+            else:
+                order = int(np.ceil(np.sqrt(n_samples)))
+            if order < 2:
+                order = 2
+            h = _create_uniform_grid_include_border(n_dim, order)
+        elif self.border == "exclude":
+            if self.use_full_layout:
+                order = int(np.floor(np.sqrt(n_samples)))
+            else:
+                order = int(np.ceil(np.sqrt(n_samples)))
+            if order < 1:
+                order = 1
+            h = _create_uniform_grid_exclude_border(n_dim, order)
+        elif self.border == "only":
+            if self.use_full_layout:
+                order = int(np.floor(n_samples / 2.))
+            else:
+                order = int(np.ceil(n_samples / 2.))
+            if order < 2:
+                order = 2
+            h = _create_uniform_grid_exclude_border(n_dim, order)
+        else:
+            raise ValueError("Wrong value for border")
+        if np.size(h, 0) > n_samples:
+            rng.shuffle(h)
+            h = h[:n_samples, :]
+        elif np.size(h, 0) < n_samples:
+            if self.append_border == "only":
+                order = int(np.ceil((n_samples - np.size(h, 0)) / 2.))
+                if order < 2:
+                    order = 2
+                h2 = _create_uniform_grid_only_border(n_dim, order)
+            elif self.append_border == "include":
+                order = int(np.ceil(np.sqrt(n_samples - np.size(h, 0))))
+                if order < 2:
+                    order = 2
+                h2 = _create_uniform_grid_include_border(n_dim, order)
+            elif self.append_border == "exclude":
+                order = int(np.ceil(np.sqrt(n_samples - np.size(h, 0))))
+                if order < 1:
+                    order = 1
+                h2 = _create_uniform_grid_exclude_border(n_dim, order)
+            else:
+                raise ValueError("Wrong value for append_border")
+            h = np.vstack((h, h2[:(n_samples - np.size(h, 0))]))
+            rng.shuffle(h)
+        else:
+            rng.shuffle(h)
+        h = space.inverse_transform(h)
+        space.set_transformer(transformer)
+        return h
diff --git a/skopt/sampler/halton.py b/skopt/sampler/halton.py
new file mode 100644
index 000000000..7a3eeb873
--- /dev/null
+++ b/skopt/sampler/halton.py
@@ -0,0 +1,182 @@
+"""
+Inspired by https://github.com/jonathf/chaospy/blob/master/chaospy/
+distributions/sampler/sequences/halton.py
+"""
+import numpy as np
+from .base import InitialPointGenerator
+from ..space import Space
+from sklearn.utils import check_random_state
+
+
+class Halton(InitialPointGenerator):
+    """Creates `Halton` sequence samples.
+
+    In statistics, Halton sequences are sequences used to generate
+    points in space for numerical methods such as Monte Carlo simulations.
+    Although these sequences are deterministic, they are of low discrepancy,
+    that is, appear to be random
+    for many purposes. They were first introduced in 1960 and are an example
+    of a quasi-random number sequence. They generalise the one-dimensional
+    van der Corput sequences.
+
+    For ``dim == 1`` the sequence falls back to Van Der Corput sequence.
+
+    Parameters
+    ----------
+    min_skip : int
+        Minimum skipped seed number. When `min_skip != max_skip`
+        a random number is picked.
+    max_skip : int
+        Maximum skipped seed number. When `min_skip != max_skip`
+        a random number is picked.
+    primes : tuple, default=None
+        The (non-)prime base to calculate values along each axis. If
+        empty or None, growing prime values starting from 2 will be used.
+
+    """
+    def __init__(self, min_skip=0, max_skip=0, primes=None):
+        self.primes = primes
+        self.min_skip = min_skip
+        self.max_skip = max_skip
+
+    def generate(self, dimensions, n_samples, random_state=None):
+        """Creates samples from Halton set.
+
+        Parameters
+        ----------
+        dimensions : list, shape (n_dims,)
+            List of search space dimensions.
+            Each search dimension can be defined either as
+
+            - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
+              dimensions),
+            - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
+              dimensions),
+            - as a list of categories (for `Categorical` dimensions), or
+            - an instance of a `Dimension` object (`Real`, `Integer` or
+              `Categorical`).
+        n_samples : int
+            The order of the Halton sequence. Defines the number of samples.
+        random_state : int, RandomState instance, or None (default)
+            Set random state to something other than None for reproducible
+            results.
+
+        Returns
+        -------
+        np.array, shape=(n_dim, n_samples)
+            Halton set.
+
+        """
+        rng = check_random_state(random_state)
+        if self.primes is None:
+            primes = []
+        else:
+            primes = list(self.primes)
+        space = Space(dimensions)
+        n_dim = space.n_dims
+        transformer = space.get_transformer()
+        space.set_transformer("normalize")
+        if len(primes) < n_dim:
+            prime_order = 10 * n_dim
+            while len(primes) < n_dim:
+                primes = _create_primes(prime_order)
+                prime_order *= 2
+
+            primes = primes[:n_dim]
+        assert len(primes) == n_dim, "not enough primes"
+
+        if self.min_skip == self.max_skip:
+            skip = self.min_skip
+        elif self.min_skip < 0 and self.max_skip < 0:
+            skip = max(primes)
+        elif self.min_skip < 0 or self.max_skip < 0:
+            skip = np.max(self.min_skip, self.max_skip)
+        else:
+            skip = rng.randint(self.min_skip, self.max_skip)
+
+        out = np.empty((n_dim, n_samples))
+        indices = [idx + skip for idx in range(n_samples)]
+        for dim_ in range(n_dim):
+            out[dim_] = _van_der_corput_samples(
+                indices, number_base=primes[dim_])
+        out = space.inverse_transform(np.transpose(out))
+        space.set_transformer(transformer)
+        return out
+
+
+def _van_der_corput_samples(idx, number_base=2):
+    """Create `Van Der Corput` low discrepancy sequence samples.
+
+    A van der Corput sequence is an example of the simplest one-dimensional
+    low-discrepancy sequence over the unit interval; it was first described in
+    1935 by the Dutch mathematician J. G. van der Corput. It is constructed by
+    reversing the base-n representation of the sequence of natural numbers
+    (1, 2, 3, ...).
+
+    In practice, use Halton sequence instead of Van Der Corput, as it is the
+    same, but generalized to work in multiple dimensions.
+
+    Parameters
+    ----------
+    idx (int, numpy.ndarray):
+        The index of the sequence. If array is provided, all values in
+        array is returned.
+    number_base : int
+        The numerical base from where to create the samples from.
+
+    Returns
+    -------
+    float, numpy.ndarray
+        Van der Corput samples.
+
+    """
+    assert number_base > 1
+
+    idx = np.asarray(idx).flatten()
+    out = np.zeros(len(idx), dtype=float)
+
+    base = float(number_base)
+    active = np.ones(len(idx), dtype=bool)
+    while np.any(active):
+        out[active] += (idx[active] % number_base)/base
+        idx //= number_base
+        base *= number_base
+        active = idx > 0
+    return out
+
+
+def _create_primes(threshold):
+    """
+    Generate prime values using sieve of Eratosthenes method.
+
+    Parameters
+    ----------
+    threshold : int
+        The upper bound for the size of the prime values.
+
+    Returns
+    ------
+    List
+        All primes from 2 and up to ``threshold``.
+    """
+    if threshold == 2:
+        return [2]
+
+    elif threshold < 2:
+        return []
+
+    numbers = list(range(3, threshold+1, 2))
+    root_of_threshold = threshold ** 0.5
+    half = int((threshold+1)/2-1)
+    idx = 0
+    counter = 3
+    while counter <= root_of_threshold:
+        if numbers[idx]:
+            idy = int((counter*counter-3)/2)
+            numbers[idy] = 0
+            while idy < half:
+                numbers[idy] = 0
+                idy += counter
+        idx += 1
+        counter = 2*idx+3
+    return [2] + [number for number in numbers if number]
diff --git a/skopt/sampler/hammersly.py b/skopt/sampler/hammersly.py
new file mode 100644
index 000000000..5de6c6e99
--- /dev/null
+++ b/skopt/sampler/hammersly.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+""" Inspired by https://github.com/jonathf/chaospy/blob/master/chaospy/
+distributions/sampler/sequences/hammersley.py
+"""
+import numpy as np
+from .halton import Halton
+from ..space import Space
+from .base import InitialPointGenerator
+from sklearn.utils import check_random_state
+
+
+class Hammersly(InitialPointGenerator):
+    """Creates `Hammersley` sequence samples.
+
+    The Hammersley set is equivalent to the Halton sequence, except for one
+    dimension is replaced with a regular grid. It is not recommended to
+    generate a Hammersley sequence with more than 10 dimension.
+
+    For ``dim == 1`` the sequence falls back to Van Der Corput sequence.
+
+    References
+    ----------
+    T-T. Wong, W-S. Luk, and P-A. Heng, "Sampling with Hammersley and Halton
+    Points," Journal of Graphics Tools, vol. 2, no. 2, 1997, pp. 9 - 24.
+
+    Parameters
+    ----------
+    min_skip : int, default=-1
+        Minimum skipped seed number. When `min_skip != max_skip` and
+        both are > -1, a random number is picked.
+    max_skip : int, default=-1
+        Maximum skipped seed number. When `min_skip != max_skip` and
+        both are > -1, a random number is picked.
+    primes : tuple, default=None
+        The (non-)prime base to calculate values along each axis. If
+        empty, growing prime values starting from 2 will be used.
+
+    """
+    def __init__(self, min_skip=0, max_skip=0, primes=None):
+        self.primes = primes
+        self.min_skip = min_skip
+        self.max_skip = max_skip
+
+    def generate(self, dimensions, n_samples, random_state=None):
+        """Creates samples from Hammersly set.
+
+        Parameters
+        ----------
+        dimensions : list, shape (n_dims,)
+            List of search space dimensions.
+            Each search dimension can be defined either as
+
+            - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
+              dimensions),
+            - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
+              dimensions),
+            - as a list of categories (for `Categorical` dimensions), or
+            - an instance of a `Dimension` object (`Real`, `Integer` or
+              `Categorical`).
+        n_samples : int
+            The order of the Hammersley sequence.
+            Defines the number of samples.
+        random_state : int, RandomState instance, or None (default)
+            Set random state to something other than None for reproducible
+            results.
+
+        Returns
+        -------
+        np.array, shape=(n_dim, n_samples)
+            Hammersley set.
+
+        """
+        rng = check_random_state(random_state)
+        halton = Halton(min_skip=self.min_skip, max_skip=self.max_skip,
+                        primes=self.primes)
+        space = Space(dimensions)
+        n_dim = space.n_dims
+        transformer = space.get_transformer()
+        space.set_transformer("normalize")
+        if n_dim == 1:
+            out = halton.generate(dimensions, n_samples,
+                                  random_state=rng)
+        else:
+            out = np.empty((n_dim, n_samples), dtype=float)
+            out[:n_dim - 1] = np.array(halton.generate(
+                [(0., 1.), ] * (n_dim - 1), n_samples,
+                random_state=rng)).T
+
+            out[n_dim - 1] = np.linspace(0, 1, n_samples + 1)[:-1]
+            out = space.inverse_transform(out.T)
+        space.set_transformer(transformer)
+        return out
diff --git a/skopt/sampler/lhs.py b/skopt/sampler/lhs.py
new file mode 100644
index 000000000..d63a15bb3
--- /dev/null
+++ b/skopt/sampler/lhs.py
@@ -0,0 +1,145 @@
+"""
+Lhs functions are inspired by
+https://github.com/clicumu/pyDOE2/blob/
+master/pyDOE2/doe_lhs.py
+"""
+import numpy as np
+from sklearn.utils import check_random_state
+from scipy import spatial
+from ..space import Space, Categorical
+from .base import InitialPointGenerator
+
+
+def _random_permute_matrix(h, random_state=None):
+    rng = check_random_state(random_state)
+    h_rand_perm = np.zeros_like(h)
+    samples, n = h.shape
+    for j in range(n):
+        order = rng.permutation(range(samples))
+        h_rand_perm[:, j] = h[order, j]
+    return h_rand_perm
+
+
+class Lhs(InitialPointGenerator):
+    """Latin hypercube sampling
+
+    Parameters
+    ----------
+    lhs_type : str, default='classic'
+        - 'classic' - a small random number is added
+        - 'centered' - points are set uniformly in each interval
+
+    criterion : str or None, default='maximin'
+        When set to None, the LHS is not optimized
+
+        - 'correlation' : optimized LHS by minimizing the correlation
+        - 'maximin' : optimized LHS by maximizing the minimal pdist
+        - 'ratio' : optimized LHS by minimizing the ratio
+          `max(pdist) / min(pdist)`
+
+    iterations : int
+        Defines the number of iterations for optimizing LHS
+    """
+    def __init__(self, lhs_type="classic", criterion="maximin",
+                 iterations=1000):
+        self.lhs_type = lhs_type
+        self.criterion = criterion
+        self.iterations = iterations
+
+    def generate(self, dimensions, n_samples, random_state=None):
+        """Creates latin hypercube samples.
+
+        Parameters
+        ----------
+        dimensions : list, shape (n_dims,)
+            List of search space dimensions.
+            Each search dimension can be defined either as
+
+            - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
+              dimensions),
+            - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
+              dimensions),
+            - as a list of categories (for `Categorical` dimensions), or
+            - an instance of a `Dimension` object (`Real`, `Integer` or
+              `Categorical`).
+
+        n_samples : int
+            The order of the LHS sequence. Defines the number of samples.
+        random_state : int, RandomState instance, or None (default)
+            Set random state to something other than None for reproducible
+            results.
+
+        Returns
+        -------
+        np.array, shape=(n_dim, n_samples)
+            LHS set
+        """
+        rng = check_random_state(random_state)
+        space = Space(dimensions)
+        transformer = space.get_transformer()
+        n_dim = space.n_dims
+        space.set_transformer("normalize")
+        if self.criterion is None or n_samples == 1:
+            h = self._lhs_normalized(n_dim, n_samples, rng)
+            h = space.inverse_transform(h)
+            space.set_transformer(transformer)
+            return h
+        else:
+            h_opt = self._lhs_normalized(n_dim, n_samples, rng)
+            h_opt = space.inverse_transform(h_opt)
+            if self.criterion == "correlation":
+                mincorr = np.inf
+                for i in range(self.iterations):
+                    # Generate a random LHS
+                    h = self._lhs_normalized(n_dim, n_samples, rng)
+                    r = np.corrcoef(np.array(h).T)
+                    if len(np.abs(r[r != 1])) > 0 and \
+                            np.max(np.abs(r[r != 1])) < mincorr:
+                        mincorr = np.max(np.abs(r - np.eye(r.shape[0])))
+                        h_opt = h.copy()
+                        h_opt = space.inverse_transform(h_opt)
+            elif self.criterion == "maximin":
+                maxdist = 0
+                # Maximize the minimum distance between points
+                for i in range(self.iterations):
+                    h = self._lhs_normalized(n_dim, n_samples, rng)
+                    d = spatial.distance.pdist(np.array(h), 'euclidean')
+                    if maxdist < np.min(d):
+                        maxdist = np.min(d)
+                        h_opt = h.copy()
+                        h_opt = space.inverse_transform(h_opt)
+            elif self.criterion == "ratio":
+                minratio = np.inf
+
+                # Maximize the minimum distance between points
+                for i in range(self.iterations):
+                    h = self._lhs_normalized(n_dim, n_samples, rng)
+                    p = spatial.distance.pdist(np.array(h), 'euclidean')
+                    if np.min(p) == 0:
+                        ratio = np.max(p) / 1e-8
+                    else:
+                        ratio = np.max(p) / np.min(p)
+                    if minratio > ratio:
+                        minratio = ratio
+                        h_opt = h.copy()
+                        h_opt = space.inverse_transform(h_opt)
+            else:
+                raise ValueError("Wrong criterion."
+                                 "Got {}".format(self.criterion))
+            space.set_transformer(transformer)
+            return h_opt
+
+    def _lhs_normalized(self, n_dim, n_samples, random_state):
+        rng = check_random_state(random_state)
+        x = np.linspace(0, 1, n_samples + 1)
+        u = rng.rand(n_samples, n_dim)
+        h = np.zeros_like(u)
+        if self.lhs_type == "centered":
+            for j in range(n_dim):
+                h[:, j] = np.diff(x) / 2.0 + x[:n_samples]
+        elif self.lhs_type == "classic":
+            for j in range(n_dim):
+                h[:, j] = u[:, j] * np.diff(x) + x[:n_samples]
+        else:
+            raise ValueError("Wrong lhs_type. Got ".format(self.lhs_type))
+        return _random_permute_matrix(h, random_state=rng)
diff --git a/skopt/sampler/sobol.py b/skopt/sampler/sobol.py
new file mode 100644
index 000000000..a148003c4
--- /dev/null
+++ b/skopt/sampler/sobol.py
@@ -0,0 +1,428 @@
+"""
+  Authors:
+    Original FORTRAN77 version of i4_sobol by Bennett Fox.
+    MATLAB version by John Burkardt.
+    PYTHON version by Corrado Chisari
+
+    Original Python version of is_prime by Corrado Chisari
+
+    Original MATLAB versions of other functions by John Burkardt.
+    PYTHON versions by Corrado Chisari
+
+    Modified Python version by Holger Nahrstaedt
+
+    Original code is available from
+    http://people.sc.fsu.edu/~jburkardt/py_src/sobol/sobol.html
+"""
+
+from __future__ import division
+
+import warnings
+
+import numpy as np
+from .base import InitialPointGenerator
+from ..space import Space
+from sklearn.utils import check_random_state
+
+
+class Sobol(InitialPointGenerator):
+    """Generates a new quasirandom Sobol' vector with each call.
+
+    Parameters
+    ----------
+    skip : int
+        Skipped seed number.
+
+    randomize : bool, default=False
+        When set to True, random shift is applied.
+
+    Notes
+    -----
+    Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in
+    :math:`[0,1)^{dim}`. Scrambling them makes them suitable for singular
+    integrands, provides a means of error estimation, and can improve their
+    rate of convergence.
+
+    There are many versions of Sobol' sequences depending on their
+    'direction numbers'. Here, the maximum number of dimension is 40.
+
+    The routine adapts the ideas of Antonov and Saleev [2]_.
+
+    .. warning::
+
+       Sobol' sequences are a quadrature rule and they lose their balance
+       properties if one uses a sample size that is not a power of 2, or skips
+       the first point, or thins the sequence [5]_.
+
+       If :math:`n=2^m` points are not enough then one should take :math:`2^M`
+       points for :math:`M>m`. When scrambling, the number R of independent
+       replicates does not have to be a power of 2.
+
+       Sobol' sequences are generated to some number :math:`B` of bits. Then
+       after :math:`2^B` points have been generated, the sequence will repeat.
+       Currently :math:`B=30`.
+
+    References
+    ----------
+    .. [1] I. M. Sobol. The distribution of points in a cube and the accurate
+       evaluation of integrals. Zh. Vychisl. Mat. i Mat. Phys., 7:784-802,
+       1967.
+
+    .. [2] Antonov, Saleev,
+       USSR Computational Mathematics and Mathematical Physics,
+       Volume 19, 1980, pages 252 - 256.
+
+    .. [3] Paul Bratley, Bennett Fox,
+       Algorithm 659:
+       Implementing Sobol's Quasirandom Sequence Generator,
+       ACM Transactions on Mathematical Software,
+       Volume 14, Number 1, pages 88-100, 1988.
+
+    .. [4] Bennett Fox,
+       Algorithm 647:
+       Implementation and Relative Efficiency of Quasirandom
+       Sequence Generators,
+
+    .. [5] Art B. Owen. On dropping the first Sobol' point. arXiv 2008.08051,
+       2020.
+
+    """
+    def __init__(self, skip=0, randomize=True):
+
+        if not (skip & (skip - 1) == 0):
+            raise ValueError("The balance properties of Sobol' points require"
+                             " skipping a power of 2.")
+        if skip != 0:
+            warnings.warn(f"{skip} points have been skipped: "
+                          f"{skip} points can be generated before the "
+                          f"sequence repeats.")
+        self.skip = skip
+
+        self.num_generated = 0
+        self.randomize = randomize
+
+        self.dim_max = 40
+        self.log_max = 30
+        self.atmost = 2 ** self.log_max - 1
+        self.lastq = None
+        self.maxcol = None
+        self.poly = None
+        self.recipd = None
+        self.seed_save = -1
+        self.v = np.zeros((self.dim_max, self.log_max))
+        self.dim_num_save = -1
+
+    def init(self, dim_num):
+        self.dim_num_save = dim_num
+        self.v = np.zeros((self.dim_max, self.log_max))
+        self.v[0:40, 0] = np.transpose([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                        1, 1, 1, 1])
+
+        self.v[2:40, 1] = np.transpose([1, 3, 1, 3, 1, 3, 3, 1, 3, 1, 3, 1,
+                                        3, 1, 1, 3, 1, 3, 1, 3, 1, 3, 3, 1,
+                                        3, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 3,
+                                        1, 3])
+
+        self.v[3:40, 2] = np.transpose([7, 5, 1, 3, 3, 7, 5, 5, 7, 7, 1,
+                                        3, 3, 7, 5, 1, 1, 5, 3, 3, 1, 7, 5,
+                                        1, 3, 3, 7, 5, 1, 1, 5, 7, 7, 5, 1,
+                                        3, 3])
+
+        self.v[5:40, 3] = np.transpose([1, 7,  9,  13, 11, 1, 3,  7,  9,
+                                        5,  13, 13, 11, 3,  15, 5, 3, 15,
+                                        7,  9,  13, 9,  1,  11, 7, 5, 15,
+                                        1,  15, 11, 5,  3,  1,  7,  9])
+
+        self.v[7:40, 4] = np.transpose([9,  3,  27, 15, 29, 21, 23, 19,
+                                        11, 25, 7,  13, 17, 1,  25, 29,
+                                        3,  31, 11, 5,  23, 27, 19, 21,
+                                        5,  1,  17, 13, 7,  15, 9,  31, 9])
+
+        self.v[13:40, 5] = np.transpose([37, 33, 7,  5,  11, 39, 63, 27,
+                                         17, 15, 23, 29, 3,  21, 13, 31,
+                                         25, 9,  49, 33, 19, 29, 11, 19,
+                                         27, 15, 25])
+
+        self.v[19:40, 6] = np.transpose([13, 33, 115, 41, 79, 17, 29, 119,
+                                         75, 73, 105, 7,  59,  65, 21, 3,
+                                         113, 61,  89, 45, 107])
+
+        self.v[37:40, 7] = np.transpose([7, 23, 39])
+
+        #  Set POLY.
+        self.poly = [1, 3, 7, 11, 13, 19, 25, 37, 59, 47, 61, 55, 41, 67, 97,
+                     91, 109, 103, 115, 131, 193, 137, 145, 143, 241, 157,
+                     185, 167, 229, 171, 213, 191, 253, 203, 211, 239, 247,
+                     285, 369, 299]
+
+        #  Find the number of bits in ATMOST.
+        self.maxcol = _bit_hi1(self.atmost)
+
+        #  Initialize row 1 of V.
+        self.v[0, 0:self.maxcol] = 1
+
+        #  Check parameters.
+        if dim_num < 1 or self.dim_max < dim_num:
+            raise ValueError(f'I4_SOBOL - Fatal error!\n'
+                             f'  The spatial dimension DIM_NUM should '
+                             f'satisfy:\n'
+                             f'  1 <= DIM_NUM <= {self.dim_max}\n'
+                             f'  But this input value is DIM_NUM = {dim_num}')
+
+        #  Initialize the remaining rows of V.
+        for i in range(2, dim_num + 1):
+
+            #  The bits of the integer POLY(I) gives the form of polynomial I.
+            #  Find the degree of polynomial I from binary encoding.
+            j = self.poly[i - 1]
+            m = 0
+            j //= 2
+            while j > 0:
+                j //= 2
+                m += 1
+
+            #  Expand this bit pattern to separate components
+            #  of the logical array INCLUD.
+            j = self.poly[i - 1]
+            includ = np.zeros(m)
+            for k in range(m, 0, -1):
+                j2 = j // 2
+                includ[k - 1] = (j != 2 * j2)
+                j = j2
+
+            #  Calculate the remaining elements of row I as explained
+            #  in Bratley and Fox, section 2.
+            for j in range(m + 1, self.maxcol + 1):
+                newv = self.v[i - 1, j - m - 1]
+                p2 = 1
+                for k in range(1, m + 1):
+                    p2 *= 2
+                    if includ[k - 1]:
+                        newv = np.bitwise_xor(
+                            int(newv), int(p2 * self.v[i - 1, j - k - 1]))
+                self.v[i - 1, j - 1] = newv
+        #  Multiply columns of V by appropriate power of 2.
+        p2 = 1
+        for j in range(self.maxcol - 1, 0, -1):
+            p2 *= 2
+            self.v[0:dim_num, j - 1] = self.v[0:dim_num, j - 1] * p2
+
+        #  RECIPD is 1/(common denominator of the elements in V).
+        self.recipd = 1.0 / (2 * p2)
+        self.lastq = np.zeros(dim_num)
+
+    def generate(self, dimensions, n_samples, random_state=None):
+        """Creates samples from Sobol' set.
+
+        Parameters
+        ----------
+        dimensions : list, shape (n_dims,)
+            List of search space dimensions.
+            Each search dimension can be defined either as
+
+            - a `(lower_bound, upper_bound)` tuple (for `Real` or `Integer`
+              dimensions),
+            - a `(lower_bound, upper_bound, "prior")` tuple (for `Real`
+              dimensions),
+            - as a list of categories (for `Categorical` dimensions), or
+            - an instance of a `Dimension` object (`Real`, `Integer` or
+              `Categorical`).
+        n_samples : int
+            The order of the Sobol' sequence. Defines the number of samples.
+        random_state : int, RandomState instance, or None (default)
+            Set random state to something other than None for reproducible
+            results.
+
+        Returns
+        -------
+        sample : array_like (n_samples, dim)
+            Sobol' set.
+
+        """
+        total_n_samples = self.num_generated + n_samples
+        if not (total_n_samples & (total_n_samples - 1) == 0):
+            warnings.warn("The balance properties of Sobol' points require "
+                          "n to be a power of 2. {0} points have been "
+                          "previously generated, then: n={0}+{1}={2}. "
+                          .format(self.num_generated, n_samples,
+                                  total_n_samples))
+        if self.skip != 0 and total_n_samples > self.skip:
+            raise ValueError(f"{self.skip} points have been skipped: "
+                             f"generating "
+                             f"{n_samples} more points would cause the "
+                             f"sequence to repeat.")
+
+        rng = check_random_state(random_state)
+        space = Space(dimensions)
+        n_dim = space.n_dims
+        transformer = space.get_transformer()
+        space.set_transformer("normalize")
+        r = np.full((n_samples, n_dim), np.nan)
+
+        seed = self.skip
+        for j in range(n_samples):
+            r[j, 0:n_dim], seed = self._sobol(n_dim, seed)
+
+        if self.randomize:
+            r = _random_shift(r, rng)
+
+        r = space.inverse_transform(r)
+        space.set_transformer(transformer)
+
+        self.num_generated += n_samples
+
+        return r
+
+    def _sobol(self, dim_num, seed):
+        """Generates a new quasirandom Sobol' vector with each call.
+
+        Parameters
+        ----------
+        dim_num : int
+          Number of spatial dimensions.
+          `dim_num` must satisfy 1 <= DIM_NUM <= 40.
+
+        seed : int
+          the `seed` for the sequence.
+          This is essentially the index in the sequence of the quasirandom
+          value to be generated. On output, `seed` has been set to the
+          appropriate next value, usually simply `seed`+1.
+          If `seed` is less than 0 on input, it is treated as though it were 0.
+          An input value of 0 requests the first (0-th) element of
+          the sequence.
+
+        Returns
+        -------
+        vector, seed : np.array (n_dim,), int
+            The next quasirandom vector and the seed of its next vector.
+
+        """
+        #  Things to do only if the dimension changed.
+        if dim_num != self.dim_num_save:
+            self.init(dim_num)
+
+        seed = int(np.floor(seed))
+
+        if seed < 0:
+            seed = 0
+
+        pos_lo0 = 1
+        if seed == 0:
+            self.lastq = np.zeros(dim_num)
+
+        elif seed == self.seed_save + 1:
+
+            #  Find the position of the right-hand zero in SEED.
+            pos_lo0 = _bit_lo0(seed)
+
+        elif seed <= self.seed_save:
+
+            self.seed_save = 0
+            self.lastq = np.zeros(dim_num)
+
+            for seed_temp in range(int(self.seed_save), int(seed)):
+                pos_lo0 = _bit_lo0(seed_temp)
+                for i in range(1, dim_num + 1):
+                    self.lastq[i - 1] = np.bitwise_xor(
+                        int(self.lastq[i - 1]),
+                        int(self.v[i - 1, pos_lo0 - 1]))
+
+            pos_lo0 = _bit_lo0(seed)
+
+        elif self.seed_save + 1 < seed:
+
+            for seed_temp in range(int(self.seed_save + 1), int(seed)):
+                pos_lo0 = _bit_lo0(seed_temp)
+                for i in range(1, dim_num + 1):
+                    self.lastq[i - 1] = np.bitwise_xor(
+                        int(self.lastq[i - 1]),
+                        int(self.v[i - 1, pos_lo0 - 1]))
+
+            pos_lo0 = _bit_lo0(seed)
+
+        #  Check that the user is not calling too many times!
+        if self.maxcol < pos_lo0:
+            raise ValueError(f'I4_SOBOL - Fatal error!\n'
+                             f' Too many calls!\n'
+                             f' MAXCOL = {self.maxcol}\n'
+                             f' L =      {pos_lo0}\n')
+
+        #  Calculate the new components of QUASI.
+        quasi = np.zeros(dim_num)
+        for i in range(1, dim_num + 1):
+            quasi[i - 1] = self.lastq[i - 1] * self.recipd
+            self.lastq[i - 1] = np.bitwise_xor(
+                int(self.lastq[i - 1]), int(self.v[i - 1, pos_lo0 - 1]))
+
+        self.seed_save = seed
+        seed += 1
+
+        return [quasi, seed]
+
+
+def _bit_hi1(n):
+    """Returns the position of the high 1 bit base 2 in an integer.
+
+    Parameters
+    ----------
+    n : int
+        Input, should be positive.
+
+    """
+    bin_repr = np.binary_repr(n)
+    most_left_one = bin_repr.find('1')
+    if most_left_one == -1:
+        return 0
+    else:
+        return len(bin_repr) - most_left_one
+
+
+def _bit_lo0(n):
+    """Returns the position of the low 0 bit base 2 in an integer.
+
+    Parameters
+    ----------
+    n : int
+        Input, should be positive.
+
+    """
+    bin_repr = np.binary_repr(n)
+    most_right_zero = bin_repr[::-1].find('0')
+    if most_right_zero == -1:
+        most_right_zero = len(bin_repr)
+    return most_right_zero + 1
+
+
+def _random_shift(dm, random_state=None):
+    """Random shifting of a vector.
+
+    Randomization of the quasi-MC samples can be achieved in the easiest manner
+    by random shift (or the Cranley-Patterson rotation).
+
+    References
+    -----------
+    .. [1] C. Lemieux, "Monte Carlo and Quasi-Monte Carlo Sampling," Springer
+       Series in Statistics 692, Springer Science+Business Media, New York,
+       2009
+
+    Parameters
+    ----------
+    dm : array, shape(n, d)
+        Input matrix.
+    random_state : int, RandomState instance, or None (default)
+        Set random state to something other than None for reproducible
+        results.
+
+    Returns
+    -------
+    dm :  array, shape(n, d)
+        Randomized Sobol' design matrix.
+
+    """
+    rng = check_random_state(random_state)
+    # Generate random shift matrix from uniform distribution
+    shift = np.repeat(rng.rand(1, dm.shape[1]), dm.shape[0], axis=0)
+    # Return the shifted Sobol' design
+    return (dm + shift) % 1
diff --git a/skopt/searchcv.py b/skopt/searchcv.py
index 10765047a..391b14cfb 100644
--- a/skopt/searchcv.py
+++ b/skopt/searchcv.py
@@ -1,20 +1,17 @@
+import warnings
+
 try:
     from collections.abc import Sized
 except ImportError:
     from collections import Sized
-from collections import defaultdict
-from functools import partial
 
 import numpy as np
 from scipy.stats import rankdata
 
-import sklearn
-from sklearn.base import is_classifier, clone
-from joblib import Parallel, delayed
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.utils import check_random_state
-from sklearn.utils.fixes import MaskedArray
-from sklearn.utils.validation import indexable, check_is_fitted
+
+from sklearn.utils.validation import check_is_fitted
 try:
     from sklearn.metrics import check_scoring
 except ImportError:
@@ -52,8 +49,7 @@ class BayesSearchCV(BaseSearchCV):
         Either estimator needs to provide a ``score`` function,
         or ``scoring`` must be passed.
 
-    search_spaces : dict, list of dict or list of tuple containing
-        (dict, int).
+    search_spaces : dict, list of dict or list of tuple containing (dict, int).
         One of these cases:
         1. dictionary, where keys are parameter names (strings)
         and values are skopt.space.Dimension instances (Real, Integer
@@ -110,21 +106,15 @@ class BayesSearchCV(BaseSearchCV):
               created and spawned. Use this for lightweight and
               fast-running jobs, to avoid delays due to on-demand
               spawning of the jobs
-
             - An int, giving the exact number of total jobs that are
               spawned
-
             - A string, giving an expression as a function of n_jobs,
               as in '2*n_jobs'
 
-    iid : boolean, default=True
-        If True, the data is assumed to be identically distributed across
-        the folds, and the loss minimized is the total loss per sample,
-        and not the mean loss across the folds.
-
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
+
           - None, to use the default 3-fold cross validation,
           - integer, to specify the number of folds in a `(Stratified)KFold`,
           - An object to be used as a cross-validation generator.
@@ -167,7 +157,7 @@ class BayesSearchCV(BaseSearchCV):
     >>> from sklearn.svm import SVC
     >>> from sklearn.model_selection import train_test_split
     >>>
-    >>> X, y = load_iris(True)
+    >>> X, y = load_iris(return_X_y=True)
     >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
     ...                                                     train_size=0.75,
     ...                                                     random_state=0)
@@ -181,7 +171,8 @@ class BayesSearchCV(BaseSearchCV):
     ...         'degree': Integer(1,8),
     ...         'kernel': Categorical(['linear', 'poly', 'rbf']),
     ...     },
-    ...     n_iter=32
+    ...     n_iter=32,
+    ...     random_state=0
     ... )
     >>>
     >>> # executes bayesian optimization
@@ -242,6 +233,10 @@ class BayesSearchCV(BaseSearchCV):
         which gave highest score (or smallest loss if specified)
         on the left out data. Not available if refit=False.
 
+    optimizer_results_ : list of `OptimizeResult`
+        Contains a `OptimizeResult` for each search space. The search space
+        parameter are sorted by its name.
+
     best_score_ : float
         Score of best_estimator on the left out data.
 
@@ -285,7 +280,7 @@ class BayesSearchCV(BaseSearchCV):
 
     def __init__(self, estimator, search_spaces, optimizer_kwargs=None,
                  n_iter=50, scoring=None, fit_params=None, n_jobs=1,
-                 n_points=1, iid=True, refit=True, cv=None, verbose=0,
+                 n_points=1, iid='deprecated', refit=True, cv=None, verbose=0,
                  pre_dispatch='2*n_jobs', random_state=None,
                  error_score='raise', return_train_score=False):
 
@@ -301,9 +296,14 @@ def __init__(self, estimator, search_spaces, optimizer_kwargs=None,
         # in the constructor and be passed in ``fit``.
         self.fit_params = fit_params
 
+        if iid != "deprecated":
+            warnings.warn("The `iid` parameter has been deprecated "
+                          "and will be ignored.")
+        self.iid = iid  # For sklearn repr pprint
+
         super(BayesSearchCV, self).__init__(
              estimator=estimator, scoring=scoring,
-             n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
+             n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
              pre_dispatch=pre_dispatch, error_score=error_score,
              return_train_score=return_train_score)
 
@@ -360,166 +360,10 @@ def _check_search_space(self, search_space):
                 "Search space should be provided as a dict or list of dict,"
                 "got %s" % search_space)
 
-    # copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
-    @property
-    def best_score_(self):
-        check_is_fitted(self, 'cv_results_')
-        return self.cv_results_['mean_test_score'][self.best_index_]
-
-    # copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
     @property
-    def best_params_(self):
-        check_is_fitted(self, 'cv_results_')
-        return self.cv_results_['params'][self.best_index_]
-
-    # copied for compatibility with 0.19 sklearn from 0.18 BaseSearchCV
-    def _fit(self, X, y, groups, parameter_iterable):
-        """
-        Actual fitting,  performing the search over parameters.
-        Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X
-                    .../sklearn/model_selection/_search.py
-        """
-
-        estimator = self.estimator
-        cv = sklearn.model_selection._validation.check_cv(
-            self.cv, y, classifier=is_classifier(estimator))
-        self.scorer_ = check_scoring(
-            self.estimator, scoring=self.scoring)
-
-        X, y, groups = indexable(X, y, groups)
-        n_splits = cv.get_n_splits(X, y, groups)
-        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
-            n_candidates = len(parameter_iterable)
-            print("Fitting {0} folds for each of {1} candidates, totalling"
-                  " {2} fits".format(n_splits, n_candidates,
-                                     n_candidates * n_splits))
-
-        base_estimator = clone(self.estimator)
-        pre_dispatch = self.pre_dispatch
-
-        cv_iter = list(cv.split(X, y, groups))
-        out = Parallel(
-            n_jobs=self.n_jobs, verbose=self.verbose,
-            pre_dispatch=pre_dispatch
-        )(delayed(sklearn.model_selection._validation._fit_and_score)(
-                clone(base_estimator),
-                X, y, self.scorer_,
-                train, test, self.verbose, parameters,
-                fit_params=self.fit_params,
-                return_train_score=self.return_train_score,
-                return_n_test_samples=True,
-                return_times=True, return_parameters=True,
-                error_score=self.error_score
-            )
-            for parameters in parameter_iterable
-            for train, test in cv_iter)
-
-        # if one choose to see train score, "out" will contain train score info
-        if self.return_train_score:
-            (train_scores, test_scores, test_sample_counts,
-             fit_time, score_time, parameters) = zip(*out)
-        else:
-            (test_scores, test_sample_counts,
-             fit_time, score_time, parameters) = zip(*out)
-
-        candidate_params = parameters[::n_splits]
-        n_candidates = len(candidate_params)
-
-        results = dict()
-
-        def _store(key_name, array, weights=None, splits=False, rank=False):
-            """A small helper to store the scores/times to the cv_results_"""
-            array = np.array(array, dtype=np.float64).reshape(n_candidates,
-                                                              n_splits)
-            if splits:
-                for split_i in range(n_splits):
-                    results["split%d_%s"
-                            % (split_i, key_name)] = array[:, split_i]
-
-            array_means = np.average(array, axis=1, weights=weights)
-            results['mean_%s' % key_name] = array_means
-            # Weighted std is not directly available in numpy
-            array_stds = np.sqrt(np.average((array -
-                                             array_means[:, np.newaxis]) ** 2,
-                                            axis=1, weights=weights))
-            results['std_%s' % key_name] = array_stds
-
-            if rank:
-                results["rank_%s" % key_name] = np.asarray(
-                    rankdata(-array_means, method='min'), dtype=np.int32)
-
-        # Computed the (weighted) mean and std for test scores alone
-        # NOTE test_sample counts (weights) remain the same for all candidates
-        test_sample_counts = np.array(test_sample_counts[:n_splits],
-                                      dtype=np.int)
-
-        _store('test_score', test_scores, splits=True, rank=True,
-               weights=test_sample_counts if self.iid else None)
-        if self.return_train_score:
-            _store('train_score', train_scores, splits=True)
-        _store('fit_time', fit_time)
-        _store('score_time', score_time)
-
-        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
-        best_parameters = candidate_params[best_index]
-
-        # Use one MaskedArray and mask all the places where the param is not
-        # applicable for that candidate. Use defaultdict as each candidate may
-        # not contain all the params
-        param_results = defaultdict(partial(
-                                            MaskedArray,
-                                            np.empty(n_candidates,),
-                                            mask=True,
-                                            dtype=object))
-        for cand_i, params in enumerate(candidate_params):
-            for name, value in params.items():
-                # An all masked empty array gets created for the key
-                # `"param_%s" % name` at the first occurence of `name`.
-                # Setting the value at an index also unmasks that index
-                param_results["param_%s" % name][cand_i] = value
-
-        results.update(param_results)
-
-        # Store a list of param dicts at the key 'params'
-        results['params'] = candidate_params
-
-        self.cv_results_ = results
-        self.best_index_ = best_index
-        self.n_splits_ = n_splits
-
-        if self.refit:
-            # fit the best estimator using the entire dataset
-            # clone first to work around broken estimators
-            best_estimator = clone(base_estimator).set_params(
-                **best_parameters)
-            if y is not None:
-                best_estimator.fit(X, y, **self.fit_params)
-            else:
-                best_estimator.fit(X, **self.fit_params)
-            self.best_estimator_ = best_estimator
-        return self
-
-    def _fit_best_model(self, X, y):
-        """Fit the estimator copy with best parameters found to the
-        provided data.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Input data, where n_samples is the number of samples and
-            n_features is the number of features.
-
-        y : array-like, shape = [n_samples] or [n_samples, n_output],
-            Target relative to X for classification or regression.
-
-        Returns
-        -------
-        self
-        """
-        self.best_estimator_ = clone(self.estimator)
-        self.best_estimator_.set_params(**self.best_params_)
-        self.best_estimator_.fit(X, y, **(self.fit_params or {}))
-        return self
+    def optimizer_results_(self):
+        check_is_fitted(self, '_optim_results')
+        return self._optim_results
 
     def _make_optimizer(self, params_space):
         """Instantiate skopt Optimizer class.
@@ -541,13 +385,17 @@ def _make_optimizer(self, params_space):
         kwargs = self.optimizer_kwargs_.copy()
         kwargs['dimensions'] = dimensions_aslist(params_space)
         optimizer = Optimizer(**kwargs)
+        for i in range(len(optimizer.space.dimensions)):
+            if optimizer.space.dimensions[i].name is not None:
+                continue
+            optimizer.space.dimensions[i].name = list(sorted(
+                params_space.keys()))[i]
 
         return optimizer
 
-    def _step(self, X, y, search_space, optimizer, groups=None, n_points=1):
+    def _step(self, search_space, optimizer, evaluate_candidates, n_points=1):
         """Generate n_jobs parameters and evaluate them in parallel.
         """
-
         # get parameter values to evaluate
         params = optimizer.ask(n_points=n_points)
 
@@ -557,26 +405,10 @@ def _step(self, X, y, search_space, optimizer, groups=None, n_points=1):
         # make lists into dictionaries
         params_dict = [point_asdict(search_space, p) for p in params]
 
-        # HACK: self.cv_results_ is reset at every call to _fit, keep current
-        all_cv_results = self.cv_results_
-
-        # HACK: this adds compatibility with different versions of sklearn
-        refit = self.refit
-        self.refit = False
-        self._fit(X, y, groups, params_dict)
-        self.refit = refit
-
-        # merge existing and new cv_results_
-        for k in self.cv_results_:
-            all_cv_results[k].extend(self.cv_results_[k])
-
-        self.cv_results_ = all_cv_results
-        self.best_index_ = np.argmax(self.cv_results_['mean_test_score'])
-
-        # feed the point and objective back into optimizer
-        local_results = self.cv_results_['mean_test_score'][-len(params):]
-
-        # optimizer minimizes objective, hence provide negative score
+        all_results = evaluate_candidates(params_dict)
+        # Feed the point and objective value back into optimizer
+        # Optimizer minimizes objective, hence provide negative score
+        local_results = all_results["mean_test_score"][-len(params):]
         return optimizer.tell(params, [-score for score in local_results])
 
     @property
@@ -602,10 +434,8 @@ def total_iterations(self):
 
         return total_iter
 
-    def _run_search(self, x):
-        pass
-
-    def fit(self, X, y=None, groups=None, callback=None):
+    # TODO: Accept callbacks via the constructor?
+    def fit(self, X, y=None, *, groups=None, callback=None, **fit_params):
         """Run fit on the estimator with randomly drawn parameters.
 
         Parameters
@@ -626,18 +456,31 @@ def fit(self, X, y=None, groups=None, callback=None):
             combination tested. If list of callables, then each callable in
             the list is called.
         """
+        self._callbacks = check_callback(callback)
 
+        if self.optimizer_kwargs is None:
+            self.optimizer_kwargs_ = {}
+        else:
+            self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
+
+        super().fit(X=X, y=y, groups=groups, **fit_params)
+
+        # BaseSearchCV never ranked train scores,
+        # but apparently we used to ship this (back-compat)
+        if self.return_train_score:
+            self.cv_results_["rank_train_score"] = \
+                rankdata(-np.array(self.cv_results_["mean_train_score"]),
+                         method='min').astype(int)
+        return self
+
+    def _run_search(self, evaluate_candidates):
         # check if space is a single dict, convert to list if so
         search_spaces = self.search_spaces
         if isinstance(search_spaces, dict):
             search_spaces = [search_spaces]
 
-        callbacks = check_callback(callback)
+        callbacks = self._callbacks
 
-        if self.optimizer_kwargs is None:
-            self.optimizer_kwargs_ = {}
-        else:
-            self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
         random_state = check_random_state(self.random_state)
         self.optimizer_kwargs_['random_state'] = random_state
 
@@ -649,9 +492,7 @@ def fit(self, X, y=None, groups=None, callback=None):
             optimizers.append(self._make_optimizer(search_space))
         self.optimizers_ = optimizers  # will save the states of the optimizers
 
-        self.cv_results_ = defaultdict(list)
-        self.best_index_ = None
-        self.multimetric_ = False
+        self._optim_results = []
 
         n_points = self.n_points
 
@@ -669,16 +510,11 @@ def fit(self, X, y=None, groups=None, callback=None):
                 n_points_adjusted = min(n_iter, n_points)
 
                 optim_result = self._step(
-                    X, y, search_space, optimizer,
-                    groups=groups, n_points=n_points_adjusted
+                    search_space, optimizer,
+                    evaluate_candidates, n_points=n_points_adjusted
                 )
                 n_iter -= n_points
 
                 if eval_callbacks(callbacks, optim_result):
                     break
-
-        # Refit the best model on the the whole dataset
-        if self.refit:
-            self._fit_best_model(X, y)
-
-        return self
+            self._optim_results.append(optim_result)
diff --git a/skopt/space/space.py b/skopt/space/space.py
index 3b8a6b0ff..329b0c18c 100644
--- a/skopt/space/space.py
+++ b/skopt/space/space.py
@@ -11,6 +11,7 @@
 
 from .transformers import CategoricalEncoder
 from .transformers import StringEncoder
+from .transformers import LabelEncoder
 from .transformers import Normalize
 from .transformers import Identity
 from .transformers import LogN
@@ -23,6 +24,22 @@ def __repr__(self):
         return '...'
 
 
+def _transpose_list_array(x):
+    """Transposes a list matrix
+    """
+
+    n_dims = len(x)
+    assert n_dims > 0
+    n_samples = len(x[0])
+    rows = [None] * n_samples
+    for i in range(n_samples):
+        r = [None] * n_dims
+        for j in range(n_dims):
+            r[j] = x[j][i]
+        rows[i] = r
+    return rows
+
+
 def check_dimension(dimension, transform=None):
     """Turn a provided dimension description into a dimension object.
 
@@ -46,11 +63,12 @@ def check_dimension(dimension, transform=None):
         - an instance of a `Dimension` object (`Real`, `Integer` or
           `Categorical`).
 
-    transform : "identity", "normalize", "string", "onehot" optional
+    transform : "identity", "normalize", "string", "label", "onehot" optional
         - For `Categorical` dimensions, the following transformations are
           supported.
 
           - "onehot" (default) one-hot transformation of the original space.
+          - "label" integer transformation of the original space
           - "string" string transformation of the original space.
           - "identity" same as the original space.
 
@@ -149,6 +167,9 @@ def inverse_transform(self, Xt):
         """
         return self.transformer.inverse_transform(Xt)
 
+    def set_transformer(self):
+        raise NotImplementedError
+
     @property
     def size(self):
         return 1
@@ -161,6 +182,10 @@ def transformed_size(self):
     def bounds(self):
         raise NotImplementedError
 
+    @property
+    def is_constant(self):
+        raise NotImplementedError
+
     @property
     def transformed_bounds(self):
         raise NotImplementedError
@@ -197,6 +222,7 @@ class Real(Dimension):
 
     prior : "uniform" or "log-uniform", default="uniform"
         Distribution to use when sampling random points for this dimension.
+
         - If `"uniform"`, points are sampled uniformly between the lower
           and upper bounds.
         - If `"log-uniform"`, points are sampled uniformly between
@@ -218,15 +244,19 @@ class Real(Dimension):
     name : str or None
         Name associated with the dimension, e.g., "learning rate".
 
-    dtype : str or dtype, default=np.float
+    dtype : str or dtype, default=float
         float type which will be used in inverse_transform,
         can be float.
+
     """
     def __init__(self, low, high, prior="uniform", base=10, transform=None,
-                 name=None, dtype=np.float):
+                 name=None, dtype=float):
         if high <= low:
             raise ValueError("the lower bound {} has to be less than the"
                              " upper bound {}".format(low, high))
+        if prior not in ["uniform", "log-uniform"]:
+            raise ValueError("prior should be 'uniform' or 'log-uniform'"
+                             " got {}".format(prior))
         self.low = low
         self.high = high
         self.prior = prior
@@ -234,26 +264,38 @@ def __init__(self, low, high, prior="uniform", base=10, transform=None,
         self.log_base = np.log10(base)
         self.name = name
         self.dtype = dtype
+        self._rvs = None
+        self.transformer = None
+        self.transform_ = transform
         if isinstance(self.dtype, str) and self.dtype\
                 not in ['float', 'float16', 'float32', 'float64']:
             raise ValueError("dtype must be 'float', 'float16', 'float32'"
                              "or 'float64'"
                              " got {}".format(self.dtype))
-        elif isinstance(self.dtype, type) and self.dtype\
-                not in [float, np.float, np.float16, np.float32, np.float64]:
-            raise ValueError("dtype must be float, np.float"
+        elif isinstance(self.dtype, type) and \
+                not np.issubdtype(self.dtype, np.floating):
+            raise ValueError("dtype must be a np.floating subtype;"
                              " got {}".format(self.dtype))
 
         if transform is None:
             transform = "identity"
+        self.set_transformer(transform)
+
+    def set_transformer(self, transform="identity"):
+        """Define rvs and transformer spaces.
+
+        Parameters
+        ----------
+        transform : str
+           Can be 'normalize' or 'identity'
 
+        """
         self.transform_ = transform
 
         if self.transform_ not in ["normalize", "identity"]:
             raise ValueError("transform should be 'normalize' or 'identity'"
                              " got {}".format(self.transform_))
 
-        # Define _rvs and transformer spaces.
         # XXX: The _rvs is for sampling in the transformed space.
         # The rvs on Dimension calls inverse_transform on the points sampled
         # using _rvs
@@ -263,12 +305,12 @@ def __init__(self, low, high, prior="uniform", base=10, transform=None,
             self._rvs = _uniform_inclusive(0., 1.)
             if self.prior == "uniform":
                 self.transformer = Pipeline(
-                    [Identity(), Normalize(low, high)])
+                    [Identity(), Normalize(self.low, self.high)])
             else:
                 self.transformer = Pipeline(
                     [LogN(self.base),
-                     Normalize(np.log10(low) / self.log_base,
-                               np.log10(high) / self.log_base)]
+                     Normalize(np.log10(self.low) / self.log_base,
+                               np.log10(self.high) / self.log_base)]
                 )
         else:
             if self.prior == "uniform":
@@ -312,6 +354,10 @@ def inverse_transform(self, Xt):
     def bounds(self):
         return (self.low, self.high)
 
+    @property
+    def is_constant(self):
+        return self.low == self.high
+
     def __contains__(self, point):
         if isinstance(point, list):
             point = np.array(point)
@@ -358,14 +404,16 @@ class Integer(Dimension):
     prior : "uniform" or "log-uniform", default="uniform"
         Distribution to use when sampling random integers for
         this dimension.
-        - If `"uniform"`, intgers are sampled uniformly between the lower
+
+        - If `"uniform"`, integers are sampled uniformly between the lower
           and upper bounds.
-        - If `"log-uniform"`, intgers are sampled uniformly between
+        - If `"log-uniform"`, integers are sampled uniformly between
           `log(lower, base)` and `log(upper, base)` where log
           has base `base`.
 
     base : int
         The logarithmic base to use for a log-uniform prior.
+
         - Default 10, otherwise commonly 2.
 
     transform : "identity", "normalize", optional
@@ -384,12 +432,16 @@ class Integer(Dimension):
         can be int, np.int16, np.uint32, np.int32, np.int64 (default).
         When set to int, `inverse_transform` returns a list instead of
         a numpy array
+
     """
     def __init__(self, low, high, prior="uniform", base=10, transform=None,
                  name=None, dtype=np.int64):
         if high <= low:
             raise ValueError("the lower bound {} has to be less than the"
                              " upper bound {}".format(low, high))
+        if prior not in ["uniform", "log-uniform"]:
+            raise ValueError("prior should be 'uniform' or 'log-uniform'"
+                             " got {}".format(prior))
         self.low = low
         self.high = high
         self.prior = prior
@@ -397,6 +449,10 @@ def __init__(self, low, high, prior="uniform", base=10, transform=None,
         self.log_base = np.log10(base)
         self.name = name
         self.dtype = dtype
+        self.transform_ = transform
+        self._rvs = None
+        self.transformer = None
+
         if isinstance(self.dtype, str) and self.dtype\
             not in ['int', 'int8', 'int16', 'int32', 'int64',
                     'uint8', 'uint16', 'uint32', 'uint64']:
@@ -414,7 +470,17 @@ def __init__(self, low, high, prior="uniform", base=10, transform=None,
 
         if transform is None:
             transform = "identity"
+        self.set_transformer(transform)
 
+    def set_transformer(self, transform="identity"):
+        """Define _rvs and transformer spaces.
+
+        Parameters
+        ----------
+        transform : str
+           Can be 'normalize' or 'identity'
+
+        """
         self.transform_ = transform
 
         if transform not in ["normalize", "identity"]:
@@ -425,13 +491,13 @@ def __init__(self, low, high, prior="uniform", base=10, transform=None,
             self._rvs = _uniform_inclusive(0.0, 1.0)
             if self.prior == "uniform":
                 self.transformer = Pipeline(
-                    [Identity(), Normalize(low, high)])
+                    [Identity(), Normalize(self.low, self.high, is_int=True)])
             else:
 
                 self.transformer = Pipeline(
                     [LogN(self.base),
-                     Normalize(np.log10(low) / self.log_base,
-                               np.log10(high) / self.log_base)]
+                     Normalize(np.log10(self.low) / self.log_base,
+                               np.log10(self.high) / self.log_base)]
                 )
         else:
             if self.prior == "uniform":
@@ -462,6 +528,8 @@ def inverse_transform(self, Xt):
         inv_transform = super(Integer, self).inverse_transform(Xt)
         if isinstance(inv_transform, list):
             inv_transform = np.array(inv_transform)
+        inv_transform = np.clip(inv_transform,
+                                self.low, self.high)
         if self.dtype == int or self.dtype == 'int':
             # necessary, otherwise the type is converted to a numpy type
             return getattr(np.round(inv_transform).astype(self.dtype),
@@ -473,6 +541,10 @@ def inverse_transform(self, Xt):
     def bounds(self):
         return (self.low, self.high)
 
+    @property
+    def is_constant(self):
+        return self.low == self.high
+
     def __contains__(self, point):
         if isinstance(point, list):
             point = np.array(point)
@@ -481,7 +553,7 @@ def __contains__(self, point):
     @property
     def transformed_bounds(self):
         if self.transform_ == "normalize":
-            return 0, 1
+            return 0., 1.
         else:
             return (self.low, self.high)
 
@@ -514,16 +586,20 @@ class Categorical(Dimension):
         Prior probabilities for each category. By default all categories
         are equally likely.
 
-    transform : "onehot", "string", "identity", default="onehot"
+    transform : "onehot", "string", "identity", "label", default="onehot"
+
         - "identity", the transformed space is the same as the original
           space.
-        -  "string",  the transformed space is a string encoded
+        - "string",  the transformed space is a string encoded
           representation of the original space.
+        - "label", the transformed space is a label encoded
+          representation (integer) of the original space.
         - "onehot", the transformed space is a one-hot encoded
           representation of the original space.
 
     name : str or None
         Name associated with dimension, e.g., "colors".
+
     """
     def __init__(self, categories, prior=None, transform=None, name=None):
         self.categories = tuple(categories)
@@ -533,31 +609,54 @@ def __init__(self, categories, prior=None, transform=None, name=None):
         if transform is None:
             transform = "onehot"
         self.transform_ = transform
-        if transform not in ["identity", "onehot", "string"]:
-            raise ValueError("Expected transform to be 'identity', 'string' or"
-                             "'onehot' got {}".format(transform))
+        self.transformer = None
+        self._rvs = None
+        self.prior = prior
+
+        if prior is None:
+            self.prior_ = np.tile(1. / len(self.categories),
+                                  len(self.categories))
+        else:
+            self.prior_ = prior
+        self.set_transformer(transform)
+
+    def set_transformer(self, transform="onehot"):
+        """Define _rvs and transformer spaces.
+
+        Parameters
+        ----------
+        transform : str
+           Can be 'normalize', 'onehot', 'string', 'label', or 'identity'
+
+        """
+        self.transform_ = transform
+        if transform not in ["identity", "onehot", "string", "normalize",
+                             "label"]:
+            raise ValueError("Expected transform to be 'identity', 'string',"
+                             "'label' or 'onehot' got {}".format(transform))
         if transform == "onehot":
             self.transformer = CategoricalEncoder()
             self.transformer.fit(self.categories)
         elif transform == "string":
             self.transformer = StringEncoder()
             self.transformer.fit(self.categories)
+        elif transform == "label":
+            self.transformer = LabelEncoder()
+            self.transformer.fit(self.categories)
+        elif transform == "normalize":
+            self.transformer = Pipeline(
+                [LabelEncoder(list(self.categories)),
+                 Normalize(0, len(self.categories) - 1, is_int=True)])
         else:
             self.transformer = Identity()
             self.transformer.fit(self.categories)
-
-        self.prior = prior
-
-        if prior is None:
-            self.prior_ = np.tile(1. / len(self.categories),
-                                  len(self.categories))
+        if transform == "normalize":
+            self._rvs = _uniform_inclusive(0.0, 1.0)
         else:
-            self.prior_ = prior
-
-        # XXX check that sum(prior) == 1
-        self._rvs = rv_discrete(
-            values=(range(len(self.categories)), self.prior_)
-        )
+            # XXX check that sum(prior) == 1
+            self._rvs = rv_discrete(
+                values=(range(len(self.categories)), self.prior_)
+            )
 
     def __eq__(self, other):
         return (type(self) is type(other) and
@@ -577,11 +676,26 @@ def __repr__(self):
 
         return "Categorical(categories={}, prior={})".format(cats, prior)
 
+    def inverse_transform(self, Xt):
+        """Inverse transform samples from the warped space back into the
+           original space.
+        """
+        # The concatenation of all transformed dimensions makes Xt to be
+        # of type float, hence the required cast back to int.
+        inv_transform = super(Categorical, self).inverse_transform(Xt)
+        if isinstance(inv_transform, list):
+            inv_transform = np.array(inv_transform)
+        return inv_transform
+
     def rvs(self, n_samples=None, random_state=None):
         choices = self._rvs.rvs(size=n_samples, random_state=random_state)
 
         if isinstance(choices, numbers.Integral):
             return self.categories[choices]
+        elif self.transform_ == "normalize" and isinstance(choices, float):
+            return self.inverse_transform([(choices)])
+        elif self.transform_ == "normalize":
+            return self.inverse_transform(list(choices))
         else:
             return [self.categories[c] for c in choices]
 
@@ -598,13 +712,17 @@ def transformed_size(self):
     def bounds(self):
         return self.categories
 
+    @property
+    def is_constant(self):
+        return len(self.categories) <= 1
+
     def __contains__(self, point):
         return point in self.categories
 
     @property
     def transformed_bounds(self):
         if self.transformed_size == 1:
-            return (0.0, 1.0)
+            return 0.0, 1.0
         else:
             return [(0.0, 1.0) for i in range(self.transformed_size)]
 
@@ -665,6 +783,21 @@ def __repr__(self):
     def __iter__(self):
         return iter(self.dimensions)
 
+    @property
+    def dimension_names(self):
+        """
+        Names of all the dimensions in the search-space.
+        """
+        index = 0
+        names = []
+        for dim in self.dimensions:
+            if dim.name is None:
+                names.append("X_%d" % index)
+            else:
+                names.append(dim.name)
+            index += 1
+        return names
+
     @property
     def is_real(self):
         """
@@ -681,25 +814,28 @@ def from_yaml(cls, yml_path, namespace=None):
         yml_path : str
             Full path to yaml configuration file, example YaML below:
             Space:
-              - Integer:
-                  low: -5
-                  high: 5
-              - Categorical:
-                  categories:
-                  - a
-                  - b
-              - Real:
-                  low: 1.0
-                  high: 5.0
-                  prior: log-uniform
+
+            - Integer:
+              low: -5
+              high: 5
+            - Categorical:
+              categories:
+              - a
+              - b
+            - Real:
+              low: 1.0
+              high: 5.0
+              prior: log-uniform
+
         namespace : str, default=None
            Namespace within configuration file to use, will use first
-             namespace if not provided
+           namespace if not provided
 
         Returns
         -------
         space : Space
            Instantiated Space object
+
         """
         with open(yml_path, 'rb') as f:
             config = yaml.safe_load(f)
@@ -761,22 +897,47 @@ def rvs(self, n_samples=1, random_state=None):
         columns = []
 
         for dim in self.dimensions:
-            if sp_version < (0, 16):
-                columns.append(dim.rvs(n_samples=n_samples))
-            else:
-                columns.append(dim.rvs(n_samples=n_samples, random_state=rng))
+            columns.append(dim.rvs(n_samples=n_samples, random_state=rng))
 
         # Transpose
-        rows = []
+        return _transpose_list_array(columns)
 
-        for i in range(n_samples):
-            r = []
-            for j in range(self.n_dims):
-                r.append(columns[j][i])
+    def set_transformer(self, transform):
+        """Sets the transformer of all dimension objects to `transform`
+
+        Parameters
+        ----------
+        transform : str or list of str
+           Sets all transformer,, when `transform`  is a string.
+           Otherwise, transform must be a list with strings with
+           the same length as `dimensions`
+        """
+        # Transform
+        for j in range(self.n_dims):
+            if isinstance(transform, list):
+                self.dimensions[j].set_transformer(transform[j])
+            else:
+                self.dimensions[j].set_transformer(transform)
+
+    def set_transformer_by_type(self, transform, dim_type):
+        """Sets the transformer of `dim_type` objects to `transform`
 
-            rows.append(r)
+        Parameters
+        ----------
+        transform : str
+           Sets all transformer of type `dim_type` to `transform`
+        dim_type : type
+            Can be `skopt.space.Real`, `skopt.space.Integer` or
+             `skopt.space.Categorical`
+        """
+        # Transform
+        for j in range(self.n_dims):
+            if isinstance(self.dimensions[j], dim_type):
+                self.dimensions[j].set_transformer(transform)
 
-        return rows
+    def get_transformer(self):
+        """Returns all transformers as list"""
+        return [self.dimensions[j].transform_ for j in range(self.n_dims)]
 
     def transform(self, X):
         """Transform samples from the original space into a warped space.
@@ -829,7 +990,7 @@ def inverse_transform(self, Xt):
         # Inverse transform
         columns = []
         start = 0
-
+        Xt = np.asarray(Xt)
         for j in range(self.n_dims):
             dim = self.dimensions[j]
             offset = dim.transformed_size
@@ -843,16 +1004,7 @@ def inverse_transform(self, Xt):
             start += offset
 
         # Transpose
-        rows = []
-
-        for i in range(len(Xt)):
-            r = []
-            for j in range(self.n_dims):
-                r.append(columns[j][i])
-
-            rows.append(r)
-
-        return rows
+        return _transpose_list_array(columns)
 
     @property
     def n_dims(self):
@@ -884,6 +1036,58 @@ def __contains__(self, point):
                 return False
         return True
 
+    def __getitem__(self, dimension_names):
+        """
+        Lookup and return the search-space dimension with the given name.
+
+        This allows for dict-like lookup of dimensions, for example:
+        `space['foo']` returns the dimension named 'foo' if it exists,
+        otherwise `None` is returned.
+
+        It also allows for lookup of a list of dimension-names, for example:
+        `space[['foo', 'bar']]` returns the two dimensions named
+        'foo' and 'bar' if they exist.
+
+        Parameters
+        ----------
+        dimension_names : str or list(str)
+            Name of a single search-space dimension (str).
+            List of names for search-space dimensions (list(str)).
+
+        Returns
+        -------
+        dims tuple (index, Dimension), list(tuple(index, Dimension)), \
+                (None, None)
+            A single search-space dimension with the given name,
+            or a list of search-space dimensions with the given names.
+        """
+
+        def _get(dimension_name):
+            """Helper-function for getting a single dimension."""
+            index = 0
+            # Get the index of the search-space dimension using its name.
+            for dim in self.dimensions:
+                if dimension_name == dim.name:
+                    return (index, dim)
+                elif dimension_name == index:
+                    return (index, dim)
+                index += 1
+            return (None, None)
+
+        if isinstance(dimension_names, (str, int)):
+            # Get a single search-space dimension.
+            dims = _get(dimension_name=dimension_names)
+        elif isinstance(dimension_names, (list, tuple)):
+            # Get a list of search-space dimensions.
+            # Note that we do not check whether the names are really strings.
+            dims = [_get(dimension_name=name) for name in dimension_names]
+        else:
+            msg = "Dimension name should be either string or" \
+                  "list of strings, but got {}."
+            raise ValueError(msg.format(type(dimension_names)))
+
+        return dims
+
     @property
     def transformed_bounds(self):
         """The dimension bounds, in the warped space."""
@@ -902,6 +1106,22 @@ def is_categorical(self):
         """Space contains exclusively categorical dimensions"""
         return all([isinstance(dim, Categorical) for dim in self.dimensions])
 
+    @property
+    def is_partly_categorical(self):
+        """Space contains any categorical dimensions"""
+        return any([isinstance(dim, Categorical) for dim in self.dimensions])
+
+    @property
+    def n_constant_dimensions(self):
+        """Returns the number of constant dimensions which have zero degree of
+        freedom, e.g. an Integer dimensions with (0., 0.) as bounds.
+        """
+        n = 0
+        for dim in self.dimensions:
+            if dim.is_constant:
+                n += 1
+        return n
+
     def distance(self, point_a, point_b):
         """Compute distance between two points in this space.
 
diff --git a/skopt/space/transformers.py b/skopt/space/transformers.py
index dd7512d76..688929529 100644
--- a/skopt/space/transformers.py
+++ b/skopt/space/transformers.py
@@ -1,6 +1,7 @@
 from __future__ import division
 import numpy as np
 from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils import column_or_1d
 
 
 class Transformer(object):
@@ -88,10 +89,10 @@ def __init__(self, base):
         self._base = base
 
     def transform(self, X):
-        return np.log10(np.asarray(X, dtype=np.float)) / np.log10(self._base)
+        return np.log10(np.asarray(X, dtype=float)) / np.log10(self._base)
 
     def inverse_transform(self, Xt):
-        return self._base ** np.asarray(Xt, dtype=np.float)
+        return self._base ** np.asarray(Xt, dtype=float)
 
 
 class CategoricalEncoder(Transformer):
@@ -151,6 +152,72 @@ def inverse_transform(self, Xt):
         ]
 
 
+class LabelEncoder(Transformer):
+    """LabelEncoder that can handle categorical variables."""
+    def __init__(self, X=None):
+        if X is not None:
+            self.fit(X)
+
+    def fit(self, X):
+        """Fit a list or array of categories.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_categories,)
+            List of categories.
+        """
+        X = np.asarray(X)
+        if X.dtype == object:
+            self.mapping_ = {v: i for i, v in enumerate(X)}
+        else:
+            i = 0
+            self.mapping_ = {}
+            for v in np.unique(X):
+                self.mapping_[v] = i
+                i += 1
+        self.inverse_mapping_ = {i: v for v, i in self.mapping_.items()}
+        return self
+
+    def transform(self, X):
+        """Transform an array of categories to a one-hot encoded
+        representation.
+
+        Parameters
+        ----------
+        X : array-like, shape=(n_samples,)
+            List of categories.
+
+        Returns
+        -------
+        Xt : array-like, shape=(n_samples, n_categories)
+            The integer categories.
+        """
+        X = np.asarray(X)
+        return [self.mapping_[v] for v in X]
+
+    def inverse_transform(self, Xt):
+        """Inverse transform integer categories back to their original
+           representation.
+
+        Parameters
+        ----------
+        Xt : array-like, shape=(n_samples, n_categories)
+            Integer categories.
+
+        Returns
+        -------
+        X : array-like, shape=(n_samples,)
+            The original categories.
+        """
+        if isinstance(Xt, (float, np.float64)):
+            Xt = [Xt]
+        else:
+            Xt = np.asarray(Xt)
+        return [
+            self.inverse_mapping_[int(np.round(i))] for i in Xt
+        ]
+
+
 class Normalize(Transformer):
     """
     Scales each dimension into the interval [0, 1].
@@ -163,7 +230,7 @@ class Normalize(Transformer):
     high : float
         Higher bound.
 
-    is_int : bool, default=True
+    is_int : bool, default=False
         Round and cast the return value of `inverse_transform` to integer. Set
         to `True` when applying this transform to integers.
     """
@@ -171,20 +238,37 @@ def __init__(self, low, high, is_int=False):
         self.low = float(low)
         self.high = float(high)
         self.is_int = is_int
+        self._eps = 1e-8
 
     def transform(self, X):
         X = np.asarray(X)
-        if np.any(X > self.high + 1e-8):
-            raise ValueError("All values should be less than %f" % self.high)
-        if np.any(X < self.low - 1e-8):
-            raise ValueError("All values should be greater than %f" % self.low)
-        return (X - self.low) / (self.high - self.low)
+        if self.is_int:
+            if np.any(np.round(X) > self.high):
+                raise ValueError("All integer values should"
+                                 "be less than %f" % self.high)
+            if np.any(np.round(X) < self.low):
+                raise ValueError("All integer values should"
+                                 "be greater than %f" % self.low)
+        else:
+            if np.any(X > self.high + self._eps):
+                raise ValueError("All values should"
+                                 "be less than %f" % self.high)
+            if np.any(X < self.low - self._eps):
+                raise ValueError("All values should"
+                                 "be greater than %f" % self.low)
+        if (self.high - self.low) == 0.:
+            return X * 0.
+        if self.is_int:
+            return (np.round(X).astype(np.int) - self.low) /\
+                   (self.high - self.low)
+        else:
+            return (X - self.low) / (self.high - self.low)
 
     def inverse_transform(self, X):
         X = np.asarray(X)
-        if np.any(X > 1.0):
+        if np.any(X > 1.0 + self._eps):
             raise ValueError("All values should be less than 1.0")
-        if np.any(X < 0.0):
+        if np.any(X < 0.0 - self._eps):
             raise ValueError("All values should be greater than 0.0")
         X_orig = X * (self.high - self.low) + self.low
         if self.is_int:
diff --git a/skopt/tests/test_acquisition.py b/skopt/tests/test_acquisition.py
index 0a6300a8b..a75d20550 100644
--- a/skopt/tests/test_acquisition.py
+++ b/skopt/tests/test_acquisition.py
@@ -1,4 +1,3 @@
-from math import log
 import numpy as np
 import pytest
 
@@ -26,7 +25,6 @@ def predict(self, X, return_std=True):
         X = np.array(X)
         return np.zeros(X.shape[0]), np.ones(X.shape[0])
 
-
 # This is used to test that given constant acquisition values at
 # different points, acquisition functions "EIps" and "PIps"
 # prefer candidate points that take lesser time.
@@ -44,8 +42,7 @@ def fit(self, X, y):
         models the logarithm of the time.
         """
         X = np.array(X)
-        y = np.array(y)
-        gpr = cook_estimator("GP", self.space, random_state=0)
+        gpr = cook_estimator("GP", self.space, normalize_y=False)
         gpr.fit(X, np.log(np.ravel(X)))
         self.estimators_ = []
         self.estimators_.append(ConstSurrogate())
@@ -122,6 +119,19 @@ def test_acquisition_gradient():
         check_gradient_correctness(X_new, gpr, acq_func, np.max(y))
 
 
+@pytest.mark.fast_test
+def test_acquisition_gradient_cookbook():
+    rng = np.random.RandomState(0)
+    X = rng.randn(20, 5)
+    y = rng.randn(20)
+    X_new = rng.randn(5)
+    gpr = cook_estimator("GP", Space(((-5.0, 5.0),)), random_state=0)
+    gpr.fit(X, y)
+
+    for acq_func in ["LCB", "PI", "EI"]:
+        check_gradient_correctness(X_new, gpr, acq_func, np.max(y))
+
+
 @pytest.mark.fast_test
 @pytest.mark.parametrize("acq_func", ["EIps", "PIps"])
 def test_acquisition_per_second(acq_func):
@@ -146,7 +156,7 @@ def test_acquisition_per_second(acq_func):
 def test_gaussian_acquisition_check_inputs():
     model = ConstantGPRSurrogate(Space(((1.0, 9.0),)))
     with pytest.raises(ValueError) as err:
-        vals = _gaussian_acquisition(np.arange(1, 5), model)
+        _gaussian_acquisition(np.arange(1, 5), model)
     assert("it must be 2-dimensional" in err.value.args[0])
 
 
diff --git a/skopt/tests/test_callbacks.py b/skopt/tests/test_callbacks.py
index de31de3f8..3df24020b 100644
--- a/skopt/tests/test_callbacks.py
+++ b/skopt/tests/test_callbacks.py
@@ -12,6 +12,8 @@
 from skopt.callbacks import DeltaYStopper
 from skopt.callbacks import DeadlineStopper
 from skopt.callbacks import CheckpointSaver
+from skopt.callbacks import HollowIterationsStopper
+from skopt.callbacks import ThresholdStopper
 
 from skopt.utils import load
 
@@ -34,6 +36,16 @@ def test_deltay_stopper():
     assert deltay(Result([0, 1])) is None
 
 
+@pytest.mark.fast_test
+def test_threshold_stopper():
+    threshold = ThresholdStopper(3.0)
+
+    Result = namedtuple('Result', ['func_vals'])
+
+    assert not threshold(Result([3.1, 4, 4.6, 100]))
+    assert threshold(Result([3.0, 3, 2.9, 0, 0.0]))
+
+
 @pytest.mark.fast_test
 def test_deadline_stopper():
     deadline = DeadlineStopper(0.0001)
@@ -47,6 +59,49 @@ def test_deadline_stopper():
     assert np.sum(deadline.iter_time) < deadline.total_time
 
 
+@pytest.mark.fast_test
+def test_hollow_iterations_stopper():
+    Result = namedtuple("Result", ["func_vals"])
+
+    hollow = HollowIterationsStopper(3, 0)
+    # will run at least n_iterations + 1 times
+    assert not hollow(Result([10, 11, 12]))
+    assert hollow(Result([10, 11, 12, 13]))
+
+    # a tie is not enough
+    assert hollow(Result([10, 11, 12, 10]))
+
+    # every time we make a new min, we then have n_iterations rounds to beat it
+    assert not hollow(Result([10, 9, 8, 7, 7, 7]))
+    assert hollow(Result([10, 9, 8, 7, 7, 7, 7]))
+
+    hollow = HollowIterationsStopper(3, 1.1)
+    assert not hollow(Result([10, 11, 12, 8.89]))
+    assert hollow(Result([10, 11, 12, 8.9]))
+
+    # individual improvement below threshold contribute
+    assert hollow(Result([10, 9.9, 9.8, 9.7]))
+    assert not hollow(Result([10, 9.5, 9, 8.5, 8, 7.5]))
+
+    hollow = HollowIterationsStopper(3, 0)
+    result = gp_minimize(
+        bench3, [(-1.0, 1.0)], callback=hollow, n_calls=100, random_state=1
+    )
+    assert len(result.func_vals) == 10
+
+    hollow = HollowIterationsStopper(3, 0.1)
+    result = gp_minimize(
+        bench3, [(-1.0, 1.0)], callback=hollow, n_calls=100, random_state=1
+    )
+    assert len(result.func_vals) == 5
+
+    hollow = HollowIterationsStopper(3, 0.2)
+    result = gp_minimize(
+        bench3, [(-1.0, 1.0)], callback=hollow, n_calls=100, random_state=1
+    )
+    assert len(result.func_vals) == 4
+
+
 @pytest.mark.fast_test
 def test_checkpoint_saver():
     checkpoint_path = "./test_checkpoint.pkl"
diff --git a/skopt/tests/test_common.py b/skopt/tests/test_common.py
index 9b299e43f..4feae300f 100644
--- a/skopt/tests/test_common.py
+++ b/skopt/tests/test_common.py
@@ -113,11 +113,11 @@ def test_minimizer_api_dummy_minimize(verbose, call):
 @pytest.mark.parametrize("minimizer", MINIMIZERS)
 def test_minimizer_api(verbose, call, minimizer):
     n_calls = 7
-    n_random_starts = 3
-    n_models = n_calls - n_random_starts + 1
+    n_initial_points = 3
+    n_models = n_calls - n_initial_points + 1
 
     result = minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)],
-                       n_random_starts=n_random_starts,
+                       n_initial_points=n_initial_points,
                        n_calls=n_calls,
                        random_state=1,
                        verbose=verbose, callback=call)
@@ -133,10 +133,10 @@ def test_minimizer_api(verbose, call, minimizer):
 def test_minimizer_api_random_only(minimizer):
     # no models should be fit as we only evaluate at random points
     n_calls = 5
-    n_random_starts = 5
+    n_initial_points = 5
 
     result = minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)],
-                       n_random_starts=n_random_starts,
+                       n_initial_points=n_initial_points,
                        n_calls=n_calls,
                        random_state=1)
 
@@ -150,15 +150,15 @@ def test_fixed_random_states(minimizer):
     # check that two runs produce exactly same results, if not there is a
     # random state somewhere that is not reproducible
     n_calls = 4
-    n_random_starts = 2
+    n_initial_points = 2
 
     space = [(-5.0, 10.0), (0.0, 15.0)]
     result1 = minimizer(branin, space, n_calls=n_calls,
-                        n_random_starts=n_random_starts, random_state=1)
+                        n_initial_points=n_initial_points, random_state=1)
 
     dimensions = [(-5.0, 10.0), (0.0, 15.0)]
     result2 = minimizer(branin, dimensions, n_calls=n_calls,
-                        n_random_starts=n_random_starts, random_state=1)
+                        n_initial_points=n_initial_points, random_state=1)
 
     assert_array_almost_equal(result1.x_iters, result2.x_iters)
     assert_array_almost_equal(result1.func_vals, result2.func_vals)
@@ -170,28 +170,28 @@ def test_minimizer_with_space(minimizer):
     # check we can pass a Space instance as dimensions argument and get same
     # result
     n_calls = 4
-    n_random_starts = 2
+    n_initial_points = 2
 
     space = Space([(-5.0, 10.0), (0.0, 15.0)])
     space_result = minimizer(branin, space, n_calls=n_calls,
-                             n_random_starts=n_random_starts, random_state=1)
+                             n_initial_points=n_initial_points, random_state=1)
 
     check_minimizer_api(space_result, n_calls)
     check_minimizer_bounds(space_result, n_calls)
 
     dimensions = [(-5.0, 10.0), (0.0, 15.0)]
     result = minimizer(branin, dimensions, n_calls=n_calls,
-                       n_random_starts=n_random_starts, random_state=1)
+                       n_initial_points=n_initial_points, random_state=1)
 
     assert_array_almost_equal(space_result.x_iters, result.x_iters)
     assert_array_almost_equal(space_result.func_vals, result.func_vals)
 
 
 @pytest.mark.slow_test
-@pytest.mark.parametrize("n_random_starts", [0, 1, 2, 3, 4])
+@pytest.mark.parametrize("n_initial_points", [0, 1, 2, 3, 4])
 @pytest.mark.parametrize("optimizer_func",
                          [gp_minimize, forest_minimize, gbrt_minimize])
-def test_init_vals_and_models(n_random_starts, optimizer_func):
+def test_init_vals_and_models(n_initial_points, optimizer_func):
     # test how many models are fitted when using initial points, y0 values
     # and random starts
     space = [(-5.0, 10.0), (0.0, 15.0)]
@@ -199,40 +199,40 @@ def test_init_vals_and_models(n_random_starts, optimizer_func):
     y0 = list(map(branin, x0))
     n_calls = 7
 
-    optimizer = partial(optimizer_func, n_random_starts=n_random_starts)
+    optimizer = partial(optimizer_func, n_initial_points=n_initial_points)
     res = optimizer(branin, space, x0=x0, y0=y0, random_state=0,
                     n_calls=n_calls)
 
-    assert_equal(len(res.models), n_calls - n_random_starts + 1)
+    assert_equal(len(res.models), n_calls - n_initial_points + 1)
 
 
 @pytest.mark.slow_test
-@pytest.mark.parametrize("n_random_starts", [0, 1, 2, 3, 4])
+@pytest.mark.parametrize("n_initial_points", [0, 1, 2, 3, 4])
 @pytest.mark.parametrize("optimizer_func",
                          [gp_minimize, forest_minimize, gbrt_minimize])
-def test_init_points_and_models(n_random_starts, optimizer_func):
+def test_init_points_and_models(n_initial_points, optimizer_func):
     # test how many models are fitted when using initial points and random
     # starts (no y0 in this case)
     space = [(-5.0, 10.0), (0.0, 15.0)]
     x0 = [[1, 2], [3, 4], [5, 6]]
     n_calls = 7
 
-    optimizer = partial(optimizer_func, n_random_starts=n_random_starts)
+    optimizer = partial(optimizer_func, n_initial_points=n_initial_points)
     res = optimizer(branin, space, x0=x0, random_state=0,
                     n_calls=n_calls)
-    assert_equal(len(res.models), n_calls - len(x0) - n_random_starts + 1)
+    assert_equal(len(res.models), n_calls - len(x0) - n_initial_points + 1)
 
 
 @pytest.mark.slow_test
-@pytest.mark.parametrize("n_random_starts", [0, 5])
+@pytest.mark.parametrize("n_initial_points", [2, 5])
 @pytest.mark.parametrize("optimizer_func",
                          [gp_minimize, forest_minimize, gbrt_minimize])
-def test_init_vals(n_random_starts, optimizer_func):
+def test_init_vals(n_initial_points, optimizer_func):
     space = [(-5.0, 10.0), (0.0, 15.0)]
     x0 = [[1, 2], [3, 4], [5, 6]]
-    n_calls = len(x0) + n_random_starts + 1
+    n_calls = len(x0) + n_initial_points + 1
 
-    optimizer = partial(optimizer_func, n_random_starts=n_random_starts)
+    optimizer = partial(optimizer_func, n_initial_points=n_initial_points)
     check_init_vals(optimizer, branin, space, x0, n_calls)
 
 
@@ -247,26 +247,26 @@ def test_init_vals_dummy_minimize():
 @pytest.mark.slow_test
 @pytest.mark.parametrize("optimizer", [
         dummy_minimize,
-        partial(gp_minimize, n_random_starts=0),
-        partial(forest_minimize, n_random_starts=0),
-        partial(gbrt_minimize, n_random_starts=0)])
+        partial(gp_minimize, n_initial_points=3),
+        partial(forest_minimize, n_initial_points=3),
+        partial(gbrt_minimize, n_initial_points=3)])
 def test_categorical_init_vals(optimizer):
     space = [("-2", "-1", "0", "1", "2")]
     x0 = [["0"], ["1"], ["2"]]
-    n_calls = 4
+    n_calls = 6
     check_init_vals(optimizer, bench4, space, x0, n_calls)
 
 
 @pytest.mark.slow_test
 @pytest.mark.parametrize("optimizer", [
         dummy_minimize,
-        partial(gp_minimize, n_random_starts=0),
-        partial(forest_minimize, n_random_starts=0),
-        partial(gbrt_minimize, n_random_starts=0)])
+        partial(gp_minimize, n_initial_points=2),
+        partial(forest_minimize, n_initial_points=2),
+        partial(gbrt_minimize, n_initial_points=2)])
 def test_mixed_spaces(optimizer):
     space = [("-2", "-1", "0", "1", "2"), (-2.0, 2.0)]
     x0 = [["0", 2.0], ["1", 1.0], ["2", 1.0]]
-    n_calls = 4
+    n_calls = 5
     check_init_vals(optimizer, bench5, space, x0, n_calls)
 
 
@@ -326,24 +326,24 @@ def test_invalid_n_calls_arguments(minimizer):
 
     with pytest.raises(ValueError):
         minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)],
-                  n_random_starts=0, random_state=1)
+                  n_initial_points=0, random_state=1)
 
-    # n_calls >= n_random_starts
+    # n_calls >= n_initial_points
     with pytest.raises(ValueError):
         minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)],
-                  n_calls=1, n_random_starts=10, random_state=1)
+                  n_calls=1, n_initial_points=10, random_state=1)
 
-    # n_calls >= n_random_starts + len(x0)
+    # n_calls >= n_initial_points + len(x0)
     with pytest.raises(ValueError):
         minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)], n_calls=1,
                   x0=[[-1, 2], [-3, 3], [2, 5]], random_state=1,
-                  n_random_starts=7)
+                  n_initial_points=7)
 
-    # n_calls >= n_random_starts
+    # n_calls >= n_initial_points
     with pytest.raises(ValueError):
         minimizer(branin, [(-5.0, 10.0), (0.0, 15.0)], n_calls=1,
                   x0=[[-1, 2], [-3, 3], [2, 5]], y0=[2.0, 3.0, 5.0],
-                  random_state=1, n_random_starts=7)
+                  random_state=1, n_initial_points=7)
 
 
 @pytest.mark.fast_test
@@ -351,7 +351,7 @@ def test_invalid_n_calls_arguments(minimizer):
 def test_repeated_x(minimizer):
     with pytest.warns(None) as record:
         minimizer(lambda x: x[0], dimensions=[[0, 1]], x0=[[0], [1]],
-                  n_random_starts=0, n_calls=3)
+                  n_initial_points=0, n_calls=3)
     assert len(record) > 0
     w = record.pop(UserWarning)
     assert issubclass(w.category, UserWarning)
@@ -359,7 +359,7 @@ def test_repeated_x(minimizer):
 
     with pytest.warns(None) as record:
         minimizer(bench4, dimensions=[("0", "1")], x0=[["0"], ["1"]],
-                  n_calls=3, n_random_starts=0)
+                  n_calls=3, n_initial_points=0)
         assert len(record) > 0
         w = record.pop(UserWarning)
         assert issubclass(w.category, UserWarning)
@@ -376,23 +376,23 @@ def test_consistent_x_iter_dimensions(minimizer):
     res = minimizer(bench1,
                     dimensions=[(0, 1), (2, 3)],
                     x0=[[0, 2], [1, 2]], n_calls=3,
-                    n_random_starts=0)
+                    n_initial_points=0)
     assert len(set(len(x) for x in res.x_iters)) == 1
     assert len(res.x_iters[0]) == 2
 
     # one dimensional problem
     res = minimizer(bench1, dimensions=[(0, 1)], x0=[[0], [1]], n_calls=3,
-                    n_random_starts=0)
+                    n_initial_points=0)
     assert len(set(len(x) for x in res.x_iters)) == 1
     assert len(res.x_iters[0]) == 1
 
     with pytest.raises(RuntimeError):
         minimizer(bench1, dimensions=[(0, 1)],
-                  x0=[[0, 1]], n_calls=3, n_random_starts=0)
+                  x0=[[0, 1]], n_calls=3, n_initial_points=0)
 
     with pytest.raises(RuntimeError):
         minimizer(bench1, dimensions=[(0, 1)],
-                  x0=[0, 1], n_calls=3, n_random_starts=0)
+                  x0=[0, 1], n_calls=3, n_initial_points=0)
 
 
 @pytest.mark.slow_test
@@ -405,7 +405,7 @@ def test_early_stopping_delta_x(minimizer):
                     dimensions=[(-1., 1.)],
                     x0=[[-0.1], [0.1], [-0.9]],
                     n_calls=n_calls,
-                    n_random_starts=0, random_state=1)
+                    n_initial_points=0, random_state=1)
     assert len(res.x_iters) < n_calls
 
 
@@ -420,7 +420,7 @@ def test_early_stopping_delta_x_empty_result_object(minimizer):
                     callback=DeltaXStopper(0.1),
                     dimensions=[(-1., 1.)],
                     n_calls=n_calls,
-                    n_random_starts=1, random_state=1)
+                    n_initial_points=2, random_state=1)
     assert len(res.x_iters) < n_calls
 
 
@@ -433,6 +433,6 @@ def bench1_with_time(x):
 
     n_calls = 3
     res = minimizer(bench1_with_time, [(-2.0, 2.0)],
-                    acq_func=acq_func, n_calls=n_calls, n_random_starts=1,
+                    acq_func=acq_func, n_calls=n_calls, n_initial_points=2,
                     random_state=1)
     assert len(res.log_time) == n_calls
diff --git a/skopt/tests/test_forest_opt.py b/skopt/tests/test_forest_opt.py
index 204bee108..5841b0acd 100644
--- a/skopt/tests/test_forest_opt.py
+++ b/skopt/tests/test_forest_opt.py
@@ -28,11 +28,11 @@ def test_forest_minimize_api(base_estimator):
 
 
 def check_minimize(minimizer, func, y_opt, dimensions, margin,
-                   n_calls, n_random_starts=10, x0=None):
+                   n_calls, n_initial_points=10, x0=None, n_jobs=1):
     for n in range(3):
         r = minimizer(
             func, dimensions, n_calls=n_calls, random_state=n,
-            n_random_starts=n_random_starts, x0=x0)
+            n_initial_points=n_initial_points, x0=x0, n_jobs=n_jobs)
         assert r.fun < y_opt + margin
 
 
@@ -57,6 +57,12 @@ def test_tree_based_minimize(name, minimizer):
                    [("-2", "-1", "0", "1", "2")], 0.05, 5, 1)
 
 
+@pytest.mark.slow_test
+def test_tree_based_minimize_n_jobs():
+    check_minimize(forest_minimize, bench1, 0.05,
+                   [(-2.0, 2.0)], 0.05, 25, 5, n_jobs=2)
+
+
 @pytest.mark.fast_test
 def test_categorical_integer():
     def f(params):
@@ -64,5 +70,5 @@ def f(params):
 
     dims = [[1]]
     res = forest_minimize(f, dims, n_calls=1, random_state=1,
-                          n_random_starts=1)
+                          n_initial_points=1)
     assert res.x_iters[0][0] == dims[0][0]
diff --git a/skopt/tests/test_gp_opt.py b/skopt/tests/test_gp_opt.py
index b09785a5b..6b5192f30 100644
--- a/skopt/tests/test_gp_opt.py
+++ b/skopt/tests/test_gp_opt.py
@@ -1,3 +1,4 @@
+import numpy as np
 from numpy.testing import assert_array_equal
 import pytest
 
@@ -7,21 +8,23 @@
 from skopt.benchmarks import bench3
 from skopt.benchmarks import bench4
 from skopt.benchmarks import branin
-from skopt.space.space import Real, Integer, Categorical, Space
+from skopt.space.space import Real, Categorical, Space
 from skopt.utils import cook_estimator
 
 
 def check_minimize(func, y_opt, bounds, acq_optimizer, acq_func,
-                   margin, n_calls, n_random_starts=10):
+                   margin, n_calls, n_initial_points=10, init_gen="random"):
     r = gp_minimize(func, bounds, acq_optimizer=acq_optimizer,
-                    acq_func=acq_func, n_random_starts=n_random_starts,
+                    acq_func=acq_func, n_initial_points=n_initial_points,
                     n_calls=n_calls, random_state=1,
+                    initial_point_generator=init_gen,
                     noise=1e-10)
     assert r.fun < y_opt + margin
 
 
 SEARCH = ["sampling", "lbfgs"]
 ACQUISITION = ["LCB", "EI"]
+INITGEN = ["random", "lhs", "halton", "hammersly", "sobol"]
 
 
 @pytest.mark.slow_test
@@ -32,6 +35,15 @@ def test_gp_minimize_bench1(search, acq):
                    [(-2.0, 2.0)], search, acq, 0.05, 20)
 
 
+@pytest.mark.slow_test
+@pytest.mark.parametrize("search", ["sampling"])
+@pytest.mark.parametrize("acq", ["LCB"])
+@pytest.mark.parametrize("initgen", INITGEN)
+def test_gp_minimize_bench1_initgen(search, acq, initgen):
+    check_minimize(bench1, 0.,
+                   [(-2.0, 2.0)], search, acq, 0.05, 20, init_gen=initgen)
+
+
 @pytest.mark.slow_test
 @pytest.mark.parametrize("search", SEARCH)
 @pytest.mark.parametrize("acq", ACQUISITION)
@@ -54,17 +66,17 @@ def test_gp_minimize_bench3(search, acq):
 def test_gp_minimize_bench4(search, acq):
     # this particular random_state picks "2" twice so we can make an extra
     # call to the objective without repeating options
-    check_minimize(bench4, 0.0,
-                   [("-2", "-1", "0", "1", "2")], search, acq, 1.05, 6, 2)
+    check_minimize(bench4, 0,
+                   [("-2", "-1", "0", "1", "2")], search, acq, 1.05, 20)
 
 
 @pytest.mark.fast_test
 def test_n_jobs():
     r_single = gp_minimize(bench3, [(-2.0, 2.0)], acq_optimizer="lbfgs",
-                           acq_func="EI", n_calls=2, n_random_starts=1,
+                           acq_func="EI", n_calls=4, n_initial_points=2,
                            random_state=1, noise=1e-10)
     r_double = gp_minimize(bench3, [(-2.0, 2.0)], acq_optimizer="lbfgs",
-                           acq_func="EI", n_calls=2, n_random_starts=1,
+                           acq_func="EI", n_calls=4, n_initial_points=2,
                            random_state=1, noise=1e-10, n_jobs=2)
     assert_array_equal(r_single.x_iters, r_double.x_iters)
 
@@ -72,7 +84,7 @@ def test_n_jobs():
 @pytest.mark.fast_test
 def test_gpr_default():
     """Smoke test that gp_minimize does not fail for default values."""
-    gp_minimize(branin, ((-5.0, 10.0), (0.0, 15.0)), n_random_starts=1,
+    gp_minimize(branin, ((-5.0, 10.0), (0.0, 15.0)), n_initial_points=2,
                 n_calls=2)
 
 
@@ -84,7 +96,7 @@ def test_use_given_estimator():
     noise_correct = 1e+5
     noise_fake = 1e-10
     estimator = cook_estimator("GP", domain, noise=noise_correct)
-    res = gp_minimize(branin, domain, n_calls=1, n_random_starts=1,
+    res = gp_minimize(branin, domain, n_calls=4, n_initial_points=2,
                       base_estimator=estimator, noise=noise_fake)
 
     assert res['models'][-1].noise == noise_correct
@@ -98,7 +110,7 @@ def test_use_given_estimator_with_max_model_size():
     noise_correct = 1e+5
     noise_fake = 1e-10
     estimator = cook_estimator("GP", domain, noise=noise_correct)
-    res = gp_minimize(branin, domain, n_calls=1, n_random_starts=1,
+    res = gp_minimize(branin, domain, n_calls=4, n_initial_points=2,
                       base_estimator=estimator, noise=noise_fake,
                       model_queue_size=1)
     assert len(res['models']) == 1
@@ -108,15 +120,16 @@ def test_use_given_estimator_with_max_model_size():
 @pytest.mark.fast_test
 def test_categorical_integer():
     def f(params):
-        return 0
+        return np.random.uniform()
 
     dims = [[1]]
-    res = gp_minimize(f, dims, n_calls=1, n_random_starts=1,
+    res = gp_minimize(f, dims, n_calls=2, n_initial_points=2,
                       random_state=1)
     assert res.x_iters[0][0] == dims[0][0]
 
 
-def test_mixed_categoricals():
+@pytest.mark.parametrize("initgen", INITGEN)
+def test_mixed_categoricals(initgen):
 
     space = Space([
         Categorical(name="x", categories=["1", "2", "3"]),
@@ -131,11 +144,13 @@ def objective(param_list):
         loss = int(x) + y * z
         return loss
 
-    res = gp_minimize(objective, space, n_calls=12, random_state=1)
-    assert res["x"] == ['1', 4, 1.0]
+    res = gp_minimize(objective, space, n_calls=20, random_state=1,
+                      initial_point_generator=initgen)
+    assert res["x"] in [['1', 4, 1.0], ['2', 4, 1.0]]
 
 
-def test_mixed_categoricals2():
+@pytest.mark.parametrize("initgen", INITGEN)
+def test_mixed_categoricals2(initgen):
     space = Space([
         Categorical(name="x", categories=["1", "2", "3"]),
         Categorical(name="y", categories=[4, 5, 6])
@@ -147,5 +162,6 @@ def objective(param_list):
         loss = int(x) + y
         return loss
 
-    res = gp_minimize(objective, space, n_calls=12, random_state=1)
+    res = gp_minimize(objective, space, n_calls=12, random_state=1,
+                      initial_point_generator=initgen)
     assert res["x"] == ['1', 4]
diff --git a/skopt/tests/test_optimizer.py b/skopt/tests/test_optimizer.py
index 9a5f0f173..ff9c4ee38 100644
--- a/skopt/tests/test_optimizer.py
+++ b/skopt/tests/test_optimizer.py
@@ -42,6 +42,8 @@ def test_multiple_asks():
     assert_equal(len(opt.models), 3)
     assert_equal(len(opt.Xi), 3)
     assert_equal(opt.ask(), opt.ask())
+    opt.update_next()
+    assert_equal(opt.ask(), opt.ask())
 
 
 @pytest.mark.fast_test
@@ -166,7 +168,8 @@ def test_dimension_checking_2D_multiple_points():
     assert "dimensions as the space" in str(e.value)
     # within bounds but one dimension too much
     with pytest.raises(ValueError) as e:
-        opt.tell([[low + 1, low + 1, low + 1], [low + 1, low + 2], [low + 1, low + 3]], 2.)
+        opt.tell([[low + 1, low + 1, low + 1], [low + 1, low + 2],
+                  [low + 1, low + 3]], 2.)
     assert "dimensions as the space" in str(e.value)
 
 
@@ -297,7 +300,14 @@ def test_optimizer_base_estimator_string_invalid():
 @pytest.mark.parametrize("base_estimator", ESTIMATOR_STRINGS)
 def test_optimizer_base_estimator_string_smoke(base_estimator):
     opt = Optimizer([(-2.0, 2.0)], base_estimator=base_estimator,
-                    n_initial_points=1, acq_func="EI")
+                    n_initial_points=2, acq_func="EI")
+    opt.run(func=lambda x: x[0]**2, n_iter=3)
+
+
+@pytest.mark.fast_test
+def test_optimizer_base_estimator_string_smoke_njobs():
+    opt = Optimizer([(-2.0, 2.0)], base_estimator="GBRT",
+                    n_initial_points=1, acq_func="EI", n_jobs=-1)
     opt.run(func=lambda x: x[0]**2, n_iter=3)
 
 
@@ -319,3 +329,74 @@ def test_defaults_are_equivalent():
     # tolerate small differences in the points sampled
     assert np.allclose(res_min.x_iters, res_opt.x_iters)#, atol=1e-5)
     assert np.allclose(res_min.x, res_opt.x)#, atol=1e-5)
+
+    res_opt2 = opt.get_result()
+    assert np.allclose(res_min.x_iters, res_opt2.x_iters)  # , atol=1e-5)
+    assert np.allclose(res_min.x, res_opt2.x)  # , atol=1e-5)
+
+
+@pytest.mark.fast_test
+def test_dimensions_names():
+    from skopt.space import Real, Categorical, Integer
+    # create search space and optimizer
+    space = [Real(0, 1, name='real'),
+             Categorical(['a', 'b', 'c'], name='cat'),
+             Integer(0, 1, name='int')]
+    opt = Optimizer(space, n_initial_points=2)
+    # result of the optimizer missing dimension names
+    result = opt.tell([(0.5, 'a', 0.5)], [3])
+    names = []
+    for d in result.space.dimensions:
+        names.append(d.name)
+    assert len(names) == 3
+    assert "real" in names
+    assert "cat" in names
+    assert "int" in names
+    assert None not in names
+
+
+@pytest.mark.fast_test
+def test_categorical_only():
+    from skopt.space import Categorical
+    cat1 = Categorical([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    cat2 = Categorical([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+
+    opt = Optimizer([cat1, cat2])
+    for n in range(15):
+        x = opt.ask()
+        res = opt.tell(x, 12 * n)
+    assert len(res.x_iters) == 15
+    next_x = opt.ask(n_points=4)
+    assert len(next_x) == 4
+
+    cat3 = Categorical(["2", "3", "4", "5", "6", "7", "8", "9", "10", "11"])
+    cat4 = Categorical(["2", "3", "4", "5", "6", "7", "8", "9", "10", "11"])
+
+    opt = Optimizer([cat3, cat4])
+    for n in range(15):
+        x = opt.ask()
+        res = opt.tell(x, 12 * n)
+    assert len(res.x_iters) == 15
+    next_x = opt.ask(n_points=4)
+    assert len(next_x) == 4
+
+
+def test_categorical_only2():
+    from numpy import linalg
+    from skopt.space import Categorical
+    from skopt.learning import GaussianProcessRegressor
+    space = [Categorical([1, 2, 3]), Categorical([4, 5, 6])]
+    opt = Optimizer(space,
+                    base_estimator=GaussianProcessRegressor(alpha=1e-7),
+                    acq_optimizer='lbfgs',
+                    n_initial_points=10,
+                    n_jobs=2)
+
+    next_x = opt.ask(n_points=4)
+    assert len(next_x) == 4
+    opt.tell(next_x, [linalg.norm(x) for x in next_x])
+    next_x = opt.ask(n_points=4)
+    assert len(next_x) == 4
+    opt.tell(next_x, [linalg.norm(x) for x in next_x])
+    next_x = opt.ask(n_points=4)
+    assert len(next_x) == 4
diff --git a/skopt/tests/test_plots.py b/skopt/tests/test_plots.py
index 016d63f30..6a909ac85 100644
--- a/skopt/tests/test_plots.py
+++ b/skopt/tests/test_plots.py
@@ -4,10 +4,15 @@
 from sklearn.datasets import load_breast_cancer
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import cross_val_score
-
+from numpy.testing import assert_array_almost_equal
 from skopt.space import Integer, Categorical
 from skopt import plots, gp_minimize
 import matplotlib.pyplot as plt
+from skopt.benchmarks import bench3
+from skopt import expected_minimum, expected_minimum_random_sampling
+from skopt.plots import _evaluate_min_params, partial_dependence
+from skopt.plots import partial_dependence_1D, partial_dependence_2D
+from skopt import Optimizer
 
 
 def save_axes(ax, filename):
@@ -34,13 +39,140 @@ def objective(params):
                                      **{dim.name: val
                                         for dim, val in zip(SPACE, params)
                                         if dim.name != 'dummy'})
-        return -np.mean(cross_val_score(clf, *load_breast_cancer(True)))
+        X, y = load_breast_cancer(return_X_y=True)
+        return -np.mean(cross_val_score(clf, X, y))
 
     res = gp_minimize(objective, SPACE, n_calls=10, random_state=3)
+
+    x = [[11, 52, 8, 14, 'entropy', 'f'],
+         [14, 90, 10, 2, 'gini', 'a'],
+         [7, 90, 6, 14, 'entropy', 'f']]
+    samples = res.space.transform(x)
+    xi_ = [1., 10.5, 20.]
+    yi_ = [-0.9240883492576596, -0.9240745890422687, -0.9240586402439884]
+    xi, yi = partial_dependence_1D(res.space, res.models[-1], 0,
+                                   samples, n_points=3)
+    assert_array_almost_equal(xi, xi_)
+    assert_array_almost_equal(yi, yi_, 2)
+
+    xi_ = [0, 1]
+    yi_ = [-0.9241087603770617, -0.9240188905968352]
+    xi, yi = partial_dependence_1D(res.space, res.models[-1], 4,
+                                   samples, n_points=3)
+    assert_array_almost_equal(xi, xi_)
+    assert_array_almost_equal(yi, yi_, 2)
+
+    xi_ = [0, 1]
+    yi_ = [1., 10.5, 20.]
+    zi_ = [[-0.92412562, -0.92403575],
+           [-0.92411186, -0.92402199],
+           [-0.92409591, -0.92400604]]
+    xi, yi, zi = partial_dependence_2D(res.space, res.models[-1], 0, 4,
+                                       samples, n_points=3)
+    assert_array_almost_equal(xi, xi_)
+    assert_array_almost_equal(yi, yi_)
+    assert_array_almost_equal(zi, zi_, 2)
+
+    x_min, f_min = expected_minimum_random_sampling(res, random_state=1)
+    x_min2, f_min2 = expected_minimum(res, random_state=1)
+
+    assert x_min == x_min2
+    assert f_min == f_min2
+
     plots.plot_convergence(res)
     plots.plot_evaluations(res)
     plots.plot_objective(res)
+    plots.plot_objective(res, dimensions=["a", "b", "c", "d", "e", "f"])
+    plots.plot_objective(res,
+                         minimum='expected_minimum_random')
+    plots.plot_objective(res,
+                         sample_source='expected_minimum_random',
+                         n_minimum_search=10000)
+    plots.plot_objective(res,
+                         sample_source='result')
     plots.plot_regret(res)
+    plots.plot_objective_2D(res, 0, 4)
+    plots.plot_histogram(res, 0, 4)
 
     # TODO: Compare plots to known good results?
     # Look into how matplotlib does this.
+
+
+@pytest.mark.slow_test
+def test_plots_work_without_cat():
+    """Basic smoke tests to make sure plotting doesn't crash."""
+    SPACE = [
+        Integer(1, 20, name='max_depth'),
+        Integer(2, 100, name='min_samples_split'),
+        Integer(5, 30, name='min_samples_leaf'),
+        Integer(1, 30, name='max_features'),
+    ]
+
+    def objective(params):
+        clf = DecisionTreeClassifier(random_state=3,
+                                     **{dim.name: val
+                                        for dim, val in zip(SPACE, params)
+                                        if dim.name != 'dummy'})
+        X, y = load_breast_cancer(return_X_y=True)
+        return -np.mean(cross_val_score(clf, X, y))
+
+    res = gp_minimize(objective, SPACE, n_calls=10, random_state=3)
+    plots.plot_convergence(res)
+    plots.plot_evaluations(res)
+    plots.plot_objective(res)
+    plots.plot_objective(res,
+                         minimum='expected_minimum')
+    plots.plot_objective(res,
+                         sample_source='expected_minimum',
+                         n_minimum_search=10)
+    plots.plot_objective(res, sample_source='result')
+    plots.plot_regret(res)
+
+    # TODO: Compare plots to known good results?
+    # Look into how matplotlib does this.
+
+
+@pytest.mark.fast_test
+def test_evaluate_min_params():
+    res = gp_minimize(bench3,
+                      [(-2.0, 2.0)],
+                      x0=[0.],
+                      noise=1e-8,
+                      n_calls=8,
+                      n_random_starts=3,
+                      random_state=1)
+
+    x_min, f_min = expected_minimum(res, random_state=1)
+    x_min2, f_min2 = expected_minimum_random_sampling(res,
+                                                      n_random_starts=1000,
+                                                      random_state=1)
+    plots.plot_gaussian_process(res)
+    assert _evaluate_min_params(res, params='result') == res.x
+    assert _evaluate_min_params(res, params=[1.]) == [1.]
+    assert _evaluate_min_params(res, params='expected_minimum',
+                                random_state=1) == x_min
+    assert _evaluate_min_params(res, params='expected_minimum',
+                                n_minimum_search=20,
+                                random_state=1) == x_min
+    assert _evaluate_min_params(res, params='expected_minimum_random',
+                                n_minimum_search=1000,
+                                random_state=1) == x_min2
+
+
+def test_names_dimensions():
+    # Define objective
+    def objective(x, noise_level=0.1):
+        return np.sin(5 * x[0]) * (1 - np.tanh(x[0] ** 2)) +\
+               np.random.randn() * noise_level
+
+    # Initialize Optimizer
+    opt = Optimizer([(-2.0, 2.0)], n_initial_points=2)
+
+    # Optimize
+    for i in range(3):
+        next_x = opt.ask()
+        f_val = objective(next_x)
+        res = opt.tell(next_x, f_val)
+
+    # Plot results
+    plots.plot_objective(res)
diff --git a/skopt/tests/test_sampler.py b/skopt/tests/test_sampler.py
new file mode 100644
index 000000000..49659ba52
--- /dev/null
+++ b/skopt/tests/test_sampler.py
@@ -0,0 +1,286 @@
+import pytest
+import numbers
+import numpy as np
+import os
+import yaml
+from tempfile import NamedTemporaryFile
+
+from numpy.testing import assert_array_almost_equal
+from numpy.testing import assert_almost_equal
+from numpy.testing import assert_array_equal
+from numpy.testing import assert_equal
+from numpy.testing import assert_raises
+from scipy import spatial
+from skopt import Optimizer
+from skopt.space import Space
+from skopt.space import Real
+from skopt.space import Integer
+from skopt.space import Categorical
+from skopt.space import check_dimension as space_check_dimension
+from skopt.sampler.sobol import _bit_lo0, _bit_hi1
+from skopt.sampler.halton import _van_der_corput_samples, _create_primes
+from skopt.sampler import Hammersly, Halton, Lhs, Sobol, Grid
+from skopt.sampler import InitialPointGenerator
+from skopt.sampler.grid import _create_uniform_grid_include_border
+from skopt.sampler.grid import _create_uniform_grid_exclude_border
+from skopt.sampler.grid import _quadrature_combine
+from skopt.sampler.grid import _create_uniform_grid_only_border
+from skopt.utils import cook_initial_point_generator
+
+
+LHS_TYPE = ["classic", "centered"]
+CRITERION = ["maximin", "ratio", "correlation", None]
+SAMPLER = ["lhs", "halton", "sobol", "hammersly", "grid"]
+
+
+@pytest.mark.fast_test
+def test_lhs_centered():
+    lhs = Lhs(lhs_type="centered")
+    samples = lhs.generate([(0., 1.), ] * 3, 3)
+    assert_almost_equal(np.sum(samples), 4.5)
+
+
+@pytest.mark.parametrize("samlper", SAMPLER)
+def test_sampler(samlper):
+    s = cook_initial_point_generator(samlper)
+    samples = s.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+    assert isinstance(s, InitialPointGenerator)
+
+    samples = s.generate([("a", "b", "c")], 3)
+    assert samples[0][0] in ["a", "b", "c"]
+
+    samples = s.generate([("a", "b", "c"), (0, 1)], 1)
+    assert samples[0][0] in ["a", "b", "c"]
+    assert samples[0][1] in [0, 1]
+
+    samples = s.generate([("a", "b", "c"), (0, 1)], 3)
+    assert samples[0][0] in ["a", "b", "c"]
+    assert samples[0][1] in [0, 1]
+
+
+@pytest.mark.parametrize("lhs_type", LHS_TYPE)
+@pytest.mark.parametrize("criterion", CRITERION)
+def test_lhs_criterion(lhs_type, criterion):
+    lhs = Lhs(lhs_type=lhs_type, criterion=criterion, iterations=100)
+    samples = lhs.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+    samples = lhs.generate([("a", "b", "c")], 3)
+    assert samples[0][0] in ["a", "b", "c"]
+
+    samples = lhs.generate([("a", "b", "c"), (0, 1)], 1)
+    assert samples[0][0] in ["a", "b", "c"]
+    assert samples[0][1] in [0, 1]
+
+    samples = lhs.generate([("a", "b", "c"), (0, 1)], 3)
+    assert samples[0][0] in ["a", "b", "c"]
+    assert samples[0][1] in [0, 1]
+
+
+def test_lhs_pdist():
+    n_dim = 2
+    n_samples = 20
+    lhs = Lhs()
+
+    h = lhs._lhs_normalized(n_dim, n_samples, 0)
+    d_classic = spatial.distance.pdist(np.array(h), 'euclidean')
+    lhs = Lhs(criterion="maximin", iterations=100)
+    h = lhs.generate([(0., 1.), ] * n_dim, n_samples, random_state=0)
+    d = spatial.distance.pdist(np.array(h), 'euclidean')
+    assert np.min(d) > np.min(d_classic)
+
+
+@pytest.mark.parametrize("criterion", CRITERION)
+def test_lhs_random_state(criterion):
+    n_dim = 2
+    n_samples = 20
+    lhs = Lhs()
+
+    h = lhs._lhs_normalized(n_dim, n_samples, 0)
+    h2 = lhs._lhs_normalized(n_dim, n_samples, 0)
+    assert_array_equal(h, h2)
+    lhs = Lhs(criterion=criterion, iterations=100)
+    h = lhs.generate([(0., 1.), ] * n_dim, n_samples, random_state=0)
+    h2 = lhs.generate([(0., 1.), ] * n_dim, n_samples, random_state=0)
+    assert_array_equal(h, h2)
+
+
+@pytest.mark.fast_test
+def test_bit():
+    X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    res = [2, 1, 3, 1, 2, 1, 4, 1, 2, 1]
+    for i in range(len(X)):
+        assert _bit_lo0(X[i]) == res[i]
+
+    X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    res = [1, 2, 2, 3, 3, 3, 3, 4, 4, 4]
+    for i in range(len(X)):
+        assert _bit_hi1(X[i]) == res[i]
+
+
+@pytest.mark.fast_test
+def test_sobol():
+    sobol = Sobol()
+    x, seed = sobol._sobol(3, 0)
+    assert_array_equal(x, [0., 0., 0.])
+    x, seed = sobol._sobol(3, 1)
+    assert_array_equal(x, [0.5, 0.5, 0.5])
+    x, seed = sobol._sobol(3, 2)
+    assert_array_equal(x, [0.75, 0.25, 0.75])
+    x, seed = sobol._sobol(3, 3)
+    assert_array_equal(x, [0.25, 0.75, 0.25])
+    x, seed = sobol._sobol(3, 4)
+    assert_array_equal(x, [0.375, 0.375, 0.625])
+    x, seed = sobol._sobol(3, 5)
+    assert_array_equal(x, [0.875, 0.875, 0.125])
+    x, seed = sobol._sobol(3, 6)
+    assert_array_equal(x, [0.625, 0.125, 0.375])
+
+
+@pytest.mark.fast_test
+def test_generate():
+    sobol = Sobol(randomize=False)
+    x = sobol.generate([(0., 1.), ] * 3, 4)
+    x = np.array(x)
+    assert_array_equal(x[0, :], [0., 0., 0.])
+    assert_array_equal(x[1, :], [0.5, 0.5, 0.5])
+    assert_array_equal(x[2, :], [0.75, 0.25, 0.75])
+    assert_array_equal(x[3, :], [0.25, 0.75, 0.25])
+
+    sobol.set_params(skip=2)
+    assert sobol.skip == 2
+    assert isinstance(sobol, InitialPointGenerator)
+
+
+@pytest.mark.fast_test
+def test_van_der_corput():
+    x = _van_der_corput_samples(range(12), number_base=10)
+    y = [0., 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9, 0.01, 0.11]
+    assert_array_equal(x, y)
+
+    x = _van_der_corput_samples(range(9), number_base=2)
+    y = [0., 0.5, 0.25, 0.75, 0.125, 0.625, 0.375, 0.875, 0.0625]
+    assert_array_equal(x, y)
+
+
+@pytest.mark.fast_test
+def test_halton():
+    h = Halton()
+    x = h.generate([(0., 1.), ], 9)
+    y = _van_der_corput_samples(range(9), number_base=2)
+    assert_array_almost_equal(np.array(x).flatten(), y)
+
+    h = Halton()
+    x = h.generate([(0., 1.), ] * 2, 6)
+    y = np.array([[0, 0], [1 / 2, 1 / 3], [1 / 4, 2 / 3], [3 / 4, 1 / 9],
+                  [1 / 8, 4 / 9], [5 / 8, 7 / 9]])
+    assert_array_almost_equal(x, y)
+
+    h = Halton(min_skip=0, max_skip=3)
+    x = h.generate([(0., 1.), ] * 2, 4, random_state=12345)
+    assert_array_almost_equal(x, y[2:])
+
+    samples = h.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+
+
+@pytest.mark.fast_test
+def test_hammersly():
+    h = Hammersly()
+    x = h.generate([(0., 1.), ] * 2, 4)
+    y = np.array([[0, 0], [1 / 2, 0.25], [1 / 4, 0.5], [3 / 4, 0.75]])
+    assert_almost_equal(x, y)
+
+    samples = h.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+
+
+@pytest.mark.fast_test
+def test_primes():
+
+    x = _create_primes(1)
+    assert_array_equal(x, [])
+    x = _create_primes(2)
+    assert_array_equal(x, [2])
+    x = _create_primes(3)
+    assert_array_equal(x, [2, 3])
+    x = _create_primes(20)
+    assert_array_equal(x, [2, 3, 5, 7, 11, 13, 17, 19])
+
+
+@pytest.mark.fast_test
+def test_quadrature_combine():
+    a = [1, 2]
+    b = [[4, 4], [5, 6]]
+    x = [[1, 4, 4], [1, 5, 6], [2, 4, 4], [2, 5, 6]]
+    x_test = _quadrature_combine([a, b])
+    assert_array_equal(x_test, x)
+
+
+@pytest.mark.fast_test
+def test_uniform_grid():
+    x = _create_uniform_grid_exclude_border(1, 2)
+    assert_array_equal(x, [[1./3.], [2./3.]])
+    x = _create_uniform_grid_include_border(1, 2)
+    assert_array_equal(x, [[0.], [1.]])
+    x = _create_uniform_grid_only_border(1, 2)
+    assert_array_equal(x, [[0.], [1.]])
+
+    x = _create_uniform_grid_exclude_border(1, 3)
+    assert_array_equal(x, [[1./4.], [2./4.], [3./4.]])
+    x = _create_uniform_grid_include_border(1, 3)
+    assert_array_equal(x, [[0./2.], [1./2.], [2./2.]])
+    x = _create_uniform_grid_only_border(1, 3)
+    assert_array_equal(x, [[0./2.], [1./2.], [2./2.]])
+
+    x = _create_uniform_grid_exclude_border(1, 5)
+    assert_array_equal(x, [[1./6.], [2./6.], [3./6.], [4./6.], [5./6.]])
+    x = _create_uniform_grid_include_border(1, 5)
+    assert_array_equal(x, [[0./4.], [1./4.], [2./4.], [3./4.], [4./4.]])
+    x = _create_uniform_grid_only_border(1, 5)
+    assert_array_equal(x, [[0./4.], [1./4.], [2./4.], [3./4.], [4./4.]])
+
+    x = _create_uniform_grid_exclude_border(2, 2)
+    assert_array_equal(x, [[1. / 3., 1./3.], [1. / 3., 2. / 3.],
+                           [2. / 3., 1. / 3.], [2. / 3., 2. / 3.]])
+    x = _create_uniform_grid_include_border(2, 2)
+    assert_array_equal(x, [[0., 0.], [0., 1.],
+                           [1., 0.], [1., 1.]])
+    x = _create_uniform_grid_only_border(2, 3)
+    assert_array_equal(x, [[0., 0.], [0., 0.5],
+                           [0., 1.], [1., 0.],
+                           [1., 0.5], [1., 1.]])
+
+    assert_raises(AssertionError, _create_uniform_grid_exclude_border, 1, 0)
+    assert_raises(AssertionError, _create_uniform_grid_exclude_border, 0, 1)
+    assert_raises(AssertionError, _create_uniform_grid_include_border, 1, 0)
+    assert_raises(AssertionError, _create_uniform_grid_include_border, 0, 1)
+    assert_raises(AssertionError, _create_uniform_grid_only_border, 1, 1)
+    assert_raises(AssertionError, _create_uniform_grid_only_border, 0, 2)
+
+
+@pytest.mark.fast_test
+def test_grid():
+    grid = Grid()
+    samples = grid.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+
+    grid = Grid(border="include")
+    samples = grid.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+
+    grid = Grid(use_full_layout=False)
+    samples = grid.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
+
+    grid = Grid(use_full_layout=True, append_border="include")
+    samples = grid.generate([(0., 1.), ] * 2, 200)
+    assert len(samples) == 200
+    assert len(samples[0]) == 2
diff --git a/skopt/tests/test_searchcv.py b/skopt/tests/test_searchcv.py
index 1df759d1d..8dcb6f44b 100644
--- a/skopt/tests/test_searchcv.py
+++ b/skopt/tests/test_searchcv.py
@@ -3,7 +3,6 @@
 """
 
 import pytest
-import time
 
 from sklearn.datasets import load_iris, make_classification
 from sklearn.model_selection import train_test_split
@@ -12,8 +11,9 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.base import clone
 from sklearn.base import BaseEstimator
-from joblib import cpu_count
-
+from scipy.stats import rankdata
+import numpy as np
+from numpy.testing import assert_array_equal
 from skopt.space import Real, Categorical, Integer
 from skopt import BayesSearchCV
 
@@ -39,9 +39,23 @@ def _fit_svc(n_jobs=1, n_points=1, cv=None):
     )
 
     opt.fit(X, y)
-
     assert opt.score(X, y) > 0.9
 
+    opt2 = BayesSearchCV(
+        SVC(),
+        {
+            'C': Real(1e-3, 1e+3, prior='log-uniform'),
+            'gamma': Real(1e-3, 1e+1, prior='log-uniform'),
+            'degree': Integer(1, 3),
+        },
+        n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv,
+        random_state=42,
+    )
+
+    opt2.fit(X, y)
+
+    assert opt.score(X, y) == opt2.score(X, y)
+
 
 def test_raise_errors():
 
@@ -79,7 +93,7 @@ def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None):
 
     """
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, train_size=0.75, random_state=0
     )
@@ -126,7 +140,7 @@ def test_searchcv_runs_multiple_subspaces():
     multiple subspaces are given.
     """
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, train_size=0.75, random_state=0
     )
@@ -167,6 +181,10 @@ def test_searchcv_runs_multiple_subspaces():
     # test if all subspaces are explored
     total_evaluations = len(opt.cv_results_['mean_test_score'])
     assert total_evaluations == 1+1+2, "Not all spaces were explored!"
+    assert len(opt.optimizer_results_) == 3
+    assert isinstance(opt.optimizer_results_[0].x[0], LinearSVC)
+    assert isinstance(opt.optimizer_results_[1].x[0], DecisionTreeClassifier)
+    assert isinstance(opt.optimizer_results_[2].x[0], SVC)
 
 
 def test_searchcv_sklearn_compatibility():
@@ -175,7 +193,7 @@ def test_searchcv_sklearn_compatibility():
     such as clone, set_params, get_params.
     """
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, train_size=0.75, random_state=0
     )
@@ -239,7 +257,7 @@ def test_searchcv_reproducibility():
     random state.
     """
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, train_size=0.75, random_state=0
     )
@@ -259,14 +277,62 @@ def test_searchcv_reproducibility():
 
     opt.fit(X_train, y_train)
     best_est = opt.best_estimator_
+    optim_res = opt.optimizer_results_[0].x
 
     opt2 = clone(opt).fit(X_train, y_train)
     best_est2 = opt2.best_estimator_
+    optim_res2 = opt2.optimizer_results_[0].x
 
     assert getattr(best_est, 'C') == getattr(best_est2, 'C')
     assert getattr(best_est, 'gamma') == getattr(best_est2, 'gamma')
     assert getattr(best_est, 'degree') == getattr(best_est2, 'degree')
     assert getattr(best_est, 'kernel') == getattr(best_est2, 'kernel')
+    # dict is sorted by alphabet
+    assert optim_res[0] == getattr(best_est, 'C')
+    assert optim_res[2] == getattr(best_est, 'gamma')
+    assert optim_res[1] == getattr(best_est, 'degree')
+    assert optim_res[3] == getattr(best_est, 'kernel')
+    assert optim_res2[0] == getattr(best_est, 'C')
+    assert optim_res2[2] == getattr(best_est, 'gamma')
+    assert optim_res2[1] == getattr(best_est, 'degree')
+    assert optim_res2[3] == getattr(best_est, 'kernel')
+
+
+@pytest.mark.fast_test
+def test_searchcv_rank():
+    """
+    Test whether results of BayesSearchCV can be reproduced with a fixed
+    random state.
+    """
+
+    X, y = load_iris(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, train_size=0.75, random_state=0
+    )
+
+    random_state = 42
+
+    opt = BayesSearchCV(
+        SVC(random_state=random_state),
+        {
+            'C': Real(1e-6, 1e+6, prior='log-uniform'),
+            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
+            'degree': Integer(1, 8),
+            'kernel': Categorical(['linear', 'poly', 'rbf']),
+        },
+        n_iter=11, random_state=random_state, return_train_score=True
+    )
+
+    opt.fit(X_train, y_train)
+    results = opt.cv_results_
+
+    test_rank = np.asarray(rankdata(-np.array(results["mean_test_score"]),
+                                    method='min'), dtype=np.int32)
+    train_rank = np.asarray(rankdata(-np.array(results["mean_train_score"]),
+                                     method='min'), dtype=np.int32)
+
+    assert_array_equal(np.array(results['rank_test_score']), test_rank)
+    assert_array_equal(np.array(results['rank_train_score']), train_rank)
 
 
 def test_searchcv_refit():
@@ -275,7 +341,7 @@ def test_searchcv_refit():
     random state.
     """
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, train_size=0.75, random_state=0
     )
@@ -317,7 +383,7 @@ def test_searchcv_callback():
     # Test whether callback is used in BayesSearchCV and
     # whether is can be used to interrupt the search loop
 
-    X, y = load_iris(True)
+    X, y = load_iris(return_X_y=True)
     opt = BayesSearchCV(
         DecisionTreeClassifier(),
         {
@@ -381,7 +447,7 @@ def fit(self, X, y):
             return self
 
         def score(self, X, y):
-            return 0.0
+            return np.random.uniform()
 
     # Below is example code that used to not work.
     X, y = make_classification(10, 4)
diff --git a/skopt/tests/test_space.py b/skopt/tests/test_space.py
index 4cfbd1088..bd93e4e1a 100644
--- a/skopt/tests/test_space.py
+++ b/skopt/tests/test_space.py
@@ -16,6 +16,7 @@
 from skopt.space import Integer
 from skopt.space import Categorical
 from skopt.space import check_dimension as space_check_dimension
+from skopt.utils import normalize_dimensions
 
 
 def check_dimension(Dimension, vals, random_val):
@@ -40,8 +41,8 @@ def check_limits(value, low, high):
     # check if low <= value <= high
     if isinstance(value, list):
         value = np.array(value)
-    assert low <= value
-    assert high >= value
+    assert np.all(low <= value)
+    assert np.all(high >= value)
 
 
 @pytest.mark.fast_test
@@ -366,8 +367,66 @@ def test_space_from_space():
     assert_equal(space, space2)
 
 
+@pytest.mark.fast_test
+def test_constant_property():
+    space = Space([(0.0, 1.0), (1,),
+                   ("a", "b", "c"), (1.0, 5.0, "log-uniform"), ("e",)])
+    assert space.n_constant_dimensions == 2
+    for i in [1, 4]:
+        assert space.dimensions[i].is_constant
+    for i in [0, 2, 3]:
+        assert not space.dimensions[i].is_constant
+
+
+@pytest.mark.fast_test
+def test_set_get_transformer():
+    # can you pass a Space instance to the Space constructor?
+    space = Space([(0.0, 1.0), (-5, 5),
+                   ("a", "b", "c"), (1.0, 5.0, "log-uniform"), ("e", "f")])
+
+    transformer = space.get_transformer()
+    assert_array_equal(["identity", "identity", "onehot",
+                        "identity", "onehot"], transformer)
+    space.set_transformer("normalize")
+    transformer = space.get_transformer()
+    assert_array_equal(["normalize"] * 5, transformer)
+    space.set_transformer(transformer)
+    assert_array_equal(transformer, space.get_transformer())
+
+    space.set_transformer_by_type("label", Categorical)
+    assert space.dimensions[2].transform(["a"]) == [0]
+
+
 @pytest.mark.fast_test
 def test_normalize():
+    # can you pass a Space instance to the Space constructor?
+    space = Space([(0.0, 1.0), (-5, 5),
+                   ("a", "b", "c"), (1.0, 5.0, "log-uniform"), ("e", "f")])
+    space.set_transformer("normalize")
+    X = [[0., -5, 'a', 1., 'e']]
+    Xt = np.zeros((1, 5))
+    assert_array_equal(space.transform(X), Xt)
+    assert_array_equal(space.inverse_transform(Xt), X)
+    assert_array_equal(space.inverse_transform(space.transform(X)), X)
+
+
+@pytest.mark.fast_test
+def test_normalize_types():
+    # can you pass a Space instance to the Space constructor?
+    space = Space([(0.0, 1.0), Integer(-5, 5, dtype=int), (True, False)])
+    space.set_transformer("normalize")
+    X = [[0., -5, False]]
+    Xt = np.zeros((1, 3))
+    assert_array_equal(space.transform(X), Xt)
+    assert_array_equal(space.inverse_transform(Xt), X)
+    assert_array_equal(space.inverse_transform(space.transform(X)), X)
+    assert isinstance(space.inverse_transform(Xt)[0][0], float)
+    assert isinstance(space.inverse_transform(Xt)[0][1], int)
+    assert isinstance(space.inverse_transform(Xt)[0][2], (np.bool_, bool))
+
+@pytest.mark.fast_test
+def test_normalize_real():
+
     a = Real(2.0, 30.0, transform="normalize")
     for i in range(50):
         check_limits(a.rvs(random_state=i), 2, 30)
@@ -398,11 +457,47 @@ def test_normalize():
     # Check inverse transform
     assert_array_almost_equal(a.inverse_transform(a.transform(X)), X)
 
+    a = Real(0, 1, transform="normalize", dtype=float)
+    for i in range(50):
+        check_limits(a.rvs(random_state=i), 0, 1)
+    assert_array_equal(a.transformed_bounds, (0, 1))
+
+    X = rng.rand()
+    # Check transformed values are in [0, 1]
+    assert np.all(a.transform(X) <= np.ones_like(X))
+    assert np.all(np.zeros_like(X) <= a.transform(X))
+
+    # Check inverse transform
+    X_orig = a.inverse_transform(a.transform(X))
+    assert isinstance(X_orig, float)
+    assert_array_equal(X_orig, X)
+
+    a = Real(0, 1, transform="normalize", dtype='float64')
+    X = np.float64(rng.rand())
+    # Check inverse transform
+    X_orig = a.inverse_transform(a.transform(X))
+    assert isinstance(X_orig, np.float64)
+
+    a = Real(0, 1, transform="normalize", dtype=np.float64)
+    X = np.float64(rng.rand())
+    # Check inverse transform
+    X_orig = a.inverse_transform(a.transform(X))
+    assert isinstance(X_orig, np.float64)
+
+    a = Real(0, 1, transform="normalize", dtype='float64')
+    X = np.float64(rng.rand())
+    # Check inverse transform
+    X_orig = a.inverse_transform(a.transform(X))
+    assert isinstance(X_orig, np.float64)
+
+
+@pytest.mark.fast_test
+def test_normalize_integer():
     a = Integer(2, 30, transform="normalize")
     for i in range(50):
         check_limits(a.rvs(random_state=i), 2, 30)
     assert_array_equal(a.transformed_bounds, (0, 1))
-
+    rng = np.random.RandomState(0)
     X = rng.randint(2, 31, dtype=np.int64)
     # Check transformed values are in [0, 1]
     assert np.all(a.transform(X) <= np.ones_like(X))
@@ -441,39 +536,36 @@ def test_normalize():
     assert isinstance(X_orig, int)
     assert_array_equal(X_orig, X)
 
-    a = Real(0, 1, transform="normalize", dtype=float)
-    for i in range(50):
-        check_limits(a.rvs(random_state=i), 0, 1)
-    assert_array_equal(a.transformed_bounds, (0, 1))
 
-    X = rng.rand()
-    # Check transformed values are in [0, 1]
-    assert np.all(a.transform(X) <= np.ones_like(X))
-    assert np.all(np.zeros_like(X) <= a.transform(X))
-
-    # Check inverse transform
-    X_orig = a.inverse_transform(a.transform(X))
-    assert isinstance(X_orig, float)
-    assert_array_equal(X_orig, X)
-
-    a = Real(0, 1, transform="normalize", dtype='float64')
-    X = np.float64(rng.rand())
-    # Check inverse transform
-    X_orig = a.inverse_transform(a.transform(X))
-    assert isinstance(X_orig, np.float64)
+@pytest.mark.fast_test
+def test_normalize_categorical():
+    categories = ["cat", "dog", "rat"]
+    a = Categorical(categories, transform="normalize")
+    for i in range(len(categories)):
+        assert a.rvs(random_state=i)[0] in categories
+    assert a.inverse_transform([0.]) == [categories[0]]
+    assert a.inverse_transform([0.5]) == [categories[1]]
+    assert a.inverse_transform([1.0]) == [categories[2]]
+    assert_array_equal(categories, a.inverse_transform([0., 0.5, 1]))
 
-    a = Real(0, 1, transform="normalize", dtype=np.float64)
-    X = np.float64(rng.rand())
-    # Check inverse transform
-    X_orig = a.inverse_transform(a.transform(X))
-    assert isinstance(X_orig, np.float64)
+    categories = [1, 2, 3]
+    a = Categorical(categories, transform="normalize")
+    assert_array_equal(categories, np.sort(np.unique(a.rvs(100,
+                                                           random_state=1))))
+    assert_array_equal(categories, a.inverse_transform([0., 0.5, 1.]))
 
-    a = Real(0, 1, transform="normalize", dtype='float64')
-    X = np.float64(rng.rand())
-    # Check inverse transform
-    X_orig = a.inverse_transform(a.transform(X))
-    assert isinstance(X_orig, np.float64)
+    categories = [1., 2., 3.]
+    a = Categorical(categories, transform="normalize")
+    assert_array_equal(categories, np.sort(np.unique(a.rvs(100,
+                                                           random_state=1))))
+    assert_array_equal(categories, a.inverse_transform([0., 0.5, 1.]))
 
+    categories = [1, 2, 3]
+    a = Categorical(categories, transform="string")
+    a.set_transformer("normalize")
+    assert_array_equal(categories, np.sort(np.unique(a.rvs(100,
+                                                           random_state=1))))
+    assert_array_equal(categories, a.inverse_transform([0., 0.5, 1.]))
 
 @pytest.mark.fast_test
 def test_normalize_integer():
@@ -587,13 +679,32 @@ def test_dimension_bounds(dimension, bounds):
 
 
 @pytest.mark.parametrize("dimension, name",
-                         [(Real(1, 2, name="learning rate"), "learning rate"),
-                          (Integer(1, 100, name="no of trees"), "no of trees"),
+                         [(Real(1, 2, name="learning_rate"), "learning_rate"),
+                          (Integer(1, 100, name="n_trees"), "n_trees"),
                           (Categorical(["red, blue"], name="colors"), "colors")])
 def test_dimension_name(dimension, name):
     assert dimension.name == name
 
 
+def test_dimension_name():
+    notnames = [1, 1., True]
+    for n in notnames:
+        with pytest.raises(ValueError) as exc:
+            real = Real(1, 2, name=n)
+            assert("Dimension's name must be either string or"
+                   "None." == exc.value.args[0])
+    s = Space([Real(1, 2, name="a"),
+               Integer(1, 100, name="b"),
+               Categorical(["red, blue"], name="c")])
+    assert s["a"] == (0, s.dimensions[0])
+    assert s["a", "c"] == [(0, s.dimensions[0]), (2, s.dimensions[2])]
+    assert s[["a", "c"]] == [(0, s.dimensions[0]), (2, s.dimensions[2])]
+    assert s[("a", "c")] == [(0, s.dimensions[0]), (2, s.dimensions[2])]
+    assert s[0] == (0, s.dimensions[0])
+    assert s[0, "c"] == [(0, s.dimensions[0]), (2, s.dimensions[2])]
+    assert s[0, 2] == [(0, s.dimensions[0]), (2, s.dimensions[2])]
+
+
 @pytest.mark.parametrize("dimension",
                          [Real(1, 2), Integer(1, 100), Categorical(["red, blue"])])
 def test_dimension_name_none(dimension):
@@ -650,8 +761,36 @@ def test_dimension_with_invalid_names(name):
 def test_purely_categorical_space():
     # Test reproduces the bug in #908, make sure it doesn't come back
     dims = [Categorical(['a', 'b', 'c']), Categorical(['A', 'B', 'C'])]
-    optimizer = Optimizer(dims, n_initial_points=1, random_state=3)
+    optimizer = Optimizer(dims, n_initial_points=2, random_state=3)
 
-    x = optimizer.ask()
-    # before the fix this call raised an exception
-    optimizer.tell(x, 1.)
+    for _ in range(2):
+        x = optimizer.ask()
+        # before the fix this call raised an exception
+        optimizer.tell(x, np.random.uniform())
+
+
+@pytest.mark.fast_test
+def test_partly_categorical_space():
+    dims = Space([Categorical(['a', 'b', 'c']), Categorical(['A', 'B', 'C'])])
+    assert dims.is_partly_categorical
+    dims = Space([Categorical(['a', 'b', 'c']), Integer(1, 2)])
+    assert dims.is_partly_categorical
+    assert not dims.is_categorical
+    dims = Space([Integer(1, 2), Integer(1, 2)])
+    assert not dims.is_partly_categorical
+
+
+@pytest.mark.fast_test
+def test_normalize_bounds():
+    bounds = [(-999, 189000), Categorical((True, False))]
+    space = Space(normalize_dimensions(bounds))
+    for a in np.linspace(1e-9, 0.4999, 1000):
+        x = space.inverse_transform([[a, a]])
+        check_limits(x[0][0], -999, 189000)
+        y = space.transform(x)
+        check_limits(y, 0., 1.)
+    for a in np.linspace(0.50001, 1e-9 + 1., 1000):
+        x = space.inverse_transform([[a, a]])
+        check_limits(x[0][0], -999, 189000)
+        y = space.transform(x)
+        check_limits(y, 0., 1.)
diff --git a/skopt/tests/test_transformers.py b/skopt/tests/test_transformers.py
index 66a210ccc..ccb803473 100644
--- a/skopt/tests/test_transformers.py
+++ b/skopt/tests/test_transformers.py
@@ -1,11 +1,12 @@
 import pytest
 import numbers
 import numpy as np
-from numpy.testing import assert_array_almost_equal
+from numpy.testing import assert_raises
 from numpy.testing import assert_array_equal
 from numpy.testing import assert_equal
 from numpy.testing import assert_raises_regex
-from skopt.space import LogN
+from skopt.space import LogN, Normalize
+from skopt.space.transformers import StringEncoder, LabelEncoder, Identity
 
 
 @pytest.mark.fast_test
@@ -16,6 +17,7 @@ def test_logn2_integer():
         X_orig = transformer.inverse_transform(transformer.transform(X))
         assert_array_equal(int(np.round(X_orig)), X)
 
+
 @pytest.mark.fast_test
 def test_logn10_integer():
 
@@ -23,3 +25,96 @@ def test_logn10_integer():
     for X in range(2, 31):
         X_orig = transformer.inverse_transform(transformer.transform(X))
         assert_array_equal(int(np.round(X_orig)), X)
+
+
+@pytest.mark.fast_test
+def test_integer_encoder():
+
+    transformer = LabelEncoder()
+    X = [1, 5, 9]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), [0, 1, 2])
+    assert_array_equal(transformer.inverse_transform([0, 1, 2]), X)
+
+    transformer = LabelEncoder(X)
+    assert_array_equal(transformer.transform(X), [0, 1, 2])
+    assert_array_equal(transformer.inverse_transform([0, 1, 2]), X)
+
+    X = ["a", "b", "c"]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), [0, 1, 2])
+    assert_array_equal(transformer.inverse_transform([0, 1, 2]), X)
+
+    transformer = LabelEncoder(X)
+    assert_array_equal(transformer.transform(X), [0, 1, 2])
+    assert_array_equal(transformer.inverse_transform([0, 1, 2]), X)
+
+
+@pytest.mark.fast_test
+def test_string_encoder():
+
+    transformer = StringEncoder()
+    X = [1, 5, 9]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), ['1', '5', '9'])
+    assert_array_equal(transformer.inverse_transform(['1', '5', '9']), X)
+
+    X = ['a', True, 1]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), ['a', 'True', '1'])
+    assert_array_equal(transformer.inverse_transform(['a', 'True', '1']), X)
+
+    X = ["a", "b", "c"]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), X)
+    assert_array_equal(transformer.inverse_transform(X), X)
+
+
+@pytest.mark.fast_test
+def test_identity_encoder():
+
+    transformer = Identity()
+    X = [1, 5, 9, 9, 5, 1]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), X)
+    assert_array_equal(transformer.inverse_transform(X), X)
+
+    X = ['a', True, 1, 'a', True, 1]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), X)
+    assert_array_equal(transformer.inverse_transform(X), X)
+
+    X = ["a", "b", "c", "a", "b", "c"]
+    transformer.fit(X)
+    assert_array_equal(transformer.transform(X), X)
+    assert_array_equal(transformer.inverse_transform(X), X)
+
+
+@pytest.mark.fast_test
+def test_normalize_integer():
+    transformer = Normalize(1, 20, is_int=True)
+    assert transformer.transform(19.8) == 1.0
+    assert transformer.transform(20.2) == 1.0
+    assert transformer.transform(1.2) == 0.0
+    assert transformer.transform(0.9) == 0.0
+    assert_raises(ValueError, transformer.transform, 20.6)
+    assert_raises(ValueError, transformer.transform, 0.4)
+
+    assert transformer.inverse_transform(0.99) == 20
+    assert transformer.inverse_transform(0.01) == 1
+    assert_raises(ValueError, transformer.inverse_transform, 1. + 1e-6)
+    assert_raises(ValueError, transformer.transform, 0. - 1e-6)
+    transformer = Normalize(0, 20, is_int=True)
+    assert transformer.transform(-0.2) == 0.0
+    assert_raises(ValueError, transformer.transform, -0.6)
+
+
+@pytest.mark.fast_test
+def test_normalize():
+    transformer = Normalize(1, 20, is_int=False)
+    assert transformer.transform(20.) == 1.0
+    assert transformer.transform(1.) == 0.0
+    assert_raises(ValueError, transformer.transform, 20. + 1e-6)
+    assert_raises(ValueError, transformer.transform, 1.0 - 1e-6)
+    assert_raises(ValueError, transformer.inverse_transform, 1. + 1e-6)
+    assert_raises(ValueError, transformer.transform, 0. - 1e-6)
diff --git a/skopt/tests/test_utils.py b/skopt/tests/test_utils.py
index 231679304..2e1c22fa7 100644
--- a/skopt/tests/test_utils.py
+++ b/skopt/tests/test_utils.py
@@ -3,17 +3,19 @@
 
 from numpy.testing import assert_array_equal
 from numpy.testing import assert_equal
+from numpy.testing import assert_raises
 import numpy as np
 
-from skopt import gp_minimize
+from skopt import gp_minimize, forest_minimize
 from skopt import load
 from skopt import dump
-from skopt import expected_minimum
+from skopt import expected_minimum, expected_minimum_random_sampling
 from skopt.benchmarks import bench1
 from skopt.benchmarks import bench3
 from skopt.learning import ExtraTreesRegressor
 from skopt import Optimizer
 from skopt import Space
+from skopt.space import Dimension
 from skopt.utils import point_asdict
 from skopt.utils import point_aslist
 from skopt.utils import dimensions_aslist
@@ -21,6 +23,8 @@
 from skopt.utils import cook_estimator
 from skopt.utils import normalize_dimensions
 from skopt.utils import use_named_args
+from skopt.utils import check_list_types
+from skopt.utils import check_dimension_names
 from skopt.space import Real, Integer, Categorical
 
 
@@ -41,7 +45,7 @@ def test_dump_and_load():
                       x0=[0.],
                       acq_func="LCB",
                       n_calls=2,
-                      n_random_starts=0,
+                      n_random_starts=1,
                       random_state=1)
 
     # Test normal dumping and loading
@@ -102,6 +106,24 @@ def test_expected_minimum():
     assert f_min == f_min2
 
 
+@pytest.mark.fast_test
+def test_expected_minimum_random_sampling():
+    res = gp_minimize(bench3,
+                      [(-2.0, 2.0)],
+                      x0=[0.],
+                      noise=1e-8,
+                      n_calls=8,
+                      n_random_starts=3,
+                      random_state=1)
+
+    x_min, f_min = expected_minimum_random_sampling(res, random_state=1)
+    x_min2, f_min2 = expected_minimum_random_sampling(res, random_state=1)
+
+    assert f_min <= res.fun  # true since noise ~= 0.0
+    assert x_min == x_min2
+    assert f_min == f_min2
+
+
 @pytest.mark.fast_test
 def test_dict_list_space_representation():
     """
@@ -241,3 +263,45 @@ def func(foo, bar, baz):
     # argument that is an unnamed numpy array.
     res = func(np.array(default_parameters))
     assert (isinstance(res, float))
+
+
+@pytest.mark.fast_test
+def test_space_names_in_use_named_args():
+    space = [Integer(250, 2000, name='n_estimators')]
+
+    @use_named_args(space)
+    def objective(n_estimators):
+        return n_estimators
+
+    res = gp_minimize(objective, space, n_calls=10, random_state=0)
+    best_params = dict(zip((s.name for s in res.space), res.x))
+    assert 'n_estimators' in best_params
+    assert res.space.dimensions[0].name == 'n_estimators'
+
+
+@pytest.mark.fast_test
+def test_check_dimension_names():
+    # Define the search-space dimensions. They must all have names!
+    dim1 = Real(name='foo', low=0.0, high=1.0)
+    dim2 = Real(name='bar', low=0.0, high=1.0)
+    dim3 = Real(name='baz', low=0.0, high=1.0)
+
+    # Gather the search-space dimensions in a list.
+    dimensions = [dim1, dim2, dim3]
+    check_dimension_names(dimensions)
+    dimensions = [dim1, dim2, dim3, Real(-1, 1)]
+    assert_raises(ValueError, check_dimension_names, dimensions)
+
+
+@pytest.mark.fast_test
+def test_check_list_types():
+    # Define the search-space dimensions. They must all have names!
+    dim1 = Real(name='foo', low=0.0, high=1.0)
+    dim2 = Real(name='bar', low=0.0, high=1.0)
+    dim3 = Real(name='baz', low=0.0, high=1.0)
+
+    # Gather the search-space dimensions in a list.
+    dimensions = [dim1, dim2, dim3]
+    check_list_types(dimensions, Dimension)
+    dimensions = [dim1, dim2, dim3, "test"]
+    assert_raises(ValueError, check_list_types, dimensions, Dimension)
diff --git a/skopt/utils.py b/skopt/utils.py
index 912e067f7..a27e62c90 100644
--- a/skopt/utils.py
+++ b/skopt/utils.py
@@ -1,6 +1,6 @@
 from copy import deepcopy
 from functools import wraps
-
+from sklearn.utils import check_random_state
 import numpy as np
 from scipy.optimize import OptimizeResult
 from scipy.optimize import minimize as sp_minimize
@@ -8,7 +8,7 @@
 from sklearn.ensemble import GradientBoostingRegressor
 from joblib import dump as dump_
 from joblib import load as load_
-
+from collections import OrderedDict
 from .learning import ExtraTreesRegressor
 from .learning import GaussianProcessRegressor
 from .learning import GradientBoostingQuantileRegressor
@@ -16,7 +16,8 @@
 from .learning.gaussian_process.kernels import ConstantKernel
 from .learning.gaussian_process.kernels import HammingKernel
 from .learning.gaussian_process.kernels import Matern
-
+from .sampler import Sobol, Lhs, Hammersly, Halton, Grid
+from .sampler import InitialPointGenerator
 from .space import Space, Categorical, Integer, Real, Dimension
 
 __all__ = (
@@ -152,9 +153,10 @@ def load(filename, **kwargs):
     Reconstruct a skopt optimization result from a file
     persisted with skopt.dump.
 
-    Notice that the loaded optimization result can be missing
-    the objective function (`.specs['args']['func']`) if `skopt.dump`
-    was called with `store_objective=False`.
+    .. note::
+        Notice that the loaded optimization result can be missing
+        the objective function (`.specs['args']['func']`) if `skopt.dump`
+        was called with `store_objective=False`.
 
     Parameters
     ----------
@@ -199,11 +201,13 @@ def check_x_in_space(x, space):
 
 
 def expected_minimum(res, n_random_starts=20, random_state=None):
-    """
-    Compute the minimum over the predictions of the last surrogate model.
+    """Compute the minimum over the predictions of the last surrogate model.
+    Uses `expected_minimum_random_sampling` with `n_random_starts` = 100000,
+    when the space contains any categorical values.
 
-    Note that the returned minimum may not necessarily be an accurate
-    prediction of the minimum of the true objective function.
+    .. note::
+        The returned minimum may not necessarily be an accurate
+        prediction of the minimum of the true objective function.
 
     Parameters
     ----------
@@ -220,11 +224,15 @@ def expected_minimum(res, n_random_starts=20, random_state=None):
 
     Returns
     -------
-    x : list]
+    x : list
         location of the minimum.
     fun : float
         the surrogate function value at the minimum.
     """
+    if res.space.is_partly_categorical:
+        return expected_minimum_random_sampling(res, n_random_starts=100000,
+                                                random_state=random_state)
+
     def func(x):
         reg = res.models[-1]
         x = res.space.transform(x.reshape(1, -1))
@@ -247,6 +255,49 @@ def func(x):
     return [v for v in best_x], best_fun
 
 
+def expected_minimum_random_sampling(res, n_random_starts=100000,
+                                     random_state=None):
+    """Minimum search by doing naive random sampling, Returns the parameters
+    that gave the minimum function value. Can be used when the space
+    contains any categorical values.
+
+    .. note::
+        The returned minimum may not necessarily be an accurate
+        prediction of the minimum of the true objective function.
+
+    Parameters
+    ----------
+    res : `OptimizeResult`, scipy object
+        The optimization result returned by a `skopt` minimizer.
+
+    n_random_starts : int, default=100000
+        The number of random starts for the minimization of the surrogate
+        model.
+
+    random_state : int, RandomState instance, or None (default)
+        Set random state to something other than None for reproducible
+        results.
+
+    Returns
+    -------
+    x : list
+        location of the minimum.
+    fun : float
+        the surrogate function value at the minimum.
+    """
+
+    # sample points from search space
+    random_samples = res.space.rvs(n_random_starts, random_state=random_state)
+
+    # make estimations with surrogate
+    model = res.models[-1]
+    y_random = model.predict(res.space.transform(random_samples))
+    index_best_objective = np.argmin(y_random)
+    min_x = random_samples[index_best_objective]
+
+    return min_x, y_random[index_best_objective]
+
+
 def has_gradients(estimator):
     """
     Check if an estimator's ``predict`` method provides gradients.
@@ -280,8 +331,7 @@ def has_gradients(estimator):
 
 
 def cook_estimator(base_estimator, space=None, **kwargs):
-    """
-    Cook a default estimator.
+    """Cook a default estimator.
 
     For the special base_estimator called "DUMMY" the return value is None.
     This corresponds to sampling points at random, hence there is no need
@@ -289,8 +339,7 @@ def cook_estimator(base_estimator, space=None, **kwargs):
 
     Parameters
     ----------
-    base_estimator : "GP", "RF", "ET", "GBRT", "DUMMY"
-                        or sklearn regressor, default="GP"
+    base_estimator : "GP", "RF", "ET", "GBRT", "DUMMY" or sklearn regressor
         Should inherit from `sklearn.base.RegressorMixin`.
         In addition the `predict` method should have an optional `return_std`
         argument, which returns `std(Y | x)`` along with `E[Y | x]`.
@@ -320,7 +369,6 @@ def cook_estimator(base_estimator, space=None, **kwargs):
             space = Space(normalize_dimensions(space.dimensions))
             n_dims = space.transformed_n_dims
             is_cat = space.is_categorical
-
         else:
             raise ValueError("Expected a Space instance, not None.")
 
@@ -350,10 +398,58 @@ def cook_estimator(base_estimator, space=None, **kwargs):
     elif base_estimator == "DUMMY":
         return None
 
+    if ('n_jobs' in kwargs.keys()) and not hasattr(base_estimator, 'n_jobs'):
+        del kwargs['n_jobs']
+
     base_estimator.set_params(**kwargs)
     return base_estimator
 
 
+def cook_initial_point_generator(generator, **kwargs):
+    """Cook a default initial point generator.
+
+    For the special generator called "random" the return value is None.
+
+    Parameters
+    ----------
+    generator : "lhs", "sobol", "halton", "hammersly", "grid", "random" \
+            or InitialPointGenerator instance"
+        Should inherit from `skopt.sampler.InitialPointGenerator`.
+
+    kwargs : dict
+        Extra parameters provided to the generator at init time.
+    """
+    if generator is None:
+        generator = "random"
+    elif isinstance(generator, str):
+        generator = generator.lower()
+        if generator not in ["sobol", "halton", "hammersly", "lhs", "random",
+                             "grid"]:
+            raise ValueError("Valid strings for the generator parameter "
+                             " are: 'sobol', 'lhs', 'halton', 'hammersly',"
+                             "'random', or 'grid' not "
+                             "%s." % generator)
+    elif not isinstance(generator, InitialPointGenerator):
+        raise ValueError("generator has to be an InitialPointGenerator."
+                         "Got %s" % (str(type(generator))))
+
+    if isinstance(generator, str):
+        if generator == "sobol":
+            generator = Sobol()
+        elif generator == "halton":
+            generator = Halton()
+        elif generator == "hammersly":
+            generator = Hammersly()
+        elif generator == "lhs":
+            generator = Lhs()
+        elif generator == "grid":
+            generator = Grid()
+        elif generator == "random":
+            return None
+    generator.set_params(**kwargs)
+    return generator
+
+
 def dimensions_aslist(search_space):
     """Convert a dict representation of a search space into a list of
     dimensions, ordered by sorted(search_space.keys()).
@@ -376,8 +472,13 @@ def dimensions_aslist(search_space):
     >>> from skopt.utils import dimensions_aslist
     >>> search_space = {'name1': Real(0,1),
     ...                 'name2': Integer(2,4), 'name3': Real(-1,1)}
-    >>> dimensions_aslist(search_space)
-    [Real(0,1), Integer(2,4), Real(-1,1)]
+    >>> dimensions_aslist(search_space)[0]
+    Real(low=0, high=1, prior='uniform', transform='identity')
+    >>> dimensions_aslist(search_space)[1]
+    Integer(low=2, high=4, prior='uniform', transform='identity')
+    >>> dimensions_aslist(search_space)[2]
+    Real(low=-1, high=1, prior='uniform', transform='identity')
+
     """
     params_space_list = [
         search_space[k] for k in sorted(search_space.keys())
@@ -405,7 +506,7 @@ def point_asdict(search_space, point_as_list):
 
     Returns
     -------
-    params_dict : dict
+    params_dict : OrderedDict
         dictionary with parameter names as keys to which
         corresponding parameter values are assigned.
 
@@ -417,11 +518,11 @@ def point_asdict(search_space, point_as_list):
     ...                 'name2': Integer(2,4), 'name3': Real(-1,1)}
     >>> point_as_list = [0.66, 3, -0.15]
     >>> point_asdict(search_space, point_as_list)
-    {'name1': 0.66, 'name2': 3, 'name3': -0.15}
+    OrderedDict([('name1', 0.66), ('name2', 3), ('name3', -0.15)])
     """
-    params_dict = {
-        k: v for k, v in zip(sorted(search_space.keys()), point_as_list)
-    }
+    params_dict = OrderedDict()
+    for k, v in zip(sorted(search_space.keys()), point_as_list):
+        params_dict[k] = v
     return params_dict
 
 
@@ -490,41 +591,75 @@ def normalize_dimensions(dimensions):
     """
     space = Space(dimensions)
     transformed_dimensions = []
-    if space.is_categorical:
-        # recreate the space and explicitly set transform to "string"
-        # this is a special case for GP based regressors
-        for dimension in space:
-            transformed_dimensions.append(Categorical(dimension.categories,
-                                                      dimension.prior,
-                                                      name=dimension.name,
-                                                      transform="string"))
-
-    else:
-        for dimension in space.dimensions:
-            if isinstance(dimension, Categorical):
-                transformed_dimensions.append(dimension)
-            # To make sure that GP operates in the [0, 1] space
-            elif isinstance(dimension, Real):
-                transformed_dimensions.append(
-                    Real(dimension.low, dimension.high, dimension.prior,
-                         name=dimension.name,
-                         transform="normalize",
-                         dtype=dimension.dtype)
-                    )
-            elif isinstance(dimension, Integer):
-                transformed_dimensions.append(
-                    Integer(dimension.low, dimension.high,
-                            name=dimension.name,
-                            transform="normalize",
-                            dtype=dimension.dtype)
-                    )
-            else:
-                raise RuntimeError("Unknown dimension type "
-                                   "(%s)" % type(dimension))
+    for dimension in space.dimensions:
+        # check if dimension is of a Dimension instance
+        if isinstance(dimension, Dimension):
+            # Change the transformer to normalize
+            # and add it to the new transformed dimensions
+            dimension.set_transformer("normalize")
+            transformed_dimensions.append(
+                dimension
+            )
+        else:
+            raise RuntimeError("Unknown dimension type "
+                               "(%s)" % type(dimension))
 
     return Space(transformed_dimensions)
 
 
+def check_list_types(x, types):
+    """
+    Check whether all elements of a list `x` are of the correct type(s)
+    and raise a ValueError if they are not.
+
+    Note that `types` can be either a single object-type or a tuple
+    of object-types.
+
+    Raises `ValueError`, If one or more element in the list `x` is
+    not of the correct type(s).
+
+    Parameters
+    ----------
+    x : list
+        List of objects.
+
+    types : object or list(object)
+        Either a single object-type or a tuple of object-types.
+
+    """
+
+    # List of the elements in the list that are incorrectly typed.
+    err = list(filter(lambda a: not isinstance(a, types), x))
+
+    # If the list is non-empty then raise an exception.
+    if len(err) > 0:
+        msg = "All elements in list must be instances of {}, but found: {}"
+        msg = msg.format(types, err)
+        raise ValueError(msg)
+
+
+def check_dimension_names(dimensions):
+    """
+    Check whether all dimensions have names. Raises `ValueError`,
+    if one or more dimensions are unnamed.
+
+    Parameters
+    ----------
+    dimensions : list(Dimension)
+        List of Dimension-objects.
+
+    """
+
+    # List of the dimensions that have no names.
+    err_dims = list(filter(lambda dim: dim.name is None, dimensions))
+
+    # If the list is non-empty then raise an exception.
+    if len(err_dims) > 0:
+        msg = "All dimensions must have names, but found: {}"
+        msg = msg.format(err_dims)
+        raise ValueError(msg)
+
+
 def use_named_args(dimensions):
     """
     Wrapper / decorator for an objective function that uses named arguments
@@ -546,7 +681,7 @@ def use_named_args(dimensions):
     Examples
     --------
     >>> # Define the search-space dimensions. They must all have names!
-    >>> from skopt.space.Space import Real
+    >>> from skopt.space import Real
     >>> from skopt import forest_minimize
     >>> from skopt.utils import use_named_args
     >>> dim1 = Real(name='foo', low=0.0, high=1.0)
@@ -560,8 +695,8 @@ def use_named_args(dimensions):
     >>> # and use this function-decorator to specify the
     >>> # search-space dimensions.
     >>> @use_named_args(dimensions=dimensions)
-    >>> def my_objective_function(foo, bar, baz):
-    >>>     return foo ** 2 + bar ** 4 + baz ** 8
+    ... def my_objective_function(foo, bar, baz):
+    ...     return foo ** 2 + bar ** 4 + baz ** 8
     >>>
     >>> # Not the function is callable from the outside as
     >>> # `my_objective_function(x)` where `x` is a list of unnamed arguments,
@@ -578,9 +713,11 @@ def use_named_args(dimensions):
     ...                          n_calls=20, base_estimator="ET",
     ...                          random_state=4)
     >>>
-    >>> # Print the best-found results.
-    >>> print("Best fitness:", result.fun)
-    >>> print("Best parameters:", result.x)
+    >>> # Print the best-found results in same format as the expected result.
+    >>> print("Best fitness: " + str(result.fun))
+    Best fitness: 0.1948080835239698
+    >>> print("Best parameters: {}".format(result.x))
+    Best parameters: [0.44134853091052617, 0.06570954323368307, 0.17586123323419825]
 
     Parameters
     ----------
@@ -610,25 +747,10 @@ def decorator(func):
         """
 
         # Ensure all dimensions are correctly typed.
-        if not all(isinstance(dim, Dimension) for dim in dimensions):
-            # List of the dimensions that are incorrectly typed.
-            err_dims = list(filter(lambda dim: not isinstance(dim, Dimension),
-                                   dimensions))
-
-            # Error message.
-            msg = "All dimensions must be instances of the Dimension-class, but found: {}"
-            msg = msg.format(err_dims)
-            raise ValueError(msg)
+        check_list_types(dimensions, Dimension)
 
         # Ensure all dimensions have names.
-        if any(dim.name is None for dim in dimensions):
-            # List of the dimensions that have no names.
-            err_dims = list(filter(lambda dim: dim.name is None, dimensions))
-
-            # Error message.
-            msg = "All dimensions must have names, but found: {}"
-            msg = msg.format(err_dims)
-            raise ValueError(msg)
+        check_dimension_names(dimensions)
 
         @wraps(func)
         def wrapper(x):