From 6df4eac1a0eb71ebbe2d665aa96a82f19aff52fe Mon Sep 17 00:00:00 2001 From: Christof Angermueller Date: Mon, 16 Feb 2015 21:48:35 +0000 Subject: [PATCH 1/4] Update documentation of predict_proba in tree module --- doc/modules/tree.rst | 16 ++++++++++++++-- sklearn/tree/tree.py | 3 +++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 5fdd13a38ec5e..7df76f39d0b07 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -101,11 +101,17 @@ holding the class labels for the training samples:: >>> clf = tree.DecisionTreeClassifier() >>> clf = clf.fit(X, Y) -After being fitted, the model can then be used to predict new values:: +After being fitted, the model can then be used to predict the class of samples:: >>> clf.predict([[2., 2.]]) array([1]) +Alternatively, the probability of each class can be predicted, which is the +fraction of training samples of the same class in a leaf:: + + >>> clf.predict_proba([[2., 2.]]) + array([[ 0., 1.]]) + :class:`DecisionTreeClassifier` is capable of both binary (where the labels are [-1, 1]) classification and multiclass (where the labels are [0, ..., K-1]) classification. @@ -155,11 +161,17 @@ a PDF file (or any other supported file type) directly in Python:: .. figure:: ../images/iris.pdf :align: center -After being fitted, the model can then be used to predict new values:: +After being fitted, the model can then be used to predict the class of samples:: >>> clf.predict(iris.data[:1, :]) array([0]) +Alternatively, the probability of each class can be predicted, which is the +fraction of training samples of the same class in a leaf:: + + >>> clf.predict_proba(iris.data[:1, :]) + array([[ 1., 0., 0.]]) + .. figure:: ../auto_examples/tree/images/plot_iris_001.png :target: ../auto_examples/tree/plot_iris.html :align: center diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 8ee0ae55f83dd..8d4a900d31324 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -545,6 +545,9 @@ def __init__(self, def predict_proba(self, X): """Predict class probabilities of the input samples X. + The predicted class probability is the fraction of samples of the same + class in a leaf. + Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] From 6035c988fa75f7aac2767eef761c37cc4be9ddba Mon Sep 17 00:00:00 2001 From: Christof Angermueller Date: Fri, 10 Apr 2015 19:12:44 +0100 Subject: [PATCH 2/4] Add conventions section to user guide --- doc/tutorial/basic/tutorial.rst | 66 +++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst index 5ac2f61c19563..13c1f18b0f111 100644 --- a/doc/tutorial/basic/tutorial.rst +++ b/doc/tutorial/basic/tutorial.rst @@ -250,3 +250,69 @@ Note that pickle has some security and maintainability issues. Please refer to section :ref:`model_persistence` for more detailed information about model persistence with scikit-learn. + +Conventions +----------- + +scikit-learn estimators follow certain rules to make their behavior more +predictive. + + +Type casting +~~~~~~~~~~~~ + +Unless otherwise specified, input will be cast to ``float64``, and regression +targets will be ``float64``. Consider the following example:: + + >>> import numpy as np + >>> from sklearn import random_projection + + >>> X = np.random.rand(10, 2000) + >>> X = np.array(X, dtype='float32') + >>> X.dtype + dtype('float32') + + >>> transformer = random_projection.GaussianRandomProjection() + >>> X_new = transformer.fit_transform(X) + >>> X_new.dtype + dtype('float64') + +The input data ``X`` is ``float32``, which is cast to ``float64`` by +``fit_transform(X)``. + + +Refitting and updating parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hyper-parameters of an estimator can be updated after it has be constructed by +changing the corresponding member variables. Calling ``fit()`` more than once +will overwrite what was learned by previous ``fit()``:: + + >>> import numpy as np + >>> from sklearn.svm import SVC + + >>> np.random.seed(0) + >>> X = np.random.rand(100, 10) + >>> y = np.random.binomial(1, 0.5, 100) + >>> XX = np.random.rand(5, 10) + + >>> clf = SVC() + >>> clf.kernel = 'linear' + >>> clf.fit(X, y) + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='linear', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) + >>> clf.predict(XX) + array([1, 0, 1, 1, 0]) + + >>> clf.kernel = 'rbf' + >>> clf.fit(X, y) + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='rbf', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) + >>> clf.predict(XX) + array([0, 0, 0, 1, 0]) + +Here, the default kernel ``rbf`` is first changed to ``linear`` after the +estimator has been constructed via ``SVC()``, and changed back to ``rbf`` to +refit the estimator and to make a second prediction. From 58315248f43af9db21c74fa8b8a7e849cbf48a97 Mon Sep 17 00:00:00 2001 From: Christof Angermueller Date: Fri, 10 Apr 2015 22:01:47 +0100 Subject: [PATCH 3/4] Update conventions section in userguide --- doc/tutorial/basic/tutorial.rst | 36 ++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst index 13c1f18b0f111..171340f566e73 100644 --- a/doc/tutorial/basic/tutorial.rst +++ b/doc/tutorial/basic/tutorial.rst @@ -261,8 +261,7 @@ predictive. Type casting ~~~~~~~~~~~~ -Unless otherwise specified, input will be cast to ``float64``, and regression -targets will be ``float64``. Consider the following example:: +Unless otherwise specified, input will be cast to ``float64``:: >>> import numpy as np >>> from sklearn import random_projection @@ -277,16 +276,43 @@ targets will be ``float64``. Consider the following example:: >>> X_new.dtype dtype('float64') -The input data ``X`` is ``float32``, which is cast to ``float64`` by +In this example, ``X`` is ``float32``, which is cast to ``float64`` by ``fit_transform(X)``. +Regression targets are cast to ``float64``, classification targets are +maintained:: + >>> from sklearn import datasets + >>> from sklearn.svm import SVC + + >>> iris = datasets.load_iris() + >>> clf = SVC() + >>> clf.fit(iris.data, iris.target) + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='rbf', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) + + >>> clf.predict(iris.data[:3]) + array([0, 0, 0]) + + >>> clf.fit(iris.data, iris.target_names[iris.target]) + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='rbf', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) + + >>> clf.predict(iris.data[:3]) # doctest: +NORMALIZE_WHITESPACE + array(['setosa', 'setosa', 'setosa'], dtype='>> import numpy as np >>> from sklearn.svm import SVC From 234be23d3d086443caefc96c3b2b221a811c019b Mon Sep 17 00:00:00 2001 From: Christof Angermueller Date: Sun, 12 Apr 2015 18:15:16 +0100 Subject: [PATCH 4/4] Use RandomState() in tutorial and rename variables --- doc/tutorial/basic/tutorial.rst | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst index 171340f566e73..615e13fe8e1f0 100644 --- a/doc/tutorial/basic/tutorial.rst +++ b/doc/tutorial/basic/tutorial.rst @@ -266,7 +266,8 @@ Unless otherwise specified, input will be cast to ``float64``:: >>> import numpy as np >>> from sklearn import random_projection - >>> X = np.random.rand(10, 2000) + >>> rng = np.random.RandomState(0) + >>> X = rng.rand(10, 2000) >>> X = np.array(X, dtype='float32') >>> X.dtype dtype('float32') @@ -310,33 +311,39 @@ array, since ``iris.target_names`` was for fitting. Refitting and updating parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Hyper-parameters of an estimator can be updated after it has been constructed by -changing the corresponding member variables. Calling ``fit()`` more than once -will overwrite what was learned by any previous ``fit()``:: +Hyper-parameters of an estimator can be updated after it has been constructed +via the :func:`sklearn.pipeline.Pipeline.set_params` method. Calling ``fit()`` +more than once will overwrite what was learned by any previous ``fit()``:: >>> import numpy as np >>> from sklearn.svm import SVC - >>> np.random.seed(0) - >>> X = np.random.rand(100, 10) - >>> y = np.random.binomial(1, 0.5, 100) - >>> XX = np.random.rand(5, 10) + >>> rng = np.random.RandomState(0) + >>> X = rng.rand(100, 10) + >>> y = rng.binomial(1, 0.5, 100) + >>> X_test = rng.rand(5, 10) >>> clf = SVC() - >>> clf.kernel = 'linear' + >>> clf.set_params(kernel='linear') + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='linear', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) >>> clf.fit(X, y) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) - >>> clf.predict(XX) + >>> clf.predict(X_test) array([1, 0, 1, 1, 0]) - >>> clf.kernel = 'rbf' + >>> clf.set_params(kernel='rbf') + SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, + kernel='rbf', max_iter=-1, probability=False, random_state=None, + shrinking=True, tol=0.001, verbose=False) >>> clf.fit(X, y) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) - >>> clf.predict(XX) + >>> clf.predict(X_test) array([0, 0, 0, 1, 0]) Here, the default kernel ``rbf`` is first changed to ``linear`` after the