larsmans
diff --git a/‎doc/modules/ensemble.rst
Lines changed: 36 additions & 25 deletions b/‎doc/modules/ensemble.rst
Lines changed: 36 additions & 25 deletions
diff --git a/‎sklearn/ensemble/forest.py
Lines changed: 49 additions & 4 deletions b/‎sklearn/ensemble/forest.py
Lines changed: 49 additions & 4 deletions
@@ -34,7 +34,7 @@ Forests of randomized trees
 The ``sklearn.ensemble`` module includes two averaging algorithms based on
 randomized :ref:`decision trees <tree>`: the RandomForest algorithm and the
 Extra-Trees method. Both algorithms are perturb-and-combine techniques
-specifically designed for trees.
+specifically designed for trees::
 
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> X = [[0, 0], [1, 1]]
@@ -60,39 +60,50 @@ features is used, but instead of looking for the most discriminative thresholds,
 thresholds are drawn at random for each candidate feature and the best of these
 randomly-generated thresholds is picked as the splitting rule. This usually
 allows to reduce the variance of the model a bit more, at the expense of a
-slightly greater increase in bias.
+slightly greater increase in bias::
 
     >>> from sklearn.cross_validation import cross_val_score
     >>> from sklearn.datasets import make_blobs
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.ensemble import ExtraTreesClassifier
     >>> from sklearn.tree import DecisionTreeClassifier
-    >>> X, y = make_blobs(n_samples=10000, n_features=10, centers=100)
-    >>> clf = DecisionTreeClassifier(max_depth=None, min_split=1)
+
+    >>> X, y = make_blobs(n_samples=10000, n_features=10, centers=100,
+    ...     random_state=0)
+
+    >>> clf = DecisionTreeClassifier(max_depth=None, min_split=1,
+    ...     random_state=0)
     >>> scores = cross_val_score(clf, X, y)
-    >>> scores.mean()
-    0.97609967955403809
-    >>> clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_split=1)
+    >>> scores.mean()                             # doctest: +ELLIPSIS
+    0.978...
+
+    >>> clf = RandomForestClassifier(n_estimators=10, max_depth=None,
+    ...     min_split=1, random_state=0)
     >>> scores = cross_val_score(clf, X, y)
-    >>> scores.mean()
-    0.99510028987301846
-    >>> clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_split=1)
+    >>> scores.mean()                             # doctest: +ELLIPSIS
+    0.992...
+
+    >>> clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,
+    ...     min_split=1, random_state=0)
     >>> scores = cross_val_score(clf, X, y)
-    >>> scores.mean()
-    1.0
-
-The main parameters to adjust when using these methods is ``n_estimators`` and
-``max_features``. The former is the number of trees in the forest. The  larger
-the better, but also the longer it will take to compute. The latter is the size
-of the random subsets of features to consider when splitting a node. The lower
-the greater the reduction of variance, but also the greater the increase in
-bias. Empiricial good default values are ``max_features=M`` in random forests,
-and ``max_features=sqrt(M)`` in extra-trees (where ``M`` is the number of
-features in the data). The best results are also usually reached when setting
-``max_depth=None`` in combination with ``min_split=1`` (i.e., when fully
-developping the trees). Finally, note that bootstrap samples are used by default
-in random forests (``bootstrap=True``) while the default strategy is to use the
-original datasets for building extra-trees (``bootstrap=False``).
+    >>> scores.mean() > 0.999
+    True
+
+The main parameters to adjust when using these methods is ``n_estimators``
+and ``max_features``. The former is the number of trees in the
+forest. The  larger the better, but also the longer it will take to
+compute. The latter is the size of the random subsets of features to
+consider when splitting a node. The lower the greater the reduction of
+variance, but also the greater the increase in bias. Empiricial good
+default values are ``max_features=n_features`` in random forests, and
+``max_features=sqrt(n_features)`` in extra-trees (where ``n_features``
+is the number of features in the data). The best results are also
+usually reached when setting ``max_depth=None`` in combination with
+``min_split=1`` (i.e., when fully developping the trees).
+
+Finally, note that bootstrap samples are used by default in random forests
+(``bootstrap=True``) while the default strategy is to use the original
 
 .. topic:: Examples:
 
 
@@ -1,6 +1,33 @@
-"""
-This module gathers forest of trees-based ensemble methods, including random
-forests and extra-trees.
+"""Forest of trees-based ensemble methods
+
+Those methods include random forests and extremly randomized trees.
+
+The module structure is the following:
+
+- The ``Forest`` base class implements a common ``fit`` method for all
+  the estimators the module. The ``fit`` method of the base ``Forest``
+  class calls the ``fit`` method of each sub-estimator on random samples
+  (with replacement, aka. bootstrap) of the training set.
+
+  The init of the sub-estimator is further delegated to the
+  ``BaseEnsemble`` constructor.
+
+- The ``ForestClassifier`` and ``ForestRegressor`` base classes further
+  implement the prediction logic by computing an average of the predicted
+  outcomes of the sub-estimators.
+
+- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
+  classes provide the user with concrete implementations of
+  the forest ensemble method using classical, deterministic
+  ``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as default
+  sub-estimator implementation.
+
+- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
+  classes provide the user with concrete implementations of the
+  forest ensemble method using the extremly randomized trees
+  ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as default
+  sub-estimator implementation.
+
 """
 
 # Authors: Gilles Louppe, Brian Holt
@@ -9,7 +36,7 @@
 import numpy as np
 
 from ..base import clone
-from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
+from ..base import ClassifierMixin, RegressorMixin
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor, \
                    ExtraTreeClassifier, ExtraTreeRegressor
 from ..utils import check_random_state
@@ -216,6 +243,10 @@ def predict(self, X):
 class RandomForestClassifier(ForestClassifier):
     """A random forest classifier.
 
+    A random forest is a meta estimator that fits a number of classifical
+    decision trees on various sub-samples of the dataset and use averaging
+    to improve the predictive accuracy and control over-fitting.
+
     Parameters
     ----------
     base_estimator : object, optional (default=None)
@@ -275,6 +306,10 @@ def __init__(self, base_estimator=None,
 class RandomForestRegressor(ForestRegressor):
     """A random forest regressor.
 
+    A random forest is a meta estimator that fits a number of classifical
+    decision trees on various sub-samples of the dataset and use averaging
+    to improve the predictive accuracy and control over-fitting.
+
     Parameters
     ----------
     base_estimator : object, optional (default=None)
@@ -334,6 +369,11 @@ def __init__(self, base_estimator=None,
 class ExtraTreesClassifier(ForestClassifier):
     """An extra-trees classifier.
 
+    This class implements a meta estimator that fits a number of
+    randomized decision trees (a.k.a. extra-trees) on various sub-samples
+    of the dataset and use averaging to improve the predictive accuracy
+    and control over-fitting.
+
     Parameters
     ----------
     base_estimator : object, optional (default=None)
@@ -394,6 +434,11 @@ def __init__(self, base_estimator=None,
 class ExtraTreesRegressor(ForestRegressor):
     """An extra-trees regressor.
 
+    This class implements a meta estimator that fits a number of
+    randomized decision trees (a.k.a. extra-trees) on various sub-samples
+    of the dataset and use averaging to improve the predictive accuracy
+    and control over-fitting.
+
     Parameters
     ----------
     base_estimator : object, optional (default=None)