sebp
diff --git a/‎requirements/prod.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/prod.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎sksurv/ensemble/boosting.py
Lines changed: 17 additions & 25 deletions b/‎sksurv/ensemble/boosting.py
Lines changed: 17 additions & 25 deletions
diff --git a/‎sksurv/ensemble/forest.py
Lines changed: 28 additions & 7 deletions b/‎sksurv/ensemble/forest.py
Lines changed: 28 additions & 7 deletions
diff --git a/‎sksurv/tree/tree.py
Lines changed: 75 additions & 20 deletions b/‎sksurv/tree/tree.py
Lines changed: 75 additions & 20 deletions
@@ -6,4 +6,4 @@ numpy
 osqp !=0.6.0,!=0.6.1
 pandas >=0.21,<0.26
 scipy >=1.0,!=1.3.0
-scikit-learn >=0.21.0,<0.22
+scikit-learn >=0.22.0,<0.23
@@ -11,6 +11,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import numbers
+import warnings
 
 import numpy
 
@@ -473,11 +474,8 @@ class GradientBoostingSurvivalAnalysis(BaseGradientBoosting, SurvivalAnalysisMix
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    presort : bool or 'auto', optional, default: 'auto'
-        Whether to presort the data to speed up the finding of best splits in
-        fitting. Auto mode by default will use presorting on dense data and
-        default to normal sorting on sparse data. Setting presort to true on
-        sparse data will raise an error.
+    presort : deprecated, optional, default: 'deprecated'
+        This parameter is deprecated and will be removed in a future version.
 
     subsample : float, optional, default: 1.0
         The fraction of samples to be used for fitting the individual regression
@@ -498,6 +496,10 @@ class GradientBoostingSurvivalAnalysis(BaseGradientBoosting, SurvivalAnalysisMix
         once in a while (the more trees the lower the frequency). If greater
         than 1 then it prints progress and performance for every tree.
 
+    ccp_alpha : non-negative float, optional, default: 0.0.
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed.
 
     Attributes
     ----------
@@ -543,9 +545,10 @@ def __init__(self, loss="coxph", learning_rate=0.1, n_estimators=100,
                  max_depth=3, min_impurity_split=None,
                  min_impurity_decrease=0., random_state=None,
                  max_features=None, max_leaf_nodes=None,
-                 presort='auto',
+                 presort='deprecated',
                  subsample=1.0, dropout_rate=0.0,
-                 verbose=0):
+                 verbose=0,
+                 ccp_alpha=0.0):
         super().__init__(loss=loss,
                          learning_rate=learning_rate,
                          n_estimators=n_estimators,
@@ -562,7 +565,8 @@ def __init__(self, loss="coxph", learning_rate=0.1, n_estimators=100,
                          max_features=max_features,
                          max_leaf_nodes=max_leaf_nodes,
                          presort=presort,
-                         verbose=verbose)
+                         verbose=verbose,
+                         ccp_alpha=ccp_alpha)
         self.dropout_rate = dropout_rate
 
     def _check_params(self):
@@ -594,10 +598,11 @@ def _check_params(self):
 
         self.max_features_ = max_features
 
-        allowed_presort = ('auto', True, False)
-        if self.presort not in allowed_presort:
-            raise ValueError("'presort' should be in {}. Got {!r} instead."
-                             .format(allowed_presort, self.presort))
+        if self.presort != 'deprecated':
+            warnings.warn("The parameter 'presort' is deprecated and has no "
+                          "effect. It will be removed in v0.24. You can "
+                          "suppress this warning by not passing any value "
+                          "to the 'presort' parameter.", DeprecationWarning)
 
         if self.loss not in LOSS_FUNCTIONS:
             raise ValueError("Loss {!r} not supported.".format(self.loss))
@@ -835,20 +840,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # The rng state must be preserved if warm_start is True
         self._rng = check_random_state(self.random_state)
 
-        if self.presort is True and issparse(X):
-            raise ValueError(
-                "Presorting is not supported for sparse matrices.")
-
-        presort = self.presort
-        # Allow presort to be 'auto', which means True if the dataset is dense,
-        # otherwise it will be False.
-        if presort == 'auto':
-            presort = not issparse(X)
-
         X_idx_sorted = None
-        if presort:
-            X_idx_sorted = numpy.asfortranarray(numpy.argsort(X, axis=0),
-                                                dtype=numpy.int32)
 
         # fit the boosting stages
         y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)])
 
@@ -2,8 +2,8 @@
 import warnings
 import numpy as np
 from sklearn.ensemble.base import _partition_estimators
-from sklearn.ensemble.forest import BaseForest, _accumulate_prediction, \
-    _generate_unsampled_indices, _parallel_build_trees
+from sklearn.ensemble._forest import BaseForest, _accumulate_prediction, \
+    _generate_unsampled_indices, _get_n_samples_bootstrap, _parallel_build_trees
 from sklearn.tree._tree import DTYPE
 from sklearn.utils._joblib import Parallel, delayed
 from sklearn.utils.fixes import _joblib_parallel_args
@@ -117,6 +117,11 @@ class RandomSurvivalForest(BaseForest, SurvivalAnalysisMixin):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
+    ccp_alpha : non-negative float, optional, default: 0.0.
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed.
+
     Attributes
     ----------
     estimators_ : list of SurvivalTree instances
@@ -177,7 +182,9 @@ def __init__(self,
                  n_jobs=None,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=SurvivalTree(),
             n_estimators=n_estimators,
@@ -187,20 +194,23 @@ def __init__(self,
                               "min_weight_fraction_leaf",
                               "max_features",
                               "max_leaf_nodes",
-                              "random_state"),
+                              "random_state",
+                              "ccp_alpha"),
             bootstrap=bootstrap,
             oob_score=oob_score,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            max_samples=max_samples)
 
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
         self.max_features = max_features
         self.max_leaf_nodes = max_leaf_nodes
+        self.ccp_alpha = ccp_alpha
 
     @property
     def feature_importances_(self):
@@ -234,6 +244,12 @@ def fit(self, X, y, sample_weight=None):
         y_numeric[:, 0] = time.astype(np.float64)
         y_numeric[:, 1] = event.astype(np.float64)
 
+        # Get bootstrap sample size
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples=X.shape[0],
+            max_samples=self.max_samples
+        )
+
         # Check parameters
         self._validate_estimator()
 
@@ -277,7 +293,8 @@ def fit(self, X, y, sample_weight=None):
                              **_joblib_parallel_args(prefer='threads'))(
                 delayed(_parallel_build_trees)(
                     t, self, X, (y_numeric, self.event_times_), sample_weight, i, len(trees),
-                    verbose=self.verbose)
+                    verbose=self.verbose,
+                    n_samples_bootstrap=n_samples_bootstrap)
                 for i, t in enumerate(trees))
 
             # Collect newly grown trees
@@ -298,9 +315,13 @@ def _set_oob_score(self, X, y):
         predictions = np.zeros(n_samples)
         n_predictions = np.zeros(n_samples)
 
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples, self.max_samples
+        )
+
         for estimator in self.estimators_:
             unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state, n_samples)
+                estimator.random_state, n_samples, n_samples_bootstrap)
             p_estimator = estimator.predict(
                 X[unsampled_indices, :], check_input=False)
 
 
@@ -1,11 +1,15 @@
 from math import ceil
 import numbers
+import warnings
 import numpy as np
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
 from sklearn.tree import _tree
 from sklearn.tree._splitter import Splitter
 from sklearn.tree._tree import BestFirstTreeBuilder, DepthFirstTreeBuilder, Tree
+from sklearn.tree._tree import _build_pruned_tree_ccp
+from sklearn.tree._tree import ccp_pruning_path
 from sklearn.tree.tree import DENSE_SPLITTERS
+from sklearn.utils import Bunch
 from sklearn.utils.validation import check_array, check_is_fitted, check_random_state
 
 from ..base import SurvivalAnalysisMixin
@@ -89,12 +93,13 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    presort : bool, optional, default: False
-        Whether to presort the data to speed up the finding of best splits in
-        fitting. For the default settings of a decision tree on large
-        datasets, setting this to true may slow down the training process.
-        When using either a smaller dataset or a restricted depth, this may
-        speed up the training.
+    presort : deprecated, optional, default: 'deprecated'
+        This parameter is deprecated and will be removed in a future version.
+
+    ccp_alpha : non-negative float, optional, default: 0.0.
+        Complexity parameter used for Minimal Cost-Complexity Pruning. The
+        subtree with the largest cost complexity that is smaller than
+        ``ccp_alpha`` will be chosen. By default, no pruning is performed.
 
     Attributes
     ----------
@@ -132,7 +137,8 @@ def __init__(self,
                  max_features=None,
                  random_state=None,
                  max_leaf_nodes=None,
-                 presort=False):
+                 presort='deprecated',
+                 ccp_alpha=0.0):
         self.splitter = splitter
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
@@ -142,6 +148,7 @@ def __init__(self,
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
         self.presort = presort
+        self.ccp_alpha = ccp_alpha
 
     def fit(self, X, y, sample_weight=None, check_input=True,
             X_idx_sorted=None):
@@ -186,10 +193,6 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         n_samples, self.n_features_ = X.shape
         params = self._check_params(n_samples)
 
-        if params["presort"]:
-            X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
-                                             dtype=np.int32)
-
         self.n_outputs_ = self.event_times_.shape[0]
         # one "class" for CHF, one for survival function
         self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2
@@ -204,8 +207,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
                 self.max_features_,
                 params["min_samples_leaf"],
                 params["min_weight_leaf"],
-                random_state,
-                self.presort)
+                random_state)
 
         self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
 
@@ -230,8 +232,59 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         builder.build(self.tree_, X, y_numeric, sample_weight, X_idx_sorted)
 
+        self._prune_tree()
+
         return self
 
+    def _prune_tree(self):
+        """Prune tree using Minimal Cost-Complexity Pruning."""
+        check_is_fitted(self)
+
+        if self.ccp_alpha < 0.0:
+            raise ValueError("ccp_alpha must be greater than or equal to 0")
+
+        if self.ccp_alpha == 0.0:
+            return
+
+        # build pruned treee
+        n_classes = np.atleast_1d(self.n_classes_)
+        pruned_tree = Tree(self.n_features_, n_classes, self.n_outputs_)
+        _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
+
+        self.tree_ = pruned_tree
+
+    def cost_complexity_pruning_path(self, X, y, sample_weight=None):
+        """Compute the pruning path during Minimal Cost-Complexity Pruning.
+        See `ref`:minimal_cost_complexity_pruning` for details on the pruning
+        process.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+        Returns
+        -------
+        ccp_path : Bunch
+            Dictionary-like object, with attributes:
+            ccp_alphas : ndarray
+                Effective alphas of subtree during pruning.
+            impurities : ndarray
+                Sum of the impurities of the subtree leaves for the
+                corresponding alpha value in ``ccp_alphas``.
+        """
+        est = clone(self).set_params(ccp_alpha=0.0)
+        est.fit(X, y, sample_weight=sample_weight)
+        return Bunch(**ccp_pruning_path(est.tree_))
+
     def _check_params(self, n_samples):
         # Check parameters
@@ -252,11 +305,14 @@ def _check_params(self, n_samples):
         min_weight_leaf = self.min_weight_fraction_leaf * n_samples
         min_impurity_split = 1e-7
 
-        allowed_presort = ('auto', True, False)
-        if self.presort not in allowed_presort:
-            raise ValueError("'presort' should be in {}. Got {!r} instead."
-                             .format(allowed_presort, self.presort))
-        presort = True if self.presort == 'auto' else self.presort
+        if self.presort != 'deprecated':
+            warnings.warn("The parameter 'presort' is deprecated and has no "
+                          "effect. It will be removed in v0.24. You can "
+                          "suppress this warning by not passing any value "
+                          "to the 'presort' parameter.", DeprecationWarning)
+
+        if self.ccp_alpha < 0.0:
+            raise ValueError("ccp_alpha must be greater than or equal to 0")
 
         return {
             "max_depth": max_depth,
@@ -265,7 +321,6 @@ def _check_params(self, n_samples):
             "min_samples_split": min_samples_split,
             "min_impurity_split": min_impurity_split,
             "min_weight_leaf": min_weight_leaf,
-            "presort": presort,
         }
 
     def _check_max_leaf_nodes(self):