From f55dc4733a26a2b9142db79e05812841d332fc2f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 24 Oct 2022 11:54:26 -0400 Subject: [PATCH 1/2] Refactor Python tree class --- sklearn/tree/_classes.py | 216 +++++++++++++++++++++------------------ 1 file changed, 119 insertions(+), 97 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e2e41f9aea78b..615b9e2d9ec9d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -173,79 +173,80 @@ def get_n_leaves(self): check_is_fitted(self) return self.tree_.n_leaves - def fit(self, X, y, sample_weight=None, check_input=True): + def fit(self, X, y=None, sample_weight=None, check_input=True): self._validate_params() random_state = check_random_state(self.random_state) - if check_input: - # Need to validate separately here. - # We can't pass multi_output=True because that would allow y to be - # csr. - check_X_params = dict(dtype=DTYPE, accept_sparse="csc") - check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) - if issparse(X): - X.sort_indices() - - if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: - raise ValueError( - "No support for np.int64 index based sparse matrices" - ) - - if self.criterion == "poisson": - if np.any(y < 0): - raise ValueError( - "Some value(s) of y are negative which is" - " not allowed for Poisson regression." - ) - if np.sum(y) <= 0: - raise ValueError( - "Sum of y is not positive which is " - "necessary for Poisson regression." - ) - - # Determine output settings - n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - - y = np.atleast_1d(y) - expanded_class_weight = None - - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) - - self.n_outputs_ = y.shape[1] + if y is not None: + if check_input: + # Need to validate separately here. + # We can't pass multi_output=True because that would allow y to be + # csr. + check_X_params = dict(dtype=DTYPE, accept_sparse="csc") + check_y_params = dict(ensure_2d=False, dtype=None) + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + if issparse(X): + X.sort_indices() + + if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: + raise ValueError( + "No support for np.int64 index based sparse matrices" + ) + + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError( + "Some value(s) of y are negative which is" + " not allowed for Poisson regression." + ) + if np.sum(y) <= 0: + raise ValueError( + "Sum of y is not positive which is " + "necessary for Poisson regression." + ) + + # Determine output settings + n_samples, self.n_features_in_ = X.shape + is_classification = is_classifier(self) + + y = np.atleast_1d(y) + expanded_class_weight = None + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + self.n_outputs_ = y.shape[1] - if is_classification: - check_classification_targets(y) - y = np.copy(y) + if is_classification: + check_classification_targets(y) + y = np.copy(y) - self.classes_ = [] - self.n_classes_ = [] + self.classes_ = [] + self.n_classes_ = [] - if self.class_weight is not None: - y_original = np.copy(y) + if self.class_weight is not None: + y_original = np.copy(y) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth @@ -320,40 +321,16 @@ def fit(self, X, y, sample_weight=None, check_input=True): min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) # Build tree - criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: - criterion = CRITERIA_CLF[self.criterion]( - self.n_outputs_, self.n_classes_ - ) - else: - criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) - else: - # Make a deepcopy in case the criterion has mutable attributes that - # might be shared and modified concurrently during parallel fitting - criterion = copy.deepcopy(criterion) - - SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS + # set the Cython criterion functionality + criterion = self._set_criterion(n_samples, is_classification) - splitter = self.splitter - if not isinstance(self.splitter, Splitter): - splitter = SPLITTERS[self.splitter]( - criterion, - self.max_features_, - min_samples_leaf, - min_weight_leaf, - random_state, - ) - - if is_classifier(self): - self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_) - else: - self.tree_ = Tree( - self.n_features_in_, - # TODO: tree shouldn't need this in this case - np.array([1] * self.n_outputs_, dtype=np.intp), - self.n_outputs_, - ) + # set the Cython splitter functionality + X_issparse = issparse(X) + splitter = self._set_splitter(X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state) + + # set the Cython tree functionality + tree = self._set_tree() + self.tree_ = tree # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: @@ -385,6 +362,47 @@ def fit(self, X, y, sample_weight=None, check_input=True): self._prune_tree() return self + + def _set_criterion(self, n_samples, is_classification): + criterion = self.criterion + if not isinstance(criterion, Criterion): + if is_classification: + criterion = CRITERIA_CLF[self.criterion]( + self.n_outputs_, self.n_classes_ + ) + else: + criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + return criterion + + def _set_tree(self): + if is_classifier(self): + tree = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_) + else: + tree = Tree( + self.n_features_in_, + # TODO: tree shouldn't need this in this case + np.array([1] * self.n_outputs_, dtype=np.intp), + self.n_outputs_, + ) + return tree + + def _set_splitter(self, X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state): + SPLITTERS = SPARSE_SPLITTERS if X_issparse else DENSE_SPLITTERS + + splitter = self.splitter + if not isinstance(self.splitter, Splitter): + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + ) + return splitter def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" @@ -587,6 +605,10 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() +class SupervisedDecisionTree(BaseDecisionTree): + def + + # ============================================================================= # Public estimators # ============================================================================= From e7ffb86884487a00db96bf6f4bf540783cd8bc1f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 24 Oct 2022 17:07:13 -0400 Subject: [PATCH 2/2] Fix flake8 --- sklearn/tree/_classes.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 615b9e2d9ec9d..81ce686af25bc 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -321,14 +321,16 @@ def fit(self, X, y=None, sample_weight=None, check_input=True): min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) # Build tree - # set the Cython criterion functionality + # set the Cython criterion functionality criterion = self._set_criterion(n_samples, is_classification) - # set the Cython splitter functionality + # set the Cython splitter functionality X_issparse = issparse(X) - splitter = self._set_splitter(X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state) - - # set the Cython tree functionality + splitter = self._set_splitter( + X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state + ) + + # set the Cython tree functionality tree = self._set_tree() self.tree_ = tree @@ -362,7 +364,7 @@ def fit(self, X, y=None, sample_weight=None, check_input=True): self._prune_tree() return self - + def _set_criterion(self, n_samples, is_classification): criterion = self.criterion if not isinstance(criterion, Criterion): @@ -390,7 +392,9 @@ def _set_tree(self): ) return tree - def _set_splitter(self, X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state): + def _set_splitter( + self, X_issparse, criterion, min_samples_leaf, min_weight_leaf, random_state + ): SPLITTERS = SPARSE_SPLITTERS if X_issparse else DENSE_SPLITTERS splitter = self.splitter @@ -605,10 +609,6 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() -class SupervisedDecisionTree(BaseDecisionTree): - def - - # ============================================================================= # Public estimators # =============================================================================