8000 FEA Add missing-value support for ExtaTreeClassifier and ExtaTreeRegr… · scikit-learn/scikit-learn@dddf2f0 · GitHub
[go: up one dir, main page]

Skip to content

Commit dddf2f0

Browse files
authored
FEA Add missing-value support for ExtaTreeClassifier and ExtaTreeRegressor (#27966)
1 parent 4bdd398 commit dddf2f0

File tree

7 files changed

+286
-63
lines changed

7 files changed

+286
-63
lines changed

doc/modules/tree.rst

+28-2
Original file line numberDiff line numberDiff line change
@@ -579,11 +579,21 @@ Note that it fits much slower than the MSE criterion.
579579
Missing Values Support
580580
======================
581581

582-
:class:`DecisionTreeClassifier` and :class:`DecisionTreeRegressor`
583-
have built-in support for missing values when `splitter='best'` and criterion is
582+
:class:`DecisionTreeClassifier`, :class:`DecisionTreeRegressor`
583+
have built-in support for missing values using `splitter='best'`, where
584+
the splits are determined in a greedy fashion.
585+
:class:`ExtraTreeClassifier`, and :class:`ExtraTreeRegressor` have built-in
586+
support for missing values for `splitter='random'`, where the splits
587+
are determined randomly. For more details on how the splitter differs on
588+
non-missing values, see the :ref:`Forest section <forest>`.
589+
590+
The criterion supported when there are missing-values are
584591
`'gini'`, `'entropy`', or `'log_loss'`, for classification or
585592
`'squared_error'`, `'friedman_mse'`, or `'poisson'` for regression.
586593

594+
First we will describe how :class:`DecisionTreeClassifier`, :class:`DecisionTreeRegressor`
595+
handle missing-values in the data.
596+
587597
For each potential threshold on the non-missing data, the splitter will evaluate
588598
the split with all the missing values going to the left node or the right node.
589599

@@ -634,6 +644,22 @@ Decisions are made as follows:
634644
>>> tree.predict(X_test)
635645
array([1])
636646

647+
:class:`ExtraTreeClassifier`, and :class:`ExtraTreeRegressor` handle missing values
648+
in a slightly different way. When splitting a node, a random threshold will be chosen
649+
to split the non-missing values on. Then the non-missing values will be sent to the
650+
left and right child based on the randomly selected threshold, while the missing
651+
values will also be randomly sent to the left or right child. This is repeated for
652+
every feature considered at each split. The best split among these is chosen.
653+
654+
During prediction, the treatment of missing-values is the same as that of the
655+
decision tree:
656+
657+
- By default when predicting, the samples with missing values are classified
658+
with the class used in the split found during training.
659+
660+
- If no missing values are seen during training for a given feature, then during
661+
prediction missing values are mapped to the child with the most samples.
662+
637663
.. _minimal_cost_complexity_pruning:
638664

639665
Minimal Cost-Complexity Pruning

doc/whats_new/v1.6.rst

+9
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,15 @@ Changelog
204204
deprecated the `base_estimator` parameter in favor of `estimator`.
205205
:pr:`28494` by :user:`Adam Li <adam2392>`.
206206

207+
:mod:`sklearn.tree`
208+
...................
209+
210+
- |Feature| :class:`tree.ExtraTreeClassifier` and :class:`tree.ExtraTreeRegressor` now
211+
support missing-values in the data matrix ``X``. Missing-values are handled by
212+
randomly moving all of the samples to the left, or right child node as the tree is
213+
traversed.
214+
:pr:`27966` by :user:`Adam Li <adam2392>`.
215+
207216
Thanks to everyone who has contributed to the maintenance and improvement of
208217
the project since version 1.5, including:
209218

sklearn/ensemble/_iforest.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,9 @@ def fit(self, X, y=None, sample_weight=None):
315315
self : object
316316
Fitted estimator.
317317
"""
318-
X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype)
318+
X = self._validate_data(
319+
X, accept_sparse=["csc"], dtype=tree_dtype, force_all_finite=False
320+
)
319321
if issparse(X):
320322
# Pre-sort indices to avoid that each individual tree of the
321323
# ensemble sorts the indices.
@@ -515,7 +517,13 @@ def score_samples(self, X):
515517
model.score(X)
516518
"""
517519
# Check data
518-
X = self._validate_data(X, accept_sparse="csr", dtype=tree_dtype, reset=False)
520+
X = self._validate_data(
521+
X,
522+
accept_sparse="csr",
523+
dtype=tree_dtype,
524+
reset=False,
525+
force_all_finite=False,
526+
)
519527

520528
return self._score_samples(X)
521529

@@ -627,7 +635,8 @@ def _more_tags(self):
627635
"check_sample_weights_invariance": (
628636
"zero sample_weight is not equivalent to removing samples"
629637
),
630-
}
638+
},
639+
"allow_nan": True,
631640
}
632641

633642

sklearn/tree/_classes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1074,7 +1074,7 @@ def predict_log_proba(self, X):
10741074
def _more_tags(self):
10751075
# XXX: nan is only support for dense arrays, but we set this for common test to
10761076
# pass, specifically: check_estimators_nan_inf
1077-
allow_nan = self.splitter == "best" and self.criterion in {
1077+
allow_nan = self.splitter in ("best", "random") and self.criterion in {
10781078
"gini",
10791079
"log_loss",
10801080
"entropy",
@@ -1405,7 +1405,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
14051405
def _more_tags(self):
14061406
# XXX: nan is only support for dense arrays, but we set this for common test to
14071407
# pass, specifically: check_estimators_nan_inf
1408-
allow_nan = self.splitter == "best" and self.criterion in {
1408+
allow_nan = self.splitter in ("best", "random") and self.criterion in {
14091409
"squared_error",
14101410
"friedman_mse",
14111411
"poisson",

sklearn/tree/_splitter.pyx

+133-22
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ from scipy.sparse import issparse
1919

2020
cdef float64_t INFINITY = np.inf
2121

22+
# Allow for 32 bit float comparisons
23+
cdef float32_t INFINITY_32t = np.inf
24+
2225
# Mitigate precision differences between 32 bit and 64 bit
2326
cdef float32_t FEATURE_THRESHOLD = 1e-7
2427

@@ -479,6 +482,10 @@ cdef inline int node_split_best(
479482
current_split.threshold = feature_values[p_prev]
480483

481484
current_split.n_missing = n_missing
485+
486+
# if there are no missing values in the training data, during
487+
# test time, we send missing values to the branch that contains
488+
# the most samples during training time.
482489
if n_missing == 0:
483490
current_split.missing_go_to_left = n_left > n_right
484491
else:
@@ -680,7 +687,13 @@ cdef inline int node_split_random(
680687
# Draw random splits and pick the best
681688
cdef intp_t start = splitter.start
682689
cdef intp_t end = splitter.end
690+
cdef intp_t end_non_missing
691+
cdef intp_t n_missing = 0
692+
cdef bint has_missing = 0
693+
cdef intp_t n_left, n_right
694+
cdef bint missing_go_to_left
683695

696+
cdef intp_t[::1] samples = splitter.samples
684697
cdef intp_t[::1] features = splitter.features
685698
cdef intp_t[::1] constant_features = splitter.constant_features
686699
cdef intp_t n_features = splitter.n_features
@@ -758,12 +771,22 @@ cdef inline int node_split_random(
758771

759772
current_split.feature = features[f_j]
760773

761-
# Find min, max
774+
# Find min, max as we will randomly select a threshold between them
762775
partitioner.find_min_max(
763776
current_split.feature, &min_feature_value, &max_feature_value
764777
)
778+
n_missing = partitioner.n_missing
779+
end_non_missing = end - n_missing
765780

766-
if max_feature_value <= min_feature_value + FEATU F438 RE_THRESHOLD:
781+
if (
782+
# All values for this feature are missing, or
783+
end_non_missing == start or
784+
# This feature is considered constant (max - min <= FEATURE_THRESHOLD)
785+
max_feature_value <= min_feature_value + FEATURE_THRESHOLD
786+
):
787+
# We consider this feature constant in this case.
788+
# Since finding a split with a constant feature is not valuable,
789+
# we do not consider this feature for splitting.
767790
features[f_j], features[n_total_constants] = features[n_total_constants], current_split.feature
768791

769792
n_found_constants += 1
@@ -772,6 +795,8 @@ cdef inline int node_split_random(
772795

773796
f_i -= 1
774797
features[f_i], features[f_j] = features[f_j], features[f_i]
798+
has_missing = n_missing != 0
799+
criterion.init_missing(n_missing)
775800

776801
# Draw a random threshold
777802
current_split.threshold = rand_uniform(
@@ -780,15 +805,38 @@ cdef inline int node_split_random(
780805
random_state,
781806
)
782807

808+
if has_missing:
809+
# If there are missing values, then we randomly make all missing
810+
# values go to the right or left.
811+
#
812+
# Note: compared to the BestSplitter, we do not evaluate the
813+
# edge case where all the missing values go to the right node
814+
# and the non-missing values go to the left node. This is because
815+
# this would indicate a threshold outside of the observed range
816+
# of the feature. However, it is not clear how much probability weight should
817+
# be given to this edge case.
818+
missing_go_to_left = rand_int(0, 2, random_state)
819+
else:
820+
missing_go_to_left = 0
821+
criterion.missing_go_to_left = missing_go_to_left
822+
783823
if current_split.threshold == max_feature_value:
784824
current_split.threshold = min_feature_value
785825

786826
# Partition
787-
current_split.pos = partitioner.partition_samples(current_split.threshold)
827+
current_split.pos = partitioner.partition_samples(
828+
current_split.threshold
829+
)
830+
831+
if missing_go_to_left:
832+
n_left = current_split.pos - start + n_missing
833+
n_right = end_non_missing - current_split.pos
834+
else:
835+
n_left = current_split.pos - start
836+
n_right = end_non_missing - current_split.pos + n_missing
788837

789838
# Reject if min_samples_leaf is not guaranteed
790-
if (((current_split.pos - start) < min_samples_leaf) or
791-
((end - current_split.pos) < min_samples_leaf)):
839+
if n_left < min_samples_leaf or n_right < min_samples_leaf:
792840
continue
793841

794842
# Evaluate split
@@ -817,26 +865,44 @@ cdef inline int node_split_random(
817865
current_proxy_improvement = criterion.proxy_impurity_improvement()
818866

819867
if current_proxy_improvement > best_proxy_improvement:
868+
current_split.n_missing = n_missing
869+
870+
# if there are no missing values in the training data, during
871+
# test time, we send missing values to the branch that contains
872+
# the most samples during training time.
873+
if has_missing:
874+
current_split.missing_go_to_left = missing_go_to_left
875+
else:
876+
current_split.missing_go_to_left = n_left > n_right
877+
820878
best_proxy_improvement = current_proxy_improvement
821879
best_split = current_split # copy
822880

823881
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
824882
if best_split.pos < end:
825883
if current_split.feature != best_split.feature:
826-
# TODO: Pass in best.n_missing when random splitter supports missing values.
827884
partitioner.partition_samples_final(
828-
best_split.pos, best_split.threshold, best_split.feature, 0
885+
best_split.pos,
886+
best_split.threshold,
887+
best_split.feature,
888+
best_split.n_missing
829889
)
890+
criterion.init_missing(best_split.n_missing)
891+
criterion.missing_go_to_left = best_split.missing_go_to_left
830892

831893
criterion.reset()
832894
criterion.update(best_split.pos)
833895
criterion.children_impurity(
834896
&best_split.impurity_left, &best_split.impurity_right
835897
)
836898
best_split.improvement = criterion.impurity_improvement(
837-
impurity, best_split.impurity_left, best_split.impurity_right
899+
impurity,
900+
best_split.impurity_left,
901+
best_split.impurity_right
838902
)
839903

904+
shift_missing_values_to_left_if_required(&best_split, samples, end)
905+
840906
# Respect invariant for constant features: the original order of
841907
# element in features[:n_known_constants] must be preserved for sibling
842908
# and child nodes
@@ -941,29 +1007,68 @@ cdef class DensePartitioner:
9411007
float32_t* min_feature_value_out,
9421008
float32_t* max_feature_value_out,
9431009
) noexcept nogil:
944-
"""Find the minimum and maximum value for current_feature."""
1010+
"""Find the minimum and maximum value for current_feature.
1011+
1012+
Missing values are stored at the end of feature_values.
1013+
The number of missing values observed in feature_values is stored
1014+
in self.n_missing.
1015+
"""
9451016
cdef:
946-
intp_t p
1017+
intp_t p, current_end
9471018
float32_t current_feature_value
9481019
const float32_t[:, :] X = self.X
9491020
intp_t[::1] samples = self.samples
950-
float32_t min_feature_value = X[samples[self.start], current_feature]
951-
float32_t max_feature_value = min_feature_value
1021+
float32_t min_feature_value = INFINITY_32t
1022+
float32_t max_feature_value = -INFINITY_32t
9521023
float32_t[::1] feature_values = self.feature_values
1024+
intp_t n_missing = 0
1025+
const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
9531026

954-
feature_values[self.start] = min_feature_value
1027+
# We are copying the values into an array and
1028+
# finding min/max of the array in a manner which utilizes the cache more
1029+
# effectively. We need to also count the number of missing-values there are
1030+
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
1031+
p, current_end = self.start, self.end - 1
1032+
# Missing values are placed at the end and do not participate in the
1033+
# min/max calculation.
1034+
while p <= current_end:
1035+
# Finds the right-most value that is not missing so that
1036+
# it can be swapped with missing values towards its left.
1037+
if isnan(X[samples[current_end], current_feature]):
1038+
n_missing += 1
1039+
current_end -= 1
1040+
continue
9551041

956-
for p in range(self.start + 1, self.end):
957-
current_feature_value = X[samples[p], current_feature]
958-
feature_values[p] = current_feature_value
1042+
# X[samples[current_end], current_feature] is a non-missing value
1043+
if isnan(X[samples[p], current_feature]):
1044+
samples[p], samples[current_end] = samples[current_end], samples[p]
1045+
n_missing += 1
1046+
current_end -= 1
9591047

960-
if current_feature_value < min_feature_value:
961-
min_feature_value = current_feature_value
962-
elif current_feature_value > max_feature_value:
963-
max_feature_value = current_feature_value
1048+
current_feature_value = X[samples[p], current_feature]
1049+
feature_values[p] = current_feature_value
1050+
if current_feature_value < min_feature_value:
1051+
min_feature_value = current_feature_value
1052+
elif current_feature_value > max_feature_value:
1053+
max_feature_value = current_feature_value
1054+
p += 1
1055+
else:
1056+
min_feature_value = X[samples[self.start], current_feature]
1057+
max_feature_value = min_feature_value
1058+
1059+
feature_values[self.start] = min_feature_value
1060+
for p in range(self.start + 1, self.end):
1061+
current_feature_value = X[samples[p], current_feature]
1062+
feature_values[p] = current_feature_value
1063+
1064+
if current_feature_value < min_feature_value:
1065+
min_feature_value = current_feature_value
1066+
elif current_feature_value > max_feature_value:
1067+
max_feature_value = current_feature_value
9641068

9651069
min_feature_value_out[0] = min_feature_value
9661070
max_feature_value_out[0] = max_feature_value
1071+
self.n_missing = n_missing
9671072

9681073
cdef inline void next_p(self, intp_t* p_prev, intp_t* p) noexcept nogil:
9691074
"""Compute the next p_prev and p for iteratiing over feature values.
@@ -986,7 +1091,10 @@ cdef class DensePartitioner:
9861091
# (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
9871092
p[0] += 1
9881093

989-
cdef inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil:
1094+
cdef inline intp_t partition_samples(
1095+
self,
1096+
float64_t current_threshold
1097+
) noexcept nogil:
9901098
"""Partition samples for feature_values at the current_threshold."""
9911099
cdef:
9921100
intp_t p = self.start
@@ -1233,7 +1341,10 @@ cdef class SparsePartitioner:
12331341
p_prev[0] = p[0]
12341342
p[0] = p_next
12351343

1236-
cdef inline intp_t partition_samples(self, float64_t current_threshold) noexcept nogil:
1344+
cdef inline intp_t partition_samples(
1345+
self,
1346+
float64_t current_threshold
1347+
) noexcept nogil:
12371348
"""Partition samples for feature_values at the current_threshold."""
12381349
return self._partition(current_threshold, self.start_positive)
12391350

0 commit comments

Comments
 (0)
0