@@ -19,6 +19,9 @@ from scipy.sparse import issparse
19
19
20
20
cdef float64_t INFINITY = np.inf
21
21
22
+ # Allow for 32 bit float comparisons
23
+ cdef float32_t INFINITY_32t = np.inf
24
+
22
25
# Mitigate precision differences between 32 bit and 64 bit
23
26
cdef float32_t FEATURE_THRESHOLD = 1e-7
24
27
@@ -479,6 +482,10 @@ cdef inline int node_split_best(
479
482
current_split.threshold = feature_values[p_prev]
480
483
481
484
current_split.n_missing = n_missing
485
+
486
+ # if there are no missing values in the training data, during
487
+ # test time, we send missing values to the branch that contains
488
+ # the most samples during training time.
482
489
if n_missing == 0 :
483
490
current_split.missing_go_to_left = n_left > n_right
484
491
else :
@@ -680,7 +687,13 @@ cdef inline int node_split_random(
680
687
# Draw random splits and pick the best
681
688
cdef intp_t start = splitter.start
682
689
cdef intp_t end = splitter.end
690
+ cdef intp_t end_non_missing
691
+ cdef intp_t n_missing = 0
692
+ cdef bint has_missing = 0
693
+ cdef intp_t n_left, n_right
694
+ cdef bint missing_go_to_left
683
695
696
+ cdef intp_t[::1 ] samples = splitter.samples
684
697
cdef intp_t[::1 ] features = splitter.features
685
698
cdef intp_t[::1 ] constant_features = splitter.constant_features
686
699
cdef intp_t n_features = splitter.n_features
@@ -758,12 +771,22 @@ cdef inline int node_split_random(
758
771
759
772
current_split.feature = features[f_j]
760
773
761
- # Find min, max
774
+ # Find min, max as we will randomly select a threshold between them
762
775
partitioner.find_min_max(
763
776
current_split.feature, & min_feature_value, & max_feature_value
764
777
)
778
+ n_missing = partitioner.n_missing
779
+ end_non_missing = end - n_missing
765
780
766
- if max_feature_value <= min_feature_value + FEATU
F438
RE_THRESHOLD:
781
+ if (
782
+ # All values for this feature are missing, or
783
+ end_non_missing == start or
784
+ # This feature is considered constant (max - min <= FEATURE_THRESHOLD)
785
+ max_feature_value <= min_feature_value + FEATURE_THRESHOLD
786
+ ):
787
+ # We consider this feature constant in this case.
788
+ # Since finding a split with a constant feature is not valuable,
789
+ # we do not consider this feature for splitting.
767
790
features[f_j], features[n_total_constants] = features[n_total_constants], current_split.feature
768
791
769
792
n_found_constants += 1
@@ -772,6 +795,8 @@ cdef inline int node_split_random(
772
795
773
796
f_i -= 1
774
797
features[f_i], features[f_j] = features[f_j], features[f_i]
798
+ has_missing = n_missing != 0
799
+ criterion.init_missing(n_missing)
775
800
776
801
# Draw a random threshold
777
802
current_split.threshold = rand_uniform(
@@ -780,15 +805,38 @@ cdef inline int node_split_random(
780
805
random_state,
781
806
)
782
807
808
+ if has_missing:
809
+ # If there are missing values, then we randomly make all missing
810
+ # values go to the right or left.
811
+ #
812
+ # Note: compared to the BestSplitter, we do not evaluate the
813
+ # edge case where all the missing values go to the right node
814
+ # and the non-missing values go to the left node. This is because
815
+ # this would indicate a threshold outside of the observed range
816
+ # of the feature. However, it is not clear how much probability weight should
817
+ # be given to this edge case.
818
+ missing_go_to_left = rand_int(0 , 2 , random_state)
819
+ else :
820
+ missing_go_to_left = 0
821
+ criterion.missing_go_to_left = missing_go_to_left
822
+
783
823
if current_split.threshold == max_feature_value:
784
824
current_split.threshold = min_feature_value
785
825
786
826
# Partition
787
- current_split.pos = partitioner.partition_samples(current_split.threshold)
827
+ current_split.pos = partitioner.partition_samples(
828
+ current_split.threshold
829
+ )
830
+
831
+ if missing_go_to_left:
832
+ n_left = current_split.pos - start + n_missing
833
+ n_right = end_non_missing - current_split.pos
834
+ else :
835
+ n_left = current_split.pos - start
836
+ n_right = end_non_missing - current_split.pos + n_missing
788
837
789
838
# Reject if min_samples_leaf is not guaranteed
790
- if (((current_split.pos - start) < min_samples_leaf) or
791
- ((end - current_split.pos) < min_samples_leaf)):
839
+ if n_left < min_samples_leaf or n_right < min_samples_leaf:
792
840
continue
793
841
794
842
# Evaluate split
@@ -817,26 +865,44 @@ cdef inline int node_split_random(
817
865
current_proxy_improvement = criterion.proxy_impurity_improvement()
818
866
819
867
if current_proxy_improvement > best_proxy_improvement:
868
+ current_split.n_missing = n_missing
869
+
870
+ # if there are no missing values in the training data, during
871
+ # test time, we send missing values to the branch that contains
872
+ # the most samples during training time.
873
+ if has_missing:
874
+ current_split.missing_go_to_left = missing_go_to_left
875
+ else :
876
+ current_split.missing_go_to_left = n_left > n_right
877
+
820
878
best_proxy_improvement = current_proxy_improvement
821
879
best_split = current_split # copy
822
880
823
881
# Reorganize into samples[start:best.pos] + samples[best.pos:end]
824
882
if best_split.pos < end:
825
883
if current_split.feature != best_split.feature:
826
- # TODO: Pass in best.n_missing when random splitter supports missing values.
827
884
partitioner.partition_samples_final(
828
- best_split.pos, best_split.threshold, best_split.feature, 0
885
+ best_split.pos,
886
+ best_split.threshold,
887
+ best_split.feature,
888
+ best_split.n_missing
829
889
)
890
+ criterion.init_missing(best_split.n_missing)
891
+ criterion.missing_go_to_left = best_split.missing_go_to_left
830
892
831
893
criterion.reset()
832
894
criterion.update(best_split.pos)
833
895
criterion.children_impurity(
834
896
& best_split.impurity_left, & best_split.impurity_right
835
897
)
836
898
best_split.improvement = criterion.impurity_improvement(
837
- impurity, best_split.impurity_left, best_split.impurity_right
899
+ impurity,
900
+ best_split.impurity_left,
901
+ best_split.impurity_right
838
902
)
839
903
904
+ shift_missing_values_to_left_if_required(& best_split, samples, end)
905
+
840
906
# Respect invariant for constant features: the original order of
841
907
# element in features[:n_known_constants] must be preserved for sibling
842
908
# and child nodes
@@ -941,29 +1007,68 @@ cdef class DensePartitioner:
941
1007
float32_t* min_feature_value_out,
942
1008
float32_t* max_feature_value_out,
943
1009
) noexcept nogil:
944
- """ Find the minimum and maximum value for current_feature."""
1010
+ """ Find the minimum and maximum value for current_feature.
1011
+
1012
+ Missing values are stored at the end of feature_values.
1013
+ The number of missing values observed in feature_values is stored
1014
+ in self.n_missing.
1015
+ """
945
1016
cdef:
946
- intp_t p
1017
+ intp_t p, current_end
947
1018
float32_t current_feature_value
948
1019
const float32_t[:, :] X = self .X
949
1020
intp_t[::1 ] samples = self .samples
950
- float32_t min_feature_value = X[samples[ self .start], current_feature]
951
- float32_t max_feature_value = min_feature_value
1021
+ float32_t min_feature_value = INFINITY_32t
1022
+ float32_t max_feature_value = - INFINITY_32t
952
1023
float32_t[::1 ] feature_values = self .feature_values
1024
+ intp_t n_missing = 0
1025
+ const unsigned char [::1 ] missing_values_in_feature_mask = self .missing_values_in_feature_mask
953
1026
954
- feature_values[self .start] = min_feature_value
1027
+ # We are copying the values into an array and
1028
+ # finding min/max of the array in a manner which utilizes the cache more
1029
+ # effectively. We need to also count the number of missing-values there are
1030
+ if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
1031
+ p, current_end = self .start, self .end - 1
1032
+ # Missing values are placed at the end and do not participate in the
1033
+ # min/max calculation.
1034
+ while p <= current_end:
1035
+ # Finds the right-most value that is not missing so that
1036
+ # it can be swapped with missing values towards its left.
1037
+ if isnan(X[samples[current_end], current_feature]):
1038
+ n_missing += 1
1039
+ current_end -= 1
1040
+ continue
955
1041
956
- for p in range (self .start + 1 , self .end):
957
- current_feature_value = X[samples[p], current_feature]
958
- feature_values[p] = current_feature_value
1042
+ # X[samples[current_end], current_feature] is a non-missing value
1043
+ if isnan(X[samples[p], current_feature]):
1044
+ samples[p], samples[current_end] = samples[current_end], samples[p]
1045
+ n_missing += 1
1046
+ current_end -= 1
959
1047
960
- if current_feature_value < min_feature_value:
961
- min_feature_value = current_feature_value
962
- elif current_feature_value > max_feature_value:
963
- max_feature_value = current_feature_value
1048
+ current_feature_value = X[samples[p], current_feature]
1049
+ feature_values[p] = current_feature_value
1050
+ if current_feature_value < min_feature_value:
1051
+ min_feature_value = current_feature_value
1052
+ elif current_feature_value > max_feature_value:
1053
+ max_feature_value = current_feature_value
1054
+ p += 1
1055
+ else :
1056
+ min_feature_value = X[samples[self .start], current_feature]
1057
+ max_feature_value = min_feature_value
1058
+
1059
+ feature_values[self .start] = min_feature_value
1060
+ for p in range (self .start + 1 , self .end):
1061
+ current_feature_value = X[samples[p], current_feature]
1062
+ feature_values[p] = current_feature_value
1063
+
1064
+ if current_feature_value < min_feature_value:
1065
+ min_feature_value = current_feature_value
1066
+ elif current_feature_value > max_feature_value:
1067
+ max_feature_value = current_feature_value
964
1068
965
1069
min_feature_value_out[0 ] = min_feature_value
966
1070
max_feature_value_out[0 ] = max_feature_value
1071
+ self .n_missing = n_missing
967
1072
968
1073
cdef inline void next_p(self , intp_t* p_prev, intp_t* p) noexcept nogil:
969
1074
""" Compute the next p_prev and p for iteratiing over feature values.
@@ -986,7 +1091,10 @@ cdef class DensePartitioner:
986
1091
# (feature_values[p] >= end) or (feature_values[p] > feature_values[p - 1])
987
1092
p[0 ] += 1
988
1093
989
- cdef inline intp_t partition_samples(self , float64_t current_threshold) noexcept nogil:
1094
+ cdef inline intp_t partition_samples(
1095
+ self ,
1096
+ float64_t current_threshold
1097
+ ) noexcept nogil:
990
1098
""" Partition samples for feature_values at the current_threshold."""
991
1099
cdef:
992
1100
intp_t p = self .start
@@ -1233,7 +1341,10 @@ cdef class SparsePartitioner:
1233
1341
p_prev[0 ] = p[0 ]
1234
1342
p[0 ] = p_next
1235
1343
1236
- cdef inline intp_t partition_samples(self , float64_t current_threshold) noexcept nogil:
1344
+ cdef inline intp_t partition_samples(
1345
+ self ,
1346
+ float64_t current_threshold
1347
+ ) noexcept nogil:
1237
1348
""" Partition samples for feature_values at the current_threshold."""
1238
1349
return self ._partition(current_threshold, self .start_positive)
1239
1350
0 commit comments