@@ -28,6 +28,9 @@ from ._utils cimport log
28
28
from ._utils cimport safe_realloc
29
29
from ._utils cimport sizet_ptr_to_ndarray
30
30
31
+ cdef int LEFT = 0
32
+ cdef int RIGHT = 1
33
+
31
34
cdef class Criterion:
32
35
""" Interface for impurity criteria.
33
36
@@ -49,8 +52,9 @@ cdef class Criterion:
49
52
pass
50
53
51
54
cdef void init(self , DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
52
- double weighted_n_samples, SIZE_t* samples, SIZE_t start,
53
- SIZE_t end) nogil:
55
+ double weighted_n_samples, SIZE_t* samples,
56
+ SIZE_t start, SIZE_t end,
57
+ SIZE_t start_missing, SIZE_t end_missing) nogil:
54
58
""" Placeholder for a method which will initialize the criterion.
55
59
56
60
Parameters
@@ -66,11 +70,17 @@ cdef class Criterion:
66
70
The total w
10000
eight of the samples being considered
67
71
samples: array-like, dtype=DOUBLE_t
68
72
Indices of the samples in X and y, where samples[start:end]
69
- correspond to the samples in this node
73
+ correspond to the non-missing samples and
74
+ samples[start_missing:end_missing] correspond to the missing valued
75
+ samples in this node
70
76
start: SIZE_t
71
- The first sample to be used on this node
77
+ The first non-missing-valued sample to be used on this node
72
78
end: SIZE_t
73
- The last sample used on this node
79
+ The last non-missing-valued sample used on this node
80
+ start_missing: SIZE_t
81
+ The first missing-valued sample to be used on this node
82
+ end_missing: SIZE_t
83
+ The last missing-valued sample used on this node
74
84
75
85
"""
76
86
@@ -106,6 +116,23 @@ cdef class Criterion:
106
116
107
117
pass
108
118
119
+ cdef void move_missing(self , bint direction) nogil:
120
+ """ Updated statistics by moving the missing-valued samples to l/r.
121
+
122
+ This updates the collected statistics by moving the missing-valued
123
+ samples (samples[start_missing:end_nonmissing]) to the direction as
124
+ specified.
125
+
126
+ Parameters
127
+ ----------
128
+ direction: bint
129
+ 0 (false) to move the missing-valued samples left.
130
+ 1 (true) to move the missing-valued samples right.
131
+
132
+ """
133
+
134
+ pass
135
+
109
136
cdef double node_impurity(self ) nogil:
110
137
""" Placeholder for calculating the impurity of the node.
111
138
@@ -198,9 +225,9 @@ cdef class Criterion:
198
225
self .children_impurity(& impurity_left, & impurity_right)
199
226
200
227
return ((self .weighted_n_node_samples / self .weighted_n_samples) *
201
- (impurity - (self .weighted_n_right /
228
+ (impurity - (self .weighted_n_right /
202
229
self .weighted_n_node_samples * impurity_right)
203
- - (self .weighted_n_left /
230
+ - (self .weighted_n_left /
204
231
self .weighted_n_node_samples * impurity_left)))
205
232
206
233
@@ -227,11 +254,18 @@ cdef class ClassificationCriterion(Criterion):
227
254
self .sample_weight = NULL
228
255
229
256
self .samples = NULL
230
- self .start = 0
231
- self .pos = 0
232
- self .end = 0
257
+ self .start_nonmissing = 0
258
+ self .pos_nonmissing = 0
259
+ self .end_nonmissing = 0
260
+
261
+ self .start_missing = 0
262
+ self .end_missing = 0
233
263
234
264
self .n_outputs = n_outputs
265
+
266
+ self .n_missing = 0
267
+ self .n_nonmissing = 0
268
+
235
269
self .n_node_samples = 0
236
270
self .weighted_n_node_samples = 0.0
237
271
self .weighted_n_left = 0.0
@@ -263,7 +297,7 @@ cdef class ClassificationCriterion(Criterion):
263
297
self .sum_left = < double * > calloc(n_elements, sizeof(double ))
264
298
self .sum_right = < double * > calloc(n_elements, sizeof(double ))
265
299
266
- if (self .sum_total == NULL or
300
+ if (self .sum_total == NULL or
267
301
self .sum_left == NULL or
268
302
self .sum_right == NULL ):
269
303
raise MemoryError ()
@@ -281,7 +315,8 @@ cdef class ClassificationCriterion(Criterion):
281
315
282
316
cdef void init(self , DOUBLE_t* y, SIZE_t y_stride,
283
317
DOUBLE_t* sample_weight, double weighted_n_samples,
284
- SIZE_t* samples, SIZE_t start, SIZE_t end) nogil:
318
+ SIZE_t* samples,
319
+ SIZE_t start, SIZE_t end) nogil:
285
320
""" Initialize the criterion at node samples[start:end] and
286
321
children samples[start:start] and samples[start:end].
287
322
@@ -298,10 +333,14 @@ cdef class ClassificationCriterion(Criterion):
298
333
The total weight of all samples
299
334
samples: array-like, dtype=SIZE_t
300
335
A mask on the samples, showing which ones we want to use
301
- start: SIZE_t
302
- The first sample to use in the mask
303
- end: SIZE_t
304
- The last sample to use in the mask
336
+ start_nonmissing: SIZE_t
337
+ The first non-missing-valued sample to be used on this node
338
+ end_nonmissing: SIZE_t
339
+ The last non-missing-valued sample used on this node
340
+ start_missing: SIZE_t
341
+ The first missing-valued sample to be used on this node
342
+ end_missing: SIZE_t
343
+ The last missing-valued sample used on this node
305
344
"""
306
345
307
346
self .y = y
@@ -328,7 +367,8 @@ cdef class ClassificationCriterion(Criterion):
328
367
memset(sum_total + offset, 0 , n_classes[k] * sizeof(double ))
329
368
offset += self .sum_stride
330
369
331
- for p in range (start, end):
370
+ for p in (range (start_nonmissing, end_nonmissing) +
371
+ range (start_missing, end_missing)):
332
372
i = samples[p]
333
373
334
374
# w is originally set to be 1.0, meaning that if no sample weights
@@ -722,7 +762,7 @@ cdef class RegressionCriterion(Criterion):
722
762
self .sum_left = < double * > calloc(n_outputs, sizeof(double ))
723
763
self .sum_right = < double * > calloc(n_outputs, sizeof(double ))
724
764
725
- if (self .sum_total == NULL or
765
+ if (self .sum_total == NULL or
726
766
self .sum_left == NULL or
727
767
self .sum_right == NULL ):
728
768
raise MemoryError ()
@@ -847,7 +887,7 @@ cdef class RegressionCriterion(Criterion):
847
887
848
888
self .weighted_n_left -= w
849
889
850
- self .weighted_n_right = (self .weighted_n_node_samples -
890
+ self .weighted_n_right = (self .weighted_n_node_samples -
851
891
self .weighted_n_left)
852
892
for k in range (self .n_outputs):
853
893
sum_right[k] = sum_total[k] - sum_left[k]
@@ -957,7 +997,7 @@ cdef class MSE(RegressionCriterion):
957
997
958
998
for k in range (self .n_outputs):
959
999
impurity_left[0 ] -= (sum_left[k] / self .weighted_n_left) ** 2.0
960
- impurity_right[0 ] -= (sum_right[k] / self .weighted_n_right) ** 2.0
1000
+ impurity_right[0 ] -= (sum_right[k] / self .weighted_n_right) ** 2.0
961
1001
962
1002
impurity_left[0 ] /= self .n_outputs
963
1003
impurity_right[0 ] /= self .n_outputs
@@ -1019,6 +1059,6 @@ cdef class FriedmanMSE(MSE):
1019
1059
diff = (self .weighted_n_right * total_sum_left -
1020
1060
self .weighted_n_left * total_sum_right) / self .n_outputs
1021
1061
1022
- return (diff * diff / (self .weighted_n_left * self .weighted_n_right *
1062
+ return (diff * diff / (self .weighted_n_left * self .weighted_n_right *
1023
1063
self .weighted_n_node_samples))
1024
-
1064
+
0 commit comments