8000 finallyyy did ittttttttt · scikit-learn/scikit-learn@29ef725 · GitHub
[go: up one dir, main page]

Skip to content

Commit 29ef725

Browse files
committed
finallyyy did ittttttttt
1 parent 0b22005 commit 29ef725

File tree

3 files changed

+31
-116
lines changed

3 files changed

+31
-116
lines changed

sklearn/tree/_criterion.pyx

Lines changed: 25 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ cdef class Criterion:
8181
n_missing: SIZE_t
8282
The number of missing-valued samples in this node. Alternatively,
8383
this is the size of the missing_samples array.
84-
missing_samples: array-like, dtype=DOUBLE_t
85-
Indices of the samples in X and y, where samples[start:end]
86-
correspond to the non-missing valued samples alone.
8784
"""
8885

8986
pass
@@ -328,8 +325,8 @@ cdef class ClassificationCriterion(Criterion):
328325
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
329326
DOUBLE_t* sample_weight, double weighted_n_samples,
330327
SIZE_t* samples, SIZE_t start, SIZE_t end,
331-
SIZE_t* missing_samples, SIZE_t n_missing,
332-
SIZE_t missing_direction) nogil:
328+
SIZE_t* missing_samples, SIZE_t n_missing) nogil:
329+
SIZE_t missing_direction=MISSING_DIR_LEFT) nogil:
333330
"""Initialize the criterion at node samples[start:end] and
334331
children samples[start:start] and samples[start:end].
335332
@@ -428,21 +425,19 @@ cdef class ClassificationCriterion(Criterion):
428425

429426
self.pos = self.start
430427

431-
self.weighted_n_left = 0
432428
# SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=1)??
433-
self.weighted_n_right = self.weighted_n_node_samples
434-
435429
# SELFNOTE The case of wighted_n_missing being 0, will be taken care
436430
# SELFNOTE at the time of building missing_samples array
437-
if self.n_missing != 0:
438-
# Move missing samples left
439-
if self.missing_direction == 0:
440-
self.weighted_n_left = self.weighted_n_node_missing
441-
self.weighted_n_right = self.weighted_n_node_nonmissing
442-
# Move missing samples right
443-
elif self.missing_direction == 1:
444-
self.weighted_n_right = self.weighted_n_node_missing
445-
self.weighted_n_left = self.weighted_n_node_nonmissing
431+
# SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
432+
if self.n_missing <= 0:
433+
self.weighted_n_left = self.weighted_n_node_samples
434+
self.weighted_n_right = 0.0
435+
elif self.missing_direction == MISSING_DIR_LEFT:
436+
self.weighted_n_left = self.weighted_n_node_missing
437+
self.weighted_n_right = self.weighted_n_node_nonmissing
438+
else:
439+
self.weighted_n_left = self.weighted_n_node_nonmissing
440+
self.weighted_n_right = self.weighted_n_node_missing
446441

447442
cdef double* sum_total = self.sum_total
448443
cdef double* sum_left = self.sum_left
@@ -464,13 +459,16 @@ cdef class ClassificationCriterion(Criterion):
464459
"""Reset the criterion at pos=end."""
465460
self.pos = self.end
466461

467-
# SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
468-
self.weighted_n_left = self.weighted_n_node_samples
469-
self.weighted_n_right = 0.0
470-
471-
if self.n_missing > 0:
472-
# SELFNOTE we are leaving missing samples untouched
473-
self.missing_direction = 0
462+
# SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=0)??
463+
if self.n_missing <= 0:
464+
self.weighted_n_left = 0.0
465+
self.weighted_n_right = self.weighted_n_node_samples
466+
elif self.missing_direction == MISSING_DIR_LEFT:
467+
self.weighted_n_left = self.weighted_n_node_missing
468+
self.weighted_n_right = self.weighted_n_node_nonmissing
469+
else:
470+
self.weighted_n_left = self.weighted_n_node_nonmissing
471+
self.weighted_n_right = self.weighted_n_node_missing
474472

475473
cdef double* sum_total = self.sum_total
476474
cdef double* sum_left = self.sum_left
@@ -487,87 +485,6 @@ cdef class ClassificationCriterion(Criterion):
487485
sum_left += self.sum_stride
488486
sum_right += self.sum_stride
489487

490-
cdef void update(self, SIZE_t new_pos) nogil:
491-
"""Updated statistics by moving samples[pos:new_pos] to the left child.
492-
493-
Parameters
494-
----------
495-
new_pos: SIZE_t
496-
The new ending position for which to move samples from the right
497-
child to the left child.
498-
"""
499-
cdef DOUBLE_t* y = self.y
500-
cdef SIZE_t pos = self.pos
501-
cdef SIZE_t end = self.end
502-
503-
cdef double* sum_left = self.sum_left
504-
cdef double* sum_right = self.sum_right
505-
cdef double* sum_total = self.sum_total
506-
507-
cdef SIZE_t* n_classes = self.n_classes
508-
cdef SIZE_t* samples = self.samples
509-
cdef DOUBLE_t* sample_weight = self.sample_weight
510-
511-
cdef SIZE_t i
512-
cdef SIZE_t p
513-
cdef SIZE_t k
514-
cdef SIZE_t c
515-
cdef SIZE_t label_index
516-
cdef DOUBLE_t w = 1.0
517-
518-
# Update statistics up to new_pos
519-
#
520-
# Given that
521-
# sum_left[x] + sum_right[x] = sum_total[x]
522-
# and that sum_total is known, we are going to update
523-
# sum_left from the direction that require the least amount
524-
# of computations, i.e. from pos to new_pos or from end to new_pos.
525-
526-
if (new_pos - pos) <= (end - new_pos):
527-
for p in range(pos, new_pos):
528-
i = samples[p]
529-
530-
if sample_weight != NULL:
531-
w = sample_weight[i]
532-
533-
for k in range(self.n_outputs):
534-
label_index = (k * self.sum_stride +
535-
<SIZE_t> y[i * self.y_stride + k])
536-
sum_left[label_index] += w
537-
538-
self.weighted_n_left += w
539-
540-
else:
541-
self.reverse_reset()
542-
543-
for p in range(end - 1, new_pos - 1, -1):
544-
i = samples[p]
545-
546-
if sample_weight != NULL:
547-
w = sample_weight[i]
548-
549-
for k in range(self.n_outputs):
550-
label_index = (k * self.sum_stride +
551-
<SIZE_t> y[i * self.y_stride + k])
552-
sum_left[label_index] -= w
553-
554-
self.weighted_n_left -= w
555-
556-
# Update right part statistics
557-
self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left
558-
for k in range(self.n_outputs):
559-
for c in range(n_classes[k]):
560-
sum_right[c] = sum_total[c] - sum_left[c]
561-
562-
sum_right += self.sum_stride
563-
sum_left += self.sum_stride
564-
sum_total += self.sum_stride
565-
566-
self.pos = new_pos
567-
568-
# SELFNOTE overloading works in cython
569-
# SELFNOTE we need a separate function to avoid adding overheads when n_missing=0
570-
# SELFNOTE dont set a default value for missing_direction, it will conflict with the function above
571488
cdef void update(self, SIZE_t new_pos, SIZE_t missing_direction) nogil:
572489
"""Updated statistics by moving samples[pos:new_pos] to the left child.
573490
@@ -584,11 +501,6 @@ cdef class ClassificationCriterion(Criterion):
584501
1 - To send the missing values right
585502
2 - To ignore the missing values
586503
"""
587-
# SELFNOTE Splitter will filter this case of n_missing = 0
588-
# SELFNOTE remove
589-
# if self.n_missing <= 0:
590-
# return self.update(new_pos)
591-
592504
cdef DOUBLE_t* y = self.y
593505
cdef SIZE_t pos = self.pos
594506
cdef SIZE_t end = self.end
@@ -600,7 +512,6 @@ cdef class ClassificationCriterion(Criterion):
600512

601513
cdef SIZE_t* n_classes = self.n_classes
602514
cdef SIZE_t* samples = self.samples
603-
cdef SIZE_t* missing_samples = self.samples
604515
cdef DOUBLE_t* sample_weight = self.sample_weight
605516

606517
cdef SIZE_t i
@@ -648,6 +559,9 @@ cdef class ClassificationCriterion(Criterion):
648559

649560
self.weighted_n_left -= w
650561

562+
if n_missing > 0 and self.missing_direction == MISSING_DIR_LEFT:
563+
self.weighted_n_left += self.weighted_n_node_missing
564+
651565
# Update right part statistics
652566
self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left
653567
for k in range(self.n_outputs):

sklearn/tree/_splitter.pyx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,7 @@ cdef class Splitter:
180180
# typically is very less compared to normal size
181181
for i in range(n_features):
182182
n_missing[i] = np.count_nonzero(missing_mask[:, i])
183-
max_missing = int_max(
184-
max_missing)
183+
max_missing = int_max(max_missing)
185184

186185
# Create a new array which will be used to store nonzero weighted
187186
# missing-valued samples from the feature of interest
@@ -259,6 +258,7 @@ cdef class Splitter:
259258
self.start = start
260259
self.end = end
261260
self.n_missing = n_missing
261+
self.missing_direction = MISSING_DIR_RIGHT
262262

263263
self.criterion.init(self.y,
264264
self.y_stride,
@@ -268,7 +268,8 @@ cdef class Splitter:
268268
start,
269269
end,
270270
self.missing_samples,
271-
n_missing)
271+
n_missing,
272+
self.missing_direction)
272273

273274
weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
274275

@@ -583,7 +584,7 @@ cdef class BestSplitter(BaseDenseSplitter):
583584

584585
if p < end:
585586
# Send missing-valued samples both ways and get the best
586-
for missing_direction in range(1)
587+
for missing_direction in range(1):
587588
# Reject if min_samples_leaf is not guaranteed
588589
if (((p - start + n_missing * (1 - missing_direction)) < min_samples_leaf) or
589590
((end - p + n_missing * missing_direction) < min_samples_leaf)):

sklearn/tree/_tree.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
198198

199199
cdef bint include_missing = <bint> self.allow_missing
200200
cdef bint include_missing_r = MISSING_DIR_UNDEFINED
201-
cdef bint include_missing_l = MISSING_DIR_UNDEFINED
201+
cdef bint include_missing_l = MISSING_DIR_UNDEFINED
202202

203203
cdef SplitRecord split
204204
cdef SIZE_t node_id

0 commit comments

Comments
 (0)
0