@@ -81,9 +81,6 @@ cdef class Criterion:
81
81
n_missing: SIZE_t
82
82
The number of missing-valued samples in this node. Alternatively,
83
83
this is the size of the missing_samples array.
84
- missing_samples: array-like, dtype=DOUBLE_t
85
- Indices of the samples in X and y, where samples[start:end]
86
- correspond to the non-missing valued samples alone.
87
84
"""
88
85
89
86
pass
@@ -328,8 +325,8 @@ cdef class ClassificationCriterion(Criterion):
328
325
cdef void init(self , DOUBLE_t* y, SIZE_t y_stride,
329
326
DOUBLE_t* sample_weight, double weighted_n_samples,
330
327
SIZE_t* samples, SIZE_t start, SIZE_t end,
331
- SIZE_t* missing_samples, SIZE_t n_missing,
332
- SIZE_t missing_direction) nogil:
328
+ SIZE_t* missing_samples, SIZE_t n_missing) nogil:
329
+ SIZE_t missing_direction= MISSING_DIR_LEFT ) nogil:
333
330
""" Initialize the criterion at node samples[start:end] and
334
331
children samples[start:start] and samples[start:end].
335
332
@@ -428,21 +425,19 @@ cdef class ClassificationCriterion(Criterion):
428
425
429
426
self .pos = self .start
430
427
431
- self .weighted_n_left = 0
432
428
# SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=1)??
433
- self .weighted_n_right = self .weighted_n_node_samples
434
-
435
429
# SELFNOTE The case of wighted_n_missing being 0, will be taken care
436
430
# SELFNOTE at the time of building missing_samples array
437
- if self .n_missing != 0 :
438
- # Move missing samples left
439
- if self .missing_direction == 0 :
440
- self .weighted_n_left = self .weighted_n_node_missing
441
- self .weighted_n_right = self .weighted_n_node_nonmissing
442
- # Move missing samples right
443
- elif self .missing_direction == 1 :
444
- self .weighted_n_right = self .weighted_n_node_missing
445
- self .weighted_n_left = self .weighted_n_node_nonmissing
431
+ # SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
432
+ if self .n_missing <= 0 :
433
+ self .weighted_n_left = self .weighted_n_node_samples
434
+ self .weighted_n_right = 0.0
435
+ elif self .missing_direction == MISSING_DIR_LEFT:
436
+ self .weighted_n_left = self .weighted_n_node_missing
437
+ self .weighted_n_right = self .weighted_n_node_nonmissing
438
+ else :
439
+ self .weighted_n_left = self .weighted_n_node_nonmissing
440
+ self .weighted_n_right = self .weighted_n_node_missing
446
441
447
442
cdef double * sum_total = self .sum_total
448
443
cdef double * sum_left = self .sum_left
@@ -464,13 +459,16 @@ cdef class ClassificationCriterion(Criterion):
464
459
""" Reset the criterion at pos=end."""
465
460
self .pos = self .end
466
461
467
- # SELFNOTE should we put the missing_samples too on the left to simplify calculations (missing_direction=0)??
468
- self .weighted_n_left = self .weighted_n_node_samples
469
- self .weighted_n_right = 0.0
470
-
471
- if self .n_missing > 0 :
472
- # SELFNOTE we are leaving missing samples untouched
473
- self .missing_direction = 0
462
+ # SELFNOTE should we put the missing_samples too on the right to simplify calculations (missing_direction=0)??
463
+ if self .n_missing <= 0 :
464
+ self .weighted_n_left = 0.0
465
+ self .weighted_n_right = self .weighted_n_node_samples
466
+ elif self .missing_direction == MISSING_DIR_LEFT:
467
+ self .weighted_n_left = self .weighted_n_node_missing
468
+ self .weighted_n_right = self .weighted_n_node_nonmissing
469
+ else :
470
+ self .weighted_n_left = self .weighted_n_node_nonmissing
471
+ self .weighted_n_right = self .weighted_n_node_missing
474
472
475
473
cdef double * sum_total = self .sum_total
476
474
cdef double * sum_left = self .sum_left
@@ -487,87 +485,6 @@ cdef class ClassificationCriterion(Criterion):
487
485
sum_left += self .sum_stride
488
486
sum_right += self .sum_stride
489
487
490
- cdef void update(self , SIZE_t new_pos) nogil:
491
- """ Updated statistics by moving samples[pos:new_pos] to the left child.
492
-
493
- Parameters
494
- ----------
495
- new_pos: SIZE_t
496
- The new ending position for which to move samples from the right
497
- child to the left child.
498
- """
499
- cdef DOUBLE_t* y = self .y
500
- cdef SIZE_t pos = self .pos
501
- cdef SIZE_t end = self .end
502
-
503
- cdef double * sum_left = self .sum_left
504
- cdef double * sum_right = self .sum_right
505
- cdef double * sum_total = self .sum_total
506
-
507
- cdef SIZE_t* n_classes = self .n_classes
508
- cdef SIZE_t* samples = self .samples
509
- cdef DOUBLE_t* sample_weight = self .sample_weight
510
-
511
- cdef SIZE_t i
512
- cdef SIZE_t p
513
- cdef SIZE_t k
514
- cdef SIZE_t c
515
- cdef SIZE_t label_index
516
- cdef DOUBLE_t w = 1.0
517
-
518
- # Update statistics up to new_pos
519
- #
520
- # Given that
521
- # sum_left[x] + sum_right[x] = sum_total[x]
522
- # and that sum_total is known, we are going to update
523
- # sum_left from the direction that require the least amount
524
- # of computations, i.e. from pos to new_pos or from end to new_pos.
525
-
526
- if (new_pos - pos) <= (end - new_pos):
527
- for p in range (pos, new_pos):
528
- i = samples[p]
529
-
530
- if sample_weight != NULL :
531
- w = sample_weight[i]
532
-
533
- for k in range (self .n_outputs):
534
- label_index = (k * self .sum_stride +
535
- < SIZE_t> y[i * self .y_stride + k])
536
- sum_left[label_index] += w
537
-
538
- self .weighted_n_left += w
539
-
540
- else :
541
- self .reverse_reset()
542
-
543
- for p in range (end - 1 , new_pos - 1 , - 1 ):
544
- i = samples[p]
545
-
546
- if sample_weight != NULL :
547
- w = sample_weight[i]
548
-
549
- for k in range (self .n_outputs):
550
- label_index = (k * self .sum_stride +
551
- < SIZE_t> y[i * self .y_stride + k])
552
- sum_left[label_index] -= w
553
-
554
- self .weighted_n_left -= w
555
-
556
- # Update right part statistics
557
- self .weighted_n_right = self .weighted_n_node_samples - self .weighted_n_left
558
- for k in range (self .n_outputs):
559
- for c in range (n_classes[k]):
560
- sum_right[c] = sum_total[c] - sum_left[c]
561
-
562
- sum_right += self .sum_stride
563
- sum_left += self .sum_stride
564
- sum_total += self .sum_stride
565
-
566
- self .pos = new_pos
567
-
568
- # SELFNOTE overloading works in cython
569
- # SELFNOTE we need a separate function to avoid adding overheads when n_missing=0
570
- # SELFNOTE dont set a default value for missing_direction, it will conflict with the function above
571
488
cdef void update(self , SIZE_t new_pos, SIZE_t missing_direction) nogil:
572
489
""" Updated statistics by moving samples[pos:new_pos] to the left child.
573
490
@@ -584,11 +501,6 @@ cdef class ClassificationCriterion(Criterion):
584
501
1 - To send the missing values right
585
502
2 - To ignore the missing values
586
503
"""
587
- # SELFNOTE Splitter will filter this case of n_missing = 0
588
- # SELFNOTE remove
589
- # if self.n_missing <= 0:
590
- # return self.update(new_pos)
591
-
592
504
cdef DOUBLE_t* y = self .y
593
505
cdef SIZE_t pos = self .pos
594
506
cdef SIZE_t end = self .end
@@ -600,7 +512,6 @@ cdef class ClassificationCriterion(Criterion):
600
512
601
513
cdef SIZE_t* n_classes = self .n_classes
602
514
cdef SIZE_t* samples = self .samples
603
- cdef SIZE_t* missing_samples = self .samples
604
515
cdef DOUBLE_t* sample_weight = self .sample_weight
605
516
606
517
cdef SIZE_t i
@@ -648,6 +559,9 @@ cdef class ClassificationCriterion(Criterion):
648
559
649
560
self .weighted_n_left -= w
650
561
562
+ if n_missing > 0 and self .missing_direction == MISSING_DIR_LEFT:
563
+ self .weighted_n_left += self .weighted_n_node_missing
564
+
651
565
# Update right part statistics
652
566
self .weighted_n_right = self .weighted_n_node_samples - self .weighted_n_left
653
567
for k in range (self .n_outputs):
0 commit comments