8000 OLDSTASH_Nov · scikit-learn/scikit-learn@91c8164 · GitHub
[go: up one dir, main page]

Skip to content

Commit 91c8164

Browse files
committed
OLDSTASH_Nov
1 parent 025495c commit 91c8164

File tree

6 files changed

+189
-67
lines changed

6 files changed

+189
-67
lines changed

sklearn/tree/_criterion.pyx

Lines changed: 62 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ from ._utils cimport log
2828
from ._utils cimport safe_realloc
2929
from ._utils cimport sizet_ptr_to_ndarray
3030

31+
cdef int LEFT = 0
32+
cdef int RIGHT = 1
33+
3134
cdef class Criterion:
3235
"""Interface for impurity criteria.
3336
@@ -49,8 +52,9 @@ cdef class Criterion:
4952
pass
5053

5154
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
52-
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
53-
SIZE_t end) nogil:
55+
double weighted_n_samples, SIZE_t* samples,
56+
SIZE_t start, SIZE_t end,
57+
SIZE_t start_missing, SIZE_t end_missing) nogil:
5458
"""Placeholder for a method which will initialize the criterion.
5559
5660
Parameters
@@ -66,11 +70,17 @@ cdef class Criterion:
6670
The total w 10000 eight of the samples being considered
6771
samples: array-like, dtype=DOUBLE_t
6872
Indices of the samples in X and y, where samples[start:end]
69-
correspond to the samples in this node
73+
correspond to the non-missing samples and
74+
samples[start_missing:end_missing] correspond to the missing valued
75+
samples in this node
7076
start: SIZE_t
71-
The first sample to be used on this node
77+
The first non-missing-valued sample to be used on this node
7278
end: SIZE_t
73-
The last sample used on this node
79+
The last non-missing-valued sample used on this node
80+
start_missing: SIZE_t
81+
The first missing-valued sample to be used on this node
82+
end_missing: SIZE_t
83+
The last missing-valued sample used on this node
7484
7585
"""
7686

@@ -106,6 +116,23 @@ cdef class Criterion:
106116

107117
pass
108118

119+
cdef void move_missing(self, bint direction) nogil:
120+
"""Updated statistics by moving the missing-valued samples to l/r.
121+
122+
This updates the collected statistics by moving the missing-valued
123+
samples (samples[start_missing:end_nonmissing]) to the direction as
124+
specified.
125+
126+
Parameters
127+
----------
128+
direction: bint
129+
0 (false) to move the missing-valued samples left.
130+
1 (true) to move the missing-valued samples right.
131+
132+
"""
133+
134+
pass
135+
109136
cdef double node_impurity(self) nogil:
110137
"""Placeholder for calculating the impurity of the node.
111138
@@ -198,9 +225,9 @@ cdef class Criterion:
198225
self.children_impurity(&impurity_left, &impurity_right)
199226

200227
return ((self.weighted_n_node_samples / self.weighted_n_samples) *
201-
(impurity - (self.weighted_n_right /
228+
(impurity - (self.weighted_n_right /
202229
self.weighted_n_node_samples * impurity_right)
203-
- (self.weighted_n_left /
230+
- (self.weighted_n_left /
204231
self.weighted_n_node_samples * impurity_left)))
205232

206233

@@ -227,11 +254,18 @@ cdef class ClassificationCriterion(Criterion):
227254
self.sample_weight = NULL
228255

229256
self.samples = NULL
230-
self.start = 0
231-
self.pos = 0
232-
self.end = 0
257+
self.start_nonmissing = 0
258+
self.pos_nonmissing = 0
259+
self.end_nonmissing = 0
260+
261+
self.start_missing = 0
262+
self.end_missing = 0
233263

234264
self.n_outputs = n_outputs
265+
266+
self.n_missing = 0
267+
self.n_nonmissing = 0
268+
235269
self.n_node_samples = 0
236270
self.weighted_n_node_samples = 0.0
237271
self.weighted_n_left = 0.0
@@ -263,7 +297,7 @@ cdef class ClassificationCriterion(Criterion):
263297
self.sum_left = <double*> calloc(n_elements, sizeof(double))
264298
self.sum_right = <double*> calloc(n_elements, sizeof(double))
265299

266-
if (self.sum_total == NULL or
300+
if (self.sum_total == NULL or
267301
self.sum_left == NULL or
268302
self.sum_right == NULL):
269303
raise MemoryError()
@@ -281,7 +315,8 @@ cdef class ClassificationCriterion(Criterion):
281315

282316
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
283317
DOUBLE_t* sample_weight, double weighted_n_samples,
284-
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil:
318+
SIZE_t* samples,
319+
SIZE_t start, SIZE_t end) nogil:
285320
"""Initialize the criterion at node samples[start:end] and
286321
children samples[start:start] and samples[start:end].
287322
@@ -298,10 +333,14 @@ cdef class ClassificationCriterion(Criterion):
298333
The total weight of all samples
299334
samples: array-like, dtype=SIZE_t
300335
A mask on the samples, showing which ones we want to use
301-
start: SIZE_t
302-
The first sample to use in the mask
303-
end: SIZE_t
304-
The last sample to use in the mask
336+
start_nonmissing: SIZE_t
337+
The first non-missing-valued sample to be used on this node
338+
end_nonmissing: SIZE_t
339+
The last non-missing-valued sample used on this node
340+
start_missing: SIZE_t
341+
The first missing-valued sample to be used on this node
342+
end_missing: SIZE_t
343+
The last missing-valued sample used on this node
305344
"""
306345

307346
self.y = y
@@ -328,7 +367,8 @@ cdef class ClassificationCriterion(Criterion):
328367
memset(sum_total + offset, 0, n_classes[k] * sizeof(double))
329368
offset += self.sum_stride
330369

331-
for p in range(start, end):
370+
for p in (range(start_nonmissing, end_nonmissing) +
371+
range(start_missing, end_missing)):
332372
i = samples[p]
333373

334374
# w is originally set to be 1.0, meaning that if no sample weights
@@ -722,7 +762,7 @@ cdef class RegressionCriterion(Criterion):
722762
self.sum_left = <double*> calloc(n_outputs, sizeof(double))
723763
self.sum_right = <double*> calloc(n_outputs, sizeof(double))
724764

725-
if (self.sum_total == NULL or
765+
if (self.sum_total == NULL or
726766
self.sum_left == NULL or
727767
self.sum_right == NULL):
728768
raise MemoryError()
@@ -847,7 +887,7 @@ cdef class RegressionCriterion(Criterion):
847887

848888
self.weighted_n_left -= w
849889

850-
self.weighted_n_right = (self.weighted_n_node_samples -
890+
self.weighted_n_right = (self.weighted_n_node_samples -
851891
self.weighted_n_left)
852892
for k in range(self.n_outputs):
853893
sum_right[k] = sum_total[k] - sum_left[k]
@@ -957,7 +997,7 @@ cdef class MSE(RegressionCriterion):
957997

958998
for k in range(self.n_outputs):
959999
impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
960-
impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
1000+
impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
9611001

9621002
impurity_left[0] /= self.n_outputs
9631003
impurity_right[0] /= self.n_outputs
@@ -1019,6 +1059,6 @@ cdef class FriedmanMSE(MSE):
10191059
diff = (self.weighted_n_right * total_sum_left -
10201060
self.weighted_n_left * total_sum_right) / self.n_outputs
10211061

1022-
return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
1062+
return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
10231063
self.weighted_n_node_samples))
1024-
1064+

sklearn/tree/_splitter.pxd

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,15 @@ ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
2222

2323
cdef struct SplitRecord:
2424
# Data to track sample split
25-
SIZE_t feature # Which feature to split on.
26-
SIZE_t pos # Split samples array at the given position,
27-
# i.e. count of samples below threshold for feature.
28-
# pos is >= end if the node is a leaf.
29-
double threshold # Threshold to split at.
30-
double improvement # Impurity improvement given parent node.
31-
double impurity_left # Impurity of the left split.
32-
double impurity_right # Impurity of the right split.
25+
SIZE_t feature # Which feature to split on.
26+
SIZE_t pos # Split samples array at the given position,
27+
# i.e. count of samples below threshold for feature.
28+
# pos is >= end if the node is a leaf.
29+
double threshold # Threshold to split at.
30+
double improvement # Impurity improvement given parent node.
31+
double impurity_left # Impurity of the left split.
32+
double impurity_right # Impurity of the right split.
33+
double send_missing_left # Whether to send the missing values left/right
3334

3435
cdef class Splitter:
3536
# The splitter searches in the input space for a feature and a threshold
@@ -47,15 +48,28 @@ cdef class Splitter:
4748
cdef UINT32_t rand_r_state # sklearn_rand_r random number state
4849

4950
cdef SIZE_t* samples # Sample indices in X, y
51+
cdef SIZE_t* missing_samples # Sample indices with missing values
52+
5053
cdef SIZE_t n_samples # X.shape[0]
54+
# TODO selfnote we need n_missing?
55+
5156
cdef double weighted_n_samples # Weighted number of samples
57+
5258
cdef SIZE_t* features # Feature indices in X
5359
cdef SIZE_t* constant_features # Constant features indices
5460
cdef SIZE_t n_features # X.shape[1]
55-
cdef DTYPE_t* feature_values # temp. array holding feature values
61+
cdef DTYPE_t* feature_values # temp. array holding non-missing
62+
# feature values
5663

5764
cdef SIZE_t start # Start position for the current node
58-
cdef SIZE_t end # End position for the current node
65+
# for the non missing values
66+
cdef SIZE_t end # End pos for the current node
67+
# for the non missing values
68+
69+
cdef SIZE_t start_missing # Start position for the current node
70+
# for the missing values
71+
cdef SIZE_t end_missing # End pos for the current node
72+
# for the missing values
5973

6074
cdef bint presort # Whether to use presorting, only
6175
# allowed on dense data
@@ -69,6 +83,9 @@ cdef class Splitter:
6983
# `node_split` reorganizes the node samples `samples[start:end]` in two
7084
# subsets `samples[start:pos]` and `samples[pos:end]`.
7185

86+
# The indices of samples with missing values for a node are grouped into
87+
# the `samples_missing`
88+
7289
# The 1-d `features` array of size n_features contains the features
7390
# indices and allows fast sampling without replacement of features.
7491

@@ -83,7 +100,8 @@ cdef class Splitter:
83100
# Methods
84101
cdef void init(self, object X, np.ndarray y,
85102
DOUBLE_t* sample_weight,
86-
np.ndarray X_idx_sorted=*) except *
103+
np.ndarray X_idx_sorted=*,
104+
object missing_samples) except *
87105

88106
cdef void node_reset(self, SIZE_t start, SIZE_t end,
89107
double* weighted_n_node_samples) nogil
@@ -95,4 +113,4 @@ cdef class Splitter:
95113

96114
cdef void node_value(self, double* dest) nogil
97115

98-
cdef double node_impurity(self) nogil
116+
cdef double node_impurity(self) nogil

sklearn/tree/_splitter.pyx

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ from ._utils cimport RAND_R_MAX
3535
from ._utils cimport safe_realloc
3636

3737
cdef double INFINITY = np.inf
38+
cdef inline int int_min(int a, int b): return a if a <= b else b
3839

3940
# Mitigate precision differences between 32 bit and 64 bit
4041
cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
@@ -50,6 +51,7 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
5051
self.feature = 0
5152
self.threshold = 0.
5253
self.improvement = -INFINITY
54+
self.send_missing_left = 0
5355

5456
cdef class Splitter:
5557
"""Abstract splitter class.
@@ -60,7 +62,7 @@ cdef class Splitter:
6062

6163
def __cinit__(self, Criterion criterion, SIZE_t max_features,
6264
SIZE_t min_samples_leaf, double min_weight_leaf,
63< 10000 /td>-
object random_state, bint presort):
65+
object random_state, bint presort, bint handle_missing):
6466
"""
6567
Parameters
6668
----------
@@ -82,12 +84,29 @@ cdef class Splitter:
8284
8385
random_state: object
8486
The user inputted random state to be used for pseudo-randomness
87+
88+
presort : bool, optional (default=False)
89+
Whether to presort the data to speed up the finding of best splits
90+
in fitting.
91+
92+
handle_missing : bool, optional (default=False)
93+
Whether to handle missing values.
8594
"""
8695

8796
self.criterion = criterion
8897

8998
self.samples = NULL
99+
self.n_nonmissing_samples = 0
100+
101+
self.handle_missing = handle_missing # SELFNOTE XXX
102+
103+
self.missing_samples = NULL # SELFNOTE XXX
104+
self.missing_mask = NULL
105+
106+
self.n_missing_samples = 0 # Do I need this??
107+
90108
self.n_samples = 0
109+
91110
self.features = NULL
92111
self.n_features = 0
93112
self.feature_values = NULL
@@ -106,6 +125,7 @@ cdef class Splitter:
106125
"""Destructor."""
107126

108127
free(self.samples)
128+
free(self.missing_samples)
109129
free(self.features)
110130
free(self.constant_features)
111131
free(self.feature_values)
@@ -120,7 +140,8 @@ cdef class Splitter:
120140
object X,
121141
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
122142
DOUBLE_t* sample_weight,
123-
np.ndarray X_idx_sorted=None) except *:
143+
np.ndarray X_idx_sorted=None,
144+
np.ndarray missing_mask=None) except *:
124145
"""Initialize the splitter.
125146
126147
Take in the input data X, the target Y, and optional sample weights.
@@ -137,12 +158,24 @@ cdef class Splitter:
137158
The weights of the samples, where higher weighted samples are fit
138159
closer than lower weight samples. If not provided, all samples
139160
are assumed to have uniform weight.
140-
"""
141161
162+
missing_mask: numpy.ndarray, dtype=bool (optional)
163+
The mask to specify the locations of missing values in the dataset.
164+
"""
142165
self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
143166
cdef SIZE_t n_samples = X.shape[0]
144167

145-
# Create a new array which will be used to store nonzero
168+
if self.handle_missing:
169+
cdef SIZE_t* max_missing = 0
170+
171+
for i in range(X.shape[1]):
172+
max_missing = int_max(np.count_nonzero(missing_mask[:, i]),
173+
max_missing)
174+
175+
cdef SIZE_t* missing_samples = safe_realloc(
176+
&self.missing_samples, max_missing)
177+
178+
# Create a new array which will be used to store nonzero weighted
146179
# samples from the feature of interest
147180
cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
148181

@@ -261,7 +294,8 @@ cdef class BaseDenseSplitter(Splitter):
261294
object X,
262295
np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
263296
DOUBLE_t* sample_weight,
264-
np.ndarray X_idx_sorted=None) except *:
297+
np.ndarray X_idx_sorted=None,
298+
object missing_mask) except *:
265299
"""Initialize the splitter."""
266300

267301
# Call parent init
@@ -302,6 +336,8 @@ cdef class BestSplitter(BaseDenseSplitter):
302336
cdef SIZE_t* samples = self.samples
303337
cdef SIZE_t start = self.start
304338
cdef SIZE_t end = self.end
339+
cdef SIZE_t start_missing = self.start_missing
340+
cdef SIZE_t end_missing = self.end_missing
305341

306342
cdef SIZE_t* features = self.features
307343
cdef SIZE_t* constant_features = self.constant_features
@@ -404,7 +440,7 @@ cdef class BestSplitter(BaseDenseSplitter):
404440
p = start
405441
feature_idx_offset = self.X_idx_sorted_stride * current.feature
406442

407-
for i in range(self.n_total_samples):
443+
for i in range(self.n_total_samples):
408444
j = X_idx_sorted[i + feature_idx_offset]
409445
if sample_mask[j] == 1:
410446
samples[p] = j

0 commit comments

Comments
 (0)
0