9
9
# Fares Hedayati <fares.hedayati@gmail.com>
10
10
# Jacob Schreiber <jmschreiber91@gmail.com>
11
11
# Nelson Liu <nelson@nelsonliu.me>
12
+ # Adam Li <adam2392@gmail.com>
13
+ # Jong Shin <jshinm@gmail.com>
12
14
#
13
15
# License: BSD 3 clause
14
16
@@ -30,15 +32,21 @@ from ._utils cimport WeightedMedianCalculator
30
32
cdef double EPSILON = 10 * np.finfo(' double' ).eps
31
33
32
34
cdef class BaseCriterion:
33
- """ Abstract interface for any criterion.
35
+ """ This is an abstract interface for criterion. For example, a tree model could
36
+ be either supervisedly, or unsupervisedly computing impurity on samples of
37
+ covariates, or labels, or both.
38
+
39
+ The downstream classes _must_ implement methods to compute the impurity
40
+ in current node and in children nodes.
34
41
35
42
This object stores methods on how to calculate how good a split is using
36
43
a set API.
37
44
38
- The criterion object is maintained such that left and right collected
39
- statistics correspond to samples[start:pos] and samples[pos:end]. So the samples in
40
- the "current" node is samples[start:end], while left and right children nodes are
41
- split with the pointer 'pos' variable.
45
+ Samples in the "current" node are stored in `samples[start:end]` which is
46
+ partitioned around `pos` (an index in `start:end`) so that:
47
+
48
+ - the samples of left child node are stored in `samples[start:pos]`
49
+ - the samples of right child node are stored in `samples[pos:end]`
42
50
"""
43
51
def __getstate__ (self ):
44
52
return {}
@@ -173,9 +181,15 @@ cdef class BaseCriterion:
173
181
cdef class Criterion(BaseCriterion):
174
182
""" Interface for impurity criteria.
175
183
176
- This object stores methods on how to calculate how good a split is using
177
- different metrics. This is the base class for any supervised tree criterion
178
- model with a homogeneous float64 dtyped y.
184
+ The supervised criterion computes the impurity of a node and the reduction of
185
+ impurity of a split on that node using the distribution of labels in parent and
186
+ children nodes. It also computes the output statistics
187
+ such as the mean in regression and class probabilities in classification.
188
+
189
+ Instances of this class are responsible for compute splits' impurity difference
190
+
191
+ Criterion is the base class for criteria used in supervised tree-based models
192
+ with a homogeneous float64-dtyped y.
179
193
"""
180
194
cdef int init(self , const DOUBLE_t[:, ::1 ] y, DOUBLE_t* sample_weight,
181
195
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
@@ -188,19 +202,18 @@ cdef class Criterion(BaseCriterion):
188
202
Parameters
189
203
----------
190
204
y : array-like, dtype=DOUBLE_t
191
- y is a buffer that can store values for n_outputs target variables
192
- sample_weight : array-like, dtype= DOUBLE_t
193
- The weight of each sample
205
+ y is a buffer that can store values for the ` n_outputs` target variables
206
+ sample_weight : pointer to a buffer of DOUBLE_t
207
+ The pointer to the buffer storing each sample weight.
194
208
weighted_n_samples : double
195
- The total weight of the samples being considered
209
+ The sum of the weights of the samples being considered.
196
210
samples : array-like, dtype=SIZE_t
197
211
Indices of the samples in X and y, where samples[start:end]
198
212
correspond to the samples in this node
199
213
start : SIZE_t
200
- The first sample to be used on this node
214
+ The index of first sample in `samples` to be considered in this node.
201
215
end : SIZE_t
202
- The last sample used on this node
203
-
216
+ The index of last sample in `samples` to be considered in this node.
204
217
"""
205
218
pass
206
219
0 commit comments