8000 [MRG+3] FIX Memory leak in MAE; Use safe_realloc; Acquire GIL only wh… · maskani-moh/scikit-learn@55f9463 · GitHub
[go: up one dir, main page]

Skip to content

Commit 55f9463

Browse files
raghavrvmaskani-moh
authored andcommitted
[MRG+3] FIX Memory leak in MAE; Use safe_realloc; Acquire GIL only when raising; Propagate all errors to python interpreter level (scikit-learn#7811) (scikit-learn#8002)
* FIX MAE reg. criterion: Use safe_realloc to avoid memory leak * Release GIL in safe_realloc and clean up scaffolding * As gil is released in safe_realloc, no need of a with gil block * Use except * to propagate error in all cdef functions * Don't use except * for functions that return python objects * Don't use except * for the comparison function passed to qsort * Omissions and Errors * Use safe_realloc now that gil is released there * Fix realloc size * Acquire GIL only if we need to raise * Use except * more judiciously; Release gil only when raising; Add comments to clarify * Actually that was unneeded; realloc will also allocate for the first time * StackRecord*, PriorityHeapRecord* to fused type realloc_ptr; Use safe_realloc * Use except -1 to propagate exceptions. This should avoid overheads * Fix docstrings and add return 0 to reset methods * TYPO * REVIEW Remove redundant MemoryError raising calls
1 parent 4aff885 commit 55f9463

File tree

8 files changed

+261
-178
lines changed

8 files changed

+261
-178
lines changed

sklearn/tree/_criterion.pxd

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ cdef class Criterion:
5353
# statistics correspond to samples[start:pos] and samples[pos:end].
5454

5555
# Methods
56-
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
57-
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
58-
SIZE_t end) nogil
59-
cdef void reset(self) nogil
60-
cdef void reverse_reset(self) nogil
61-
cdef void update(self, SIZE_t new_pos) nogil
56+
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
57+
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
58+
SIZE_t end) nogil except -1
59+
cdef int reset(self) nogil except -1
60+
cdef int reverse_reset(self) nogil except -1
61+
cdef int update(self, SIZE_t new_pos) nogil except -1
6262
cdef double node_impurity(self) nogil
6363
cdef void children_impurity(self, double* impurity_left,
6464
double* impurity_right) nogil

sklearn/tree/_criterion.pyx

Lines changed: 73 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,14 @@ cdef class Criterion:
5151
def __setstate__(self, d):
5252
pass
5353

54-
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
55-
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
56-
SIZE_t end) nogil:
54+
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
55+
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
56+
SIZE_t end) nogil except -1:
5757
"""Placeholder for a method which will initialize the criterion.
5858
59+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
60+
or 0 otherwise.
61+
5962
Parameters
6063
----------
6164
y : array-like, dtype=DOUBLE_t
@@ -79,22 +82,22 @@ cdef class Criterion:
7982

8083
pass
8184

82-
cdef void reset(self) nogil:
85+
cdef int reset(self) nogil except -1:
8386
"""Reset the criterion at pos=start.
8487
8588
This method must be implemented by the subclass.
8689
"""
8790

8891
pass
8992

90-
cdef void reverse_reset(self) nogil:
93+
cdef int reverse_reset(self) nogil except -1:
9194
"""Reset the criterion at pos=end.
9295
9396
This method must be implemented by the subclass.
9497
"""
9598
pass
9699

97-
cdef void update(self, SIZE_t new_pos) nogil:
100+
cdef int update(self, SIZE_t new_pos) nogil except -1:
98101
"""Updated statistics by moving samples[pos:new_pos] to the left child.
99102
100103
This updates the collected statistics by moving samples[pos:new_pos]
@@ -281,12 +284,15 @@ cdef class ClassificationCriterion(Criterion):
281284
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)),
282285
self.__getstate__())
283286

284-
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride,
285-
DOUBLE_t* sample_weight, double weighted_n_samples,
286-
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil:
287+
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride,
288+
DOUBLE_t* sample_weight, double weighted_n_samples,
289+
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1:
287290
"""Initialize the criterion at node samples[start:end] and
288291
children samples[start:start] and samples[start:end].
289292
293+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
294+
or 0 otherwise.
295+
290296
Parameters
291297
----------
292298
y : array-like, dtype=DOUBLE_t
@@ -347,10 +353,14 @@ cdef class ClassificationCriterion(Criterion):
347353

348354
# Reset to pos=start
349355
self.reset()
356+
return 0
350357

351-
cdef void reset(self) nogil:
352-
"""Reset the criterion at pos=start."""
358+
cdef int reset(self) nogil except -1:
359+
"""Reset the criterion at pos=start
353360
361+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
362+
or 0 otherwise.
363+
"""
354364
self.pos = self.start
355365

356366
self.weighted_n_left = 0.0
@@ -370,9 +380,14 @@ cdef class ClassificationCriterion(Criterion):
370380
sum_total += self.sum_stride
371381
sum_left += self.sum_stride
372382
sum_right += self.sum_stride
383+
return 0
373384

374-
cdef void reverse_reset(self) nogil:
375-
"""Reset the criterion at pos=end."""
385+
cdef int reverse_reset(self) nogil except -1:
386+
"""Reset the criterion at pos=end
387+
388+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
389+
or 0 otherwise.
390+
"""
376391
self.pos = self.end
377392

378393
self.weighted_n_left = self.weighted_n_node_samples
@@ -392,10 +407,14 @@ cdef class ClassificationCriterion(Criterion):
392407
sum_total += self.sum_stride
393408
sum_left += self.sum_stride
394409
sum_right += self.sum_stride
410+
return 0
395411

396-
cdef void update(self, SIZE_t new_pos) nogil:
412+
cdef int update(self, SIZE_t new_pos) nogil except -1:
397413
"""Updated statistics by moving samples[pos:new_pos] to the left child.
398414
415+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
416+
or 0 otherwise.
417+
399418
Parameters
400419
----------
401420
new_pos : SIZE_t
@@ -470,6 +489,7 @@ cdef class ClassificationCriterion(Criterion):
470489
sum_total += self.sum_stride
471490

472491
self.pos = new_pos
492+
return 0
473493

474494
cdef double node_impurity(self) nogil:
475495
pass
@@ -736,9 +756,9 @@ cdef class RegressionCriterion(Criterion):
736756
def __reduce__(self):
737757
return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())
738758

739-
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
740-
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
741-
SIZE_t end) nogil:
759+
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
760+
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
761+
SIZE_t end) nogil except -1:
742762
"""Initialize the criterion at node samples[start:end] and
743763
children samples[start:start] and samples[start:end]."""
744764
# Initialize fields
@@ -778,8 +798,9 @@ cdef class RegressionCriterion(Criterion):
778798

779799
# Reset to pos=start
780800
self.reset()
801+
return 0
781802

782-
cdef void reset(self) nogil:
803+
cdef int reset(self) nogil except -1:
783804
"""Reset the criterion at pos=start."""
784805
cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
785806
memset(self.sum_left, 0, n_bytes)
@@ -788,8 +809,9 @@ cdef class RegressionCriterion(Criterion):
788809
self.weighted_n_left = 0.0
789810
self.weighted_n_right = self.weighted_n_node_samples
790811
self.pos = self.start
812+
return 0
791813

792-
cdef void reverse_reset(self) nogil:
814+
cdef int reverse_reset(self) nogil except -1:
793815
"""Reset the criterion at pos=end."""
794816
cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
795817
memset(self.sum_right, 0, n_bytes)
@@ -798,8 +820,9 @@ cdef class RegressionCriterion(Criterion):
798820
self.weighted_n_right = 0.0
799821
self.weighted_n_left = self.weighted_n_node_samples
800822
self.pos = self.end
823+
return 0
801824

802-
cdef void update(self, SIZE_t new_pos) nogil:
825+
cdef int update(self, SIZE_t new_pos) nogil except -1:
803826
"""Updated statistics by moving samples[pos:new_pos] to the left."""
804827

805828
cdef double* sum_left = self.sum_left
@@ -859,6 +882,7 @@ cdef class RegressionCriterion(Criterion):
859882
sum_right[k] = sum_total[k] - sum_left[k]
860883

861884
self.pos = new_pos
885+
return 0
862886

863887
cdef double node_impurity(self) nogil:
864888
pass
@@ -1018,19 +1042,16 @@ cdef class MAE(RegressionCriterion):
10181042
# Allocate memory for the accumulators
10191043
safe_realloc(&self.node_medians, n_outputs)
10201044

1021-
if (self.node_medians == NULL):
1022-
raise MemoryError()
1023-
10241045
self.left_child = np.empty(n_outputs, dtype='object')
10251046
self.right_child = np.empty(n_outputs, dtype='object')
10261047
# initialize WeightedMedianCalculators
10271048
for k in range(n_outputs):
10281049
self.left_child[k] = WeightedMedianCalculator(n_samples)
10291050
self.right_child[k] = WeightedMedianCalculator(n_samples)
10301051

1031-
cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
1032-
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
1033-
SIZE_t end) nogil:
1052+
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
1053+
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
1054+
SIZE_t end) nogil except -1:
10341055
"""Initialize the criterion at node samples[start:end] and
10351056
children samples[start:start] and samples[start:end]."""
10361057

@@ -1068,6 +1089,7 @@ cdef class MAE(RegressionCriterion):
10681089
for k in range(self.n_outputs):
10691090
y_ik = y[i * y_stride + k]
10701091

1092+
# push method ends up calling safe_realloc, hence `except -1`
10711093
# push all values to the right side,
10721094
# since pos = start initially anyway
10731095
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
@@ -1079,9 +1101,14 @@ cdef class MAE(RegressionCriterion):
10791101

10801102
# Reset to pos=start
10811103
self.reset()
1104+
return 0
10821105

1083-
cdef void reset(self) nogil:
1084-
"""Reset the criterion at pos=start."""
1106+
cdef int reset(self) nogil except -1:
1107+
"""Reset the criterion at pos=start
1108+
1109+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
1110+
or 0 otherwise.
1111+
"""
10851112

10861113
cdef SIZE_t i, k
10871114
cdef DOUBLE_t value
@@ -1103,11 +1130,17 @@ cdef class MAE(RegressionCriterion):
11031130
# remove everything from left and put it into right
11041131
(<WeightedMedianCalculator> left_child[k]).pop(&value,
11051132
&weight)
1133+
# push method ends up calling safe_realloc, hence `except -1`
11061134
(<WeightedMedianCalculator> right_child[k]).push(value,
11071135
weight)
1136+
return 0
11081137

1109-
cdef void reverse_reset(self) nogil:
1110-
"""Reset the criterion at pos=end."""
1138+
cdef int reverse_reset(self) nogil except -1:
1139+
"""Reset the criterion at pos=end
1140+
1141+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
1142+
or 0 otherwise.
1143+
"""
11111144

11121145
self.weighted_n_right = 0.0
11131146
self.weighted_n_left = self.weighted_n_node_samples
@@ -1126,11 +1159,17 @@ cdef class MAE(RegressionCriterion):
11261159
# remove everything from right and put it into left
11271160
(<WeightedMedianCalculator> right_child[k]).pop(&value,
11281161
&weight)
1162+
# push method ends up calling safe_realloc, hence `except -1`
11291163
(<WeightedMedianCalculator> left_child[k]).push(value,
11301164
weight)
1165+
return 0
11311166

1132-
cdef void update(self, SIZE_t new_pos) nogil:
1133-
"""Updated statistics by moving samples[pos:new_pos] to the left."""
1167+
cdef int update(self, SIZE_t new_pos) nogil except -1:
1168+
"""Updated statistics by moving samples[pos:new_pos] to the left
1169+
1170+
Returns -1 in case of failure to allocate memory (and raise MemoryError)
1171+
or 0 otherwise.
1172+
"""
11341173

11351174
cdef DOUBLE_t* sample_weight = self.sample_weight
11361175
cdef SIZE_t* samples = self.samples
@@ -1162,6 +1201,7 @@ cdef class MAE(RegressionCriterion):
11621201
y_ik = y[i * self.y_stride + k]
11631202
# remove y_ik and its weight w from right and add to left
11641203
(<WeightedMedianCalculator> right_child[k]).remove(y_ik, w)
1204+
# push method ends up calling safe_realloc, hence except -1
11651205
(<WeightedMedianCalculator> left_child[k]).push(y_ik, w)
11661206

11671207
self.weighted_n_left += w
@@ -1185,6 +1225,7 @@ cdef class MAE(RegressionCriterion):
11851225
self.weighted_n_right = (self.weighted_n_node_samples -
11861226
self.weighted_n_left)
11871227
self.pos = new_pos
1228+
return 0
11881229

11891230
cdef void node_value(self, double* dest) nogil:
11901231
"""Computes the node value of samples[start:end] into dest."""

sklearn/tree/_splitter.pxd

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,18 @@ cdef class Splitter:
8181
# This allows optimization with depth-based tree building.
8282

8383
# Methods
84-
cdef void init(self, object X, np.ndarray y,
85-
DOUBLE_t* sample_weight,
86-
np.ndarray X_idx_sorted=*) except *
84+
cdef int init(self, object X, np.ndarray y,
85+
DOUBLE_t* sample_weight,
86+
np.ndarray X_idx_sorted=*) except -1
8787

88-
cdef void node_reset(self, SIZE_t start, SIZE_t end,
89-
double* weighted_n_node_samples) nogil
88+
cdef int node_reset(self, SIZE_t start, SIZE_t end,
89+
double* weighted_n_node_samples) nogil except -1
9090

91-
cdef void node_split(self,
92-
double impurity, # Impurity of the node
93-
SplitRecord* split,
94-
SIZE_t* n_constant_features) nogil
91+
cdef int node_split(self,
92+
double impurity, # Impurity of the node
93+
SplitRecord* split,
94+
SIZE_t* n_constant_features) nogil except -1
9595

9696
cdef void node_value(self, double* dest) nogil
9797

98-
cdef double node_impurity(self) nogil
98+
cdef double node_impurity(self) nogil

0 commit comments

Comments
 (0)
0